1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "free-space-tree.h" 37 #include "math.h" 38 #include "sysfs.h" 39 #include "qgroup.h" 40 41 #undef SCRAMBLE_DELAYED_REFS 42 43 /* 44 * control flags for do_chunk_alloc's force field 45 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 46 * if we really need one. 47 * 48 * CHUNK_ALLOC_LIMITED means to only try and allocate one 49 * if we have very few chunks already allocated. This is 50 * used as part of the clustering code to help make sure 51 * we have a good pool of storage to cluster in, without 52 * filling the FS with empty chunks 53 * 54 * CHUNK_ALLOC_FORCE means it must try to allocate one 55 * 56 */ 57 enum { 58 CHUNK_ALLOC_NO_FORCE = 0, 59 CHUNK_ALLOC_LIMITED = 1, 60 CHUNK_ALLOC_FORCE = 2, 61 }; 62 63 /* 64 * Control how reservations are dealt with. 65 * 66 * RESERVE_FREE - freeing a reservation. 67 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 68 * ENOSPC accounting 69 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 70 * bytes_may_use as the ENOSPC accounting is done elsewhere 71 */ 72 enum { 73 RESERVE_FREE = 0, 74 RESERVE_ALLOC = 1, 75 RESERVE_ALLOC_NO_ACCOUNT = 2, 76 }; 77 78 static int update_block_group(struct btrfs_trans_handle *trans, 79 struct btrfs_root *root, u64 bytenr, 80 u64 num_bytes, int alloc); 81 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 82 struct btrfs_root *root, 83 struct btrfs_delayed_ref_node *node, u64 parent, 84 u64 root_objectid, u64 owner_objectid, 85 u64 owner_offset, int refs_to_drop, 86 struct btrfs_delayed_extent_op *extra_op); 87 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 88 struct extent_buffer *leaf, 89 struct btrfs_extent_item *ei); 90 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 91 struct btrfs_root *root, 92 u64 parent, u64 root_objectid, 93 u64 flags, u64 owner, u64 offset, 94 struct btrfs_key *ins, int ref_mod); 95 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 96 struct btrfs_root *root, 97 u64 parent, u64 root_objectid, 98 u64 flags, struct btrfs_disk_key *key, 99 int level, struct btrfs_key *ins); 100 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 101 struct btrfs_root *extent_root, u64 flags, 102 int force); 103 static int find_next_key(struct btrfs_path *path, int level, 104 struct btrfs_key *key); 105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 106 int dump_block_groups); 107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 108 u64 num_bytes, int reserve, 109 int delalloc); 110 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 111 u64 num_bytes); 112 int btrfs_pin_extent(struct btrfs_root *root, 113 u64 bytenr, u64 num_bytes, int reserved); 114 115 static noinline int 116 block_group_cache_done(struct btrfs_block_group_cache *cache) 117 { 118 smp_mb(); 119 return cache->cached == BTRFS_CACHE_FINISHED || 120 cache->cached == BTRFS_CACHE_ERROR; 121 } 122 123 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 124 { 125 return (cache->flags & bits) == bits; 126 } 127 128 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 129 { 130 atomic_inc(&cache->count); 131 } 132 133 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 134 { 135 if (atomic_dec_and_test(&cache->count)) { 136 WARN_ON(cache->pinned > 0); 137 WARN_ON(cache->reserved > 0); 138 kfree(cache->free_space_ctl); 139 kfree(cache); 140 } 141 } 142 143 /* 144 * this adds the block group to the fs_info rb tree for the block group 145 * cache 146 */ 147 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 148 struct btrfs_block_group_cache *block_group) 149 { 150 struct rb_node **p; 151 struct rb_node *parent = NULL; 152 struct btrfs_block_group_cache *cache; 153 154 spin_lock(&info->block_group_cache_lock); 155 p = &info->block_group_cache_tree.rb_node; 156 157 while (*p) { 158 parent = *p; 159 cache = rb_entry(parent, struct btrfs_block_group_cache, 160 cache_node); 161 if (block_group->key.objectid < cache->key.objectid) { 162 p = &(*p)->rb_left; 163 } else if (block_group->key.objectid > cache->key.objectid) { 164 p = &(*p)->rb_right; 165 } else { 166 spin_unlock(&info->block_group_cache_lock); 167 return -EEXIST; 168 } 169 } 170 171 rb_link_node(&block_group->cache_node, parent, p); 172 rb_insert_color(&block_group->cache_node, 173 &info->block_group_cache_tree); 174 175 if (info->first_logical_byte > block_group->key.objectid) 176 info->first_logical_byte = block_group->key.objectid; 177 178 spin_unlock(&info->block_group_cache_lock); 179 180 return 0; 181 } 182 183 /* 184 * This will return the block group at or after bytenr if contains is 0, else 185 * it will return the block group that contains the bytenr 186 */ 187 static struct btrfs_block_group_cache * 188 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 189 int contains) 190 { 191 struct btrfs_block_group_cache *cache, *ret = NULL; 192 struct rb_node *n; 193 u64 end, start; 194 195 spin_lock(&info->block_group_cache_lock); 196 n = info->block_group_cache_tree.rb_node; 197 198 while (n) { 199 cache = rb_entry(n, struct btrfs_block_group_cache, 200 cache_node); 201 end = cache->key.objectid + cache->key.offset - 1; 202 start = cache->key.objectid; 203 204 if (bytenr < start) { 205 if (!contains && (!ret || start < ret->key.objectid)) 206 ret = cache; 207 n = n->rb_left; 208 } else if (bytenr > start) { 209 if (contains && bytenr <= end) { 210 ret = cache; 211 break; 212 } 213 n = n->rb_right; 214 } else { 215 ret = cache; 216 break; 217 } 218 } 219 if (ret) { 220 btrfs_get_block_group(ret); 221 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 222 info->first_logical_byte = ret->key.objectid; 223 } 224 spin_unlock(&info->block_group_cache_lock); 225 226 return ret; 227 } 228 229 static int add_excluded_extent(struct btrfs_root *root, 230 u64 start, u64 num_bytes) 231 { 232 u64 end = start + num_bytes - 1; 233 set_extent_bits(&root->fs_info->freed_extents[0], 234 start, end, EXTENT_UPTODATE); 235 set_extent_bits(&root->fs_info->freed_extents[1], 236 start, end, EXTENT_UPTODATE); 237 return 0; 238 } 239 240 static void free_excluded_extents(struct btrfs_root *root, 241 struct btrfs_block_group_cache *cache) 242 { 243 u64 start, end; 244 245 start = cache->key.objectid; 246 end = start + cache->key.offset - 1; 247 248 clear_extent_bits(&root->fs_info->freed_extents[0], 249 start, end, EXTENT_UPTODATE); 250 clear_extent_bits(&root->fs_info->freed_extents[1], 251 start, end, EXTENT_UPTODATE); 252 } 253 254 static int exclude_super_stripes(struct btrfs_root *root, 255 struct btrfs_block_group_cache *cache) 256 { 257 u64 bytenr; 258 u64 *logical; 259 int stripe_len; 260 int i, nr, ret; 261 262 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 263 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 264 cache->bytes_super += stripe_len; 265 ret = add_excluded_extent(root, cache->key.objectid, 266 stripe_len); 267 if (ret) 268 return ret; 269 } 270 271 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 272 bytenr = btrfs_sb_offset(i); 273 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 274 cache->key.objectid, bytenr, 275 0, &logical, &nr, &stripe_len); 276 if (ret) 277 return ret; 278 279 while (nr--) { 280 u64 start, len; 281 282 if (logical[nr] > cache->key.objectid + 283 cache->key.offset) 284 continue; 285 286 if (logical[nr] + stripe_len <= cache->key.objectid) 287 continue; 288 289 start = logical[nr]; 290 if (start < cache->key.objectid) { 291 start = cache->key.objectid; 292 len = (logical[nr] + stripe_len) - start; 293 } else { 294 len = min_t(u64, stripe_len, 295 cache->key.objectid + 296 cache->key.offset - start); 297 } 298 299 cache->bytes_super += len; 300 ret = add_excluded_extent(root, start, len); 301 if (ret) { 302 kfree(logical); 303 return ret; 304 } 305 } 306 307 kfree(logical); 308 } 309 return 0; 310 } 311 312 static struct btrfs_caching_control * 313 get_caching_control(struct btrfs_block_group_cache *cache) 314 { 315 struct btrfs_caching_control *ctl; 316 317 spin_lock(&cache->lock); 318 if (!cache->caching_ctl) { 319 spin_unlock(&cache->lock); 320 return NULL; 321 } 322 323 ctl = cache->caching_ctl; 324 atomic_inc(&ctl->count); 325 spin_unlock(&cache->lock); 326 return ctl; 327 } 328 329 static void put_caching_control(struct btrfs_caching_control *ctl) 330 { 331 if (atomic_dec_and_test(&ctl->count)) 332 kfree(ctl); 333 } 334 335 #ifdef CONFIG_BTRFS_DEBUG 336 static void fragment_free_space(struct btrfs_root *root, 337 struct btrfs_block_group_cache *block_group) 338 { 339 u64 start = block_group->key.objectid; 340 u64 len = block_group->key.offset; 341 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 342 root->nodesize : root->sectorsize; 343 u64 step = chunk << 1; 344 345 while (len > chunk) { 346 btrfs_remove_free_space(block_group, start, chunk); 347 start += step; 348 if (len < step) 349 len = 0; 350 else 351 len -= step; 352 } 353 } 354 #endif 355 356 /* 357 * this is only called by cache_block_group, since we could have freed extents 358 * we need to check the pinned_extents for any extents that can't be used yet 359 * since their free space will be released as soon as the transaction commits. 360 */ 361 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 362 struct btrfs_fs_info *info, u64 start, u64 end) 363 { 364 u64 extent_start, extent_end, size, total_added = 0; 365 int ret; 366 367 while (start < end) { 368 ret = find_first_extent_bit(info->pinned_extents, start, 369 &extent_start, &extent_end, 370 EXTENT_DIRTY | EXTENT_UPTODATE, 371 NULL); 372 if (ret) 373 break; 374 375 if (extent_start <= start) { 376 start = extent_end + 1; 377 } else if (extent_start > start && extent_start < end) { 378 size = extent_start - start; 379 total_added += size; 380 ret = btrfs_add_free_space(block_group, start, 381 size); 382 BUG_ON(ret); /* -ENOMEM or logic error */ 383 start = extent_end + 1; 384 } else { 385 break; 386 } 387 } 388 389 if (start < end) { 390 size = end - start; 391 total_added += size; 392 ret = btrfs_add_free_space(block_group, start, size); 393 BUG_ON(ret); /* -ENOMEM or logic error */ 394 } 395 396 return total_added; 397 } 398 399 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 400 { 401 struct btrfs_block_group_cache *block_group; 402 struct btrfs_fs_info *fs_info; 403 struct btrfs_root *extent_root; 404 struct btrfs_path *path; 405 struct extent_buffer *leaf; 406 struct btrfs_key key; 407 u64 total_found = 0; 408 u64 last = 0; 409 u32 nritems; 410 int ret; 411 bool wakeup = true; 412 413 block_group = caching_ctl->block_group; 414 fs_info = block_group->fs_info; 415 extent_root = fs_info->extent_root; 416 417 path = btrfs_alloc_path(); 418 if (!path) 419 return -ENOMEM; 420 421 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 422 423 #ifdef CONFIG_BTRFS_DEBUG 424 /* 425 * If we're fragmenting we don't want to make anybody think we can 426 * allocate from this block group until we've had a chance to fragment 427 * the free space. 428 */ 429 if (btrfs_should_fragment_free_space(extent_root, block_group)) 430 wakeup = false; 431 #endif 432 /* 433 * We don't want to deadlock with somebody trying to allocate a new 434 * extent for the extent root while also trying to search the extent 435 * root to add free space. So we skip locking and search the commit 436 * root, since its read-only 437 */ 438 path->skip_locking = 1; 439 path->search_commit_root = 1; 440 path->reada = READA_FORWARD; 441 442 key.objectid = last; 443 key.offset = 0; 444 key.type = BTRFS_EXTENT_ITEM_KEY; 445 446 next: 447 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 448 if (ret < 0) 449 goto out; 450 451 leaf = path->nodes[0]; 452 nritems = btrfs_header_nritems(leaf); 453 454 while (1) { 455 if (btrfs_fs_closing(fs_info) > 1) { 456 last = (u64)-1; 457 break; 458 } 459 460 if (path->slots[0] < nritems) { 461 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 462 } else { 463 ret = find_next_key(path, 0, &key); 464 if (ret) 465 break; 466 467 if (need_resched() || 468 rwsem_is_contended(&fs_info->commit_root_sem)) { 469 if (wakeup) 470 caching_ctl->progress = last; 471 btrfs_release_path(path); 472 up_read(&fs_info->commit_root_sem); 473 mutex_unlock(&caching_ctl->mutex); 474 cond_resched(); 475 mutex_lock(&caching_ctl->mutex); 476 down_read(&fs_info->commit_root_sem); 477 goto next; 478 } 479 480 ret = btrfs_next_leaf(extent_root, path); 481 if (ret < 0) 482 goto out; 483 if (ret) 484 break; 485 leaf = path->nodes[0]; 486 nritems = btrfs_header_nritems(leaf); 487 continue; 488 } 489 490 if (key.objectid < last) { 491 key.objectid = last; 492 key.offset = 0; 493 key.type = BTRFS_EXTENT_ITEM_KEY; 494 495 if (wakeup) 496 caching_ctl->progress = last; 497 btrfs_release_path(path); 498 goto next; 499 } 500 501 if (key.objectid < block_group->key.objectid) { 502 path->slots[0]++; 503 continue; 504 } 505 506 if (key.objectid >= block_group->key.objectid + 507 block_group->key.offset) 508 break; 509 510 if (key.type == BTRFS_EXTENT_ITEM_KEY || 511 key.type == BTRFS_METADATA_ITEM_KEY) { 512 total_found += add_new_free_space(block_group, 513 fs_info, last, 514 key.objectid); 515 if (key.type == BTRFS_METADATA_ITEM_KEY) 516 last = key.objectid + 517 fs_info->tree_root->nodesize; 518 else 519 last = key.objectid + key.offset; 520 521 if (total_found > CACHING_CTL_WAKE_UP) { 522 total_found = 0; 523 if (wakeup) 524 wake_up(&caching_ctl->wait); 525 } 526 } 527 path->slots[0]++; 528 } 529 ret = 0; 530 531 total_found += add_new_free_space(block_group, fs_info, last, 532 block_group->key.objectid + 533 block_group->key.offset); 534 caching_ctl->progress = (u64)-1; 535 536 out: 537 btrfs_free_path(path); 538 return ret; 539 } 540 541 static noinline void caching_thread(struct btrfs_work *work) 542 { 543 struct btrfs_block_group_cache *block_group; 544 struct btrfs_fs_info *fs_info; 545 struct btrfs_caching_control *caching_ctl; 546 struct btrfs_root *extent_root; 547 int ret; 548 549 caching_ctl = container_of(work, struct btrfs_caching_control, work); 550 block_group = caching_ctl->block_group; 551 fs_info = block_group->fs_info; 552 extent_root = fs_info->extent_root; 553 554 mutex_lock(&caching_ctl->mutex); 555 down_read(&fs_info->commit_root_sem); 556 557 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 558 ret = load_free_space_tree(caching_ctl); 559 else 560 ret = load_extent_tree_free(caching_ctl); 561 562 spin_lock(&block_group->lock); 563 block_group->caching_ctl = NULL; 564 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 565 spin_unlock(&block_group->lock); 566 567 #ifdef CONFIG_BTRFS_DEBUG 568 if (btrfs_should_fragment_free_space(extent_root, block_group)) { 569 u64 bytes_used; 570 571 spin_lock(&block_group->space_info->lock); 572 spin_lock(&block_group->lock); 573 bytes_used = block_group->key.offset - 574 btrfs_block_group_used(&block_group->item); 575 block_group->space_info->bytes_used += bytes_used >> 1; 576 spin_unlock(&block_group->lock); 577 spin_unlock(&block_group->space_info->lock); 578 fragment_free_space(extent_root, block_group); 579 } 580 #endif 581 582 caching_ctl->progress = (u64)-1; 583 584 up_read(&fs_info->commit_root_sem); 585 free_excluded_extents(fs_info->extent_root, block_group); 586 mutex_unlock(&caching_ctl->mutex); 587 588 wake_up(&caching_ctl->wait); 589 590 put_caching_control(caching_ctl); 591 btrfs_put_block_group(block_group); 592 } 593 594 static int cache_block_group(struct btrfs_block_group_cache *cache, 595 int load_cache_only) 596 { 597 DEFINE_WAIT(wait); 598 struct btrfs_fs_info *fs_info = cache->fs_info; 599 struct btrfs_caching_control *caching_ctl; 600 int ret = 0; 601 602 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 603 if (!caching_ctl) 604 return -ENOMEM; 605 606 INIT_LIST_HEAD(&caching_ctl->list); 607 mutex_init(&caching_ctl->mutex); 608 init_waitqueue_head(&caching_ctl->wait); 609 caching_ctl->block_group = cache; 610 caching_ctl->progress = cache->key.objectid; 611 atomic_set(&caching_ctl->count, 1); 612 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 613 caching_thread, NULL, NULL); 614 615 spin_lock(&cache->lock); 616 /* 617 * This should be a rare occasion, but this could happen I think in the 618 * case where one thread starts to load the space cache info, and then 619 * some other thread starts a transaction commit which tries to do an 620 * allocation while the other thread is still loading the space cache 621 * info. The previous loop should have kept us from choosing this block 622 * group, but if we've moved to the state where we will wait on caching 623 * block groups we need to first check if we're doing a fast load here, 624 * so we can wait for it to finish, otherwise we could end up allocating 625 * from a block group who's cache gets evicted for one reason or 626 * another. 627 */ 628 while (cache->cached == BTRFS_CACHE_FAST) { 629 struct btrfs_caching_control *ctl; 630 631 ctl = cache->caching_ctl; 632 atomic_inc(&ctl->count); 633 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 634 spin_unlock(&cache->lock); 635 636 schedule(); 637 638 finish_wait(&ctl->wait, &wait); 639 put_caching_control(ctl); 640 spin_lock(&cache->lock); 641 } 642 643 if (cache->cached != BTRFS_CACHE_NO) { 644 spin_unlock(&cache->lock); 645 kfree(caching_ctl); 646 return 0; 647 } 648 WARN_ON(cache->caching_ctl); 649 cache->caching_ctl = caching_ctl; 650 cache->cached = BTRFS_CACHE_FAST; 651 spin_unlock(&cache->lock); 652 653 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 654 mutex_lock(&caching_ctl->mutex); 655 ret = load_free_space_cache(fs_info, cache); 656 657 spin_lock(&cache->lock); 658 if (ret == 1) { 659 cache->caching_ctl = NULL; 660 cache->cached = BTRFS_CACHE_FINISHED; 661 cache->last_byte_to_unpin = (u64)-1; 662 caching_ctl->progress = (u64)-1; 663 } else { 664 if (load_cache_only) { 665 cache->caching_ctl = NULL; 666 cache->cached = BTRFS_CACHE_NO; 667 } else { 668 cache->cached = BTRFS_CACHE_STARTED; 669 cache->has_caching_ctl = 1; 670 } 671 } 672 spin_unlock(&cache->lock); 673 #ifdef CONFIG_BTRFS_DEBUG 674 if (ret == 1 && 675 btrfs_should_fragment_free_space(fs_info->extent_root, 676 cache)) { 677 u64 bytes_used; 678 679 spin_lock(&cache->space_info->lock); 680 spin_lock(&cache->lock); 681 bytes_used = cache->key.offset - 682 btrfs_block_group_used(&cache->item); 683 cache->space_info->bytes_used += bytes_used >> 1; 684 spin_unlock(&cache->lock); 685 spin_unlock(&cache->space_info->lock); 686 fragment_free_space(fs_info->extent_root, cache); 687 } 688 #endif 689 mutex_unlock(&caching_ctl->mutex); 690 691 wake_up(&caching_ctl->wait); 692 if (ret == 1) { 693 put_caching_control(caching_ctl); 694 free_excluded_extents(fs_info->extent_root, cache); 695 return 0; 696 } 697 } else { 698 /* 699 * We're either using the free space tree or no caching at all. 700 * Set cached to the appropriate value and wakeup any waiters. 701 */ 702 spin_lock(&cache->lock); 703 if (load_cache_only) { 704 cache->caching_ctl = NULL; 705 cache->cached = BTRFS_CACHE_NO; 706 } else { 707 cache->cached = BTRFS_CACHE_STARTED; 708 cache->has_caching_ctl = 1; 709 } 710 spin_unlock(&cache->lock); 711 wake_up(&caching_ctl->wait); 712 } 713 714 if (load_cache_only) { 715 put_caching_control(caching_ctl); 716 return 0; 717 } 718 719 down_write(&fs_info->commit_root_sem); 720 atomic_inc(&caching_ctl->count); 721 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 722 up_write(&fs_info->commit_root_sem); 723 724 btrfs_get_block_group(cache); 725 726 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 727 728 return ret; 729 } 730 731 /* 732 * return the block group that starts at or after bytenr 733 */ 734 static struct btrfs_block_group_cache * 735 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 736 { 737 struct btrfs_block_group_cache *cache; 738 739 cache = block_group_cache_tree_search(info, bytenr, 0); 740 741 return cache; 742 } 743 744 /* 745 * return the block group that contains the given bytenr 746 */ 747 struct btrfs_block_group_cache *btrfs_lookup_block_group( 748 struct btrfs_fs_info *info, 749 u64 bytenr) 750 { 751 struct btrfs_block_group_cache *cache; 752 753 cache = block_group_cache_tree_search(info, bytenr, 1); 754 755 return cache; 756 } 757 758 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 759 u64 flags) 760 { 761 struct list_head *head = &info->space_info; 762 struct btrfs_space_info *found; 763 764 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 765 766 rcu_read_lock(); 767 list_for_each_entry_rcu(found, head, list) { 768 if (found->flags & flags) { 769 rcu_read_unlock(); 770 return found; 771 } 772 } 773 rcu_read_unlock(); 774 return NULL; 775 } 776 777 /* 778 * after adding space to the filesystem, we need to clear the full flags 779 * on all the space infos. 780 */ 781 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 782 { 783 struct list_head *head = &info->space_info; 784 struct btrfs_space_info *found; 785 786 rcu_read_lock(); 787 list_for_each_entry_rcu(found, head, list) 788 found->full = 0; 789 rcu_read_unlock(); 790 } 791 792 /* simple helper to search for an existing data extent at a given offset */ 793 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) 794 { 795 int ret; 796 struct btrfs_key key; 797 struct btrfs_path *path; 798 799 path = btrfs_alloc_path(); 800 if (!path) 801 return -ENOMEM; 802 803 key.objectid = start; 804 key.offset = len; 805 key.type = BTRFS_EXTENT_ITEM_KEY; 806 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 807 0, 0); 808 btrfs_free_path(path); 809 return ret; 810 } 811 812 /* 813 * helper function to lookup reference count and flags of a tree block. 814 * 815 * the head node for delayed ref is used to store the sum of all the 816 * reference count modifications queued up in the rbtree. the head 817 * node may also store the extent flags to set. This way you can check 818 * to see what the reference count and extent flags would be if all of 819 * the delayed refs are not processed. 820 */ 821 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 822 struct btrfs_root *root, u64 bytenr, 823 u64 offset, int metadata, u64 *refs, u64 *flags) 824 { 825 struct btrfs_delayed_ref_head *head; 826 struct btrfs_delayed_ref_root *delayed_refs; 827 struct btrfs_path *path; 828 struct btrfs_extent_item *ei; 829 struct extent_buffer *leaf; 830 struct btrfs_key key; 831 u32 item_size; 832 u64 num_refs; 833 u64 extent_flags; 834 int ret; 835 836 /* 837 * If we don't have skinny metadata, don't bother doing anything 838 * different 839 */ 840 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 841 offset = root->nodesize; 842 metadata = 0; 843 } 844 845 path = btrfs_alloc_path(); 846 if (!path) 847 return -ENOMEM; 848 849 if (!trans) { 850 path->skip_locking = 1; 851 path->search_commit_root = 1; 852 } 853 854 search_again: 855 key.objectid = bytenr; 856 key.offset = offset; 857 if (metadata) 858 key.type = BTRFS_METADATA_ITEM_KEY; 859 else 860 key.type = BTRFS_EXTENT_ITEM_KEY; 861 862 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 863 &key, path, 0, 0); 864 if (ret < 0) 865 goto out_free; 866 867 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 868 if (path->slots[0]) { 869 path->slots[0]--; 870 btrfs_item_key_to_cpu(path->nodes[0], &key, 871 path->slots[0]); 872 if (key.objectid == bytenr && 873 key.type == BTRFS_EXTENT_ITEM_KEY && 874 key.offset == root->nodesize) 875 ret = 0; 876 } 877 } 878 879 if (ret == 0) { 880 leaf = path->nodes[0]; 881 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 882 if (item_size >= sizeof(*ei)) { 883 ei = btrfs_item_ptr(leaf, path->slots[0], 884 struct btrfs_extent_item); 885 num_refs = btrfs_extent_refs(leaf, ei); 886 extent_flags = btrfs_extent_flags(leaf, ei); 887 } else { 888 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 889 struct btrfs_extent_item_v0 *ei0; 890 BUG_ON(item_size != sizeof(*ei0)); 891 ei0 = btrfs_item_ptr(leaf, path->slots[0], 892 struct btrfs_extent_item_v0); 893 num_refs = btrfs_extent_refs_v0(leaf, ei0); 894 /* FIXME: this isn't correct for data */ 895 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 896 #else 897 BUG(); 898 #endif 899 } 900 BUG_ON(num_refs == 0); 901 } else { 902 num_refs = 0; 903 extent_flags = 0; 904 ret = 0; 905 } 906 907 if (!trans) 908 goto out; 909 910 delayed_refs = &trans->transaction->delayed_refs; 911 spin_lock(&delayed_refs->lock); 912 head = btrfs_find_delayed_ref_head(trans, bytenr); 913 if (head) { 914 if (!mutex_trylock(&head->mutex)) { 915 atomic_inc(&head->node.refs); 916 spin_unlock(&delayed_refs->lock); 917 918 btrfs_release_path(path); 919 920 /* 921 * Mutex was contended, block until it's released and try 922 * again 923 */ 924 mutex_lock(&head->mutex); 925 mutex_unlock(&head->mutex); 926 btrfs_put_delayed_ref(&head->node); 927 goto search_again; 928 } 929 spin_lock(&head->lock); 930 if (head->extent_op && head->extent_op->update_flags) 931 extent_flags |= head->extent_op->flags_to_set; 932 else 933 BUG_ON(num_refs == 0); 934 935 num_refs += head->node.ref_mod; 936 spin_unlock(&head->lock); 937 mutex_unlock(&head->mutex); 938 } 939 spin_unlock(&delayed_refs->lock); 940 out: 941 WARN_ON(num_refs == 0); 942 if (refs) 943 *refs = num_refs; 944 if (flags) 945 *flags = extent_flags; 946 out_free: 947 btrfs_free_path(path); 948 return ret; 949 } 950 951 /* 952 * Back reference rules. Back refs have three main goals: 953 * 954 * 1) differentiate between all holders of references to an extent so that 955 * when a reference is dropped we can make sure it was a valid reference 956 * before freeing the extent. 957 * 958 * 2) Provide enough information to quickly find the holders of an extent 959 * if we notice a given block is corrupted or bad. 960 * 961 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 962 * maintenance. This is actually the same as #2, but with a slightly 963 * different use case. 964 * 965 * There are two kinds of back refs. The implicit back refs is optimized 966 * for pointers in non-shared tree blocks. For a given pointer in a block, 967 * back refs of this kind provide information about the block's owner tree 968 * and the pointer's key. These information allow us to find the block by 969 * b-tree searching. The full back refs is for pointers in tree blocks not 970 * referenced by their owner trees. The location of tree block is recorded 971 * in the back refs. Actually the full back refs is generic, and can be 972 * used in all cases the implicit back refs is used. The major shortcoming 973 * of the full back refs is its overhead. Every time a tree block gets 974 * COWed, we have to update back refs entry for all pointers in it. 975 * 976 * For a newly allocated tree block, we use implicit back refs for 977 * pointers in it. This means most tree related operations only involve 978 * implicit back refs. For a tree block created in old transaction, the 979 * only way to drop a reference to it is COW it. So we can detect the 980 * event that tree block loses its owner tree's reference and do the 981 * back refs conversion. 982 * 983 * When a tree block is COWed through a tree, there are four cases: 984 * 985 * The reference count of the block is one and the tree is the block's 986 * owner tree. Nothing to do in this case. 987 * 988 * The reference count of the block is one and the tree is not the 989 * block's owner tree. In this case, full back refs is used for pointers 990 * in the block. Remove these full back refs, add implicit back refs for 991 * every pointers in the new block. 992 * 993 * The reference count of the block is greater than one and the tree is 994 * the block's owner tree. In this case, implicit back refs is used for 995 * pointers in the block. Add full back refs for every pointers in the 996 * block, increase lower level extents' reference counts. The original 997 * implicit back refs are entailed to the new block. 998 * 999 * The reference count of the block is greater than one and the tree is 1000 * not the block's owner tree. Add implicit back refs for every pointer in 1001 * the new block, increase lower level extents' reference count. 1002 * 1003 * Back Reference Key composing: 1004 * 1005 * The key objectid corresponds to the first byte in the extent, 1006 * The key type is used to differentiate between types of back refs. 1007 * There are different meanings of the key offset for different types 1008 * of back refs. 1009 * 1010 * File extents can be referenced by: 1011 * 1012 * - multiple snapshots, subvolumes, or different generations in one subvol 1013 * - different files inside a single subvolume 1014 * - different offsets inside a file (bookend extents in file.c) 1015 * 1016 * The extent ref structure for the implicit back refs has fields for: 1017 * 1018 * - Objectid of the subvolume root 1019 * - objectid of the file holding the reference 1020 * - original offset in the file 1021 * - how many bookend extents 1022 * 1023 * The key offset for the implicit back refs is hash of the first 1024 * three fields. 1025 * 1026 * The extent ref structure for the full back refs has field for: 1027 * 1028 * - number of pointers in the tree leaf 1029 * 1030 * The key offset for the implicit back refs is the first byte of 1031 * the tree leaf 1032 * 1033 * When a file extent is allocated, The implicit back refs is used. 1034 * the fields are filled in: 1035 * 1036 * (root_key.objectid, inode objectid, offset in file, 1) 1037 * 1038 * When a file extent is removed file truncation, we find the 1039 * corresponding implicit back refs and check the following fields: 1040 * 1041 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1042 * 1043 * Btree extents can be referenced by: 1044 * 1045 * - Different subvolumes 1046 * 1047 * Both the implicit back refs and the full back refs for tree blocks 1048 * only consist of key. The key offset for the implicit back refs is 1049 * objectid of block's owner tree. The key offset for the full back refs 1050 * is the first byte of parent block. 1051 * 1052 * When implicit back refs is used, information about the lowest key and 1053 * level of the tree block are required. These information are stored in 1054 * tree block info structure. 1055 */ 1056 1057 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1058 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 1059 struct btrfs_root *root, 1060 struct btrfs_path *path, 1061 u64 owner, u32 extra_size) 1062 { 1063 struct btrfs_extent_item *item; 1064 struct btrfs_extent_item_v0 *ei0; 1065 struct btrfs_extent_ref_v0 *ref0; 1066 struct btrfs_tree_block_info *bi; 1067 struct extent_buffer *leaf; 1068 struct btrfs_key key; 1069 struct btrfs_key found_key; 1070 u32 new_size = sizeof(*item); 1071 u64 refs; 1072 int ret; 1073 1074 leaf = path->nodes[0]; 1075 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1076 1077 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1078 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1079 struct btrfs_extent_item_v0); 1080 refs = btrfs_extent_refs_v0(leaf, ei0); 1081 1082 if (owner == (u64)-1) { 1083 while (1) { 1084 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1085 ret = btrfs_next_leaf(root, path); 1086 if (ret < 0) 1087 return ret; 1088 BUG_ON(ret > 0); /* Corruption */ 1089 leaf = path->nodes[0]; 1090 } 1091 btrfs_item_key_to_cpu(leaf, &found_key, 1092 path->slots[0]); 1093 BUG_ON(key.objectid != found_key.objectid); 1094 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1095 path->slots[0]++; 1096 continue; 1097 } 1098 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1099 struct btrfs_extent_ref_v0); 1100 owner = btrfs_ref_objectid_v0(leaf, ref0); 1101 break; 1102 } 1103 } 1104 btrfs_release_path(path); 1105 1106 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1107 new_size += sizeof(*bi); 1108 1109 new_size -= sizeof(*ei0); 1110 ret = btrfs_search_slot(trans, root, &key, path, 1111 new_size + extra_size, 1); 1112 if (ret < 0) 1113 return ret; 1114 BUG_ON(ret); /* Corruption */ 1115 1116 btrfs_extend_item(root, path, new_size); 1117 1118 leaf = path->nodes[0]; 1119 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1120 btrfs_set_extent_refs(leaf, item, refs); 1121 /* FIXME: get real generation */ 1122 btrfs_set_extent_generation(leaf, item, 0); 1123 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1124 btrfs_set_extent_flags(leaf, item, 1125 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1126 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1127 bi = (struct btrfs_tree_block_info *)(item + 1); 1128 /* FIXME: get first key of the block */ 1129 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1130 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1131 } else { 1132 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1133 } 1134 btrfs_mark_buffer_dirty(leaf); 1135 return 0; 1136 } 1137 #endif 1138 1139 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1140 { 1141 u32 high_crc = ~(u32)0; 1142 u32 low_crc = ~(u32)0; 1143 __le64 lenum; 1144 1145 lenum = cpu_to_le64(root_objectid); 1146 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1147 lenum = cpu_to_le64(owner); 1148 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1149 lenum = cpu_to_le64(offset); 1150 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1151 1152 return ((u64)high_crc << 31) ^ (u64)low_crc; 1153 } 1154 1155 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1156 struct btrfs_extent_data_ref *ref) 1157 { 1158 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1159 btrfs_extent_data_ref_objectid(leaf, ref), 1160 btrfs_extent_data_ref_offset(leaf, ref)); 1161 } 1162 1163 static int match_extent_data_ref(struct extent_buffer *leaf, 1164 struct btrfs_extent_data_ref *ref, 1165 u64 root_objectid, u64 owner, u64 offset) 1166 { 1167 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1168 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1169 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1170 return 0; 1171 return 1; 1172 } 1173 1174 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1175 struct btrfs_root *root, 1176 struct btrfs_path *path, 1177 u64 bytenr, u64 parent, 1178 u64 root_objectid, 1179 u64 owner, u64 offset) 1180 { 1181 struct btrfs_key key; 1182 struct btrfs_extent_data_ref *ref; 1183 struct extent_buffer *leaf; 1184 u32 nritems; 1185 int ret; 1186 int recow; 1187 int err = -ENOENT; 1188 1189 key.objectid = bytenr; 1190 if (parent) { 1191 key.type = BTRFS_SHARED_DATA_REF_KEY; 1192 key.offset = parent; 1193 } else { 1194 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1195 key.offset = hash_extent_data_ref(root_objectid, 1196 owner, offset); 1197 } 1198 again: 1199 recow = 0; 1200 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1201 if (ret < 0) { 1202 err = ret; 1203 goto fail; 1204 } 1205 1206 if (parent) { 1207 if (!ret) 1208 return 0; 1209 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1210 key.type = BTRFS_EXTENT_REF_V0_KEY; 1211 btrfs_release_path(path); 1212 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1213 if (ret < 0) { 1214 err = ret; 1215 goto fail; 1216 } 1217 if (!ret) 1218 return 0; 1219 #endif 1220 goto fail; 1221 } 1222 1223 leaf = path->nodes[0]; 1224 nritems = btrfs_header_nritems(leaf); 1225 while (1) { 1226 if (path->slots[0] >= nritems) { 1227 ret = btrfs_next_leaf(root, path); 1228 if (ret < 0) 1229 err = ret; 1230 if (ret) 1231 goto fail; 1232 1233 leaf = path->nodes[0]; 1234 nritems = btrfs_header_nritems(leaf); 1235 recow = 1; 1236 } 1237 1238 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1239 if (key.objectid != bytenr || 1240 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1241 goto fail; 1242 1243 ref = btrfs_item_ptr(leaf, path->slots[0], 1244 struct btrfs_extent_data_ref); 1245 1246 if (match_extent_data_ref(leaf, ref, root_objectid, 1247 owner, offset)) { 1248 if (recow) { 1249 btrfs_release_path(path); 1250 goto again; 1251 } 1252 err = 0; 1253 break; 1254 } 1255 path->slots[0]++; 1256 } 1257 fail: 1258 return err; 1259 } 1260 1261 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1262 struct btrfs_root *root, 1263 struct btrfs_path *path, 1264 u64 bytenr, u64 parent, 1265 u64 root_objectid, u64 owner, 1266 u64 offset, int refs_to_add) 1267 { 1268 struct btrfs_key key; 1269 struct extent_buffer *leaf; 1270 u32 size; 1271 u32 num_refs; 1272 int ret; 1273 1274 key.objectid = bytenr; 1275 if (parent) { 1276 key.type = BTRFS_SHARED_DATA_REF_KEY; 1277 key.offset = parent; 1278 size = sizeof(struct btrfs_shared_data_ref); 1279 } else { 1280 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1281 key.offset = hash_extent_data_ref(root_objectid, 1282 owner, offset); 1283 size = sizeof(struct btrfs_extent_data_ref); 1284 } 1285 1286 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1287 if (ret && ret != -EEXIST) 1288 goto fail; 1289 1290 leaf = path->nodes[0]; 1291 if (parent) { 1292 struct btrfs_shared_data_ref *ref; 1293 ref = btrfs_item_ptr(leaf, path->slots[0], 1294 struct btrfs_shared_data_ref); 1295 if (ret == 0) { 1296 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1297 } else { 1298 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1299 num_refs += refs_to_add; 1300 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1301 } 1302 } else { 1303 struct btrfs_extent_data_ref *ref; 1304 while (ret == -EEXIST) { 1305 ref = btrfs_item_ptr(leaf, path->slots[0], 1306 struct btrfs_extent_data_ref); 1307 if (match_extent_data_ref(leaf, ref, root_objectid, 1308 owner, offset)) 1309 break; 1310 btrfs_release_path(path); 1311 key.offset++; 1312 ret = btrfs_insert_empty_item(trans, root, path, &key, 1313 size); 1314 if (ret && ret != -EEXIST) 1315 goto fail; 1316 1317 leaf = path->nodes[0]; 1318 } 1319 ref = btrfs_item_ptr(leaf, path->slots[0], 1320 struct btrfs_extent_data_ref); 1321 if (ret == 0) { 1322 btrfs_set_extent_data_ref_root(leaf, ref, 1323 root_objectid); 1324 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1325 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1326 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1327 } else { 1328 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1329 num_refs += refs_to_add; 1330 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1331 } 1332 } 1333 btrfs_mark_buffer_dirty(leaf); 1334 ret = 0; 1335 fail: 1336 btrfs_release_path(path); 1337 return ret; 1338 } 1339 1340 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1341 struct btrfs_root *root, 1342 struct btrfs_path *path, 1343 int refs_to_drop, int *last_ref) 1344 { 1345 struct btrfs_key key; 1346 struct btrfs_extent_data_ref *ref1 = NULL; 1347 struct btrfs_shared_data_ref *ref2 = NULL; 1348 struct extent_buffer *leaf; 1349 u32 num_refs = 0; 1350 int ret = 0; 1351 1352 leaf = path->nodes[0]; 1353 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1354 1355 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1356 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1357 struct btrfs_extent_data_ref); 1358 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1359 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1360 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1361 struct btrfs_shared_data_ref); 1362 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1363 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1364 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1365 struct btrfs_extent_ref_v0 *ref0; 1366 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1367 struct btrfs_extent_ref_v0); 1368 num_refs = btrfs_ref_count_v0(leaf, ref0); 1369 #endif 1370 } else { 1371 BUG(); 1372 } 1373 1374 BUG_ON(num_refs < refs_to_drop); 1375 num_refs -= refs_to_drop; 1376 1377 if (num_refs == 0) { 1378 ret = btrfs_del_item(trans, root, path); 1379 *last_ref = 1; 1380 } else { 1381 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1382 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1383 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1384 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1385 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1386 else { 1387 struct btrfs_extent_ref_v0 *ref0; 1388 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1389 struct btrfs_extent_ref_v0); 1390 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1391 } 1392 #endif 1393 btrfs_mark_buffer_dirty(leaf); 1394 } 1395 return ret; 1396 } 1397 1398 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1399 struct btrfs_extent_inline_ref *iref) 1400 { 1401 struct btrfs_key key; 1402 struct extent_buffer *leaf; 1403 struct btrfs_extent_data_ref *ref1; 1404 struct btrfs_shared_data_ref *ref2; 1405 u32 num_refs = 0; 1406 1407 leaf = path->nodes[0]; 1408 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1409 if (iref) { 1410 if (btrfs_extent_inline_ref_type(leaf, iref) == 1411 BTRFS_EXTENT_DATA_REF_KEY) { 1412 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1413 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1414 } else { 1415 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1416 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1417 } 1418 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1419 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1420 struct btrfs_extent_data_ref); 1421 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1422 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1423 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1424 struct btrfs_shared_data_ref); 1425 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1426 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1427 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1428 struct btrfs_extent_ref_v0 *ref0; 1429 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1430 struct btrfs_extent_ref_v0); 1431 num_refs = btrfs_ref_count_v0(leaf, ref0); 1432 #endif 1433 } else { 1434 WARN_ON(1); 1435 } 1436 return num_refs; 1437 } 1438 1439 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1440 struct btrfs_root *root, 1441 struct btrfs_path *path, 1442 u64 bytenr, u64 parent, 1443 u64 root_objectid) 1444 { 1445 struct btrfs_key key; 1446 int ret; 1447 1448 key.objectid = bytenr; 1449 if (parent) { 1450 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1451 key.offset = parent; 1452 } else { 1453 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1454 key.offset = root_objectid; 1455 } 1456 1457 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1458 if (ret > 0) 1459 ret = -ENOENT; 1460 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1461 if (ret == -ENOENT && parent) { 1462 btrfs_release_path(path); 1463 key.type = BTRFS_EXTENT_REF_V0_KEY; 1464 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1465 if (ret > 0) 1466 ret = -ENOENT; 1467 } 1468 #endif 1469 return ret; 1470 } 1471 1472 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1473 struct btrfs_root *root, 1474 struct btrfs_path *path, 1475 u64 bytenr, u64 parent, 1476 u64 root_objectid) 1477 { 1478 struct btrfs_key key; 1479 int ret; 1480 1481 key.objectid = bytenr; 1482 if (parent) { 1483 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1484 key.offset = parent; 1485 } else { 1486 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1487 key.offset = root_objectid; 1488 } 1489 1490 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1491 btrfs_release_path(path); 1492 return ret; 1493 } 1494 1495 static inline int extent_ref_type(u64 parent, u64 owner) 1496 { 1497 int type; 1498 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1499 if (parent > 0) 1500 type = BTRFS_SHARED_BLOCK_REF_KEY; 1501 else 1502 type = BTRFS_TREE_BLOCK_REF_KEY; 1503 } else { 1504 if (parent > 0) 1505 type = BTRFS_SHARED_DATA_REF_KEY; 1506 else 1507 type = BTRFS_EXTENT_DATA_REF_KEY; 1508 } 1509 return type; 1510 } 1511 1512 static int find_next_key(struct btrfs_path *path, int level, 1513 struct btrfs_key *key) 1514 1515 { 1516 for (; level < BTRFS_MAX_LEVEL; level++) { 1517 if (!path->nodes[level]) 1518 break; 1519 if (path->slots[level] + 1 >= 1520 btrfs_header_nritems(path->nodes[level])) 1521 continue; 1522 if (level == 0) 1523 btrfs_item_key_to_cpu(path->nodes[level], key, 1524 path->slots[level] + 1); 1525 else 1526 btrfs_node_key_to_cpu(path->nodes[level], key, 1527 path->slots[level] + 1); 1528 return 0; 1529 } 1530 return 1; 1531 } 1532 1533 /* 1534 * look for inline back ref. if back ref is found, *ref_ret is set 1535 * to the address of inline back ref, and 0 is returned. 1536 * 1537 * if back ref isn't found, *ref_ret is set to the address where it 1538 * should be inserted, and -ENOENT is returned. 1539 * 1540 * if insert is true and there are too many inline back refs, the path 1541 * points to the extent item, and -EAGAIN is returned. 1542 * 1543 * NOTE: inline back refs are ordered in the same way that back ref 1544 * items in the tree are ordered. 1545 */ 1546 static noinline_for_stack 1547 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1548 struct btrfs_root *root, 1549 struct btrfs_path *path, 1550 struct btrfs_extent_inline_ref **ref_ret, 1551 u64 bytenr, u64 num_bytes, 1552 u64 parent, u64 root_objectid, 1553 u64 owner, u64 offset, int insert) 1554 { 1555 struct btrfs_key key; 1556 struct extent_buffer *leaf; 1557 struct btrfs_extent_item *ei; 1558 struct btrfs_extent_inline_ref *iref; 1559 u64 flags; 1560 u64 item_size; 1561 unsigned long ptr; 1562 unsigned long end; 1563 int extra_size; 1564 int type; 1565 int want; 1566 int ret; 1567 int err = 0; 1568 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1569 SKINNY_METADATA); 1570 1571 key.objectid = bytenr; 1572 key.type = BTRFS_EXTENT_ITEM_KEY; 1573 key.offset = num_bytes; 1574 1575 want = extent_ref_type(parent, owner); 1576 if (insert) { 1577 extra_size = btrfs_extent_inline_ref_size(want); 1578 path->keep_locks = 1; 1579 } else 1580 extra_size = -1; 1581 1582 /* 1583 * Owner is our parent level, so we can just add one to get the level 1584 * for the block we are interested in. 1585 */ 1586 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1587 key.type = BTRFS_METADATA_ITEM_KEY; 1588 key.offset = owner; 1589 } 1590 1591 again: 1592 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1593 if (ret < 0) { 1594 err = ret; 1595 goto out; 1596 } 1597 1598 /* 1599 * We may be a newly converted file system which still has the old fat 1600 * extent entries for metadata, so try and see if we have one of those. 1601 */ 1602 if (ret > 0 && skinny_metadata) { 1603 skinny_metadata = false; 1604 if (path->slots[0]) { 1605 path->slots[0]--; 1606 btrfs_item_key_to_cpu(path->nodes[0], &key, 1607 path->slots[0]); 1608 if (key.objectid == bytenr && 1609 key.type == BTRFS_EXTENT_ITEM_KEY && 1610 key.offset == num_bytes) 1611 ret = 0; 1612 } 1613 if (ret) { 1614 key.objectid = bytenr; 1615 key.type = BTRFS_EXTENT_ITEM_KEY; 1616 key.offset = num_bytes; 1617 btrfs_release_path(path); 1618 goto again; 1619 } 1620 } 1621 1622 if (ret && !insert) { 1623 err = -ENOENT; 1624 goto out; 1625 } else if (WARN_ON(ret)) { 1626 err = -EIO; 1627 goto out; 1628 } 1629 1630 leaf = path->nodes[0]; 1631 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1632 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1633 if (item_size < sizeof(*ei)) { 1634 if (!insert) { 1635 err = -ENOENT; 1636 goto out; 1637 } 1638 ret = convert_extent_item_v0(trans, root, path, owner, 1639 extra_size); 1640 if (ret < 0) { 1641 err = ret; 1642 goto out; 1643 } 1644 leaf = path->nodes[0]; 1645 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1646 } 1647 #endif 1648 BUG_ON(item_size < sizeof(*ei)); 1649 1650 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1651 flags = btrfs_extent_flags(leaf, ei); 1652 1653 ptr = (unsigned long)(ei + 1); 1654 end = (unsigned long)ei + item_size; 1655 1656 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1657 ptr += sizeof(struct btrfs_tree_block_info); 1658 BUG_ON(ptr > end); 1659 } 1660 1661 err = -ENOENT; 1662 while (1) { 1663 if (ptr >= end) { 1664 WARN_ON(ptr > end); 1665 break; 1666 } 1667 iref = (struct btrfs_extent_inline_ref *)ptr; 1668 type = btrfs_extent_inline_ref_type(leaf, iref); 1669 if (want < type) 1670 break; 1671 if (want > type) { 1672 ptr += btrfs_extent_inline_ref_size(type); 1673 continue; 1674 } 1675 1676 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1677 struct btrfs_extent_data_ref *dref; 1678 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1679 if (match_extent_data_ref(leaf, dref, root_objectid, 1680 owner, offset)) { 1681 err = 0; 1682 break; 1683 } 1684 if (hash_extent_data_ref_item(leaf, dref) < 1685 hash_extent_data_ref(root_objectid, owner, offset)) 1686 break; 1687 } else { 1688 u64 ref_offset; 1689 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1690 if (parent > 0) { 1691 if (parent == ref_offset) { 1692 err = 0; 1693 break; 1694 } 1695 if (ref_offset < parent) 1696 break; 1697 } else { 1698 if (root_objectid == ref_offset) { 1699 err = 0; 1700 break; 1701 } 1702 if (ref_offset < root_objectid) 1703 break; 1704 } 1705 } 1706 ptr += btrfs_extent_inline_ref_size(type); 1707 } 1708 if (err == -ENOENT && insert) { 1709 if (item_size + extra_size >= 1710 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1711 err = -EAGAIN; 1712 goto out; 1713 } 1714 /* 1715 * To add new inline back ref, we have to make sure 1716 * there is no corresponding back ref item. 1717 * For simplicity, we just do not add new inline back 1718 * ref if there is any kind of item for this block 1719 */ 1720 if (find_next_key(path, 0, &key) == 0 && 1721 key.objectid == bytenr && 1722 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1723 err = -EAGAIN; 1724 goto out; 1725 } 1726 } 1727 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1728 out: 1729 if (insert) { 1730 path->keep_locks = 0; 1731 btrfs_unlock_up_safe(path, 1); 1732 } 1733 return err; 1734 } 1735 1736 /* 1737 * helper to add new inline back ref 1738 */ 1739 static noinline_for_stack 1740 void setup_inline_extent_backref(struct btrfs_root *root, 1741 struct btrfs_path *path, 1742 struct btrfs_extent_inline_ref *iref, 1743 u64 parent, u64 root_objectid, 1744 u64 owner, u64 offset, int refs_to_add, 1745 struct btrfs_delayed_extent_op *extent_op) 1746 { 1747 struct extent_buffer *leaf; 1748 struct btrfs_extent_item *ei; 1749 unsigned long ptr; 1750 unsigned long end; 1751 unsigned long item_offset; 1752 u64 refs; 1753 int size; 1754 int type; 1755 1756 leaf = path->nodes[0]; 1757 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1758 item_offset = (unsigned long)iref - (unsigned long)ei; 1759 1760 type = extent_ref_type(parent, owner); 1761 size = btrfs_extent_inline_ref_size(type); 1762 1763 btrfs_extend_item(root, path, size); 1764 1765 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1766 refs = btrfs_extent_refs(leaf, ei); 1767 refs += refs_to_add; 1768 btrfs_set_extent_refs(leaf, ei, refs); 1769 if (extent_op) 1770 __run_delayed_extent_op(extent_op, leaf, ei); 1771 1772 ptr = (unsigned long)ei + item_offset; 1773 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1774 if (ptr < end - size) 1775 memmove_extent_buffer(leaf, ptr + size, ptr, 1776 end - size - ptr); 1777 1778 iref = (struct btrfs_extent_inline_ref *)ptr; 1779 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1780 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1781 struct btrfs_extent_data_ref *dref; 1782 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1783 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1784 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1785 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1786 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1787 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1788 struct btrfs_shared_data_ref *sref; 1789 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1790 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1791 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1792 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1793 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1794 } else { 1795 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1796 } 1797 btrfs_mark_buffer_dirty(leaf); 1798 } 1799 1800 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1801 struct btrfs_root *root, 1802 struct btrfs_path *path, 1803 struct btrfs_extent_inline_ref **ref_ret, 1804 u64 bytenr, u64 num_bytes, u64 parent, 1805 u64 root_objectid, u64 owner, u64 offset) 1806 { 1807 int ret; 1808 1809 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1810 bytenr, num_bytes, parent, 1811 root_objectid, owner, offset, 0); 1812 if (ret != -ENOENT) 1813 return ret; 1814 1815 btrfs_release_path(path); 1816 *ref_ret = NULL; 1817 1818 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1819 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1820 root_objectid); 1821 } else { 1822 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1823 root_objectid, owner, offset); 1824 } 1825 return ret; 1826 } 1827 1828 /* 1829 * helper to update/remove inline back ref 1830 */ 1831 static noinline_for_stack 1832 void update_inline_extent_backref(struct btrfs_root *root, 1833 struct btrfs_path *path, 1834 struct btrfs_extent_inline_ref *iref, 1835 int refs_to_mod, 1836 struct btrfs_delayed_extent_op *extent_op, 1837 int *last_ref) 1838 { 1839 struct extent_buffer *leaf; 1840 struct btrfs_extent_item *ei; 1841 struct btrfs_extent_data_ref *dref = NULL; 1842 struct btrfs_shared_data_ref *sref = NULL; 1843 unsigned long ptr; 1844 unsigned long end; 1845 u32 item_size; 1846 int size; 1847 int type; 1848 u64 refs; 1849 1850 leaf = path->nodes[0]; 1851 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1852 refs = btrfs_extent_refs(leaf, ei); 1853 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1854 refs += refs_to_mod; 1855 btrfs_set_extent_refs(leaf, ei, refs); 1856 if (extent_op) 1857 __run_delayed_extent_op(extent_op, leaf, ei); 1858 1859 type = btrfs_extent_inline_ref_type(leaf, iref); 1860 1861 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1862 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1863 refs = btrfs_extent_data_ref_count(leaf, dref); 1864 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1865 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1866 refs = btrfs_shared_data_ref_count(leaf, sref); 1867 } else { 1868 refs = 1; 1869 BUG_ON(refs_to_mod != -1); 1870 } 1871 1872 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1873 refs += refs_to_mod; 1874 1875 if (refs > 0) { 1876 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1877 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1878 else 1879 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1880 } else { 1881 *last_ref = 1; 1882 size = btrfs_extent_inline_ref_size(type); 1883 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1884 ptr = (unsigned long)iref; 1885 end = (unsigned long)ei + item_size; 1886 if (ptr + size < end) 1887 memmove_extent_buffer(leaf, ptr, ptr + size, 1888 end - ptr - size); 1889 item_size -= size; 1890 btrfs_truncate_item(root, path, item_size, 1); 1891 } 1892 btrfs_mark_buffer_dirty(leaf); 1893 } 1894 1895 static noinline_for_stack 1896 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1897 struct btrfs_root *root, 1898 struct btrfs_path *path, 1899 u64 bytenr, u64 num_bytes, u64 parent, 1900 u64 root_objectid, u64 owner, 1901 u64 offset, int refs_to_add, 1902 struct btrfs_delayed_extent_op *extent_op) 1903 { 1904 struct btrfs_extent_inline_ref *iref; 1905 int ret; 1906 1907 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1908 bytenr, num_bytes, parent, 1909 root_objectid, owner, offset, 1); 1910 if (ret == 0) { 1911 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1912 update_inline_extent_backref(root, path, iref, 1913 refs_to_add, extent_op, NULL); 1914 } else if (ret == -ENOENT) { 1915 setup_inline_extent_backref(root, path, iref, parent, 1916 root_objectid, owner, offset, 1917 refs_to_add, extent_op); 1918 ret = 0; 1919 } 1920 return ret; 1921 } 1922 1923 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1924 struct btrfs_root *root, 1925 struct btrfs_path *path, 1926 u64 bytenr, u64 parent, u64 root_objectid, 1927 u64 owner, u64 offset, int refs_to_add) 1928 { 1929 int ret; 1930 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1931 BUG_ON(refs_to_add != 1); 1932 ret = insert_tree_block_ref(trans, root, path, bytenr, 1933 parent, root_objectid); 1934 } else { 1935 ret = insert_extent_data_ref(trans, root, path, bytenr, 1936 parent, root_objectid, 1937 owner, offset, refs_to_add); 1938 } 1939 return ret; 1940 } 1941 1942 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1943 struct btrfs_root *root, 1944 struct btrfs_path *path, 1945 struct btrfs_extent_inline_ref *iref, 1946 int refs_to_drop, int is_data, int *last_ref) 1947 { 1948 int ret = 0; 1949 1950 BUG_ON(!is_data && refs_to_drop != 1); 1951 if (iref) { 1952 update_inline_extent_backref(root, path, iref, 1953 -refs_to_drop, NULL, last_ref); 1954 } else if (is_data) { 1955 ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1956 last_ref); 1957 } else { 1958 *last_ref = 1; 1959 ret = btrfs_del_item(trans, root, path); 1960 } 1961 return ret; 1962 } 1963 1964 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1965 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1966 u64 *discarded_bytes) 1967 { 1968 int j, ret = 0; 1969 u64 bytes_left, end; 1970 u64 aligned_start = ALIGN(start, 1 << 9); 1971 1972 if (WARN_ON(start != aligned_start)) { 1973 len -= aligned_start - start; 1974 len = round_down(len, 1 << 9); 1975 start = aligned_start; 1976 } 1977 1978 *discarded_bytes = 0; 1979 1980 if (!len) 1981 return 0; 1982 1983 end = start + len; 1984 bytes_left = len; 1985 1986 /* Skip any superblocks on this device. */ 1987 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1988 u64 sb_start = btrfs_sb_offset(j); 1989 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1990 u64 size = sb_start - start; 1991 1992 if (!in_range(sb_start, start, bytes_left) && 1993 !in_range(sb_end, start, bytes_left) && 1994 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1995 continue; 1996 1997 /* 1998 * Superblock spans beginning of range. Adjust start and 1999 * try again. 2000 */ 2001 if (sb_start <= start) { 2002 start += sb_end - start; 2003 if (start > end) { 2004 bytes_left = 0; 2005 break; 2006 } 2007 bytes_left = end - start; 2008 continue; 2009 } 2010 2011 if (size) { 2012 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 2013 GFP_NOFS, 0); 2014 if (!ret) 2015 *discarded_bytes += size; 2016 else if (ret != -EOPNOTSUPP) 2017 return ret; 2018 } 2019 2020 start = sb_end; 2021 if (start > end) { 2022 bytes_left = 0; 2023 break; 2024 } 2025 bytes_left = end - start; 2026 } 2027 2028 if (bytes_left) { 2029 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 2030 GFP_NOFS, 0); 2031 if (!ret) 2032 *discarded_bytes += bytes_left; 2033 } 2034 return ret; 2035 } 2036 2037 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 2038 u64 num_bytes, u64 *actual_bytes) 2039 { 2040 int ret; 2041 u64 discarded_bytes = 0; 2042 struct btrfs_bio *bbio = NULL; 2043 2044 2045 /* 2046 * Avoid races with device replace and make sure our bbio has devices 2047 * associated to its stripes that don't go away while we are discarding. 2048 */ 2049 btrfs_bio_counter_inc_blocked(root->fs_info); 2050 /* Tell the block device(s) that the sectors can be discarded */ 2051 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 2052 bytenr, &num_bytes, &bbio, 0); 2053 /* Error condition is -ENOMEM */ 2054 if (!ret) { 2055 struct btrfs_bio_stripe *stripe = bbio->stripes; 2056 int i; 2057 2058 2059 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2060 u64 bytes; 2061 if (!stripe->dev->can_discard) 2062 continue; 2063 2064 ret = btrfs_issue_discard(stripe->dev->bdev, 2065 stripe->physical, 2066 stripe->length, 2067 &bytes); 2068 if (!ret) 2069 discarded_bytes += bytes; 2070 else if (ret != -EOPNOTSUPP) 2071 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2072 2073 /* 2074 * Just in case we get back EOPNOTSUPP for some reason, 2075 * just ignore the return value so we don't screw up 2076 * people calling discard_extent. 2077 */ 2078 ret = 0; 2079 } 2080 btrfs_put_bbio(bbio); 2081 } 2082 btrfs_bio_counter_dec(root->fs_info); 2083 2084 if (actual_bytes) 2085 *actual_bytes = discarded_bytes; 2086 2087 2088 if (ret == -EOPNOTSUPP) 2089 ret = 0; 2090 return ret; 2091 } 2092 2093 /* Can return -ENOMEM */ 2094 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2095 struct btrfs_root *root, 2096 u64 bytenr, u64 num_bytes, u64 parent, 2097 u64 root_objectid, u64 owner, u64 offset) 2098 { 2099 int ret; 2100 struct btrfs_fs_info *fs_info = root->fs_info; 2101 2102 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2103 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2104 2105 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2106 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2107 num_bytes, 2108 parent, root_objectid, (int)owner, 2109 BTRFS_ADD_DELAYED_REF, NULL); 2110 } else { 2111 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2112 num_bytes, parent, root_objectid, 2113 owner, offset, 0, 2114 BTRFS_ADD_DELAYED_REF, NULL); 2115 } 2116 return ret; 2117 } 2118 2119 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2120 struct btrfs_root *root, 2121 struct btrfs_delayed_ref_node *node, 2122 u64 parent, u64 root_objectid, 2123 u64 owner, u64 offset, int refs_to_add, 2124 struct btrfs_delayed_extent_op *extent_op) 2125 { 2126 struct btrfs_fs_info *fs_info = root->fs_info; 2127 struct btrfs_path *path; 2128 struct extent_buffer *leaf; 2129 struct btrfs_extent_item *item; 2130 struct btrfs_key key; 2131 u64 bytenr = node->bytenr; 2132 u64 num_bytes = node->num_bytes; 2133 u64 refs; 2134 int ret; 2135 2136 path = btrfs_alloc_path(); 2137 if (!path) 2138 return -ENOMEM; 2139 2140 path->reada = READA_FORWARD; 2141 path->leave_spinning = 1; 2142 /* this will setup the path even if it fails to insert the back ref */ 2143 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, 2144 bytenr, num_bytes, parent, 2145 root_objectid, owner, offset, 2146 refs_to_add, extent_op); 2147 if ((ret < 0 && ret != -EAGAIN) || !ret) 2148 goto out; 2149 2150 /* 2151 * Ok we had -EAGAIN which means we didn't have space to insert and 2152 * inline extent ref, so just update the reference count and add a 2153 * normal backref. 2154 */ 2155 leaf = path->nodes[0]; 2156 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2157 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2158 refs = btrfs_extent_refs(leaf, item); 2159 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2160 if (extent_op) 2161 __run_delayed_extent_op(extent_op, leaf, item); 2162 2163 btrfs_mark_buffer_dirty(leaf); 2164 btrfs_release_path(path); 2165 2166 path->reada = READA_FORWARD; 2167 path->leave_spinning = 1; 2168 /* now insert the actual backref */ 2169 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2170 path, bytenr, parent, root_objectid, 2171 owner, offset, refs_to_add); 2172 if (ret) 2173 btrfs_abort_transaction(trans, root, ret); 2174 out: 2175 btrfs_free_path(path); 2176 return ret; 2177 } 2178 2179 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2180 struct btrfs_root *root, 2181 struct btrfs_delayed_ref_node *node, 2182 struct btrfs_delayed_extent_op *extent_op, 2183 int insert_reserved) 2184 { 2185 int ret = 0; 2186 struct btrfs_delayed_data_ref *ref; 2187 struct btrfs_key ins; 2188 u64 parent = 0; 2189 u64 ref_root = 0; 2190 u64 flags = 0; 2191 2192 ins.objectid = node->bytenr; 2193 ins.offset = node->num_bytes; 2194 ins.type = BTRFS_EXTENT_ITEM_KEY; 2195 2196 ref = btrfs_delayed_node_to_data_ref(node); 2197 trace_run_delayed_data_ref(node, ref, node->action); 2198 2199 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2200 parent = ref->parent; 2201 ref_root = ref->root; 2202 2203 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2204 if (extent_op) 2205 flags |= extent_op->flags_to_set; 2206 ret = alloc_reserved_file_extent(trans, root, 2207 parent, ref_root, flags, 2208 ref->objectid, ref->offset, 2209 &ins, node->ref_mod); 2210 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2211 ret = __btrfs_inc_extent_ref(trans, root, node, parent, 2212 ref_root, ref->objectid, 2213 ref->offset, node->ref_mod, 2214 extent_op); 2215 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2216 ret = __btrfs_free_extent(trans, root, node, parent, 2217 ref_root, ref->objectid, 2218 ref->offset, node->ref_mod, 2219 extent_op); 2220 } else { 2221 BUG(); 2222 } 2223 return ret; 2224 } 2225 2226 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2227 struct extent_buffer *leaf, 2228 struct btrfs_extent_item *ei) 2229 { 2230 u64 flags = btrfs_extent_flags(leaf, ei); 2231 if (extent_op->update_flags) { 2232 flags |= extent_op->flags_to_set; 2233 btrfs_set_extent_flags(leaf, ei, flags); 2234 } 2235 2236 if (extent_op->update_key) { 2237 struct btrfs_tree_block_info *bi; 2238 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2239 bi = (struct btrfs_tree_block_info *)(ei + 1); 2240 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2241 } 2242 } 2243 2244 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2245 struct btrfs_root *root, 2246 struct btrfs_delayed_ref_node *node, 2247 struct btrfs_delayed_extent_op *extent_op) 2248 { 2249 struct btrfs_key key; 2250 struct btrfs_path *path; 2251 struct btrfs_extent_item *ei; 2252 struct extent_buffer *leaf; 2253 u32 item_size; 2254 int ret; 2255 int err = 0; 2256 int metadata = !extent_op->is_data; 2257 2258 if (trans->aborted) 2259 return 0; 2260 2261 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2262 metadata = 0; 2263 2264 path = btrfs_alloc_path(); 2265 if (!path) 2266 return -ENOMEM; 2267 2268 key.objectid = node->bytenr; 2269 2270 if (metadata) { 2271 key.type = BTRFS_METADATA_ITEM_KEY; 2272 key.offset = extent_op->level; 2273 } else { 2274 key.type = BTRFS_EXTENT_ITEM_KEY; 2275 key.offset = node->num_bytes; 2276 } 2277 2278 again: 2279 path->reada = READA_FORWARD; 2280 path->leave_spinning = 1; 2281 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2282 path, 0, 1); 2283 if (ret < 0) { 2284 err = ret; 2285 goto out; 2286 } 2287 if (ret > 0) { 2288 if (metadata) { 2289 if (path->slots[0] > 0) { 2290 path->slots[0]--; 2291 btrfs_item_key_to_cpu(path->nodes[0], &key, 2292 path->slots[0]); 2293 if (key.objectid == node->bytenr && 2294 key.type == BTRFS_EXTENT_ITEM_KEY && 2295 key.offset == node->num_bytes) 2296 ret = 0; 2297 } 2298 if (ret > 0) { 2299 btrfs_release_path(path); 2300 metadata = 0; 2301 2302 key.objectid = node->bytenr; 2303 key.offset = node->num_bytes; 2304 key.type = BTRFS_EXTENT_ITEM_KEY; 2305 goto again; 2306 } 2307 } else { 2308 err = -EIO; 2309 goto out; 2310 } 2311 } 2312 2313 leaf = path->nodes[0]; 2314 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2315 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2316 if (item_size < sizeof(*ei)) { 2317 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2318 path, (u64)-1, 0); 2319 if (ret < 0) { 2320 err = ret; 2321 goto out; 2322 } 2323 leaf = path->nodes[0]; 2324 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2325 } 2326 #endif 2327 BUG_ON(item_size < sizeof(*ei)); 2328 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2329 __run_delayed_extent_op(extent_op, leaf, ei); 2330 2331 btrfs_mark_buffer_dirty(leaf); 2332 out: 2333 btrfs_free_path(path); 2334 return err; 2335 } 2336 2337 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2338 struct btrfs_root *root, 2339 struct btrfs_delayed_ref_node *node, 2340 struct btrfs_delayed_extent_op *extent_op, 2341 int insert_reserved) 2342 { 2343 int ret = 0; 2344 struct btrfs_delayed_tree_ref *ref; 2345 struct btrfs_key ins; 2346 u64 parent = 0; 2347 u64 ref_root = 0; 2348 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2349 SKINNY_METADATA); 2350 2351 ref = btrfs_delayed_node_to_tree_ref(node); 2352 trace_run_delayed_tree_ref(node, ref, node->action); 2353 2354 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2355 parent = ref->parent; 2356 ref_root = ref->root; 2357 2358 ins.objectid = node->bytenr; 2359 if (skinny_metadata) { 2360 ins.offset = ref->level; 2361 ins.type = BTRFS_METADATA_ITEM_KEY; 2362 } else { 2363 ins.offset = node->num_bytes; 2364 ins.type = BTRFS_EXTENT_ITEM_KEY; 2365 } 2366 2367 BUG_ON(node->ref_mod != 1); 2368 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2369 BUG_ON(!extent_op || !extent_op->update_flags); 2370 ret = alloc_reserved_tree_block(trans, root, 2371 parent, ref_root, 2372 extent_op->flags_to_set, 2373 &extent_op->key, 2374 ref->level, &ins); 2375 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2376 ret = __btrfs_inc_extent_ref(trans, root, node, 2377 parent, ref_root, 2378 ref->level, 0, 1, 2379 extent_op); 2380 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2381 ret = __btrfs_free_extent(trans, root, node, 2382 parent, ref_root, 2383 ref->level, 0, 1, extent_op); 2384 } else { 2385 BUG(); 2386 } 2387 return ret; 2388 } 2389 2390 /* helper function to actually process a single delayed ref entry */ 2391 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2392 struct btrfs_root *root, 2393 struct btrfs_delayed_ref_node *node, 2394 struct btrfs_delayed_extent_op *extent_op, 2395 int insert_reserved) 2396 { 2397 int ret = 0; 2398 2399 if (trans->aborted) { 2400 if (insert_reserved) 2401 btrfs_pin_extent(root, node->bytenr, 2402 node->num_bytes, 1); 2403 return 0; 2404 } 2405 2406 if (btrfs_delayed_ref_is_head(node)) { 2407 struct btrfs_delayed_ref_head *head; 2408 /* 2409 * we've hit the end of the chain and we were supposed 2410 * to insert this extent into the tree. But, it got 2411 * deleted before we ever needed to insert it, so all 2412 * we have to do is clean up the accounting 2413 */ 2414 BUG_ON(extent_op); 2415 head = btrfs_delayed_node_to_head(node); 2416 trace_run_delayed_ref_head(node, head, node->action); 2417 2418 if (insert_reserved) { 2419 btrfs_pin_extent(root, node->bytenr, 2420 node->num_bytes, 1); 2421 if (head->is_data) { 2422 ret = btrfs_del_csums(trans, root, 2423 node->bytenr, 2424 node->num_bytes); 2425 } 2426 } 2427 2428 /* Also free its reserved qgroup space */ 2429 btrfs_qgroup_free_delayed_ref(root->fs_info, 2430 head->qgroup_ref_root, 2431 head->qgroup_reserved); 2432 return ret; 2433 } 2434 2435 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2436 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2437 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2438 insert_reserved); 2439 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2440 node->type == BTRFS_SHARED_DATA_REF_KEY) 2441 ret = run_delayed_data_ref(trans, root, node, extent_op, 2442 insert_reserved); 2443 else 2444 BUG(); 2445 return ret; 2446 } 2447 2448 static inline struct btrfs_delayed_ref_node * 2449 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2450 { 2451 struct btrfs_delayed_ref_node *ref; 2452 2453 if (list_empty(&head->ref_list)) 2454 return NULL; 2455 2456 /* 2457 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2458 * This is to prevent a ref count from going down to zero, which deletes 2459 * the extent item from the extent tree, when there still are references 2460 * to add, which would fail because they would not find the extent item. 2461 */ 2462 list_for_each_entry(ref, &head->ref_list, list) { 2463 if (ref->action == BTRFS_ADD_DELAYED_REF) 2464 return ref; 2465 } 2466 2467 return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, 2468 list); 2469 } 2470 2471 /* 2472 * Returns 0 on success or if called with an already aborted transaction. 2473 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2474 */ 2475 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2476 struct btrfs_root *root, 2477 unsigned long nr) 2478 { 2479 struct btrfs_delayed_ref_root *delayed_refs; 2480 struct btrfs_delayed_ref_node *ref; 2481 struct btrfs_delayed_ref_head *locked_ref = NULL; 2482 struct btrfs_delayed_extent_op *extent_op; 2483 struct btrfs_fs_info *fs_info = root->fs_info; 2484 ktime_t start = ktime_get(); 2485 int ret; 2486 unsigned long count = 0; 2487 unsigned long actual_count = 0; 2488 int must_insert_reserved = 0; 2489 2490 delayed_refs = &trans->transaction->delayed_refs; 2491 while (1) { 2492 if (!locked_ref) { 2493 if (count >= nr) 2494 break; 2495 2496 spin_lock(&delayed_refs->lock); 2497 locked_ref = btrfs_select_ref_head(trans); 2498 if (!locked_ref) { 2499 spin_unlock(&delayed_refs->lock); 2500 break; 2501 } 2502 2503 /* grab the lock that says we are going to process 2504 * all the refs for this head */ 2505 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2506 spin_unlock(&delayed_refs->lock); 2507 /* 2508 * we may have dropped the spin lock to get the head 2509 * mutex lock, and that might have given someone else 2510 * time to free the head. If that's true, it has been 2511 * removed from our list and we can move on. 2512 */ 2513 if (ret == -EAGAIN) { 2514 locked_ref = NULL; 2515 count++; 2516 continue; 2517 } 2518 } 2519 2520 /* 2521 * We need to try and merge add/drops of the same ref since we 2522 * can run into issues with relocate dropping the implicit ref 2523 * and then it being added back again before the drop can 2524 * finish. If we merged anything we need to re-loop so we can 2525 * get a good ref. 2526 * Or we can get node references of the same type that weren't 2527 * merged when created due to bumps in the tree mod seq, and 2528 * we need to merge them to prevent adding an inline extent 2529 * backref before dropping it (triggering a BUG_ON at 2530 * insert_inline_extent_backref()). 2531 */ 2532 spin_lock(&locked_ref->lock); 2533 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2534 locked_ref); 2535 2536 /* 2537 * locked_ref is the head node, so we have to go one 2538 * node back for any delayed ref updates 2539 */ 2540 ref = select_delayed_ref(locked_ref); 2541 2542 if (ref && ref->seq && 2543 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2544 spin_unlock(&locked_ref->lock); 2545 btrfs_delayed_ref_unlock(locked_ref); 2546 spin_lock(&delayed_refs->lock); 2547 locked_ref->processing = 0; 2548 delayed_refs->num_heads_ready++; 2549 spin_unlock(&delayed_refs->lock); 2550 locked_ref = NULL; 2551 cond_resched(); 2552 count++; 2553 continue; 2554 } 2555 2556 /* 2557 * record the must insert reserved flag before we 2558 * drop the spin lock. 2559 */ 2560 must_insert_reserved = locked_ref->must_insert_reserved; 2561 locked_ref->must_insert_reserved = 0; 2562 2563 extent_op = locked_ref->extent_op; 2564 locked_ref->extent_op = NULL; 2565 2566 if (!ref) { 2567 2568 2569 /* All delayed refs have been processed, Go ahead 2570 * and send the head node to run_one_delayed_ref, 2571 * so that any accounting fixes can happen 2572 */ 2573 ref = &locked_ref->node; 2574 2575 if (extent_op && must_insert_reserved) { 2576 btrfs_free_delayed_extent_op(extent_op); 2577 extent_op = NULL; 2578 } 2579 2580 if (extent_op) { 2581 spin_unlock(&locked_ref->lock); 2582 ret = run_delayed_extent_op(trans, root, 2583 ref, extent_op); 2584 btrfs_free_delayed_extent_op(extent_op); 2585 2586 if (ret) { 2587 /* 2588 * Need to reset must_insert_reserved if 2589 * there was an error so the abort stuff 2590 * can cleanup the reserved space 2591 * properly. 2592 */ 2593 if (must_insert_reserved) 2594 locked_ref->must_insert_reserved = 1; 2595 locked_ref->processing = 0; 2596 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2597 btrfs_delayed_ref_unlock(locked_ref); 2598 return ret; 2599 } 2600 continue; 2601 } 2602 2603 /* 2604 * Need to drop our head ref lock and re-acquire the 2605 * delayed ref lock and then re-check to make sure 2606 * nobody got added. 2607 */ 2608 spin_unlock(&locked_ref->lock); 2609 spin_lock(&delayed_refs->lock); 2610 spin_lock(&locked_ref->lock); 2611 if (!list_empty(&locked_ref->ref_list) || 2612 locked_ref->extent_op) { 2613 spin_unlock(&locked_ref->lock); 2614 spin_unlock(&delayed_refs->lock); 2615 continue; 2616 } 2617 ref->in_tree = 0; 2618 delayed_refs->num_heads--; 2619 rb_erase(&locked_ref->href_node, 2620 &delayed_refs->href_root); 2621 spin_unlock(&delayed_refs->lock); 2622 } else { 2623 actual_count++; 2624 ref->in_tree = 0; 2625 list_del(&ref->list); 2626 } 2627 atomic_dec(&delayed_refs->num_entries); 2628 2629 if (!btrfs_delayed_ref_is_head(ref)) { 2630 /* 2631 * when we play the delayed ref, also correct the 2632 * ref_mod on head 2633 */ 2634 switch (ref->action) { 2635 case BTRFS_ADD_DELAYED_REF: 2636 case BTRFS_ADD_DELAYED_EXTENT: 2637 locked_ref->node.ref_mod -= ref->ref_mod; 2638 break; 2639 case BTRFS_DROP_DELAYED_REF: 2640 locked_ref->node.ref_mod += ref->ref_mod; 2641 break; 2642 default: 2643 WARN_ON(1); 2644 } 2645 } 2646 spin_unlock(&locked_ref->lock); 2647 2648 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2649 must_insert_reserved); 2650 2651 btrfs_free_delayed_extent_op(extent_op); 2652 if (ret) { 2653 locked_ref->processing = 0; 2654 btrfs_delayed_ref_unlock(locked_ref); 2655 btrfs_put_delayed_ref(ref); 2656 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2657 return ret; 2658 } 2659 2660 /* 2661 * If this node is a head, that means all the refs in this head 2662 * have been dealt with, and we will pick the next head to deal 2663 * with, so we must unlock the head and drop it from the cluster 2664 * list before we release it. 2665 */ 2666 if (btrfs_delayed_ref_is_head(ref)) { 2667 if (locked_ref->is_data && 2668 locked_ref->total_ref_mod < 0) { 2669 spin_lock(&delayed_refs->lock); 2670 delayed_refs->pending_csums -= ref->num_bytes; 2671 spin_unlock(&delayed_refs->lock); 2672 } 2673 btrfs_delayed_ref_unlock(locked_ref); 2674 locked_ref = NULL; 2675 } 2676 btrfs_put_delayed_ref(ref); 2677 count++; 2678 cond_resched(); 2679 } 2680 2681 /* 2682 * We don't want to include ref heads since we can have empty ref heads 2683 * and those will drastically skew our runtime down since we just do 2684 * accounting, no actual extent tree updates. 2685 */ 2686 if (actual_count > 0) { 2687 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2688 u64 avg; 2689 2690 /* 2691 * We weigh the current average higher than our current runtime 2692 * to avoid large swings in the average. 2693 */ 2694 spin_lock(&delayed_refs->lock); 2695 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2696 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2697 spin_unlock(&delayed_refs->lock); 2698 } 2699 return 0; 2700 } 2701 2702 #ifdef SCRAMBLE_DELAYED_REFS 2703 /* 2704 * Normally delayed refs get processed in ascending bytenr order. This 2705 * correlates in most cases to the order added. To expose dependencies on this 2706 * order, we start to process the tree in the middle instead of the beginning 2707 */ 2708 static u64 find_middle(struct rb_root *root) 2709 { 2710 struct rb_node *n = root->rb_node; 2711 struct btrfs_delayed_ref_node *entry; 2712 int alt = 1; 2713 u64 middle; 2714 u64 first = 0, last = 0; 2715 2716 n = rb_first(root); 2717 if (n) { 2718 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2719 first = entry->bytenr; 2720 } 2721 n = rb_last(root); 2722 if (n) { 2723 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2724 last = entry->bytenr; 2725 } 2726 n = root->rb_node; 2727 2728 while (n) { 2729 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2730 WARN_ON(!entry->in_tree); 2731 2732 middle = entry->bytenr; 2733 2734 if (alt) 2735 n = n->rb_left; 2736 else 2737 n = n->rb_right; 2738 2739 alt = 1 - alt; 2740 } 2741 return middle; 2742 } 2743 #endif 2744 2745 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2746 { 2747 u64 num_bytes; 2748 2749 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2750 sizeof(struct btrfs_extent_inline_ref)); 2751 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2752 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2753 2754 /* 2755 * We don't ever fill up leaves all the way so multiply by 2 just to be 2756 * closer to what we're really going to want to use. 2757 */ 2758 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2759 } 2760 2761 /* 2762 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2763 * would require to store the csums for that many bytes. 2764 */ 2765 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) 2766 { 2767 u64 csum_size; 2768 u64 num_csums_per_leaf; 2769 u64 num_csums; 2770 2771 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 2772 num_csums_per_leaf = div64_u64(csum_size, 2773 (u64)btrfs_super_csum_size(root->fs_info->super_copy)); 2774 num_csums = div64_u64(csum_bytes, root->sectorsize); 2775 num_csums += num_csums_per_leaf - 1; 2776 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2777 return num_csums; 2778 } 2779 2780 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2781 struct btrfs_root *root) 2782 { 2783 struct btrfs_block_rsv *global_rsv; 2784 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2785 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; 2786 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; 2787 u64 num_bytes, num_dirty_bgs_bytes; 2788 int ret = 0; 2789 2790 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2791 num_heads = heads_to_leaves(root, num_heads); 2792 if (num_heads > 1) 2793 num_bytes += (num_heads - 1) * root->nodesize; 2794 num_bytes <<= 1; 2795 num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; 2796 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, 2797 num_dirty_bgs); 2798 global_rsv = &root->fs_info->global_block_rsv; 2799 2800 /* 2801 * If we can't allocate any more chunks lets make sure we have _lots_ of 2802 * wiggle room since running delayed refs can create more delayed refs. 2803 */ 2804 if (global_rsv->space_info->full) { 2805 num_dirty_bgs_bytes <<= 1; 2806 num_bytes <<= 1; 2807 } 2808 2809 spin_lock(&global_rsv->lock); 2810 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) 2811 ret = 1; 2812 spin_unlock(&global_rsv->lock); 2813 return ret; 2814 } 2815 2816 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2817 struct btrfs_root *root) 2818 { 2819 struct btrfs_fs_info *fs_info = root->fs_info; 2820 u64 num_entries = 2821 atomic_read(&trans->transaction->delayed_refs.num_entries); 2822 u64 avg_runtime; 2823 u64 val; 2824 2825 smp_mb(); 2826 avg_runtime = fs_info->avg_delayed_ref_runtime; 2827 val = num_entries * avg_runtime; 2828 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2829 return 1; 2830 if (val >= NSEC_PER_SEC / 2) 2831 return 2; 2832 2833 return btrfs_check_space_for_delayed_refs(trans, root); 2834 } 2835 2836 struct async_delayed_refs { 2837 struct btrfs_root *root; 2838 int count; 2839 int error; 2840 int sync; 2841 struct completion wait; 2842 struct btrfs_work work; 2843 }; 2844 2845 static void delayed_ref_async_start(struct btrfs_work *work) 2846 { 2847 struct async_delayed_refs *async; 2848 struct btrfs_trans_handle *trans; 2849 int ret; 2850 2851 async = container_of(work, struct async_delayed_refs, work); 2852 2853 trans = btrfs_join_transaction(async->root); 2854 if (IS_ERR(trans)) { 2855 async->error = PTR_ERR(trans); 2856 goto done; 2857 } 2858 2859 /* 2860 * trans->sync means that when we call end_transaction, we won't 2861 * wait on delayed refs 2862 */ 2863 trans->sync = true; 2864 ret = btrfs_run_delayed_refs(trans, async->root, async->count); 2865 if (ret) 2866 async->error = ret; 2867 2868 ret = btrfs_end_transaction(trans, async->root); 2869 if (ret && !async->error) 2870 async->error = ret; 2871 done: 2872 if (async->sync) 2873 complete(&async->wait); 2874 else 2875 kfree(async); 2876 } 2877 2878 int btrfs_async_run_delayed_refs(struct btrfs_root *root, 2879 unsigned long count, int wait) 2880 { 2881 struct async_delayed_refs *async; 2882 int ret; 2883 2884 async = kmalloc(sizeof(*async), GFP_NOFS); 2885 if (!async) 2886 return -ENOMEM; 2887 2888 async->root = root->fs_info->tree_root; 2889 async->count = count; 2890 async->error = 0; 2891 if (wait) 2892 async->sync = 1; 2893 else 2894 async->sync = 0; 2895 init_completion(&async->wait); 2896 2897 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2898 delayed_ref_async_start, NULL, NULL); 2899 2900 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2901 2902 if (wait) { 2903 wait_for_completion(&async->wait); 2904 ret = async->error; 2905 kfree(async); 2906 return ret; 2907 } 2908 return 0; 2909 } 2910 2911 /* 2912 * this starts processing the delayed reference count updates and 2913 * extent insertions we have queued up so far. count can be 2914 * 0, which means to process everything in the tree at the start 2915 * of the run (but not newly added entries), or it can be some target 2916 * number you'd like to process. 2917 * 2918 * Returns 0 on success or if called with an aborted transaction 2919 * Returns <0 on error and aborts the transaction 2920 */ 2921 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2922 struct btrfs_root *root, unsigned long count) 2923 { 2924 struct rb_node *node; 2925 struct btrfs_delayed_ref_root *delayed_refs; 2926 struct btrfs_delayed_ref_head *head; 2927 int ret; 2928 int run_all = count == (unsigned long)-1; 2929 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 2930 2931 /* We'll clean this up in btrfs_cleanup_transaction */ 2932 if (trans->aborted) 2933 return 0; 2934 2935 if (root->fs_info->creating_free_space_tree) 2936 return 0; 2937 2938 if (root == root->fs_info->extent_root) 2939 root = root->fs_info->tree_root; 2940 2941 delayed_refs = &trans->transaction->delayed_refs; 2942 if (count == 0) 2943 count = atomic_read(&delayed_refs->num_entries) * 2; 2944 2945 again: 2946 #ifdef SCRAMBLE_DELAYED_REFS 2947 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2948 #endif 2949 trans->can_flush_pending_bgs = false; 2950 ret = __btrfs_run_delayed_refs(trans, root, count); 2951 if (ret < 0) { 2952 btrfs_abort_transaction(trans, root, ret); 2953 return ret; 2954 } 2955 2956 if (run_all) { 2957 if (!list_empty(&trans->new_bgs)) 2958 btrfs_create_pending_block_groups(trans, root); 2959 2960 spin_lock(&delayed_refs->lock); 2961 node = rb_first(&delayed_refs->href_root); 2962 if (!node) { 2963 spin_unlock(&delayed_refs->lock); 2964 goto out; 2965 } 2966 count = (unsigned long)-1; 2967 2968 while (node) { 2969 head = rb_entry(node, struct btrfs_delayed_ref_head, 2970 href_node); 2971 if (btrfs_delayed_ref_is_head(&head->node)) { 2972 struct btrfs_delayed_ref_node *ref; 2973 2974 ref = &head->node; 2975 atomic_inc(&ref->refs); 2976 2977 spin_unlock(&delayed_refs->lock); 2978 /* 2979 * Mutex was contended, block until it's 2980 * released and try again 2981 */ 2982 mutex_lock(&head->mutex); 2983 mutex_unlock(&head->mutex); 2984 2985 btrfs_put_delayed_ref(ref); 2986 cond_resched(); 2987 goto again; 2988 } else { 2989 WARN_ON(1); 2990 } 2991 node = rb_next(node); 2992 } 2993 spin_unlock(&delayed_refs->lock); 2994 cond_resched(); 2995 goto again; 2996 } 2997 out: 2998 assert_qgroups_uptodate(trans); 2999 trans->can_flush_pending_bgs = can_flush_pending_bgs; 3000 return 0; 3001 } 3002 3003 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3004 struct btrfs_root *root, 3005 u64 bytenr, u64 num_bytes, u64 flags, 3006 int level, int is_data) 3007 { 3008 struct btrfs_delayed_extent_op *extent_op; 3009 int ret; 3010 3011 extent_op = btrfs_alloc_delayed_extent_op(); 3012 if (!extent_op) 3013 return -ENOMEM; 3014 3015 extent_op->flags_to_set = flags; 3016 extent_op->update_flags = true; 3017 extent_op->update_key = false; 3018 extent_op->is_data = is_data ? true : false; 3019 extent_op->level = level; 3020 3021 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 3022 num_bytes, extent_op); 3023 if (ret) 3024 btrfs_free_delayed_extent_op(extent_op); 3025 return ret; 3026 } 3027 3028 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 3029 struct btrfs_root *root, 3030 struct btrfs_path *path, 3031 u64 objectid, u64 offset, u64 bytenr) 3032 { 3033 struct btrfs_delayed_ref_head *head; 3034 struct btrfs_delayed_ref_node *ref; 3035 struct btrfs_delayed_data_ref *data_ref; 3036 struct btrfs_delayed_ref_root *delayed_refs; 3037 int ret = 0; 3038 3039 delayed_refs = &trans->transaction->delayed_refs; 3040 spin_lock(&delayed_refs->lock); 3041 head = btrfs_find_delayed_ref_head(trans, bytenr); 3042 if (!head) { 3043 spin_unlock(&delayed_refs->lock); 3044 return 0; 3045 } 3046 3047 if (!mutex_trylock(&head->mutex)) { 3048 atomic_inc(&head->node.refs); 3049 spin_unlock(&delayed_refs->lock); 3050 3051 btrfs_release_path(path); 3052 3053 /* 3054 * Mutex was contended, block until it's released and let 3055 * caller try again 3056 */ 3057 mutex_lock(&head->mutex); 3058 mutex_unlock(&head->mutex); 3059 btrfs_put_delayed_ref(&head->node); 3060 return -EAGAIN; 3061 } 3062 spin_unlock(&delayed_refs->lock); 3063 3064 spin_lock(&head->lock); 3065 list_for_each_entry(ref, &head->ref_list, list) { 3066 /* If it's a shared ref we know a cross reference exists */ 3067 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3068 ret = 1; 3069 break; 3070 } 3071 3072 data_ref = btrfs_delayed_node_to_data_ref(ref); 3073 3074 /* 3075 * If our ref doesn't match the one we're currently looking at 3076 * then we have a cross reference. 3077 */ 3078 if (data_ref->root != root->root_key.objectid || 3079 data_ref->objectid != objectid || 3080 data_ref->offset != offset) { 3081 ret = 1; 3082 break; 3083 } 3084 } 3085 spin_unlock(&head->lock); 3086 mutex_unlock(&head->mutex); 3087 return ret; 3088 } 3089 3090 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 3091 struct btrfs_root *root, 3092 struct btrfs_path *path, 3093 u64 objectid, u64 offset, u64 bytenr) 3094 { 3095 struct btrfs_root *extent_root = root->fs_info->extent_root; 3096 struct extent_buffer *leaf; 3097 struct btrfs_extent_data_ref *ref; 3098 struct btrfs_extent_inline_ref *iref; 3099 struct btrfs_extent_item *ei; 3100 struct btrfs_key key; 3101 u32 item_size; 3102 int ret; 3103 3104 key.objectid = bytenr; 3105 key.offset = (u64)-1; 3106 key.type = BTRFS_EXTENT_ITEM_KEY; 3107 3108 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3109 if (ret < 0) 3110 goto out; 3111 BUG_ON(ret == 0); /* Corruption */ 3112 3113 ret = -ENOENT; 3114 if (path->slots[0] == 0) 3115 goto out; 3116 3117 path->slots[0]--; 3118 leaf = path->nodes[0]; 3119 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3120 3121 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3122 goto out; 3123 3124 ret = 1; 3125 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3126 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3127 if (item_size < sizeof(*ei)) { 3128 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3129 goto out; 3130 } 3131 #endif 3132 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3133 3134 if (item_size != sizeof(*ei) + 3135 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3136 goto out; 3137 3138 if (btrfs_extent_generation(leaf, ei) <= 3139 btrfs_root_last_snapshot(&root->root_item)) 3140 goto out; 3141 3142 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3143 if (btrfs_extent_inline_ref_type(leaf, iref) != 3144 BTRFS_EXTENT_DATA_REF_KEY) 3145 goto out; 3146 3147 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3148 if (btrfs_extent_refs(leaf, ei) != 3149 btrfs_extent_data_ref_count(leaf, ref) || 3150 btrfs_extent_data_ref_root(leaf, ref) != 3151 root->root_key.objectid || 3152 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3153 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3154 goto out; 3155 3156 ret = 0; 3157 out: 3158 return ret; 3159 } 3160 3161 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3162 struct btrfs_root *root, 3163 u64 objectid, u64 offset, u64 bytenr) 3164 { 3165 struct btrfs_path *path; 3166 int ret; 3167 int ret2; 3168 3169 path = btrfs_alloc_path(); 3170 if (!path) 3171 return -ENOENT; 3172 3173 do { 3174 ret = check_committed_ref(trans, root, path, objectid, 3175 offset, bytenr); 3176 if (ret && ret != -ENOENT) 3177 goto out; 3178 3179 ret2 = check_delayed_ref(trans, root, path, objectid, 3180 offset, bytenr); 3181 } while (ret2 == -EAGAIN); 3182 3183 if (ret2 && ret2 != -ENOENT) { 3184 ret = ret2; 3185 goto out; 3186 } 3187 3188 if (ret != -ENOENT || ret2 != -ENOENT) 3189 ret = 0; 3190 out: 3191 btrfs_free_path(path); 3192 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3193 WARN_ON(ret > 0); 3194 return ret; 3195 } 3196 3197 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3198 struct btrfs_root *root, 3199 struct extent_buffer *buf, 3200 int full_backref, int inc) 3201 { 3202 u64 bytenr; 3203 u64 num_bytes; 3204 u64 parent; 3205 u64 ref_root; 3206 u32 nritems; 3207 struct btrfs_key key; 3208 struct btrfs_file_extent_item *fi; 3209 int i; 3210 int level; 3211 int ret = 0; 3212 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3213 u64, u64, u64, u64, u64, u64); 3214 3215 3216 if (btrfs_test_is_dummy_root(root)) 3217 return 0; 3218 3219 ref_root = btrfs_header_owner(buf); 3220 nritems = btrfs_header_nritems(buf); 3221 level = btrfs_header_level(buf); 3222 3223 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3224 return 0; 3225 3226 if (inc) 3227 process_func = btrfs_inc_extent_ref; 3228 else 3229 process_func = btrfs_free_extent; 3230 3231 if (full_backref) 3232 parent = buf->start; 3233 else 3234 parent = 0; 3235 3236 for (i = 0; i < nritems; i++) { 3237 if (level == 0) { 3238 btrfs_item_key_to_cpu(buf, &key, i); 3239 if (key.type != BTRFS_EXTENT_DATA_KEY) 3240 continue; 3241 fi = btrfs_item_ptr(buf, i, 3242 struct btrfs_file_extent_item); 3243 if (btrfs_file_extent_type(buf, fi) == 3244 BTRFS_FILE_EXTENT_INLINE) 3245 continue; 3246 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3247 if (bytenr == 0) 3248 continue; 3249 3250 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3251 key.offset -= btrfs_file_extent_offset(buf, fi); 3252 ret = process_func(trans, root, bytenr, num_bytes, 3253 parent, ref_root, key.objectid, 3254 key.offset); 3255 if (ret) 3256 goto fail; 3257 } else { 3258 bytenr = btrfs_node_blockptr(buf, i); 3259 num_bytes = root->nodesize; 3260 ret = process_func(trans, root, bytenr, num_bytes, 3261 parent, ref_root, level - 1, 0); 3262 if (ret) 3263 goto fail; 3264 } 3265 } 3266 return 0; 3267 fail: 3268 return ret; 3269 } 3270 3271 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3272 struct extent_buffer *buf, int full_backref) 3273 { 3274 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3275 } 3276 3277 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3278 struct extent_buffer *buf, int full_backref) 3279 { 3280 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3281 } 3282 3283 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3284 struct btrfs_root *root, 3285 struct btrfs_path *path, 3286 struct btrfs_block_group_cache *cache) 3287 { 3288 int ret; 3289 struct btrfs_root *extent_root = root->fs_info->extent_root; 3290 unsigned long bi; 3291 struct extent_buffer *leaf; 3292 3293 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3294 if (ret) { 3295 if (ret > 0) 3296 ret = -ENOENT; 3297 goto fail; 3298 } 3299 3300 leaf = path->nodes[0]; 3301 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3302 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3303 btrfs_mark_buffer_dirty(leaf); 3304 fail: 3305 btrfs_release_path(path); 3306 return ret; 3307 3308 } 3309 3310 static struct btrfs_block_group_cache * 3311 next_block_group(struct btrfs_root *root, 3312 struct btrfs_block_group_cache *cache) 3313 { 3314 struct rb_node *node; 3315 3316 spin_lock(&root->fs_info->block_group_cache_lock); 3317 3318 /* If our block group was removed, we need a full search. */ 3319 if (RB_EMPTY_NODE(&cache->cache_node)) { 3320 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3321 3322 spin_unlock(&root->fs_info->block_group_cache_lock); 3323 btrfs_put_block_group(cache); 3324 cache = btrfs_lookup_first_block_group(root->fs_info, 3325 next_bytenr); 3326 return cache; 3327 } 3328 node = rb_next(&cache->cache_node); 3329 btrfs_put_block_group(cache); 3330 if (node) { 3331 cache = rb_entry(node, struct btrfs_block_group_cache, 3332 cache_node); 3333 btrfs_get_block_group(cache); 3334 } else 3335 cache = NULL; 3336 spin_unlock(&root->fs_info->block_group_cache_lock); 3337 return cache; 3338 } 3339 3340 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3341 struct btrfs_trans_handle *trans, 3342 struct btrfs_path *path) 3343 { 3344 struct btrfs_root *root = block_group->fs_info->tree_root; 3345 struct inode *inode = NULL; 3346 u64 alloc_hint = 0; 3347 int dcs = BTRFS_DC_ERROR; 3348 u64 num_pages = 0; 3349 int retries = 0; 3350 int ret = 0; 3351 3352 /* 3353 * If this block group is smaller than 100 megs don't bother caching the 3354 * block group. 3355 */ 3356 if (block_group->key.offset < (100 * SZ_1M)) { 3357 spin_lock(&block_group->lock); 3358 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3359 spin_unlock(&block_group->lock); 3360 return 0; 3361 } 3362 3363 if (trans->aborted) 3364 return 0; 3365 again: 3366 inode = lookup_free_space_inode(root, block_group, path); 3367 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3368 ret = PTR_ERR(inode); 3369 btrfs_release_path(path); 3370 goto out; 3371 } 3372 3373 if (IS_ERR(inode)) { 3374 BUG_ON(retries); 3375 retries++; 3376 3377 if (block_group->ro) 3378 goto out_free; 3379 3380 ret = create_free_space_inode(root, trans, block_group, path); 3381 if (ret) 3382 goto out_free; 3383 goto again; 3384 } 3385 3386 /* We've already setup this transaction, go ahead and exit */ 3387 if (block_group->cache_generation == trans->transid && 3388 i_size_read(inode)) { 3389 dcs = BTRFS_DC_SETUP; 3390 goto out_put; 3391 } 3392 3393 /* 3394 * We want to set the generation to 0, that way if anything goes wrong 3395 * from here on out we know not to trust this cache when we load up next 3396 * time. 3397 */ 3398 BTRFS_I(inode)->generation = 0; 3399 ret = btrfs_update_inode(trans, root, inode); 3400 if (ret) { 3401 /* 3402 * So theoretically we could recover from this, simply set the 3403 * super cache generation to 0 so we know to invalidate the 3404 * cache, but then we'd have to keep track of the block groups 3405 * that fail this way so we know we _have_ to reset this cache 3406 * before the next commit or risk reading stale cache. So to 3407 * limit our exposure to horrible edge cases lets just abort the 3408 * transaction, this only happens in really bad situations 3409 * anyway. 3410 */ 3411 btrfs_abort_transaction(trans, root, ret); 3412 goto out_put; 3413 } 3414 WARN_ON(ret); 3415 3416 if (i_size_read(inode) > 0) { 3417 ret = btrfs_check_trunc_cache_free_space(root, 3418 &root->fs_info->global_block_rsv); 3419 if (ret) 3420 goto out_put; 3421 3422 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); 3423 if (ret) 3424 goto out_put; 3425 } 3426 3427 spin_lock(&block_group->lock); 3428 if (block_group->cached != BTRFS_CACHE_FINISHED || 3429 !btrfs_test_opt(root, SPACE_CACHE)) { 3430 /* 3431 * don't bother trying to write stuff out _if_ 3432 * a) we're not cached, 3433 * b) we're with nospace_cache mount option. 3434 */ 3435 dcs = BTRFS_DC_WRITTEN; 3436 spin_unlock(&block_group->lock); 3437 goto out_put; 3438 } 3439 spin_unlock(&block_group->lock); 3440 3441 /* 3442 * We hit an ENOSPC when setting up the cache in this transaction, just 3443 * skip doing the setup, we've already cleared the cache so we're safe. 3444 */ 3445 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3446 ret = -ENOSPC; 3447 goto out_put; 3448 } 3449 3450 /* 3451 * Try to preallocate enough space based on how big the block group is. 3452 * Keep in mind this has to include any pinned space which could end up 3453 * taking up quite a bit since it's not folded into the other space 3454 * cache. 3455 */ 3456 num_pages = div_u64(block_group->key.offset, SZ_256M); 3457 if (!num_pages) 3458 num_pages = 1; 3459 3460 num_pages *= 16; 3461 num_pages *= PAGE_SIZE; 3462 3463 ret = btrfs_check_data_free_space(inode, 0, num_pages); 3464 if (ret) 3465 goto out_put; 3466 3467 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3468 num_pages, num_pages, 3469 &alloc_hint); 3470 /* 3471 * Our cache requires contiguous chunks so that we don't modify a bunch 3472 * of metadata or split extents when writing the cache out, which means 3473 * we can enospc if we are heavily fragmented in addition to just normal 3474 * out of space conditions. So if we hit this just skip setting up any 3475 * other block groups for this transaction, maybe we'll unpin enough 3476 * space the next time around. 3477 */ 3478 if (!ret) 3479 dcs = BTRFS_DC_SETUP; 3480 else if (ret == -ENOSPC) 3481 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3482 btrfs_free_reserved_data_space(inode, 0, num_pages); 3483 3484 out_put: 3485 iput(inode); 3486 out_free: 3487 btrfs_release_path(path); 3488 out: 3489 spin_lock(&block_group->lock); 3490 if (!ret && dcs == BTRFS_DC_SETUP) 3491 block_group->cache_generation = trans->transid; 3492 block_group->disk_cache_state = dcs; 3493 spin_unlock(&block_group->lock); 3494 3495 return ret; 3496 } 3497 3498 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3499 struct btrfs_root *root) 3500 { 3501 struct btrfs_block_group_cache *cache, *tmp; 3502 struct btrfs_transaction *cur_trans = trans->transaction; 3503 struct btrfs_path *path; 3504 3505 if (list_empty(&cur_trans->dirty_bgs) || 3506 !btrfs_test_opt(root, SPACE_CACHE)) 3507 return 0; 3508 3509 path = btrfs_alloc_path(); 3510 if (!path) 3511 return -ENOMEM; 3512 3513 /* Could add new block groups, use _safe just in case */ 3514 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3515 dirty_list) { 3516 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3517 cache_save_setup(cache, trans, path); 3518 } 3519 3520 btrfs_free_path(path); 3521 return 0; 3522 } 3523 3524 /* 3525 * transaction commit does final block group cache writeback during a 3526 * critical section where nothing is allowed to change the FS. This is 3527 * required in order for the cache to actually match the block group, 3528 * but can introduce a lot of latency into the commit. 3529 * 3530 * So, btrfs_start_dirty_block_groups is here to kick off block group 3531 * cache IO. There's a chance we'll have to redo some of it if the 3532 * block group changes again during the commit, but it greatly reduces 3533 * the commit latency by getting rid of the easy block groups while 3534 * we're still allowing others to join the commit. 3535 */ 3536 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3537 struct btrfs_root *root) 3538 { 3539 struct btrfs_block_group_cache *cache; 3540 struct btrfs_transaction *cur_trans = trans->transaction; 3541 int ret = 0; 3542 int should_put; 3543 struct btrfs_path *path = NULL; 3544 LIST_HEAD(dirty); 3545 struct list_head *io = &cur_trans->io_bgs; 3546 int num_started = 0; 3547 int loops = 0; 3548 3549 spin_lock(&cur_trans->dirty_bgs_lock); 3550 if (list_empty(&cur_trans->dirty_bgs)) { 3551 spin_unlock(&cur_trans->dirty_bgs_lock); 3552 return 0; 3553 } 3554 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3555 spin_unlock(&cur_trans->dirty_bgs_lock); 3556 3557 again: 3558 /* 3559 * make sure all the block groups on our dirty list actually 3560 * exist 3561 */ 3562 btrfs_create_pending_block_groups(trans, root); 3563 3564 if (!path) { 3565 path = btrfs_alloc_path(); 3566 if (!path) 3567 return -ENOMEM; 3568 } 3569 3570 /* 3571 * cache_write_mutex is here only to save us from balance or automatic 3572 * removal of empty block groups deleting this block group while we are 3573 * writing out the cache 3574 */ 3575 mutex_lock(&trans->transaction->cache_write_mutex); 3576 while (!list_empty(&dirty)) { 3577 cache = list_first_entry(&dirty, 3578 struct btrfs_block_group_cache, 3579 dirty_list); 3580 /* 3581 * this can happen if something re-dirties a block 3582 * group that is already under IO. Just wait for it to 3583 * finish and then do it all again 3584 */ 3585 if (!list_empty(&cache->io_list)) { 3586 list_del_init(&cache->io_list); 3587 btrfs_wait_cache_io(root, trans, cache, 3588 &cache->io_ctl, path, 3589 cache->key.objectid); 3590 btrfs_put_block_group(cache); 3591 } 3592 3593 3594 /* 3595 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3596 * if it should update the cache_state. Don't delete 3597 * until after we wait. 3598 * 3599 * Since we're not running in the commit critical section 3600 * we need the dirty_bgs_lock to protect from update_block_group 3601 */ 3602 spin_lock(&cur_trans->dirty_bgs_lock); 3603 list_del_init(&cache->dirty_list); 3604 spin_unlock(&cur_trans->dirty_bgs_lock); 3605 3606 should_put = 1; 3607 3608 cache_save_setup(cache, trans, path); 3609 3610 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3611 cache->io_ctl.inode = NULL; 3612 ret = btrfs_write_out_cache(root, trans, cache, path); 3613 if (ret == 0 && cache->io_ctl.inode) { 3614 num_started++; 3615 should_put = 0; 3616 3617 /* 3618 * the cache_write_mutex is protecting 3619 * the io_list 3620 */ 3621 list_add_tail(&cache->io_list, io); 3622 } else { 3623 /* 3624 * if we failed to write the cache, the 3625 * generation will be bad and life goes on 3626 */ 3627 ret = 0; 3628 } 3629 } 3630 if (!ret) { 3631 ret = write_one_cache_group(trans, root, path, cache); 3632 /* 3633 * Our block group might still be attached to the list 3634 * of new block groups in the transaction handle of some 3635 * other task (struct btrfs_trans_handle->new_bgs). This 3636 * means its block group item isn't yet in the extent 3637 * tree. If this happens ignore the error, as we will 3638 * try again later in the critical section of the 3639 * transaction commit. 3640 */ 3641 if (ret == -ENOENT) { 3642 ret = 0; 3643 spin_lock(&cur_trans->dirty_bgs_lock); 3644 if (list_empty(&cache->dirty_list)) { 3645 list_add_tail(&cache->dirty_list, 3646 &cur_trans->dirty_bgs); 3647 btrfs_get_block_group(cache); 3648 } 3649 spin_unlock(&cur_trans->dirty_bgs_lock); 3650 } else if (ret) { 3651 btrfs_abort_transaction(trans, root, ret); 3652 } 3653 } 3654 3655 /* if its not on the io list, we need to put the block group */ 3656 if (should_put) 3657 btrfs_put_block_group(cache); 3658 3659 if (ret) 3660 break; 3661 3662 /* 3663 * Avoid blocking other tasks for too long. It might even save 3664 * us from writing caches for block groups that are going to be 3665 * removed. 3666 */ 3667 mutex_unlock(&trans->transaction->cache_write_mutex); 3668 mutex_lock(&trans->transaction->cache_write_mutex); 3669 } 3670 mutex_unlock(&trans->transaction->cache_write_mutex); 3671 3672 /* 3673 * go through delayed refs for all the stuff we've just kicked off 3674 * and then loop back (just once) 3675 */ 3676 ret = btrfs_run_delayed_refs(trans, root, 0); 3677 if (!ret && loops == 0) { 3678 loops++; 3679 spin_lock(&cur_trans->dirty_bgs_lock); 3680 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3681 /* 3682 * dirty_bgs_lock protects us from concurrent block group 3683 * deletes too (not just cache_write_mutex). 3684 */ 3685 if (!list_empty(&dirty)) { 3686 spin_unlock(&cur_trans->dirty_bgs_lock); 3687 goto again; 3688 } 3689 spin_unlock(&cur_trans->dirty_bgs_lock); 3690 } 3691 3692 btrfs_free_path(path); 3693 return ret; 3694 } 3695 3696 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3697 struct btrfs_root *root) 3698 { 3699 struct btrfs_block_group_cache *cache; 3700 struct btrfs_transaction *cur_trans = trans->transaction; 3701 int ret = 0; 3702 int should_put; 3703 struct btrfs_path *path; 3704 struct list_head *io = &cur_trans->io_bgs; 3705 int num_started = 0; 3706 3707 path = btrfs_alloc_path(); 3708 if (!path) 3709 return -ENOMEM; 3710 3711 /* 3712 * Even though we are in the critical section of the transaction commit, 3713 * we can still have concurrent tasks adding elements to this 3714 * transaction's list of dirty block groups. These tasks correspond to 3715 * endio free space workers started when writeback finishes for a 3716 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3717 * allocate new block groups as a result of COWing nodes of the root 3718 * tree when updating the free space inode. The writeback for the space 3719 * caches is triggered by an earlier call to 3720 * btrfs_start_dirty_block_groups() and iterations of the following 3721 * loop. 3722 * Also we want to do the cache_save_setup first and then run the 3723 * delayed refs to make sure we have the best chance at doing this all 3724 * in one shot. 3725 */ 3726 spin_lock(&cur_trans->dirty_bgs_lock); 3727 while (!list_empty(&cur_trans->dirty_bgs)) { 3728 cache = list_first_entry(&cur_trans->dirty_bgs, 3729 struct btrfs_block_group_cache, 3730 dirty_list); 3731 3732 /* 3733 * this can happen if cache_save_setup re-dirties a block 3734 * group that is already under IO. Just wait for it to 3735 * finish and then do it all again 3736 */ 3737 if (!list_empty(&cache->io_list)) { 3738 spin_unlock(&cur_trans->dirty_bgs_lock); 3739 list_del_init(&cache->io_list); 3740 btrfs_wait_cache_io(root, trans, cache, 3741 &cache->io_ctl, path, 3742 cache->key.objectid); 3743 btrfs_put_block_group(cache); 3744 spin_lock(&cur_trans->dirty_bgs_lock); 3745 } 3746 3747 /* 3748 * don't remove from the dirty list until after we've waited 3749 * on any pending IO 3750 */ 3751 list_del_init(&cache->dirty_list); 3752 spin_unlock(&cur_trans->dirty_bgs_lock); 3753 should_put = 1; 3754 3755 cache_save_setup(cache, trans, path); 3756 3757 if (!ret) 3758 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); 3759 3760 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3761 cache->io_ctl.inode = NULL; 3762 ret = btrfs_write_out_cache(root, trans, cache, path); 3763 if (ret == 0 && cache->io_ctl.inode) { 3764 num_started++; 3765 should_put = 0; 3766 list_add_tail(&cache->io_list, io); 3767 } else { 3768 /* 3769 * if we failed to write the cache, the 3770 * generation will be bad and life goes on 3771 */ 3772 ret = 0; 3773 } 3774 } 3775 if (!ret) { 3776 ret = write_one_cache_group(trans, root, path, cache); 3777 /* 3778 * One of the free space endio workers might have 3779 * created a new block group while updating a free space 3780 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3781 * and hasn't released its transaction handle yet, in 3782 * which case the new block group is still attached to 3783 * its transaction handle and its creation has not 3784 * finished yet (no block group item in the extent tree 3785 * yet, etc). If this is the case, wait for all free 3786 * space endio workers to finish and retry. This is a 3787 * a very rare case so no need for a more efficient and 3788 * complex approach. 3789 */ 3790 if (ret == -ENOENT) { 3791 wait_event(cur_trans->writer_wait, 3792 atomic_read(&cur_trans->num_writers) == 1); 3793 ret = write_one_cache_group(trans, root, path, 3794 cache); 3795 } 3796 if (ret) 3797 btrfs_abort_transaction(trans, root, ret); 3798 } 3799 3800 /* if its not on the io list, we need to put the block group */ 3801 if (should_put) 3802 btrfs_put_block_group(cache); 3803 spin_lock(&cur_trans->dirty_bgs_lock); 3804 } 3805 spin_unlock(&cur_trans->dirty_bgs_lock); 3806 3807 while (!list_empty(io)) { 3808 cache = list_first_entry(io, struct btrfs_block_group_cache, 3809 io_list); 3810 list_del_init(&cache->io_list); 3811 btrfs_wait_cache_io(root, trans, cache, 3812 &cache->io_ctl, path, cache->key.objectid); 3813 btrfs_put_block_group(cache); 3814 } 3815 3816 btrfs_free_path(path); 3817 return ret; 3818 } 3819 3820 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3821 { 3822 struct btrfs_block_group_cache *block_group; 3823 int readonly = 0; 3824 3825 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3826 if (!block_group || block_group->ro) 3827 readonly = 1; 3828 if (block_group) 3829 btrfs_put_block_group(block_group); 3830 return readonly; 3831 } 3832 3833 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3834 { 3835 struct btrfs_block_group_cache *bg; 3836 bool ret = true; 3837 3838 bg = btrfs_lookup_block_group(fs_info, bytenr); 3839 if (!bg) 3840 return false; 3841 3842 spin_lock(&bg->lock); 3843 if (bg->ro) 3844 ret = false; 3845 else 3846 atomic_inc(&bg->nocow_writers); 3847 spin_unlock(&bg->lock); 3848 3849 /* no put on block group, done by btrfs_dec_nocow_writers */ 3850 if (!ret) 3851 btrfs_put_block_group(bg); 3852 3853 return ret; 3854 3855 } 3856 3857 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3858 { 3859 struct btrfs_block_group_cache *bg; 3860 3861 bg = btrfs_lookup_block_group(fs_info, bytenr); 3862 ASSERT(bg); 3863 if (atomic_dec_and_test(&bg->nocow_writers)) 3864 wake_up_atomic_t(&bg->nocow_writers); 3865 /* 3866 * Once for our lookup and once for the lookup done by a previous call 3867 * to btrfs_inc_nocow_writers() 3868 */ 3869 btrfs_put_block_group(bg); 3870 btrfs_put_block_group(bg); 3871 } 3872 3873 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) 3874 { 3875 schedule(); 3876 return 0; 3877 } 3878 3879 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3880 { 3881 wait_on_atomic_t(&bg->nocow_writers, 3882 btrfs_wait_nocow_writers_atomic_t, 3883 TASK_UNINTERRUPTIBLE); 3884 } 3885 3886 static const char *alloc_name(u64 flags) 3887 { 3888 switch (flags) { 3889 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3890 return "mixed"; 3891 case BTRFS_BLOCK_GROUP_METADATA: 3892 return "metadata"; 3893 case BTRFS_BLOCK_GROUP_DATA: 3894 return "data"; 3895 case BTRFS_BLOCK_GROUP_SYSTEM: 3896 return "system"; 3897 default: 3898 WARN_ON(1); 3899 return "invalid-combination"; 3900 }; 3901 } 3902 3903 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3904 u64 total_bytes, u64 bytes_used, 3905 struct btrfs_space_info **space_info) 3906 { 3907 struct btrfs_space_info *found; 3908 int i; 3909 int factor; 3910 int ret; 3911 3912 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3913 BTRFS_BLOCK_GROUP_RAID10)) 3914 factor = 2; 3915 else 3916 factor = 1; 3917 3918 found = __find_space_info(info, flags); 3919 if (found) { 3920 spin_lock(&found->lock); 3921 found->total_bytes += total_bytes; 3922 found->disk_total += total_bytes * factor; 3923 found->bytes_used += bytes_used; 3924 found->disk_used += bytes_used * factor; 3925 if (total_bytes > 0) 3926 found->full = 0; 3927 spin_unlock(&found->lock); 3928 *space_info = found; 3929 return 0; 3930 } 3931 found = kzalloc(sizeof(*found), GFP_NOFS); 3932 if (!found) 3933 return -ENOMEM; 3934 3935 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3936 if (ret) { 3937 kfree(found); 3938 return ret; 3939 } 3940 3941 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3942 INIT_LIST_HEAD(&found->block_groups[i]); 3943 init_rwsem(&found->groups_sem); 3944 spin_lock_init(&found->lock); 3945 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3946 found->total_bytes = total_bytes; 3947 found->disk_total = total_bytes * factor; 3948 found->bytes_used = bytes_used; 3949 found->disk_used = bytes_used * factor; 3950 found->bytes_pinned = 0; 3951 found->bytes_reserved = 0; 3952 found->bytes_readonly = 0; 3953 found->bytes_may_use = 0; 3954 found->full = 0; 3955 found->max_extent_size = 0; 3956 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3957 found->chunk_alloc = 0; 3958 found->flush = 0; 3959 init_waitqueue_head(&found->wait); 3960 INIT_LIST_HEAD(&found->ro_bgs); 3961 3962 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3963 info->space_info_kobj, "%s", 3964 alloc_name(found->flags)); 3965 if (ret) { 3966 kfree(found); 3967 return ret; 3968 } 3969 3970 *space_info = found; 3971 list_add_rcu(&found->list, &info->space_info); 3972 if (flags & BTRFS_BLOCK_GROUP_DATA) 3973 info->data_sinfo = found; 3974 3975 return ret; 3976 } 3977 3978 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3979 { 3980 u64 extra_flags = chunk_to_extended(flags) & 3981 BTRFS_EXTENDED_PROFILE_MASK; 3982 3983 write_seqlock(&fs_info->profiles_lock); 3984 if (flags & BTRFS_BLOCK_GROUP_DATA) 3985 fs_info->avail_data_alloc_bits |= extra_flags; 3986 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3987 fs_info->avail_metadata_alloc_bits |= extra_flags; 3988 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3989 fs_info->avail_system_alloc_bits |= extra_flags; 3990 write_sequnlock(&fs_info->profiles_lock); 3991 } 3992 3993 /* 3994 * returns target flags in extended format or 0 if restripe for this 3995 * chunk_type is not in progress 3996 * 3997 * should be called with either volume_mutex or balance_lock held 3998 */ 3999 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4000 { 4001 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4002 u64 target = 0; 4003 4004 if (!bctl) 4005 return 0; 4006 4007 if (flags & BTRFS_BLOCK_GROUP_DATA && 4008 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4009 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4010 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4011 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4012 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4013 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4014 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4015 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4016 } 4017 4018 return target; 4019 } 4020 4021 /* 4022 * @flags: available profiles in extended format (see ctree.h) 4023 * 4024 * Returns reduced profile in chunk format. If profile changing is in 4025 * progress (either running or paused) picks the target profile (if it's 4026 * already available), otherwise falls back to plain reducing. 4027 */ 4028 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 4029 { 4030 u64 num_devices = root->fs_info->fs_devices->rw_devices; 4031 u64 target; 4032 u64 raid_type; 4033 u64 allowed = 0; 4034 4035 /* 4036 * see if restripe for this chunk_type is in progress, if so 4037 * try to reduce to the target profile 4038 */ 4039 spin_lock(&root->fs_info->balance_lock); 4040 target = get_restripe_target(root->fs_info, flags); 4041 if (target) { 4042 /* pick target profile only if it's already available */ 4043 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4044 spin_unlock(&root->fs_info->balance_lock); 4045 return extended_to_chunk(target); 4046 } 4047 } 4048 spin_unlock(&root->fs_info->balance_lock); 4049 4050 /* First, mask out the RAID levels which aren't possible */ 4051 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4052 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4053 allowed |= btrfs_raid_group[raid_type]; 4054 } 4055 allowed &= flags; 4056 4057 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4058 allowed = BTRFS_BLOCK_GROUP_RAID6; 4059 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4060 allowed = BTRFS_BLOCK_GROUP_RAID5; 4061 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4062 allowed = BTRFS_BLOCK_GROUP_RAID10; 4063 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4064 allowed = BTRFS_BLOCK_GROUP_RAID1; 4065 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4066 allowed = BTRFS_BLOCK_GROUP_RAID0; 4067 4068 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4069 4070 return extended_to_chunk(flags | allowed); 4071 } 4072 4073 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 4074 { 4075 unsigned seq; 4076 u64 flags; 4077 4078 do { 4079 flags = orig_flags; 4080 seq = read_seqbegin(&root->fs_info->profiles_lock); 4081 4082 if (flags & BTRFS_BLOCK_GROUP_DATA) 4083 flags |= root->fs_info->avail_data_alloc_bits; 4084 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4085 flags |= root->fs_info->avail_system_alloc_bits; 4086 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4087 flags |= root->fs_info->avail_metadata_alloc_bits; 4088 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 4089 4090 return btrfs_reduce_alloc_profile(root, flags); 4091 } 4092 4093 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 4094 { 4095 u64 flags; 4096 u64 ret; 4097 4098 if (data) 4099 flags = BTRFS_BLOCK_GROUP_DATA; 4100 else if (root == root->fs_info->chunk_root) 4101 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4102 else 4103 flags = BTRFS_BLOCK_GROUP_METADATA; 4104 4105 ret = get_alloc_profile(root, flags); 4106 return ret; 4107 } 4108 4109 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) 4110 { 4111 struct btrfs_space_info *data_sinfo; 4112 struct btrfs_root *root = BTRFS_I(inode)->root; 4113 struct btrfs_fs_info *fs_info = root->fs_info; 4114 u64 used; 4115 int ret = 0; 4116 int need_commit = 2; 4117 int have_pinned_space; 4118 4119 /* make sure bytes are sectorsize aligned */ 4120 bytes = ALIGN(bytes, root->sectorsize); 4121 4122 if (btrfs_is_free_space_inode(inode)) { 4123 need_commit = 0; 4124 ASSERT(current->journal_info); 4125 } 4126 4127 data_sinfo = fs_info->data_sinfo; 4128 if (!data_sinfo) 4129 goto alloc; 4130 4131 again: 4132 /* make sure we have enough space to handle the data first */ 4133 spin_lock(&data_sinfo->lock); 4134 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 4135 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 4136 data_sinfo->bytes_may_use; 4137 4138 if (used + bytes > data_sinfo->total_bytes) { 4139 struct btrfs_trans_handle *trans; 4140 4141 /* 4142 * if we don't have enough free bytes in this space then we need 4143 * to alloc a new chunk. 4144 */ 4145 if (!data_sinfo->full) { 4146 u64 alloc_target; 4147 4148 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4149 spin_unlock(&data_sinfo->lock); 4150 alloc: 4151 alloc_target = btrfs_get_alloc_profile(root, 1); 4152 /* 4153 * It is ugly that we don't call nolock join 4154 * transaction for the free space inode case here. 4155 * But it is safe because we only do the data space 4156 * reservation for the free space cache in the 4157 * transaction context, the common join transaction 4158 * just increase the counter of the current transaction 4159 * handler, doesn't try to acquire the trans_lock of 4160 * the fs. 4161 */ 4162 trans = btrfs_join_transaction(root); 4163 if (IS_ERR(trans)) 4164 return PTR_ERR(trans); 4165 4166 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4167 alloc_target, 4168 CHUNK_ALLOC_NO_FORCE); 4169 btrfs_end_transaction(trans, root); 4170 if (ret < 0) { 4171 if (ret != -ENOSPC) 4172 return ret; 4173 else { 4174 have_pinned_space = 1; 4175 goto commit_trans; 4176 } 4177 } 4178 4179 if (!data_sinfo) 4180 data_sinfo = fs_info->data_sinfo; 4181 4182 goto again; 4183 } 4184 4185 /* 4186 * If we don't have enough pinned space to deal with this 4187 * allocation, and no removed chunk in current transaction, 4188 * don't bother committing the transaction. 4189 */ 4190 have_pinned_space = percpu_counter_compare( 4191 &data_sinfo->total_bytes_pinned, 4192 used + bytes - data_sinfo->total_bytes); 4193 spin_unlock(&data_sinfo->lock); 4194 4195 /* commit the current transaction and try again */ 4196 commit_trans: 4197 if (need_commit && 4198 !atomic_read(&root->fs_info->open_ioctl_trans)) { 4199 need_commit--; 4200 4201 if (need_commit > 0) { 4202 btrfs_start_delalloc_roots(fs_info, 0, -1); 4203 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); 4204 } 4205 4206 trans = btrfs_join_transaction(root); 4207 if (IS_ERR(trans)) 4208 return PTR_ERR(trans); 4209 if (have_pinned_space >= 0 || 4210 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4211 &trans->transaction->flags) || 4212 need_commit > 0) { 4213 ret = btrfs_commit_transaction(trans, root); 4214 if (ret) 4215 return ret; 4216 /* 4217 * The cleaner kthread might still be doing iput 4218 * operations. Wait for it to finish so that 4219 * more space is released. 4220 */ 4221 mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); 4222 mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); 4223 goto again; 4224 } else { 4225 btrfs_end_transaction(trans, root); 4226 } 4227 } 4228 4229 trace_btrfs_space_reservation(root->fs_info, 4230 "space_info:enospc", 4231 data_sinfo->flags, bytes, 1); 4232 return -ENOSPC; 4233 } 4234 data_sinfo->bytes_may_use += bytes; 4235 trace_btrfs_space_reservation(root->fs_info, "space_info", 4236 data_sinfo->flags, bytes, 1); 4237 spin_unlock(&data_sinfo->lock); 4238 4239 return ret; 4240 } 4241 4242 /* 4243 * New check_data_free_space() with ability for precious data reservation 4244 * Will replace old btrfs_check_data_free_space(), but for patch split, 4245 * add a new function first and then replace it. 4246 */ 4247 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) 4248 { 4249 struct btrfs_root *root = BTRFS_I(inode)->root; 4250 int ret; 4251 4252 /* align the range */ 4253 len = round_up(start + len, root->sectorsize) - 4254 round_down(start, root->sectorsize); 4255 start = round_down(start, root->sectorsize); 4256 4257 ret = btrfs_alloc_data_chunk_ondemand(inode, len); 4258 if (ret < 0) 4259 return ret; 4260 4261 /* 4262 * Use new btrfs_qgroup_reserve_data to reserve precious data space 4263 * 4264 * TODO: Find a good method to avoid reserve data space for NOCOW 4265 * range, but don't impact performance on quota disable case. 4266 */ 4267 ret = btrfs_qgroup_reserve_data(inode, start, len); 4268 return ret; 4269 } 4270 4271 /* 4272 * Called if we need to clear a data reservation for this inode 4273 * Normally in a error case. 4274 * 4275 * This one will *NOT* use accurate qgroup reserved space API, just for case 4276 * which we can't sleep and is sure it won't affect qgroup reserved space. 4277 * Like clear_bit_hook(). 4278 */ 4279 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4280 u64 len) 4281 { 4282 struct btrfs_root *root = BTRFS_I(inode)->root; 4283 struct btrfs_space_info *data_sinfo; 4284 4285 /* Make sure the range is aligned to sectorsize */ 4286 len = round_up(start + len, root->sectorsize) - 4287 round_down(start, root->sectorsize); 4288 start = round_down(start, root->sectorsize); 4289 4290 data_sinfo = root->fs_info->data_sinfo; 4291 spin_lock(&data_sinfo->lock); 4292 if (WARN_ON(data_sinfo->bytes_may_use < len)) 4293 data_sinfo->bytes_may_use = 0; 4294 else 4295 data_sinfo->bytes_may_use -= len; 4296 trace_btrfs_space_reservation(root->fs_info, "space_info", 4297 data_sinfo->flags, len, 0); 4298 spin_unlock(&data_sinfo->lock); 4299 } 4300 4301 /* 4302 * Called if we need to clear a data reservation for this inode 4303 * Normally in a error case. 4304 * 4305 * This one will handle the per-inode data rsv map for accurate reserved 4306 * space framework. 4307 */ 4308 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) 4309 { 4310 btrfs_free_reserved_data_space_noquota(inode, start, len); 4311 btrfs_qgroup_free_data(inode, start, len); 4312 } 4313 4314 static void force_metadata_allocation(struct btrfs_fs_info *info) 4315 { 4316 struct list_head *head = &info->space_info; 4317 struct btrfs_space_info *found; 4318 4319 rcu_read_lock(); 4320 list_for_each_entry_rcu(found, head, list) { 4321 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4322 found->force_alloc = CHUNK_ALLOC_FORCE; 4323 } 4324 rcu_read_unlock(); 4325 } 4326 4327 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4328 { 4329 return (global->size << 1); 4330 } 4331 4332 static int should_alloc_chunk(struct btrfs_root *root, 4333 struct btrfs_space_info *sinfo, int force) 4334 { 4335 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4336 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 4337 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 4338 u64 thresh; 4339 4340 if (force == CHUNK_ALLOC_FORCE) 4341 return 1; 4342 4343 /* 4344 * We need to take into account the global rsv because for all intents 4345 * and purposes it's used space. Don't worry about locking the 4346 * global_rsv, it doesn't change except when the transaction commits. 4347 */ 4348 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 4349 num_allocated += calc_global_rsv_need_space(global_rsv); 4350 4351 /* 4352 * in limited mode, we want to have some free space up to 4353 * about 1% of the FS size. 4354 */ 4355 if (force == CHUNK_ALLOC_LIMITED) { 4356 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 4357 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4358 4359 if (num_bytes - num_allocated < thresh) 4360 return 1; 4361 } 4362 4363 if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) 4364 return 0; 4365 return 1; 4366 } 4367 4368 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) 4369 { 4370 u64 num_dev; 4371 4372 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4373 BTRFS_BLOCK_GROUP_RAID0 | 4374 BTRFS_BLOCK_GROUP_RAID5 | 4375 BTRFS_BLOCK_GROUP_RAID6)) 4376 num_dev = root->fs_info->fs_devices->rw_devices; 4377 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4378 num_dev = 2; 4379 else 4380 num_dev = 1; /* DUP or single */ 4381 4382 return num_dev; 4383 } 4384 4385 /* 4386 * If @is_allocation is true, reserve space in the system space info necessary 4387 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4388 * removing a chunk. 4389 */ 4390 void check_system_chunk(struct btrfs_trans_handle *trans, 4391 struct btrfs_root *root, 4392 u64 type) 4393 { 4394 struct btrfs_space_info *info; 4395 u64 left; 4396 u64 thresh; 4397 int ret = 0; 4398 u64 num_devs; 4399 4400 /* 4401 * Needed because we can end up allocating a system chunk and for an 4402 * atomic and race free space reservation in the chunk block reserve. 4403 */ 4404 ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); 4405 4406 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4407 spin_lock(&info->lock); 4408 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 4409 info->bytes_reserved - info->bytes_readonly - 4410 info->bytes_may_use; 4411 spin_unlock(&info->lock); 4412 4413 num_devs = get_profile_num_devs(root, type); 4414 4415 /* num_devs device items to update and 1 chunk item to add or remove */ 4416 thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + 4417 btrfs_calc_trans_metadata_size(root, 1); 4418 4419 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 4420 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 4421 left, thresh, type); 4422 dump_space_info(info, 0, 0); 4423 } 4424 4425 if (left < thresh) { 4426 u64 flags; 4427 4428 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 4429 /* 4430 * Ignore failure to create system chunk. We might end up not 4431 * needing it, as we might not need to COW all nodes/leafs from 4432 * the paths we visit in the chunk tree (they were already COWed 4433 * or created in the current transaction for example). 4434 */ 4435 ret = btrfs_alloc_chunk(trans, root, flags); 4436 } 4437 4438 if (!ret) { 4439 ret = btrfs_block_rsv_add(root->fs_info->chunk_root, 4440 &root->fs_info->chunk_block_rsv, 4441 thresh, BTRFS_RESERVE_NO_FLUSH); 4442 if (!ret) 4443 trans->chunk_bytes_reserved += thresh; 4444 } 4445 } 4446 4447 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 4448 struct btrfs_root *extent_root, u64 flags, int force) 4449 { 4450 struct btrfs_space_info *space_info; 4451 struct btrfs_fs_info *fs_info = extent_root->fs_info; 4452 int wait_for_alloc = 0; 4453 int ret = 0; 4454 4455 /* Don't re-enter if we're already allocating a chunk */ 4456 if (trans->allocating_chunk) 4457 return -ENOSPC; 4458 4459 space_info = __find_space_info(extent_root->fs_info, flags); 4460 if (!space_info) { 4461 ret = update_space_info(extent_root->fs_info, flags, 4462 0, 0, &space_info); 4463 BUG_ON(ret); /* -ENOMEM */ 4464 } 4465 BUG_ON(!space_info); /* Logic error */ 4466 4467 again: 4468 spin_lock(&space_info->lock); 4469 if (force < space_info->force_alloc) 4470 force = space_info->force_alloc; 4471 if (space_info->full) { 4472 if (should_alloc_chunk(extent_root, space_info, force)) 4473 ret = -ENOSPC; 4474 else 4475 ret = 0; 4476 spin_unlock(&space_info->lock); 4477 return ret; 4478 } 4479 4480 if (!should_alloc_chunk(extent_root, space_info, force)) { 4481 spin_unlock(&space_info->lock); 4482 return 0; 4483 } else if (space_info->chunk_alloc) { 4484 wait_for_alloc = 1; 4485 } else { 4486 space_info->chunk_alloc = 1; 4487 } 4488 4489 spin_unlock(&space_info->lock); 4490 4491 mutex_lock(&fs_info->chunk_mutex); 4492 4493 /* 4494 * The chunk_mutex is held throughout the entirety of a chunk 4495 * allocation, so once we've acquired the chunk_mutex we know that the 4496 * other guy is done and we need to recheck and see if we should 4497 * allocate. 4498 */ 4499 if (wait_for_alloc) { 4500 mutex_unlock(&fs_info->chunk_mutex); 4501 wait_for_alloc = 0; 4502 goto again; 4503 } 4504 4505 trans->allocating_chunk = true; 4506 4507 /* 4508 * If we have mixed data/metadata chunks we want to make sure we keep 4509 * allocating mixed chunks instead of individual chunks. 4510 */ 4511 if (btrfs_mixed_space_info(space_info)) 4512 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4513 4514 /* 4515 * if we're doing a data chunk, go ahead and make sure that 4516 * we keep a reasonable number of metadata chunks allocated in the 4517 * FS as well. 4518 */ 4519 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4520 fs_info->data_chunk_allocations++; 4521 if (!(fs_info->data_chunk_allocations % 4522 fs_info->metadata_ratio)) 4523 force_metadata_allocation(fs_info); 4524 } 4525 4526 /* 4527 * Check if we have enough space in SYSTEM chunk because we may need 4528 * to update devices. 4529 */ 4530 check_system_chunk(trans, extent_root, flags); 4531 4532 ret = btrfs_alloc_chunk(trans, extent_root, flags); 4533 trans->allocating_chunk = false; 4534 4535 spin_lock(&space_info->lock); 4536 if (ret < 0 && ret != -ENOSPC) 4537 goto out; 4538 if (ret) 4539 space_info->full = 1; 4540 else 4541 ret = 1; 4542 4543 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4544 out: 4545 space_info->chunk_alloc = 0; 4546 spin_unlock(&space_info->lock); 4547 mutex_unlock(&fs_info->chunk_mutex); 4548 /* 4549 * When we allocate a new chunk we reserve space in the chunk block 4550 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4551 * add new nodes/leafs to it if we end up needing to do it when 4552 * inserting the chunk item and updating device items as part of the 4553 * second phase of chunk allocation, performed by 4554 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4555 * large number of new block groups to create in our transaction 4556 * handle's new_bgs list to avoid exhausting the chunk block reserve 4557 * in extreme cases - like having a single transaction create many new 4558 * block groups when starting to write out the free space caches of all 4559 * the block groups that were made dirty during the lifetime of the 4560 * transaction. 4561 */ 4562 if (trans->can_flush_pending_bgs && 4563 trans->chunk_bytes_reserved >= (u64)SZ_2M) { 4564 btrfs_create_pending_block_groups(trans, trans->root); 4565 btrfs_trans_release_chunk_metadata(trans); 4566 } 4567 return ret; 4568 } 4569 4570 static int can_overcommit(struct btrfs_root *root, 4571 struct btrfs_space_info *space_info, u64 bytes, 4572 enum btrfs_reserve_flush_enum flush) 4573 { 4574 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4575 u64 profile = btrfs_get_alloc_profile(root, 0); 4576 u64 space_size; 4577 u64 avail; 4578 u64 used; 4579 4580 used = space_info->bytes_used + space_info->bytes_reserved + 4581 space_info->bytes_pinned + space_info->bytes_readonly; 4582 4583 /* 4584 * We only want to allow over committing if we have lots of actual space 4585 * free, but if we don't have enough space to handle the global reserve 4586 * space then we could end up having a real enospc problem when trying 4587 * to allocate a chunk or some other such important allocation. 4588 */ 4589 spin_lock(&global_rsv->lock); 4590 space_size = calc_global_rsv_need_space(global_rsv); 4591 spin_unlock(&global_rsv->lock); 4592 if (used + space_size >= space_info->total_bytes) 4593 return 0; 4594 4595 used += space_info->bytes_may_use; 4596 4597 spin_lock(&root->fs_info->free_chunk_lock); 4598 avail = root->fs_info->free_chunk_space; 4599 spin_unlock(&root->fs_info->free_chunk_lock); 4600 4601 /* 4602 * If we have dup, raid1 or raid10 then only half of the free 4603 * space is actually useable. For raid56, the space info used 4604 * doesn't include the parity drive, so we don't have to 4605 * change the math 4606 */ 4607 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4608 BTRFS_BLOCK_GROUP_RAID1 | 4609 BTRFS_BLOCK_GROUP_RAID10)) 4610 avail >>= 1; 4611 4612 /* 4613 * If we aren't flushing all things, let us overcommit up to 4614 * 1/2th of the space. If we can flush, don't let us overcommit 4615 * too much, let it overcommit up to 1/8 of the space. 4616 */ 4617 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4618 avail >>= 3; 4619 else 4620 avail >>= 1; 4621 4622 if (used + bytes < space_info->total_bytes + avail) 4623 return 1; 4624 return 0; 4625 } 4626 4627 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4628 unsigned long nr_pages, int nr_items) 4629 { 4630 struct super_block *sb = root->fs_info->sb; 4631 4632 if (down_read_trylock(&sb->s_umount)) { 4633 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4634 up_read(&sb->s_umount); 4635 } else { 4636 /* 4637 * We needn't worry the filesystem going from r/w to r/o though 4638 * we don't acquire ->s_umount mutex, because the filesystem 4639 * should guarantee the delalloc inodes list be empty after 4640 * the filesystem is readonly(all dirty pages are written to 4641 * the disk). 4642 */ 4643 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4644 if (!current->journal_info) 4645 btrfs_wait_ordered_roots(root->fs_info, nr_items, 4646 0, (u64)-1); 4647 } 4648 } 4649 4650 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4651 { 4652 u64 bytes; 4653 int nr; 4654 4655 bytes = btrfs_calc_trans_metadata_size(root, 1); 4656 nr = (int)div64_u64(to_reclaim, bytes); 4657 if (!nr) 4658 nr = 1; 4659 return nr; 4660 } 4661 4662 #define EXTENT_SIZE_PER_ITEM SZ_256K 4663 4664 /* 4665 * shrink metadata reservation for delalloc 4666 */ 4667 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4668 bool wait_ordered) 4669 { 4670 struct btrfs_block_rsv *block_rsv; 4671 struct btrfs_space_info *space_info; 4672 struct btrfs_trans_handle *trans; 4673 u64 delalloc_bytes; 4674 u64 max_reclaim; 4675 long time_left; 4676 unsigned long nr_pages; 4677 int loops; 4678 int items; 4679 enum btrfs_reserve_flush_enum flush; 4680 4681 /* Calc the number of the pages we need flush for space reservation */ 4682 items = calc_reclaim_items_nr(root, to_reclaim); 4683 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; 4684 4685 trans = (struct btrfs_trans_handle *)current->journal_info; 4686 block_rsv = &root->fs_info->delalloc_block_rsv; 4687 space_info = block_rsv->space_info; 4688 4689 delalloc_bytes = percpu_counter_sum_positive( 4690 &root->fs_info->delalloc_bytes); 4691 if (delalloc_bytes == 0) { 4692 if (trans) 4693 return; 4694 if (wait_ordered) 4695 btrfs_wait_ordered_roots(root->fs_info, items, 4696 0, (u64)-1); 4697 return; 4698 } 4699 4700 loops = 0; 4701 while (delalloc_bytes && loops < 3) { 4702 max_reclaim = min(delalloc_bytes, to_reclaim); 4703 nr_pages = max_reclaim >> PAGE_SHIFT; 4704 btrfs_writeback_inodes_sb_nr(root, nr_pages, items); 4705 /* 4706 * We need to wait for the async pages to actually start before 4707 * we do anything. 4708 */ 4709 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4710 if (!max_reclaim) 4711 goto skip_async; 4712 4713 if (max_reclaim <= nr_pages) 4714 max_reclaim = 0; 4715 else 4716 max_reclaim -= nr_pages; 4717 4718 wait_event(root->fs_info->async_submit_wait, 4719 atomic_read(&root->fs_info->async_delalloc_pages) <= 4720 (int)max_reclaim); 4721 skip_async: 4722 if (!trans) 4723 flush = BTRFS_RESERVE_FLUSH_ALL; 4724 else 4725 flush = BTRFS_RESERVE_NO_FLUSH; 4726 spin_lock(&space_info->lock); 4727 if (can_overcommit(root, space_info, orig, flush)) { 4728 spin_unlock(&space_info->lock); 4729 break; 4730 } 4731 spin_unlock(&space_info->lock); 4732 4733 loops++; 4734 if (wait_ordered && !trans) { 4735 btrfs_wait_ordered_roots(root->fs_info, items, 4736 0, (u64)-1); 4737 } else { 4738 time_left = schedule_timeout_killable(1); 4739 if (time_left) 4740 break; 4741 } 4742 delalloc_bytes = percpu_counter_sum_positive( 4743 &root->fs_info->delalloc_bytes); 4744 } 4745 } 4746 4747 /** 4748 * maybe_commit_transaction - possibly commit the transaction if its ok to 4749 * @root - the root we're allocating for 4750 * @bytes - the number of bytes we want to reserve 4751 * @force - force the commit 4752 * 4753 * This will check to make sure that committing the transaction will actually 4754 * get us somewhere and then commit the transaction if it does. Otherwise it 4755 * will return -ENOSPC. 4756 */ 4757 static int may_commit_transaction(struct btrfs_root *root, 4758 struct btrfs_space_info *space_info, 4759 u64 bytes, int force) 4760 { 4761 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4762 struct btrfs_trans_handle *trans; 4763 4764 trans = (struct btrfs_trans_handle *)current->journal_info; 4765 if (trans) 4766 return -EAGAIN; 4767 4768 if (force) 4769 goto commit; 4770 4771 /* See if there is enough pinned space to make this reservation */ 4772 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4773 bytes) >= 0) 4774 goto commit; 4775 4776 /* 4777 * See if there is some space in the delayed insertion reservation for 4778 * this reservation. 4779 */ 4780 if (space_info != delayed_rsv->space_info) 4781 return -ENOSPC; 4782 4783 spin_lock(&delayed_rsv->lock); 4784 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4785 bytes - delayed_rsv->size) >= 0) { 4786 spin_unlock(&delayed_rsv->lock); 4787 return -ENOSPC; 4788 } 4789 spin_unlock(&delayed_rsv->lock); 4790 4791 commit: 4792 trans = btrfs_join_transaction(root); 4793 if (IS_ERR(trans)) 4794 return -ENOSPC; 4795 4796 return btrfs_commit_transaction(trans, root); 4797 } 4798 4799 enum flush_state { 4800 FLUSH_DELAYED_ITEMS_NR = 1, 4801 FLUSH_DELAYED_ITEMS = 2, 4802 FLUSH_DELALLOC = 3, 4803 FLUSH_DELALLOC_WAIT = 4, 4804 ALLOC_CHUNK = 5, 4805 COMMIT_TRANS = 6, 4806 }; 4807 4808 static int flush_space(struct btrfs_root *root, 4809 struct btrfs_space_info *space_info, u64 num_bytes, 4810 u64 orig_bytes, int state) 4811 { 4812 struct btrfs_trans_handle *trans; 4813 int nr; 4814 int ret = 0; 4815 4816 switch (state) { 4817 case FLUSH_DELAYED_ITEMS_NR: 4818 case FLUSH_DELAYED_ITEMS: 4819 if (state == FLUSH_DELAYED_ITEMS_NR) 4820 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4821 else 4822 nr = -1; 4823 4824 trans = btrfs_join_transaction(root); 4825 if (IS_ERR(trans)) { 4826 ret = PTR_ERR(trans); 4827 break; 4828 } 4829 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4830 btrfs_end_transaction(trans, root); 4831 break; 4832 case FLUSH_DELALLOC: 4833 case FLUSH_DELALLOC_WAIT: 4834 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4835 state == FLUSH_DELALLOC_WAIT); 4836 break; 4837 case ALLOC_CHUNK: 4838 trans = btrfs_join_transaction(root); 4839 if (IS_ERR(trans)) { 4840 ret = PTR_ERR(trans); 4841 break; 4842 } 4843 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4844 btrfs_get_alloc_profile(root, 0), 4845 CHUNK_ALLOC_NO_FORCE); 4846 btrfs_end_transaction(trans, root); 4847 if (ret == -ENOSPC) 4848 ret = 0; 4849 break; 4850 case COMMIT_TRANS: 4851 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4852 break; 4853 default: 4854 ret = -ENOSPC; 4855 break; 4856 } 4857 4858 return ret; 4859 } 4860 4861 static inline u64 4862 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4863 struct btrfs_space_info *space_info) 4864 { 4865 u64 used; 4866 u64 expected; 4867 u64 to_reclaim; 4868 4869 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4870 spin_lock(&space_info->lock); 4871 if (can_overcommit(root, space_info, to_reclaim, 4872 BTRFS_RESERVE_FLUSH_ALL)) { 4873 to_reclaim = 0; 4874 goto out; 4875 } 4876 4877 used = space_info->bytes_used + space_info->bytes_reserved + 4878 space_info->bytes_pinned + space_info->bytes_readonly + 4879 space_info->bytes_may_use; 4880 if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 4881 expected = div_factor_fine(space_info->total_bytes, 95); 4882 else 4883 expected = div_factor_fine(space_info->total_bytes, 90); 4884 4885 if (used > expected) 4886 to_reclaim = used - expected; 4887 else 4888 to_reclaim = 0; 4889 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4890 space_info->bytes_reserved); 4891 out: 4892 spin_unlock(&space_info->lock); 4893 4894 return to_reclaim; 4895 } 4896 4897 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4898 struct btrfs_fs_info *fs_info, u64 used) 4899 { 4900 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4901 4902 /* If we're just plain full then async reclaim just slows us down. */ 4903 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4904 return 0; 4905 4906 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4907 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4908 } 4909 4910 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4911 struct btrfs_fs_info *fs_info, 4912 int flush_state) 4913 { 4914 u64 used; 4915 4916 spin_lock(&space_info->lock); 4917 /* 4918 * We run out of space and have not got any free space via flush_space, 4919 * so don't bother doing async reclaim. 4920 */ 4921 if (flush_state > COMMIT_TRANS && space_info->full) { 4922 spin_unlock(&space_info->lock); 4923 return 0; 4924 } 4925 4926 used = space_info->bytes_used + space_info->bytes_reserved + 4927 space_info->bytes_pinned + space_info->bytes_readonly + 4928 space_info->bytes_may_use; 4929 if (need_do_async_reclaim(space_info, fs_info, used)) { 4930 spin_unlock(&space_info->lock); 4931 return 1; 4932 } 4933 spin_unlock(&space_info->lock); 4934 4935 return 0; 4936 } 4937 4938 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4939 { 4940 struct btrfs_fs_info *fs_info; 4941 struct btrfs_space_info *space_info; 4942 u64 to_reclaim; 4943 int flush_state; 4944 4945 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4946 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4947 4948 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4949 space_info); 4950 if (!to_reclaim) 4951 return; 4952 4953 flush_state = FLUSH_DELAYED_ITEMS_NR; 4954 do { 4955 flush_space(fs_info->fs_root, space_info, to_reclaim, 4956 to_reclaim, flush_state); 4957 flush_state++; 4958 if (!btrfs_need_do_async_reclaim(space_info, fs_info, 4959 flush_state)) 4960 return; 4961 } while (flush_state < COMMIT_TRANS); 4962 } 4963 4964 void btrfs_init_async_reclaim_work(struct work_struct *work) 4965 { 4966 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 4967 } 4968 4969 /** 4970 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4971 * @root - the root we're allocating for 4972 * @block_rsv - the block_rsv we're allocating for 4973 * @orig_bytes - the number of bytes we want 4974 * @flush - whether or not we can flush to make our reservation 4975 * 4976 * This will reserve orig_bytes number of bytes from the space info associated 4977 * with the block_rsv. If there is not enough space it will make an attempt to 4978 * flush out space to make room. It will do this by flushing delalloc if 4979 * possible or committing the transaction. If flush is 0 then no attempts to 4980 * regain reservations will be made and this will fail if there is not enough 4981 * space already. 4982 */ 4983 static int reserve_metadata_bytes(struct btrfs_root *root, 4984 struct btrfs_block_rsv *block_rsv, 4985 u64 orig_bytes, 4986 enum btrfs_reserve_flush_enum flush) 4987 { 4988 struct btrfs_space_info *space_info = block_rsv->space_info; 4989 u64 used; 4990 u64 num_bytes = orig_bytes; 4991 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4992 int ret = 0; 4993 bool flushing = false; 4994 4995 again: 4996 ret = 0; 4997 spin_lock(&space_info->lock); 4998 /* 4999 * We only want to wait if somebody other than us is flushing and we 5000 * are actually allowed to flush all things. 5001 */ 5002 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 5003 space_info->flush) { 5004 spin_unlock(&space_info->lock); 5005 /* 5006 * If we have a trans handle we can't wait because the flusher 5007 * may have to commit the transaction, which would mean we would 5008 * deadlock since we are waiting for the flusher to finish, but 5009 * hold the current transaction open. 5010 */ 5011 if (current->journal_info) 5012 return -EAGAIN; 5013 ret = wait_event_killable(space_info->wait, !space_info->flush); 5014 /* Must have been killed, return */ 5015 if (ret) 5016 return -EINTR; 5017 5018 spin_lock(&space_info->lock); 5019 } 5020 5021 ret = -ENOSPC; 5022 used = space_info->bytes_used + space_info->bytes_reserved + 5023 space_info->bytes_pinned + space_info->bytes_readonly + 5024 space_info->bytes_may_use; 5025 5026 /* 5027 * The idea here is that we've not already over-reserved the block group 5028 * then we can go ahead and save our reservation first and then start 5029 * flushing if we need to. Otherwise if we've already overcommitted 5030 * lets start flushing stuff first and then come back and try to make 5031 * our reservation. 5032 */ 5033 if (used <= space_info->total_bytes) { 5034 if (used + orig_bytes <= space_info->total_bytes) { 5035 space_info->bytes_may_use += orig_bytes; 5036 trace_btrfs_space_reservation(root->fs_info, 5037 "space_info", space_info->flags, orig_bytes, 1); 5038 ret = 0; 5039 } else { 5040 /* 5041 * Ok set num_bytes to orig_bytes since we aren't 5042 * overocmmitted, this way we only try and reclaim what 5043 * we need. 5044 */ 5045 num_bytes = orig_bytes; 5046 } 5047 } else { 5048 /* 5049 * Ok we're over committed, set num_bytes to the overcommitted 5050 * amount plus the amount of bytes that we need for this 5051 * reservation. 5052 */ 5053 num_bytes = used - space_info->total_bytes + 5054 (orig_bytes * 2); 5055 } 5056 5057 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 5058 space_info->bytes_may_use += orig_bytes; 5059 trace_btrfs_space_reservation(root->fs_info, "space_info", 5060 space_info->flags, orig_bytes, 5061 1); 5062 ret = 0; 5063 } 5064 5065 /* 5066 * Couldn't make our reservation, save our place so while we're trying 5067 * to reclaim space we can actually use it instead of somebody else 5068 * stealing it from us. 5069 * 5070 * We make the other tasks wait for the flush only when we can flush 5071 * all things. 5072 */ 5073 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5074 flushing = true; 5075 space_info->flush = 1; 5076 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5077 used += orig_bytes; 5078 /* 5079 * We will do the space reservation dance during log replay, 5080 * which means we won't have fs_info->fs_root set, so don't do 5081 * the async reclaim as we will panic. 5082 */ 5083 if (!root->fs_info->log_root_recovering && 5084 need_do_async_reclaim(space_info, root->fs_info, used) && 5085 !work_busy(&root->fs_info->async_reclaim_work)) 5086 queue_work(system_unbound_wq, 5087 &root->fs_info->async_reclaim_work); 5088 } 5089 spin_unlock(&space_info->lock); 5090 5091 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5092 goto out; 5093 5094 ret = flush_space(root, space_info, num_bytes, orig_bytes, 5095 flush_state); 5096 flush_state++; 5097 5098 /* 5099 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 5100 * would happen. So skip delalloc flush. 5101 */ 5102 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 5103 (flush_state == FLUSH_DELALLOC || 5104 flush_state == FLUSH_DELALLOC_WAIT)) 5105 flush_state = ALLOC_CHUNK; 5106 5107 if (!ret) 5108 goto again; 5109 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 5110 flush_state < COMMIT_TRANS) 5111 goto again; 5112 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 5113 flush_state <= COMMIT_TRANS) 5114 goto again; 5115 5116 out: 5117 if (ret == -ENOSPC && 5118 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5119 struct btrfs_block_rsv *global_rsv = 5120 &root->fs_info->global_block_rsv; 5121 5122 if (block_rsv != global_rsv && 5123 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5124 ret = 0; 5125 } 5126 if (ret == -ENOSPC) 5127 trace_btrfs_space_reservation(root->fs_info, 5128 "space_info:enospc", 5129 space_info->flags, orig_bytes, 1); 5130 if (flushing) { 5131 spin_lock(&space_info->lock); 5132 space_info->flush = 0; 5133 wake_up_all(&space_info->wait); 5134 spin_unlock(&space_info->lock); 5135 } 5136 return ret; 5137 } 5138 5139 static struct btrfs_block_rsv *get_block_rsv( 5140 const struct btrfs_trans_handle *trans, 5141 const struct btrfs_root *root) 5142 { 5143 struct btrfs_block_rsv *block_rsv = NULL; 5144 5145 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5146 (root == root->fs_info->csum_root && trans->adding_csums) || 5147 (root == root->fs_info->uuid_root)) 5148 block_rsv = trans->block_rsv; 5149 5150 if (!block_rsv) 5151 block_rsv = root->block_rsv; 5152 5153 if (!block_rsv) 5154 block_rsv = &root->fs_info->empty_block_rsv; 5155 5156 return block_rsv; 5157 } 5158 5159 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5160 u64 num_bytes) 5161 { 5162 int ret = -ENOSPC; 5163 spin_lock(&block_rsv->lock); 5164 if (block_rsv->reserved >= num_bytes) { 5165 block_rsv->reserved -= num_bytes; 5166 if (block_rsv->reserved < block_rsv->size) 5167 block_rsv->full = 0; 5168 ret = 0; 5169 } 5170 spin_unlock(&block_rsv->lock); 5171 return ret; 5172 } 5173 5174 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5175 u64 num_bytes, int update_size) 5176 { 5177 spin_lock(&block_rsv->lock); 5178 block_rsv->reserved += num_bytes; 5179 if (update_size) 5180 block_rsv->size += num_bytes; 5181 else if (block_rsv->reserved >= block_rsv->size) 5182 block_rsv->full = 1; 5183 spin_unlock(&block_rsv->lock); 5184 } 5185 5186 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5187 struct btrfs_block_rsv *dest, u64 num_bytes, 5188 int min_factor) 5189 { 5190 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5191 u64 min_bytes; 5192 5193 if (global_rsv->space_info != dest->space_info) 5194 return -ENOSPC; 5195 5196 spin_lock(&global_rsv->lock); 5197 min_bytes = div_factor(global_rsv->size, min_factor); 5198 if (global_rsv->reserved < min_bytes + num_bytes) { 5199 spin_unlock(&global_rsv->lock); 5200 return -ENOSPC; 5201 } 5202 global_rsv->reserved -= num_bytes; 5203 if (global_rsv->reserved < global_rsv->size) 5204 global_rsv->full = 0; 5205 spin_unlock(&global_rsv->lock); 5206 5207 block_rsv_add_bytes(dest, num_bytes, 1); 5208 return 0; 5209 } 5210 5211 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5212 struct btrfs_block_rsv *block_rsv, 5213 struct btrfs_block_rsv *dest, u64 num_bytes) 5214 { 5215 struct btrfs_space_info *space_info = block_rsv->space_info; 5216 5217 spin_lock(&block_rsv->lock); 5218 if (num_bytes == (u64)-1) 5219 num_bytes = block_rsv->size; 5220 block_rsv->size -= num_bytes; 5221 if (block_rsv->reserved >= block_rsv->size) { 5222 num_bytes = block_rsv->reserved - block_rsv->size; 5223 block_rsv->reserved = block_rsv->size; 5224 block_rsv->full = 1; 5225 } else { 5226 num_bytes = 0; 5227 } 5228 spin_unlock(&block_rsv->lock); 5229 5230 if (num_bytes > 0) { 5231 if (dest) { 5232 spin_lock(&dest->lock); 5233 if (!dest->full) { 5234 u64 bytes_to_add; 5235 5236 bytes_to_add = dest->size - dest->reserved; 5237 bytes_to_add = min(num_bytes, bytes_to_add); 5238 dest->reserved += bytes_to_add; 5239 if (dest->reserved >= dest->size) 5240 dest->full = 1; 5241 num_bytes -= bytes_to_add; 5242 } 5243 spin_unlock(&dest->lock); 5244 } 5245 if (num_bytes) { 5246 spin_lock(&space_info->lock); 5247 space_info->bytes_may_use -= num_bytes; 5248 trace_btrfs_space_reservation(fs_info, "space_info", 5249 space_info->flags, num_bytes, 0); 5250 spin_unlock(&space_info->lock); 5251 } 5252 } 5253 } 5254 5255 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 5256 struct btrfs_block_rsv *dst, u64 num_bytes) 5257 { 5258 int ret; 5259 5260 ret = block_rsv_use_bytes(src, num_bytes); 5261 if (ret) 5262 return ret; 5263 5264 block_rsv_add_bytes(dst, num_bytes, 1); 5265 return 0; 5266 } 5267 5268 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5269 { 5270 memset(rsv, 0, sizeof(*rsv)); 5271 spin_lock_init(&rsv->lock); 5272 rsv->type = type; 5273 } 5274 5275 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 5276 unsigned short type) 5277 { 5278 struct btrfs_block_rsv *block_rsv; 5279 struct btrfs_fs_info *fs_info = root->fs_info; 5280 5281 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5282 if (!block_rsv) 5283 return NULL; 5284 5285 btrfs_init_block_rsv(block_rsv, type); 5286 block_rsv->space_info = __find_space_info(fs_info, 5287 BTRFS_BLOCK_GROUP_METADATA); 5288 return block_rsv; 5289 } 5290 5291 void btrfs_free_block_rsv(struct btrfs_root *root, 5292 struct btrfs_block_rsv *rsv) 5293 { 5294 if (!rsv) 5295 return; 5296 btrfs_block_rsv_release(root, rsv, (u64)-1); 5297 kfree(rsv); 5298 } 5299 5300 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) 5301 { 5302 kfree(rsv); 5303 } 5304 5305 int btrfs_block_rsv_add(struct btrfs_root *root, 5306 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5307 enum btrfs_reserve_flush_enum flush) 5308 { 5309 int ret; 5310 5311 if (num_bytes == 0) 5312 return 0; 5313 5314 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5315 if (!ret) { 5316 block_rsv_add_bytes(block_rsv, num_bytes, 1); 5317 return 0; 5318 } 5319 5320 return ret; 5321 } 5322 5323 int btrfs_block_rsv_check(struct btrfs_root *root, 5324 struct btrfs_block_rsv *block_rsv, int min_factor) 5325 { 5326 u64 num_bytes = 0; 5327 int ret = -ENOSPC; 5328 5329 if (!block_rsv) 5330 return 0; 5331 5332 spin_lock(&block_rsv->lock); 5333 num_bytes = div_factor(block_rsv->size, min_factor); 5334 if (block_rsv->reserved >= num_bytes) 5335 ret = 0; 5336 spin_unlock(&block_rsv->lock); 5337 5338 return ret; 5339 } 5340 5341 int btrfs_block_rsv_refill(struct btrfs_root *root, 5342 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5343 enum btrfs_reserve_flush_enum flush) 5344 { 5345 u64 num_bytes = 0; 5346 int ret = -ENOSPC; 5347 5348 if (!block_rsv) 5349 return 0; 5350 5351 spin_lock(&block_rsv->lock); 5352 num_bytes = min_reserved; 5353 if (block_rsv->reserved >= num_bytes) 5354 ret = 0; 5355 else 5356 num_bytes -= block_rsv->reserved; 5357 spin_unlock(&block_rsv->lock); 5358 5359 if (!ret) 5360 return 0; 5361 5362 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5363 if (!ret) { 5364 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5365 return 0; 5366 } 5367 5368 return ret; 5369 } 5370 5371 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 5372 struct btrfs_block_rsv *dst_rsv, 5373 u64 num_bytes) 5374 { 5375 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 5376 } 5377 5378 void btrfs_block_rsv_release(struct btrfs_root *root, 5379 struct btrfs_block_rsv *block_rsv, 5380 u64 num_bytes) 5381 { 5382 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 5383 if (global_rsv == block_rsv || 5384 block_rsv->space_info != global_rsv->space_info) 5385 global_rsv = NULL; 5386 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 5387 num_bytes); 5388 } 5389 5390 /* 5391 * helper to calculate size of global block reservation. 5392 * the desired value is sum of space used by extent tree, 5393 * checksum tree and root tree 5394 */ 5395 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 5396 { 5397 struct btrfs_space_info *sinfo; 5398 u64 num_bytes; 5399 u64 meta_used; 5400 u64 data_used; 5401 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 5402 5403 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 5404 spin_lock(&sinfo->lock); 5405 data_used = sinfo->bytes_used; 5406 spin_unlock(&sinfo->lock); 5407 5408 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5409 spin_lock(&sinfo->lock); 5410 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 5411 data_used = 0; 5412 meta_used = sinfo->bytes_used; 5413 spin_unlock(&sinfo->lock); 5414 5415 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 5416 csum_size * 2; 5417 num_bytes += div_u64(data_used + meta_used, 50); 5418 5419 if (num_bytes * 3 > meta_used) 5420 num_bytes = div_u64(meta_used, 3); 5421 5422 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); 5423 } 5424 5425 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5426 { 5427 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5428 struct btrfs_space_info *sinfo = block_rsv->space_info; 5429 u64 num_bytes; 5430 5431 num_bytes = calc_global_metadata_size(fs_info); 5432 5433 spin_lock(&sinfo->lock); 5434 spin_lock(&block_rsv->lock); 5435 5436 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5437 5438 if (block_rsv->reserved < block_rsv->size) { 5439 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 5440 sinfo->bytes_reserved + sinfo->bytes_readonly + 5441 sinfo->bytes_may_use; 5442 if (sinfo->total_bytes > num_bytes) { 5443 num_bytes = sinfo->total_bytes - num_bytes; 5444 num_bytes = min(num_bytes, 5445 block_rsv->size - block_rsv->reserved); 5446 block_rsv->reserved += num_bytes; 5447 sinfo->bytes_may_use += num_bytes; 5448 trace_btrfs_space_reservation(fs_info, "space_info", 5449 sinfo->flags, num_bytes, 5450 1); 5451 } 5452 } else if (block_rsv->reserved > block_rsv->size) { 5453 num_bytes = block_rsv->reserved - block_rsv->size; 5454 sinfo->bytes_may_use -= num_bytes; 5455 trace_btrfs_space_reservation(fs_info, "space_info", 5456 sinfo->flags, num_bytes, 0); 5457 block_rsv->reserved = block_rsv->size; 5458 } 5459 5460 if (block_rsv->reserved == block_rsv->size) 5461 block_rsv->full = 1; 5462 else 5463 block_rsv->full = 0; 5464 5465 spin_unlock(&block_rsv->lock); 5466 spin_unlock(&sinfo->lock); 5467 } 5468 5469 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5470 { 5471 struct btrfs_space_info *space_info; 5472 5473 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5474 fs_info->chunk_block_rsv.space_info = space_info; 5475 5476 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5477 fs_info->global_block_rsv.space_info = space_info; 5478 fs_info->delalloc_block_rsv.space_info = space_info; 5479 fs_info->trans_block_rsv.space_info = space_info; 5480 fs_info->empty_block_rsv.space_info = space_info; 5481 fs_info->delayed_block_rsv.space_info = space_info; 5482 5483 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 5484 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 5485 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5486 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5487 if (fs_info->quota_root) 5488 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5489 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5490 5491 update_global_block_rsv(fs_info); 5492 } 5493 5494 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5495 { 5496 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5497 (u64)-1); 5498 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 5499 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 5500 WARN_ON(fs_info->trans_block_rsv.size > 0); 5501 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5502 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5503 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5504 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5505 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5506 } 5507 5508 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5509 struct btrfs_root *root) 5510 { 5511 if (!trans->block_rsv) 5512 return; 5513 5514 if (!trans->bytes_reserved) 5515 return; 5516 5517 trace_btrfs_space_reservation(root->fs_info, "transaction", 5518 trans->transid, trans->bytes_reserved, 0); 5519 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 5520 trans->bytes_reserved = 0; 5521 } 5522 5523 /* 5524 * To be called after all the new block groups attached to the transaction 5525 * handle have been created (btrfs_create_pending_block_groups()). 5526 */ 5527 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5528 { 5529 struct btrfs_fs_info *fs_info = trans->root->fs_info; 5530 5531 if (!trans->chunk_bytes_reserved) 5532 return; 5533 5534 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5535 5536 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5537 trans->chunk_bytes_reserved); 5538 trans->chunk_bytes_reserved = 0; 5539 } 5540 5541 /* Can only return 0 or -ENOSPC */ 5542 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5543 struct inode *inode) 5544 { 5545 struct btrfs_root *root = BTRFS_I(inode)->root; 5546 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 5547 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 5548 5549 /* 5550 * We need to hold space in order to delete our orphan item once we've 5551 * added it, so this takes the reservation so we can release it later 5552 * when we are truly done with the orphan item. 5553 */ 5554 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 5555 trace_btrfs_space_reservation(root->fs_info, "orphan", 5556 btrfs_ino(inode), num_bytes, 1); 5557 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 5558 } 5559 5560 void btrfs_orphan_release_metadata(struct inode *inode) 5561 { 5562 struct btrfs_root *root = BTRFS_I(inode)->root; 5563 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 5564 trace_btrfs_space_reservation(root->fs_info, "orphan", 5565 btrfs_ino(inode), num_bytes, 0); 5566 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 5567 } 5568 5569 /* 5570 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5571 * root: the root of the parent directory 5572 * rsv: block reservation 5573 * items: the number of items that we need do reservation 5574 * qgroup_reserved: used to return the reserved size in qgroup 5575 * 5576 * This function is used to reserve the space for snapshot/subvolume 5577 * creation and deletion. Those operations are different with the 5578 * common file/directory operations, they change two fs/file trees 5579 * and root tree, the number of items that the qgroup reserves is 5580 * different with the free space reservation. So we can not use 5581 * the space reservation mechanism in start_transaction(). 5582 */ 5583 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5584 struct btrfs_block_rsv *rsv, 5585 int items, 5586 u64 *qgroup_reserved, 5587 bool use_global_rsv) 5588 { 5589 u64 num_bytes; 5590 int ret; 5591 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 5592 5593 if (root->fs_info->quota_enabled) { 5594 /* One for parent inode, two for dir entries */ 5595 num_bytes = 3 * root->nodesize; 5596 ret = btrfs_qgroup_reserve_meta(root, num_bytes); 5597 if (ret) 5598 return ret; 5599 } else { 5600 num_bytes = 0; 5601 } 5602 5603 *qgroup_reserved = num_bytes; 5604 5605 num_bytes = btrfs_calc_trans_metadata_size(root, items); 5606 rsv->space_info = __find_space_info(root->fs_info, 5607 BTRFS_BLOCK_GROUP_METADATA); 5608 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5609 BTRFS_RESERVE_FLUSH_ALL); 5610 5611 if (ret == -ENOSPC && use_global_rsv) 5612 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 5613 5614 if (ret && *qgroup_reserved) 5615 btrfs_qgroup_free_meta(root, *qgroup_reserved); 5616 5617 return ret; 5618 } 5619 5620 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 5621 struct btrfs_block_rsv *rsv, 5622 u64 qgroup_reserved) 5623 { 5624 btrfs_block_rsv_release(root, rsv, (u64)-1); 5625 } 5626 5627 /** 5628 * drop_outstanding_extent - drop an outstanding extent 5629 * @inode: the inode we're dropping the extent for 5630 * @num_bytes: the number of bytes we're releasing. 5631 * 5632 * This is called when we are freeing up an outstanding extent, either called 5633 * after an error or after an extent is written. This will return the number of 5634 * reserved extents that need to be freed. This must be called with 5635 * BTRFS_I(inode)->lock held. 5636 */ 5637 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) 5638 { 5639 unsigned drop_inode_space = 0; 5640 unsigned dropped_extents = 0; 5641 unsigned num_extents = 0; 5642 5643 num_extents = (unsigned)div64_u64(num_bytes + 5644 BTRFS_MAX_EXTENT_SIZE - 1, 5645 BTRFS_MAX_EXTENT_SIZE); 5646 ASSERT(num_extents); 5647 ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); 5648 BTRFS_I(inode)->outstanding_extents -= num_extents; 5649 5650 if (BTRFS_I(inode)->outstanding_extents == 0 && 5651 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5652 &BTRFS_I(inode)->runtime_flags)) 5653 drop_inode_space = 1; 5654 5655 /* 5656 * If we have more or the same amount of outstanding extents than we have 5657 * reserved then we need to leave the reserved extents count alone. 5658 */ 5659 if (BTRFS_I(inode)->outstanding_extents >= 5660 BTRFS_I(inode)->reserved_extents) 5661 return drop_inode_space; 5662 5663 dropped_extents = BTRFS_I(inode)->reserved_extents - 5664 BTRFS_I(inode)->outstanding_extents; 5665 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5666 return dropped_extents + drop_inode_space; 5667 } 5668 5669 /** 5670 * calc_csum_metadata_size - return the amount of metadata space that must be 5671 * reserved/freed for the given bytes. 5672 * @inode: the inode we're manipulating 5673 * @num_bytes: the number of bytes in question 5674 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5675 * 5676 * This adjusts the number of csum_bytes in the inode and then returns the 5677 * correct amount of metadata that must either be reserved or freed. We 5678 * calculate how many checksums we can fit into one leaf and then divide the 5679 * number of bytes that will need to be checksumed by this value to figure out 5680 * how many checksums will be required. If we are adding bytes then the number 5681 * may go up and we will return the number of additional bytes that must be 5682 * reserved. If it is going down we will return the number of bytes that must 5683 * be freed. 5684 * 5685 * This must be called with BTRFS_I(inode)->lock held. 5686 */ 5687 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5688 int reserve) 5689 { 5690 struct btrfs_root *root = BTRFS_I(inode)->root; 5691 u64 old_csums, num_csums; 5692 5693 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5694 BTRFS_I(inode)->csum_bytes == 0) 5695 return 0; 5696 5697 old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); 5698 if (reserve) 5699 BTRFS_I(inode)->csum_bytes += num_bytes; 5700 else 5701 BTRFS_I(inode)->csum_bytes -= num_bytes; 5702 num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); 5703 5704 /* No change, no need to reserve more */ 5705 if (old_csums == num_csums) 5706 return 0; 5707 5708 if (reserve) 5709 return btrfs_calc_trans_metadata_size(root, 5710 num_csums - old_csums); 5711 5712 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 5713 } 5714 5715 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5716 { 5717 struct btrfs_root *root = BTRFS_I(inode)->root; 5718 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 5719 u64 to_reserve = 0; 5720 u64 csum_bytes; 5721 unsigned nr_extents = 0; 5722 int extra_reserve = 0; 5723 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5724 int ret = 0; 5725 bool delalloc_lock = true; 5726 u64 to_free = 0; 5727 unsigned dropped; 5728 5729 /* If we are a free space inode we need to not flush since we will be in 5730 * the middle of a transaction commit. We also don't need the delalloc 5731 * mutex since we won't race with anybody. We need this mostly to make 5732 * lockdep shut its filthy mouth. 5733 */ 5734 if (btrfs_is_free_space_inode(inode)) { 5735 flush = BTRFS_RESERVE_NO_FLUSH; 5736 delalloc_lock = false; 5737 } 5738 5739 if (flush != BTRFS_RESERVE_NO_FLUSH && 5740 btrfs_transaction_in_commit(root->fs_info)) 5741 schedule_timeout(1); 5742 5743 if (delalloc_lock) 5744 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5745 5746 num_bytes = ALIGN(num_bytes, root->sectorsize); 5747 5748 spin_lock(&BTRFS_I(inode)->lock); 5749 nr_extents = (unsigned)div64_u64(num_bytes + 5750 BTRFS_MAX_EXTENT_SIZE - 1, 5751 BTRFS_MAX_EXTENT_SIZE); 5752 BTRFS_I(inode)->outstanding_extents += nr_extents; 5753 nr_extents = 0; 5754 5755 if (BTRFS_I(inode)->outstanding_extents > 5756 BTRFS_I(inode)->reserved_extents) 5757 nr_extents = BTRFS_I(inode)->outstanding_extents - 5758 BTRFS_I(inode)->reserved_extents; 5759 5760 /* 5761 * Add an item to reserve for updating the inode when we complete the 5762 * delalloc io. 5763 */ 5764 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5765 &BTRFS_I(inode)->runtime_flags)) { 5766 nr_extents++; 5767 extra_reserve = 1; 5768 } 5769 5770 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 5771 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5772 csum_bytes = BTRFS_I(inode)->csum_bytes; 5773 spin_unlock(&BTRFS_I(inode)->lock); 5774 5775 if (root->fs_info->quota_enabled) { 5776 ret = btrfs_qgroup_reserve_meta(root, 5777 nr_extents * root->nodesize); 5778 if (ret) 5779 goto out_fail; 5780 } 5781 5782 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5783 if (unlikely(ret)) { 5784 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); 5785 goto out_fail; 5786 } 5787 5788 spin_lock(&BTRFS_I(inode)->lock); 5789 if (extra_reserve) { 5790 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5791 &BTRFS_I(inode)->runtime_flags); 5792 nr_extents--; 5793 } 5794 BTRFS_I(inode)->reserved_extents += nr_extents; 5795 spin_unlock(&BTRFS_I(inode)->lock); 5796 5797 if (delalloc_lock) 5798 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5799 5800 if (to_reserve) 5801 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5802 btrfs_ino(inode), to_reserve, 1); 5803 block_rsv_add_bytes(block_rsv, to_reserve, 1); 5804 5805 return 0; 5806 5807 out_fail: 5808 spin_lock(&BTRFS_I(inode)->lock); 5809 dropped = drop_outstanding_extent(inode, num_bytes); 5810 /* 5811 * If the inodes csum_bytes is the same as the original 5812 * csum_bytes then we know we haven't raced with any free()ers 5813 * so we can just reduce our inodes csum bytes and carry on. 5814 */ 5815 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 5816 calc_csum_metadata_size(inode, num_bytes, 0); 5817 } else { 5818 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 5819 u64 bytes; 5820 5821 /* 5822 * This is tricky, but first we need to figure out how much we 5823 * freed from any free-ers that occurred during this 5824 * reservation, so we reset ->csum_bytes to the csum_bytes 5825 * before we dropped our lock, and then call the free for the 5826 * number of bytes that were freed while we were trying our 5827 * reservation. 5828 */ 5829 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 5830 BTRFS_I(inode)->csum_bytes = csum_bytes; 5831 to_free = calc_csum_metadata_size(inode, bytes, 0); 5832 5833 5834 /* 5835 * Now we need to see how much we would have freed had we not 5836 * been making this reservation and our ->csum_bytes were not 5837 * artificially inflated. 5838 */ 5839 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 5840 bytes = csum_bytes - orig_csum_bytes; 5841 bytes = calc_csum_metadata_size(inode, bytes, 0); 5842 5843 /* 5844 * Now reset ->csum_bytes to what it should be. If bytes is 5845 * more than to_free then we would have freed more space had we 5846 * not had an artificially high ->csum_bytes, so we need to free 5847 * the remainder. If bytes is the same or less then we don't 5848 * need to do anything, the other free-ers did the correct 5849 * thing. 5850 */ 5851 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 5852 if (bytes > to_free) 5853 to_free = bytes - to_free; 5854 else 5855 to_free = 0; 5856 } 5857 spin_unlock(&BTRFS_I(inode)->lock); 5858 if (dropped) 5859 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5860 5861 if (to_free) { 5862 btrfs_block_rsv_release(root, block_rsv, to_free); 5863 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5864 btrfs_ino(inode), to_free, 0); 5865 } 5866 if (delalloc_lock) 5867 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5868 return ret; 5869 } 5870 5871 /** 5872 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5873 * @inode: the inode to release the reservation for 5874 * @num_bytes: the number of bytes we're releasing 5875 * 5876 * This will release the metadata reservation for an inode. This can be called 5877 * once we complete IO for a given set of bytes to release their metadata 5878 * reservations. 5879 */ 5880 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 5881 { 5882 struct btrfs_root *root = BTRFS_I(inode)->root; 5883 u64 to_free = 0; 5884 unsigned dropped; 5885 5886 num_bytes = ALIGN(num_bytes, root->sectorsize); 5887 spin_lock(&BTRFS_I(inode)->lock); 5888 dropped = drop_outstanding_extent(inode, num_bytes); 5889 5890 if (num_bytes) 5891 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5892 spin_unlock(&BTRFS_I(inode)->lock); 5893 if (dropped > 0) 5894 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5895 5896 if (btrfs_test_is_dummy_root(root)) 5897 return; 5898 5899 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5900 btrfs_ino(inode), to_free, 0); 5901 5902 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5903 to_free); 5904 } 5905 5906 /** 5907 * btrfs_delalloc_reserve_space - reserve data and metadata space for 5908 * delalloc 5909 * @inode: inode we're writing to 5910 * @start: start range we are writing to 5911 * @len: how long the range we are writing to 5912 * 5913 * TODO: This function will finally replace old btrfs_delalloc_reserve_space() 5914 * 5915 * This will do the following things 5916 * 5917 * o reserve space in data space info for num bytes 5918 * and reserve precious corresponding qgroup space 5919 * (Done in check_data_free_space) 5920 * 5921 * o reserve space for metadata space, based on the number of outstanding 5922 * extents and how much csums will be needed 5923 * also reserve metadata space in a per root over-reserve method. 5924 * o add to the inodes->delalloc_bytes 5925 * o add it to the fs_info's delalloc inodes list. 5926 * (Above 3 all done in delalloc_reserve_metadata) 5927 * 5928 * Return 0 for success 5929 * Return <0 for error(-ENOSPC or -EQUOT) 5930 */ 5931 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) 5932 { 5933 int ret; 5934 5935 ret = btrfs_check_data_free_space(inode, start, len); 5936 if (ret < 0) 5937 return ret; 5938 ret = btrfs_delalloc_reserve_metadata(inode, len); 5939 if (ret < 0) 5940 btrfs_free_reserved_data_space(inode, start, len); 5941 return ret; 5942 } 5943 5944 /** 5945 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5946 * @inode: inode we're releasing space for 5947 * @start: start position of the space already reserved 5948 * @len: the len of the space already reserved 5949 * 5950 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5951 * called in the case that we don't need the metadata AND data reservations 5952 * anymore. So if there is an error or we insert an inline extent. 5953 * 5954 * This function will release the metadata space that was not used and will 5955 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5956 * list if there are no delalloc bytes left. 5957 * Also it will handle the qgroup reserved space. 5958 */ 5959 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) 5960 { 5961 btrfs_delalloc_release_metadata(inode, len); 5962 btrfs_free_reserved_data_space(inode, start, len); 5963 } 5964 5965 static int update_block_group(struct btrfs_trans_handle *trans, 5966 struct btrfs_root *root, u64 bytenr, 5967 u64 num_bytes, int alloc) 5968 { 5969 struct btrfs_block_group_cache *cache = NULL; 5970 struct btrfs_fs_info *info = root->fs_info; 5971 u64 total = num_bytes; 5972 u64 old_val; 5973 u64 byte_in_group; 5974 int factor; 5975 5976 /* block accounting for super block */ 5977 spin_lock(&info->delalloc_root_lock); 5978 old_val = btrfs_super_bytes_used(info->super_copy); 5979 if (alloc) 5980 old_val += num_bytes; 5981 else 5982 old_val -= num_bytes; 5983 btrfs_set_super_bytes_used(info->super_copy, old_val); 5984 spin_unlock(&info->delalloc_root_lock); 5985 5986 while (total) { 5987 cache = btrfs_lookup_block_group(info, bytenr); 5988 if (!cache) 5989 return -ENOENT; 5990 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5991 BTRFS_BLOCK_GROUP_RAID1 | 5992 BTRFS_BLOCK_GROUP_RAID10)) 5993 factor = 2; 5994 else 5995 factor = 1; 5996 /* 5997 * If this block group has free space cache written out, we 5998 * need to make sure to load it if we are removing space. This 5999 * is because we need the unpinning stage to actually add the 6000 * space back to the block group, otherwise we will leak space. 6001 */ 6002 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6003 cache_block_group(cache, 1); 6004 6005 byte_in_group = bytenr - cache->key.objectid; 6006 WARN_ON(byte_in_group > cache->key.offset); 6007 6008 spin_lock(&cache->space_info->lock); 6009 spin_lock(&cache->lock); 6010 6011 if (btrfs_test_opt(root, SPACE_CACHE) && 6012 cache->disk_cache_state < BTRFS_DC_CLEAR) 6013 cache->disk_cache_state = BTRFS_DC_CLEAR; 6014 6015 old_val = btrfs_block_group_used(&cache->item); 6016 num_bytes = min(total, cache->key.offset - byte_in_group); 6017 if (alloc) { 6018 old_val += num_bytes; 6019 btrfs_set_block_group_used(&cache->item, old_val); 6020 cache->reserved -= num_bytes; 6021 cache->space_info->bytes_reserved -= num_bytes; 6022 cache->space_info->bytes_used += num_bytes; 6023 cache->space_info->disk_used += num_bytes * factor; 6024 spin_unlock(&cache->lock); 6025 spin_unlock(&cache->space_info->lock); 6026 } else { 6027 old_val -= num_bytes; 6028 btrfs_set_block_group_used(&cache->item, old_val); 6029 cache->pinned += num_bytes; 6030 cache->space_info->bytes_pinned += num_bytes; 6031 cache->space_info->bytes_used -= num_bytes; 6032 cache->space_info->disk_used -= num_bytes * factor; 6033 spin_unlock(&cache->lock); 6034 spin_unlock(&cache->space_info->lock); 6035 6036 set_extent_dirty(info->pinned_extents, 6037 bytenr, bytenr + num_bytes - 1, 6038 GFP_NOFS | __GFP_NOFAIL); 6039 } 6040 6041 spin_lock(&trans->transaction->dirty_bgs_lock); 6042 if (list_empty(&cache->dirty_list)) { 6043 list_add_tail(&cache->dirty_list, 6044 &trans->transaction->dirty_bgs); 6045 trans->transaction->num_dirty_bgs++; 6046 btrfs_get_block_group(cache); 6047 } 6048 spin_unlock(&trans->transaction->dirty_bgs_lock); 6049 6050 /* 6051 * No longer have used bytes in this block group, queue it for 6052 * deletion. We do this after adding the block group to the 6053 * dirty list to avoid races between cleaner kthread and space 6054 * cache writeout. 6055 */ 6056 if (!alloc && old_val == 0) { 6057 spin_lock(&info->unused_bgs_lock); 6058 if (list_empty(&cache->bg_list)) { 6059 btrfs_get_block_group(cache); 6060 list_add_tail(&cache->bg_list, 6061 &info->unused_bgs); 6062 } 6063 spin_unlock(&info->unused_bgs_lock); 6064 } 6065 6066 btrfs_put_block_group(cache); 6067 total -= num_bytes; 6068 bytenr += num_bytes; 6069 } 6070 return 0; 6071 } 6072 6073 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 6074 { 6075 struct btrfs_block_group_cache *cache; 6076 u64 bytenr; 6077 6078 spin_lock(&root->fs_info->block_group_cache_lock); 6079 bytenr = root->fs_info->first_logical_byte; 6080 spin_unlock(&root->fs_info->block_group_cache_lock); 6081 6082 if (bytenr < (u64)-1) 6083 return bytenr; 6084 6085 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 6086 if (!cache) 6087 return 0; 6088 6089 bytenr = cache->key.objectid; 6090 btrfs_put_block_group(cache); 6091 6092 return bytenr; 6093 } 6094 6095 static int pin_down_extent(struct btrfs_root *root, 6096 struct btrfs_block_group_cache *cache, 6097 u64 bytenr, u64 num_bytes, int reserved) 6098 { 6099 spin_lock(&cache->space_info->lock); 6100 spin_lock(&cache->lock); 6101 cache->pinned += num_bytes; 6102 cache->space_info->bytes_pinned += num_bytes; 6103 if (reserved) { 6104 cache->reserved -= num_bytes; 6105 cache->space_info->bytes_reserved -= num_bytes; 6106 } 6107 spin_unlock(&cache->lock); 6108 spin_unlock(&cache->space_info->lock); 6109 6110 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 6111 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6112 if (reserved) 6113 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); 6114 return 0; 6115 } 6116 6117 /* 6118 * this function must be called within transaction 6119 */ 6120 int btrfs_pin_extent(struct btrfs_root *root, 6121 u64 bytenr, u64 num_bytes, int reserved) 6122 { 6123 struct btrfs_block_group_cache *cache; 6124 6125 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 6126 BUG_ON(!cache); /* Logic error */ 6127 6128 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 6129 6130 btrfs_put_block_group(cache); 6131 return 0; 6132 } 6133 6134 /* 6135 * this function must be called within transaction 6136 */ 6137 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 6138 u64 bytenr, u64 num_bytes) 6139 { 6140 struct btrfs_block_group_cache *cache; 6141 int ret; 6142 6143 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 6144 if (!cache) 6145 return -EINVAL; 6146 6147 /* 6148 * pull in the free space cache (if any) so that our pin 6149 * removes the free space from the cache. We have load_only set 6150 * to one because the slow code to read in the free extents does check 6151 * the pinned extents. 6152 */ 6153 cache_block_group(cache, 1); 6154 6155 pin_down_extent(root, cache, bytenr, num_bytes, 0); 6156 6157 /* remove us from the free space cache (if we're there at all) */ 6158 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6159 btrfs_put_block_group(cache); 6160 return ret; 6161 } 6162 6163 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 6164 { 6165 int ret; 6166 struct btrfs_block_group_cache *block_group; 6167 struct btrfs_caching_control *caching_ctl; 6168 6169 block_group = btrfs_lookup_block_group(root->fs_info, start); 6170 if (!block_group) 6171 return -EINVAL; 6172 6173 cache_block_group(block_group, 0); 6174 caching_ctl = get_caching_control(block_group); 6175 6176 if (!caching_ctl) { 6177 /* Logic error */ 6178 BUG_ON(!block_group_cache_done(block_group)); 6179 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6180 } else { 6181 mutex_lock(&caching_ctl->mutex); 6182 6183 if (start >= caching_ctl->progress) { 6184 ret = add_excluded_extent(root, start, num_bytes); 6185 } else if (start + num_bytes <= caching_ctl->progress) { 6186 ret = btrfs_remove_free_space(block_group, 6187 start, num_bytes); 6188 } else { 6189 num_bytes = caching_ctl->progress - start; 6190 ret = btrfs_remove_free_space(block_group, 6191 start, num_bytes); 6192 if (ret) 6193 goto out_lock; 6194 6195 num_bytes = (start + num_bytes) - 6196 caching_ctl->progress; 6197 start = caching_ctl->progress; 6198 ret = add_excluded_extent(root, start, num_bytes); 6199 } 6200 out_lock: 6201 mutex_unlock(&caching_ctl->mutex); 6202 put_caching_control(caching_ctl); 6203 } 6204 btrfs_put_block_group(block_group); 6205 return ret; 6206 } 6207 6208 int btrfs_exclude_logged_extents(struct btrfs_root *log, 6209 struct extent_buffer *eb) 6210 { 6211 struct btrfs_file_extent_item *item; 6212 struct btrfs_key key; 6213 int found_type; 6214 int i; 6215 6216 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 6217 return 0; 6218 6219 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6220 btrfs_item_key_to_cpu(eb, &key, i); 6221 if (key.type != BTRFS_EXTENT_DATA_KEY) 6222 continue; 6223 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6224 found_type = btrfs_file_extent_type(eb, item); 6225 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6226 continue; 6227 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6228 continue; 6229 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6230 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6231 __exclude_logged_extent(log, key.objectid, key.offset); 6232 } 6233 6234 return 0; 6235 } 6236 6237 static void 6238 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6239 { 6240 atomic_inc(&bg->reservations); 6241 } 6242 6243 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6244 const u64 start) 6245 { 6246 struct btrfs_block_group_cache *bg; 6247 6248 bg = btrfs_lookup_block_group(fs_info, start); 6249 ASSERT(bg); 6250 if (atomic_dec_and_test(&bg->reservations)) 6251 wake_up_atomic_t(&bg->reservations); 6252 btrfs_put_block_group(bg); 6253 } 6254 6255 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) 6256 { 6257 schedule(); 6258 return 0; 6259 } 6260 6261 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6262 { 6263 struct btrfs_space_info *space_info = bg->space_info; 6264 6265 ASSERT(bg->ro); 6266 6267 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6268 return; 6269 6270 /* 6271 * Our block group is read only but before we set it to read only, 6272 * some task might have had allocated an extent from it already, but it 6273 * has not yet created a respective ordered extent (and added it to a 6274 * root's list of ordered extents). 6275 * Therefore wait for any task currently allocating extents, since the 6276 * block group's reservations counter is incremented while a read lock 6277 * on the groups' semaphore is held and decremented after releasing 6278 * the read access on that semaphore and creating the ordered extent. 6279 */ 6280 down_write(&space_info->groups_sem); 6281 up_write(&space_info->groups_sem); 6282 6283 wait_on_atomic_t(&bg->reservations, 6284 btrfs_wait_bg_reservations_atomic_t, 6285 TASK_UNINTERRUPTIBLE); 6286 } 6287 6288 /** 6289 * btrfs_update_reserved_bytes - update the block_group and space info counters 6290 * @cache: The cache we are manipulating 6291 * @num_bytes: The number of bytes in question 6292 * @reserve: One of the reservation enums 6293 * @delalloc: The blocks are allocated for the delalloc write 6294 * 6295 * This is called by the allocator when it reserves space, or by somebody who is 6296 * freeing space that was never actually used on disk. For example if you 6297 * reserve some space for a new leaf in transaction A and before transaction A 6298 * commits you free that leaf, you call this with reserve set to 0 in order to 6299 * clear the reservation. 6300 * 6301 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 6302 * ENOSPC accounting. For data we handle the reservation through clearing the 6303 * delalloc bits in the io_tree. We have to do this since we could end up 6304 * allocating less disk space for the amount of data we have reserved in the 6305 * case of compression. 6306 * 6307 * If this is a reservation and the block group has become read only we cannot 6308 * make the reservation and return -EAGAIN, otherwise this function always 6309 * succeeds. 6310 */ 6311 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 6312 u64 num_bytes, int reserve, int delalloc) 6313 { 6314 struct btrfs_space_info *space_info = cache->space_info; 6315 int ret = 0; 6316 6317 spin_lock(&space_info->lock); 6318 spin_lock(&cache->lock); 6319 if (reserve != RESERVE_FREE) { 6320 if (cache->ro) { 6321 ret = -EAGAIN; 6322 } else { 6323 cache->reserved += num_bytes; 6324 space_info->bytes_reserved += num_bytes; 6325 if (reserve == RESERVE_ALLOC) { 6326 trace_btrfs_space_reservation(cache->fs_info, 6327 "space_info", space_info->flags, 6328 num_bytes, 0); 6329 space_info->bytes_may_use -= num_bytes; 6330 } 6331 6332 if (delalloc) 6333 cache->delalloc_bytes += num_bytes; 6334 } 6335 } else { 6336 if (cache->ro) 6337 space_info->bytes_readonly += num_bytes; 6338 cache->reserved -= num_bytes; 6339 space_info->bytes_reserved -= num_bytes; 6340 6341 if (delalloc) 6342 cache->delalloc_bytes -= num_bytes; 6343 } 6344 spin_unlock(&cache->lock); 6345 spin_unlock(&space_info->lock); 6346 return ret; 6347 } 6348 6349 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 6350 struct btrfs_root *root) 6351 { 6352 struct btrfs_fs_info *fs_info = root->fs_info; 6353 struct btrfs_caching_control *next; 6354 struct btrfs_caching_control *caching_ctl; 6355 struct btrfs_block_group_cache *cache; 6356 6357 down_write(&fs_info->commit_root_sem); 6358 6359 list_for_each_entry_safe(caching_ctl, next, 6360 &fs_info->caching_block_groups, list) { 6361 cache = caching_ctl->block_group; 6362 if (block_group_cache_done(cache)) { 6363 cache->last_byte_to_unpin = (u64)-1; 6364 list_del_init(&caching_ctl->list); 6365 put_caching_control(caching_ctl); 6366 } else { 6367 cache->last_byte_to_unpin = caching_ctl->progress; 6368 } 6369 } 6370 6371 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6372 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6373 else 6374 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6375 6376 up_write(&fs_info->commit_root_sem); 6377 6378 update_global_block_rsv(fs_info); 6379 } 6380 6381 /* 6382 * Returns the free cluster for the given space info and sets empty_cluster to 6383 * what it should be based on the mount options. 6384 */ 6385 static struct btrfs_free_cluster * 6386 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, 6387 u64 *empty_cluster) 6388 { 6389 struct btrfs_free_cluster *ret = NULL; 6390 bool ssd = btrfs_test_opt(root, SSD); 6391 6392 *empty_cluster = 0; 6393 if (btrfs_mixed_space_info(space_info)) 6394 return ret; 6395 6396 if (ssd) 6397 *empty_cluster = SZ_2M; 6398 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6399 ret = &root->fs_info->meta_alloc_cluster; 6400 if (!ssd) 6401 *empty_cluster = SZ_64K; 6402 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { 6403 ret = &root->fs_info->data_alloc_cluster; 6404 } 6405 6406 return ret; 6407 } 6408 6409 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, 6410 const bool return_free_space) 6411 { 6412 struct btrfs_fs_info *fs_info = root->fs_info; 6413 struct btrfs_block_group_cache *cache = NULL; 6414 struct btrfs_space_info *space_info; 6415 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6416 struct btrfs_free_cluster *cluster = NULL; 6417 u64 len; 6418 u64 total_unpinned = 0; 6419 u64 empty_cluster = 0; 6420 bool readonly; 6421 6422 while (start <= end) { 6423 readonly = false; 6424 if (!cache || 6425 start >= cache->key.objectid + cache->key.offset) { 6426 if (cache) 6427 btrfs_put_block_group(cache); 6428 total_unpinned = 0; 6429 cache = btrfs_lookup_block_group(fs_info, start); 6430 BUG_ON(!cache); /* Logic error */ 6431 6432 cluster = fetch_cluster_info(root, 6433 cache->space_info, 6434 &empty_cluster); 6435 empty_cluster <<= 1; 6436 } 6437 6438 len = cache->key.objectid + cache->key.offset - start; 6439 len = min(len, end + 1 - start); 6440 6441 if (start < cache->last_byte_to_unpin) { 6442 len = min(len, cache->last_byte_to_unpin - start); 6443 if (return_free_space) 6444 btrfs_add_free_space(cache, start, len); 6445 } 6446 6447 start += len; 6448 total_unpinned += len; 6449 space_info = cache->space_info; 6450 6451 /* 6452 * If this space cluster has been marked as fragmented and we've 6453 * unpinned enough in this block group to potentially allow a 6454 * cluster to be created inside of it go ahead and clear the 6455 * fragmented check. 6456 */ 6457 if (cluster && cluster->fragmented && 6458 total_unpinned > empty_cluster) { 6459 spin_lock(&cluster->lock); 6460 cluster->fragmented = 0; 6461 spin_unlock(&cluster->lock); 6462 } 6463 6464 spin_lock(&space_info->lock); 6465 spin_lock(&cache->lock); 6466 cache->pinned -= len; 6467 space_info->bytes_pinned -= len; 6468 space_info->max_extent_size = 0; 6469 percpu_counter_add(&space_info->total_bytes_pinned, -len); 6470 if (cache->ro) { 6471 space_info->bytes_readonly += len; 6472 readonly = true; 6473 } 6474 spin_unlock(&cache->lock); 6475 if (!readonly && global_rsv->space_info == space_info) { 6476 spin_lock(&global_rsv->lock); 6477 if (!global_rsv->full) { 6478 len = min(len, global_rsv->size - 6479 global_rsv->reserved); 6480 global_rsv->reserved += len; 6481 space_info->bytes_may_use += len; 6482 if (global_rsv->reserved >= global_rsv->size) 6483 global_rsv->full = 1; 6484 } 6485 spin_unlock(&global_rsv->lock); 6486 } 6487 spin_unlock(&space_info->lock); 6488 } 6489 6490 if (cache) 6491 btrfs_put_block_group(cache); 6492 return 0; 6493 } 6494 6495 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 6496 struct btrfs_root *root) 6497 { 6498 struct btrfs_fs_info *fs_info = root->fs_info; 6499 struct btrfs_block_group_cache *block_group, *tmp; 6500 struct list_head *deleted_bgs; 6501 struct extent_io_tree *unpin; 6502 u64 start; 6503 u64 end; 6504 int ret; 6505 6506 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6507 unpin = &fs_info->freed_extents[1]; 6508 else 6509 unpin = &fs_info->freed_extents[0]; 6510 6511 while (!trans->aborted) { 6512 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6513 ret = find_first_extent_bit(unpin, 0, &start, &end, 6514 EXTENT_DIRTY, NULL); 6515 if (ret) { 6516 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6517 break; 6518 } 6519 6520 if (btrfs_test_opt(root, DISCARD)) 6521 ret = btrfs_discard_extent(root, start, 6522 end + 1 - start, NULL); 6523 6524 clear_extent_dirty(unpin, start, end); 6525 unpin_extent_range(root, start, end, true); 6526 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6527 cond_resched(); 6528 } 6529 6530 /* 6531 * Transaction is finished. We don't need the lock anymore. We 6532 * do need to clean up the block groups in case of a transaction 6533 * abort. 6534 */ 6535 deleted_bgs = &trans->transaction->deleted_bgs; 6536 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6537 u64 trimmed = 0; 6538 6539 ret = -EROFS; 6540 if (!trans->aborted) 6541 ret = btrfs_discard_extent(root, 6542 block_group->key.objectid, 6543 block_group->key.offset, 6544 &trimmed); 6545 6546 list_del_init(&block_group->bg_list); 6547 btrfs_put_block_group_trimming(block_group); 6548 btrfs_put_block_group(block_group); 6549 6550 if (ret) { 6551 const char *errstr = btrfs_decode_error(ret); 6552 btrfs_warn(fs_info, 6553 "Discard failed while removing blockgroup: errno=%d %s\n", 6554 ret, errstr); 6555 } 6556 } 6557 6558 return 0; 6559 } 6560 6561 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 6562 u64 owner, u64 root_objectid) 6563 { 6564 struct btrfs_space_info *space_info; 6565 u64 flags; 6566 6567 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6568 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 6569 flags = BTRFS_BLOCK_GROUP_SYSTEM; 6570 else 6571 flags = BTRFS_BLOCK_GROUP_METADATA; 6572 } else { 6573 flags = BTRFS_BLOCK_GROUP_DATA; 6574 } 6575 6576 space_info = __find_space_info(fs_info, flags); 6577 BUG_ON(!space_info); /* Logic bug */ 6578 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 6579 } 6580 6581 6582 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6583 struct btrfs_root *root, 6584 struct btrfs_delayed_ref_node *node, u64 parent, 6585 u64 root_objectid, u64 owner_objectid, 6586 u64 owner_offset, int refs_to_drop, 6587 struct btrfs_delayed_extent_op *extent_op) 6588 { 6589 struct btrfs_key key; 6590 struct btrfs_path *path; 6591 struct btrfs_fs_info *info = root->fs_info; 6592 struct btrfs_root *extent_root = info->extent_root; 6593 struct extent_buffer *leaf; 6594 struct btrfs_extent_item *ei; 6595 struct btrfs_extent_inline_ref *iref; 6596 int ret; 6597 int is_data; 6598 int extent_slot = 0; 6599 int found_extent = 0; 6600 int num_to_del = 1; 6601 u32 item_size; 6602 u64 refs; 6603 u64 bytenr = node->bytenr; 6604 u64 num_bytes = node->num_bytes; 6605 int last_ref = 0; 6606 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6607 SKINNY_METADATA); 6608 6609 path = btrfs_alloc_path(); 6610 if (!path) 6611 return -ENOMEM; 6612 6613 path->reada = READA_FORWARD; 6614 path->leave_spinning = 1; 6615 6616 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6617 BUG_ON(!is_data && refs_to_drop != 1); 6618 6619 if (is_data) 6620 skinny_metadata = 0; 6621 6622 ret = lookup_extent_backref(trans, extent_root, path, &iref, 6623 bytenr, num_bytes, parent, 6624 root_objectid, owner_objectid, 6625 owner_offset); 6626 if (ret == 0) { 6627 extent_slot = path->slots[0]; 6628 while (extent_slot >= 0) { 6629 btrfs_item_key_to_cpu(path->nodes[0], &key, 6630 extent_slot); 6631 if (key.objectid != bytenr) 6632 break; 6633 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6634 key.offset == num_bytes) { 6635 found_extent = 1; 6636 break; 6637 } 6638 if (key.type == BTRFS_METADATA_ITEM_KEY && 6639 key.offset == owner_objectid) { 6640 found_extent = 1; 6641 break; 6642 } 6643 if (path->slots[0] - extent_slot > 5) 6644 break; 6645 extent_slot--; 6646 } 6647 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6648 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 6649 if (found_extent && item_size < sizeof(*ei)) 6650 found_extent = 0; 6651 #endif 6652 if (!found_extent) { 6653 BUG_ON(iref); 6654 ret = remove_extent_backref(trans, extent_root, path, 6655 NULL, refs_to_drop, 6656 is_data, &last_ref); 6657 if (ret) { 6658 btrfs_abort_transaction(trans, extent_root, ret); 6659 goto out; 6660 } 6661 btrfs_release_path(path); 6662 path->leave_spinning = 1; 6663 6664 key.objectid = bytenr; 6665 key.type = BTRFS_EXTENT_ITEM_KEY; 6666 key.offset = num_bytes; 6667 6668 if (!is_data && skinny_metadata) { 6669 key.type = BTRFS_METADATA_ITEM_KEY; 6670 key.offset = owner_objectid; 6671 } 6672 6673 ret = btrfs_search_slot(trans, extent_root, 6674 &key, path, -1, 1); 6675 if (ret > 0 && skinny_metadata && path->slots[0]) { 6676 /* 6677 * Couldn't find our skinny metadata item, 6678 * see if we have ye olde extent item. 6679 */ 6680 path->slots[0]--; 6681 btrfs_item_key_to_cpu(path->nodes[0], &key, 6682 path->slots[0]); 6683 if (key.objectid == bytenr && 6684 key.type == BTRFS_EXTENT_ITEM_KEY && 6685 key.offset == num_bytes) 6686 ret = 0; 6687 } 6688 6689 if (ret > 0 && skinny_metadata) { 6690 skinny_metadata = false; 6691 key.objectid = bytenr; 6692 key.type = BTRFS_EXTENT_ITEM_KEY; 6693 key.offset = num_bytes; 6694 btrfs_release_path(path); 6695 ret = btrfs_search_slot(trans, extent_root, 6696 &key, path, -1, 1); 6697 } 6698 6699 if (ret) { 6700 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 6701 ret, bytenr); 6702 if (ret > 0) 6703 btrfs_print_leaf(extent_root, 6704 path->nodes[0]); 6705 } 6706 if (ret < 0) { 6707 btrfs_abort_transaction(trans, extent_root, ret); 6708 goto out; 6709 } 6710 extent_slot = path->slots[0]; 6711 } 6712 } else if (WARN_ON(ret == -ENOENT)) { 6713 btrfs_print_leaf(extent_root, path->nodes[0]); 6714 btrfs_err(info, 6715 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 6716 bytenr, parent, root_objectid, owner_objectid, 6717 owner_offset); 6718 btrfs_abort_transaction(trans, extent_root, ret); 6719 goto out; 6720 } else { 6721 btrfs_abort_transaction(trans, extent_root, ret); 6722 goto out; 6723 } 6724 6725 leaf = path->nodes[0]; 6726 item_size = btrfs_item_size_nr(leaf, extent_slot); 6727 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6728 if (item_size < sizeof(*ei)) { 6729 BUG_ON(found_extent || extent_slot != path->slots[0]); 6730 ret = convert_extent_item_v0(trans, extent_root, path, 6731 owner_objectid, 0); 6732 if (ret < 0) { 6733 btrfs_abort_transaction(trans, extent_root, ret); 6734 goto out; 6735 } 6736 6737 btrfs_release_path(path); 6738 path->leave_spinning = 1; 6739 6740 key.objectid = bytenr; 6741 key.type = BTRFS_EXTENT_ITEM_KEY; 6742 key.offset = num_bytes; 6743 6744 ret = btrfs_search_slot(trans, extent_root, &key, path, 6745 -1, 1); 6746 if (ret) { 6747 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 6748 ret, bytenr); 6749 btrfs_print_leaf(extent_root, path->nodes[0]); 6750 } 6751 if (ret < 0) { 6752 btrfs_abort_transaction(trans, extent_root, ret); 6753 goto out; 6754 } 6755 6756 extent_slot = path->slots[0]; 6757 leaf = path->nodes[0]; 6758 item_size = btrfs_item_size_nr(leaf, extent_slot); 6759 } 6760 #endif 6761 BUG_ON(item_size < sizeof(*ei)); 6762 ei = btrfs_item_ptr(leaf, extent_slot, 6763 struct btrfs_extent_item); 6764 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6765 key.type == BTRFS_EXTENT_ITEM_KEY) { 6766 struct btrfs_tree_block_info *bi; 6767 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6768 bi = (struct btrfs_tree_block_info *)(ei + 1); 6769 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6770 } 6771 6772 refs = btrfs_extent_refs(leaf, ei); 6773 if (refs < refs_to_drop) { 6774 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 6775 "for bytenr %Lu", refs_to_drop, refs, bytenr); 6776 ret = -EINVAL; 6777 btrfs_abort_transaction(trans, extent_root, ret); 6778 goto out; 6779 } 6780 refs -= refs_to_drop; 6781 6782 if (refs > 0) { 6783 if (extent_op) 6784 __run_delayed_extent_op(extent_op, leaf, ei); 6785 /* 6786 * In the case of inline back ref, reference count will 6787 * be updated by remove_extent_backref 6788 */ 6789 if (iref) { 6790 BUG_ON(!found_extent); 6791 } else { 6792 btrfs_set_extent_refs(leaf, ei, refs); 6793 btrfs_mark_buffer_dirty(leaf); 6794 } 6795 if (found_extent) { 6796 ret = remove_extent_backref(trans, extent_root, path, 6797 iref, refs_to_drop, 6798 is_data, &last_ref); 6799 if (ret) { 6800 btrfs_abort_transaction(trans, extent_root, ret); 6801 goto out; 6802 } 6803 } 6804 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 6805 root_objectid); 6806 } else { 6807 if (found_extent) { 6808 BUG_ON(is_data && refs_to_drop != 6809 extent_data_ref_count(path, iref)); 6810 if (iref) { 6811 BUG_ON(path->slots[0] != extent_slot); 6812 } else { 6813 BUG_ON(path->slots[0] != extent_slot + 1); 6814 path->slots[0] = extent_slot; 6815 num_to_del = 2; 6816 } 6817 } 6818 6819 last_ref = 1; 6820 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6821 num_to_del); 6822 if (ret) { 6823 btrfs_abort_transaction(trans, extent_root, ret); 6824 goto out; 6825 } 6826 btrfs_release_path(path); 6827 6828 if (is_data) { 6829 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 6830 if (ret) { 6831 btrfs_abort_transaction(trans, extent_root, ret); 6832 goto out; 6833 } 6834 } 6835 6836 ret = add_to_free_space_tree(trans, root->fs_info, bytenr, 6837 num_bytes); 6838 if (ret) { 6839 btrfs_abort_transaction(trans, extent_root, ret); 6840 goto out; 6841 } 6842 6843 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 6844 if (ret) { 6845 btrfs_abort_transaction(trans, extent_root, ret); 6846 goto out; 6847 } 6848 } 6849 btrfs_release_path(path); 6850 6851 out: 6852 btrfs_free_path(path); 6853 return ret; 6854 } 6855 6856 /* 6857 * when we free an block, it is possible (and likely) that we free the last 6858 * delayed ref for that extent as well. This searches the delayed ref tree for 6859 * a given extent, and if there are no other delayed refs to be processed, it 6860 * removes it from the tree. 6861 */ 6862 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 6863 struct btrfs_root *root, u64 bytenr) 6864 { 6865 struct btrfs_delayed_ref_head *head; 6866 struct btrfs_delayed_ref_root *delayed_refs; 6867 int ret = 0; 6868 6869 delayed_refs = &trans->transaction->delayed_refs; 6870 spin_lock(&delayed_refs->lock); 6871 head = btrfs_find_delayed_ref_head(trans, bytenr); 6872 if (!head) 6873 goto out_delayed_unlock; 6874 6875 spin_lock(&head->lock); 6876 if (!list_empty(&head->ref_list)) 6877 goto out; 6878 6879 if (head->extent_op) { 6880 if (!head->must_insert_reserved) 6881 goto out; 6882 btrfs_free_delayed_extent_op(head->extent_op); 6883 head->extent_op = NULL; 6884 } 6885 6886 /* 6887 * waiting for the lock here would deadlock. If someone else has it 6888 * locked they are already in the process of dropping it anyway 6889 */ 6890 if (!mutex_trylock(&head->mutex)) 6891 goto out; 6892 6893 /* 6894 * at this point we have a head with no other entries. Go 6895 * ahead and process it. 6896 */ 6897 head->node.in_tree = 0; 6898 rb_erase(&head->href_node, &delayed_refs->href_root); 6899 6900 atomic_dec(&delayed_refs->num_entries); 6901 6902 /* 6903 * we don't take a ref on the node because we're removing it from the 6904 * tree, so we just steal the ref the tree was holding. 6905 */ 6906 delayed_refs->num_heads--; 6907 if (head->processing == 0) 6908 delayed_refs->num_heads_ready--; 6909 head->processing = 0; 6910 spin_unlock(&head->lock); 6911 spin_unlock(&delayed_refs->lock); 6912 6913 BUG_ON(head->extent_op); 6914 if (head->must_insert_reserved) 6915 ret = 1; 6916 6917 mutex_unlock(&head->mutex); 6918 btrfs_put_delayed_ref(&head->node); 6919 return ret; 6920 out: 6921 spin_unlock(&head->lock); 6922 6923 out_delayed_unlock: 6924 spin_unlock(&delayed_refs->lock); 6925 return 0; 6926 } 6927 6928 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 6929 struct btrfs_root *root, 6930 struct extent_buffer *buf, 6931 u64 parent, int last_ref) 6932 { 6933 int pin = 1; 6934 int ret; 6935 6936 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6937 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6938 buf->start, buf->len, 6939 parent, root->root_key.objectid, 6940 btrfs_header_level(buf), 6941 BTRFS_DROP_DELAYED_REF, NULL); 6942 BUG_ON(ret); /* -ENOMEM */ 6943 } 6944 6945 if (!last_ref) 6946 return; 6947 6948 if (btrfs_header_generation(buf) == trans->transid) { 6949 struct btrfs_block_group_cache *cache; 6950 6951 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6952 ret = check_ref_cleanup(trans, root, buf->start); 6953 if (!ret) 6954 goto out; 6955 } 6956 6957 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 6958 6959 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6960 pin_down_extent(root, cache, buf->start, buf->len, 1); 6961 btrfs_put_block_group(cache); 6962 goto out; 6963 } 6964 6965 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6966 6967 btrfs_add_free_space(cache, buf->start, buf->len); 6968 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 6969 btrfs_put_block_group(cache); 6970 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6971 pin = 0; 6972 } 6973 out: 6974 if (pin) 6975 add_pinned_bytes(root->fs_info, buf->len, 6976 btrfs_header_level(buf), 6977 root->root_key.objectid); 6978 6979 /* 6980 * Deleting the buffer, clear the corrupt flag since it doesn't matter 6981 * anymore. 6982 */ 6983 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6984 } 6985 6986 /* Can return -ENOMEM */ 6987 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6988 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6989 u64 owner, u64 offset) 6990 { 6991 int ret; 6992 struct btrfs_fs_info *fs_info = root->fs_info; 6993 6994 if (btrfs_test_is_dummy_root(root)) 6995 return 0; 6996 6997 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6998 6999 /* 7000 * tree log blocks never actually go into the extent allocation 7001 * tree, just update pinning info and exit early. 7002 */ 7003 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7004 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7005 /* unlocks the pinned mutex */ 7006 btrfs_pin_extent(root, bytenr, num_bytes, 1); 7007 ret = 0; 7008 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7009 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 7010 num_bytes, 7011 parent, root_objectid, (int)owner, 7012 BTRFS_DROP_DELAYED_REF, NULL); 7013 } else { 7014 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 7015 num_bytes, 7016 parent, root_objectid, owner, 7017 offset, 0, 7018 BTRFS_DROP_DELAYED_REF, NULL); 7019 } 7020 return ret; 7021 } 7022 7023 /* 7024 * when we wait for progress in the block group caching, its because 7025 * our allocation attempt failed at least once. So, we must sleep 7026 * and let some progress happen before we try again. 7027 * 7028 * This function will sleep at least once waiting for new free space to 7029 * show up, and then it will check the block group free space numbers 7030 * for our min num_bytes. Another option is to have it go ahead 7031 * and look in the rbtree for a free extent of a given size, but this 7032 * is a good start. 7033 * 7034 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7035 * any of the information in this block group. 7036 */ 7037 static noinline void 7038 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7039 u64 num_bytes) 7040 { 7041 struct btrfs_caching_control *caching_ctl; 7042 7043 caching_ctl = get_caching_control(cache); 7044 if (!caching_ctl) 7045 return; 7046 7047 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7048 (cache->free_space_ctl->free_space >= num_bytes)); 7049 7050 put_caching_control(caching_ctl); 7051 } 7052 7053 static noinline int 7054 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7055 { 7056 struct btrfs_caching_control *caching_ctl; 7057 int ret = 0; 7058 7059 caching_ctl = get_caching_control(cache); 7060 if (!caching_ctl) 7061 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7062 7063 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7064 if (cache->cached == BTRFS_CACHE_ERROR) 7065 ret = -EIO; 7066 put_caching_control(caching_ctl); 7067 return ret; 7068 } 7069 7070 int __get_raid_index(u64 flags) 7071 { 7072 if (flags & BTRFS_BLOCK_GROUP_RAID10) 7073 return BTRFS_RAID_RAID10; 7074 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 7075 return BTRFS_RAID_RAID1; 7076 else if (flags & BTRFS_BLOCK_GROUP_DUP) 7077 return BTRFS_RAID_DUP; 7078 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 7079 return BTRFS_RAID_RAID0; 7080 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 7081 return BTRFS_RAID_RAID5; 7082 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 7083 return BTRFS_RAID_RAID6; 7084 7085 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 7086 } 7087 7088 int get_block_group_index(struct btrfs_block_group_cache *cache) 7089 { 7090 return __get_raid_index(cache->flags); 7091 } 7092 7093 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 7094 [BTRFS_RAID_RAID10] = "raid10", 7095 [BTRFS_RAID_RAID1] = "raid1", 7096 [BTRFS_RAID_DUP] = "dup", 7097 [BTRFS_RAID_RAID0] = "raid0", 7098 [BTRFS_RAID_SINGLE] = "single", 7099 [BTRFS_RAID_RAID5] = "raid5", 7100 [BTRFS_RAID_RAID6] = "raid6", 7101 }; 7102 7103 static const char *get_raid_name(enum btrfs_raid_types type) 7104 { 7105 if (type >= BTRFS_NR_RAID_TYPES) 7106 return NULL; 7107 7108 return btrfs_raid_type_names[type]; 7109 } 7110 7111 enum btrfs_loop_type { 7112 LOOP_CACHING_NOWAIT = 0, 7113 LOOP_CACHING_WAIT = 1, 7114 LOOP_ALLOC_CHUNK = 2, 7115 LOOP_NO_EMPTY_SIZE = 3, 7116 }; 7117 7118 static inline void 7119 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7120 int delalloc) 7121 { 7122 if (delalloc) 7123 down_read(&cache->data_rwsem); 7124 } 7125 7126 static inline void 7127 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7128 int delalloc) 7129 { 7130 btrfs_get_block_group(cache); 7131 if (delalloc) 7132 down_read(&cache->data_rwsem); 7133 } 7134 7135 static struct btrfs_block_group_cache * 7136 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7137 struct btrfs_free_cluster *cluster, 7138 int delalloc) 7139 { 7140 struct btrfs_block_group_cache *used_bg = NULL; 7141 7142 spin_lock(&cluster->refill_lock); 7143 while (1) { 7144 used_bg = cluster->block_group; 7145 if (!used_bg) 7146 return NULL; 7147 7148 if (used_bg == block_group) 7149 return used_bg; 7150 7151 btrfs_get_block_group(used_bg); 7152 7153 if (!delalloc) 7154 return used_bg; 7155 7156 if (down_read_trylock(&used_bg->data_rwsem)) 7157 return used_bg; 7158 7159 spin_unlock(&cluster->refill_lock); 7160 7161 down_read(&used_bg->data_rwsem); 7162 7163 spin_lock(&cluster->refill_lock); 7164 if (used_bg == cluster->block_group) 7165 return used_bg; 7166 7167 up_read(&used_bg->data_rwsem); 7168 btrfs_put_block_group(used_bg); 7169 } 7170 } 7171 7172 static inline void 7173 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7174 int delalloc) 7175 { 7176 if (delalloc) 7177 up_read(&cache->data_rwsem); 7178 btrfs_put_block_group(cache); 7179 } 7180 7181 /* 7182 * walks the btree of allocated extents and find a hole of a given size. 7183 * The key ins is changed to record the hole: 7184 * ins->objectid == start position 7185 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7186 * ins->offset == the size of the hole. 7187 * Any available blocks before search_start are skipped. 7188 * 7189 * If there is no suitable free space, we will record the max size of 7190 * the free space extent currently. 7191 */ 7192 static noinline int find_free_extent(struct btrfs_root *orig_root, 7193 u64 num_bytes, u64 empty_size, 7194 u64 hint_byte, struct btrfs_key *ins, 7195 u64 flags, int delalloc) 7196 { 7197 int ret = 0; 7198 struct btrfs_root *root = orig_root->fs_info->extent_root; 7199 struct btrfs_free_cluster *last_ptr = NULL; 7200 struct btrfs_block_group_cache *block_group = NULL; 7201 u64 search_start = 0; 7202 u64 max_extent_size = 0; 7203 u64 empty_cluster = 0; 7204 struct btrfs_space_info *space_info; 7205 int loop = 0; 7206 int index = __get_raid_index(flags); 7207 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 7208 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 7209 bool failed_cluster_refill = false; 7210 bool failed_alloc = false; 7211 bool use_cluster = true; 7212 bool have_caching_bg = false; 7213 bool orig_have_caching_bg = false; 7214 bool full_search = false; 7215 7216 WARN_ON(num_bytes < root->sectorsize); 7217 ins->type = BTRFS_EXTENT_ITEM_KEY; 7218 ins->objectid = 0; 7219 ins->offset = 0; 7220 7221 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 7222 7223 space_info = __find_space_info(root->fs_info, flags); 7224 if (!space_info) { 7225 btrfs_err(root->fs_info, "No space info for %llu", flags); 7226 return -ENOSPC; 7227 } 7228 7229 /* 7230 * If our free space is heavily fragmented we may not be able to make 7231 * big contiguous allocations, so instead of doing the expensive search 7232 * for free space, simply return ENOSPC with our max_extent_size so we 7233 * can go ahead and search for a more manageable chunk. 7234 * 7235 * If our max_extent_size is large enough for our allocation simply 7236 * disable clustering since we will likely not be able to find enough 7237 * space to create a cluster and induce latency trying. 7238 */ 7239 if (unlikely(space_info->max_extent_size)) { 7240 spin_lock(&space_info->lock); 7241 if (space_info->max_extent_size && 7242 num_bytes > space_info->max_extent_size) { 7243 ins->offset = space_info->max_extent_size; 7244 spin_unlock(&space_info->lock); 7245 return -ENOSPC; 7246 } else if (space_info->max_extent_size) { 7247 use_cluster = false; 7248 } 7249 spin_unlock(&space_info->lock); 7250 } 7251 7252 last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); 7253 if (last_ptr) { 7254 spin_lock(&last_ptr->lock); 7255 if (last_ptr->block_group) 7256 hint_byte = last_ptr->window_start; 7257 if (last_ptr->fragmented) { 7258 /* 7259 * We still set window_start so we can keep track of the 7260 * last place we found an allocation to try and save 7261 * some time. 7262 */ 7263 hint_byte = last_ptr->window_start; 7264 use_cluster = false; 7265 } 7266 spin_unlock(&last_ptr->lock); 7267 } 7268 7269 search_start = max(search_start, first_logical_byte(root, 0)); 7270 search_start = max(search_start, hint_byte); 7271 if (search_start == hint_byte) { 7272 block_group = btrfs_lookup_block_group(root->fs_info, 7273 search_start); 7274 /* 7275 * we don't want to use the block group if it doesn't match our 7276 * allocation bits, or if its not cached. 7277 * 7278 * However if we are re-searching with an ideal block group 7279 * picked out then we don't care that the block group is cached. 7280 */ 7281 if (block_group && block_group_bits(block_group, flags) && 7282 block_group->cached != BTRFS_CACHE_NO) { 7283 down_read(&space_info->groups_sem); 7284 if (list_empty(&block_group->list) || 7285 block_group->ro) { 7286 /* 7287 * someone is removing this block group, 7288 * we can't jump into the have_block_group 7289 * target because our list pointers are not 7290 * valid 7291 */ 7292 btrfs_put_block_group(block_group); 7293 up_read(&space_info->groups_sem); 7294 } else { 7295 index = get_block_group_index(block_group); 7296 btrfs_lock_block_group(block_group, delalloc); 7297 goto have_block_group; 7298 } 7299 } else if (block_group) { 7300 btrfs_put_block_group(block_group); 7301 } 7302 } 7303 search: 7304 have_caching_bg = false; 7305 if (index == 0 || index == __get_raid_index(flags)) 7306 full_search = true; 7307 down_read(&space_info->groups_sem); 7308 list_for_each_entry(block_group, &space_info->block_groups[index], 7309 list) { 7310 u64 offset; 7311 int cached; 7312 7313 btrfs_grab_block_group(block_group, delalloc); 7314 search_start = block_group->key.objectid; 7315 7316 /* 7317 * this can happen if we end up cycling through all the 7318 * raid types, but we want to make sure we only allocate 7319 * for the proper type. 7320 */ 7321 if (!block_group_bits(block_group, flags)) { 7322 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7323 BTRFS_BLOCK_GROUP_RAID1 | 7324 BTRFS_BLOCK_GROUP_RAID5 | 7325 BTRFS_BLOCK_GROUP_RAID6 | 7326 BTRFS_BLOCK_GROUP_RAID10; 7327 7328 /* 7329 * if they asked for extra copies and this block group 7330 * doesn't provide them, bail. This does allow us to 7331 * fill raid0 from raid1. 7332 */ 7333 if ((flags & extra) && !(block_group->flags & extra)) 7334 goto loop; 7335 } 7336 7337 have_block_group: 7338 cached = block_group_cache_done(block_group); 7339 if (unlikely(!cached)) { 7340 have_caching_bg = true; 7341 ret = cache_block_group(block_group, 0); 7342 BUG_ON(ret < 0); 7343 ret = 0; 7344 } 7345 7346 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7347 goto loop; 7348 if (unlikely(block_group->ro)) 7349 goto loop; 7350 7351 /* 7352 * Ok we want to try and use the cluster allocator, so 7353 * lets look there 7354 */ 7355 if (last_ptr && use_cluster) { 7356 struct btrfs_block_group_cache *used_block_group; 7357 unsigned long aligned_cluster; 7358 /* 7359 * the refill lock keeps out other 7360 * people trying to start a new cluster 7361 */ 7362 used_block_group = btrfs_lock_cluster(block_group, 7363 last_ptr, 7364 delalloc); 7365 if (!used_block_group) 7366 goto refill_cluster; 7367 7368 if (used_block_group != block_group && 7369 (used_block_group->ro || 7370 !block_group_bits(used_block_group, flags))) 7371 goto release_cluster; 7372 7373 offset = btrfs_alloc_from_cluster(used_block_group, 7374 last_ptr, 7375 num_bytes, 7376 used_block_group->key.objectid, 7377 &max_extent_size); 7378 if (offset) { 7379 /* we have a block, we're done */ 7380 spin_unlock(&last_ptr->refill_lock); 7381 trace_btrfs_reserve_extent_cluster(root, 7382 used_block_group, 7383 search_start, num_bytes); 7384 if (used_block_group != block_group) { 7385 btrfs_release_block_group(block_group, 7386 delalloc); 7387 block_group = used_block_group; 7388 } 7389 goto checks; 7390 } 7391 7392 WARN_ON(last_ptr->block_group != used_block_group); 7393 release_cluster: 7394 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 7395 * set up a new clusters, so lets just skip it 7396 * and let the allocator find whatever block 7397 * it can find. If we reach this point, we 7398 * will have tried the cluster allocator 7399 * plenty of times and not have found 7400 * anything, so we are likely way too 7401 * fragmented for the clustering stuff to find 7402 * anything. 7403 * 7404 * However, if the cluster is taken from the 7405 * current block group, release the cluster 7406 * first, so that we stand a better chance of 7407 * succeeding in the unclustered 7408 * allocation. */ 7409 if (loop >= LOOP_NO_EMPTY_SIZE && 7410 used_block_group != block_group) { 7411 spin_unlock(&last_ptr->refill_lock); 7412 btrfs_release_block_group(used_block_group, 7413 delalloc); 7414 goto unclustered_alloc; 7415 } 7416 7417 /* 7418 * this cluster didn't work out, free it and 7419 * start over 7420 */ 7421 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7422 7423 if (used_block_group != block_group) 7424 btrfs_release_block_group(used_block_group, 7425 delalloc); 7426 refill_cluster: 7427 if (loop >= LOOP_NO_EMPTY_SIZE) { 7428 spin_unlock(&last_ptr->refill_lock); 7429 goto unclustered_alloc; 7430 } 7431 7432 aligned_cluster = max_t(unsigned long, 7433 empty_cluster + empty_size, 7434 block_group->full_stripe_len); 7435 7436 /* allocate a cluster in this block group */ 7437 ret = btrfs_find_space_cluster(root, block_group, 7438 last_ptr, search_start, 7439 num_bytes, 7440 aligned_cluster); 7441 if (ret == 0) { 7442 /* 7443 * now pull our allocation out of this 7444 * cluster 7445 */ 7446 offset = btrfs_alloc_from_cluster(block_group, 7447 last_ptr, 7448 num_bytes, 7449 search_start, 7450 &max_extent_size); 7451 if (offset) { 7452 /* we found one, proceed */ 7453 spin_unlock(&last_ptr->refill_lock); 7454 trace_btrfs_reserve_extent_cluster(root, 7455 block_group, search_start, 7456 num_bytes); 7457 goto checks; 7458 } 7459 } else if (!cached && loop > LOOP_CACHING_NOWAIT 7460 && !failed_cluster_refill) { 7461 spin_unlock(&last_ptr->refill_lock); 7462 7463 failed_cluster_refill = true; 7464 wait_block_group_cache_progress(block_group, 7465 num_bytes + empty_cluster + empty_size); 7466 goto have_block_group; 7467 } 7468 7469 /* 7470 * at this point we either didn't find a cluster 7471 * or we weren't able to allocate a block from our 7472 * cluster. Free the cluster we've been trying 7473 * to use, and go to the next block group 7474 */ 7475 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7476 spin_unlock(&last_ptr->refill_lock); 7477 goto loop; 7478 } 7479 7480 unclustered_alloc: 7481 /* 7482 * We are doing an unclustered alloc, set the fragmented flag so 7483 * we don't bother trying to setup a cluster again until we get 7484 * more space. 7485 */ 7486 if (unlikely(last_ptr)) { 7487 spin_lock(&last_ptr->lock); 7488 last_ptr->fragmented = 1; 7489 spin_unlock(&last_ptr->lock); 7490 } 7491 spin_lock(&block_group->free_space_ctl->tree_lock); 7492 if (cached && 7493 block_group->free_space_ctl->free_space < 7494 num_bytes + empty_cluster + empty_size) { 7495 if (block_group->free_space_ctl->free_space > 7496 max_extent_size) 7497 max_extent_size = 7498 block_group->free_space_ctl->free_space; 7499 spin_unlock(&block_group->free_space_ctl->tree_lock); 7500 goto loop; 7501 } 7502 spin_unlock(&block_group->free_space_ctl->tree_lock); 7503 7504 offset = btrfs_find_space_for_alloc(block_group, search_start, 7505 num_bytes, empty_size, 7506 &max_extent_size); 7507 /* 7508 * If we didn't find a chunk, and we haven't failed on this 7509 * block group before, and this block group is in the middle of 7510 * caching and we are ok with waiting, then go ahead and wait 7511 * for progress to be made, and set failed_alloc to true. 7512 * 7513 * If failed_alloc is true then we've already waited on this 7514 * block group once and should move on to the next block group. 7515 */ 7516 if (!offset && !failed_alloc && !cached && 7517 loop > LOOP_CACHING_NOWAIT) { 7518 wait_block_group_cache_progress(block_group, 7519 num_bytes + empty_size); 7520 failed_alloc = true; 7521 goto have_block_group; 7522 } else if (!offset) { 7523 goto loop; 7524 } 7525 checks: 7526 search_start = ALIGN(offset, root->stripesize); 7527 7528 /* move on to the next group */ 7529 if (search_start + num_bytes > 7530 block_group->key.objectid + block_group->key.offset) { 7531 btrfs_add_free_space(block_group, offset, num_bytes); 7532 goto loop; 7533 } 7534 7535 if (offset < search_start) 7536 btrfs_add_free_space(block_group, offset, 7537 search_start - offset); 7538 BUG_ON(offset > search_start); 7539 7540 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 7541 alloc_type, delalloc); 7542 if (ret == -EAGAIN) { 7543 btrfs_add_free_space(block_group, offset, num_bytes); 7544 goto loop; 7545 } 7546 btrfs_inc_block_group_reservations(block_group); 7547 7548 /* we are all good, lets return */ 7549 ins->objectid = search_start; 7550 ins->offset = num_bytes; 7551 7552 trace_btrfs_reserve_extent(orig_root, block_group, 7553 search_start, num_bytes); 7554 btrfs_release_block_group(block_group, delalloc); 7555 break; 7556 loop: 7557 failed_cluster_refill = false; 7558 failed_alloc = false; 7559 BUG_ON(index != get_block_group_index(block_group)); 7560 btrfs_release_block_group(block_group, delalloc); 7561 } 7562 up_read(&space_info->groups_sem); 7563 7564 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg 7565 && !orig_have_caching_bg) 7566 orig_have_caching_bg = true; 7567 7568 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 7569 goto search; 7570 7571 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 7572 goto search; 7573 7574 /* 7575 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7576 * caching kthreads as we move along 7577 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7578 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7579 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7580 * again 7581 */ 7582 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 7583 index = 0; 7584 if (loop == LOOP_CACHING_NOWAIT) { 7585 /* 7586 * We want to skip the LOOP_CACHING_WAIT step if we 7587 * don't have any uncached bgs and we've already done a 7588 * full search through. 7589 */ 7590 if (orig_have_caching_bg || !full_search) 7591 loop = LOOP_CACHING_WAIT; 7592 else 7593 loop = LOOP_ALLOC_CHUNK; 7594 } else { 7595 loop++; 7596 } 7597 7598 if (loop == LOOP_ALLOC_CHUNK) { 7599 struct btrfs_trans_handle *trans; 7600 int exist = 0; 7601 7602 trans = current->journal_info; 7603 if (trans) 7604 exist = 1; 7605 else 7606 trans = btrfs_join_transaction(root); 7607 7608 if (IS_ERR(trans)) { 7609 ret = PTR_ERR(trans); 7610 goto out; 7611 } 7612 7613 ret = do_chunk_alloc(trans, root, flags, 7614 CHUNK_ALLOC_FORCE); 7615 7616 /* 7617 * If we can't allocate a new chunk we've already looped 7618 * through at least once, move on to the NO_EMPTY_SIZE 7619 * case. 7620 */ 7621 if (ret == -ENOSPC) 7622 loop = LOOP_NO_EMPTY_SIZE; 7623 7624 /* 7625 * Do not bail out on ENOSPC since we 7626 * can do more things. 7627 */ 7628 if (ret < 0 && ret != -ENOSPC) 7629 btrfs_abort_transaction(trans, 7630 root, ret); 7631 else 7632 ret = 0; 7633 if (!exist) 7634 btrfs_end_transaction(trans, root); 7635 if (ret) 7636 goto out; 7637 } 7638 7639 if (loop == LOOP_NO_EMPTY_SIZE) { 7640 /* 7641 * Don't loop again if we already have no empty_size and 7642 * no empty_cluster. 7643 */ 7644 if (empty_size == 0 && 7645 empty_cluster == 0) { 7646 ret = -ENOSPC; 7647 goto out; 7648 } 7649 empty_size = 0; 7650 empty_cluster = 0; 7651 } 7652 7653 goto search; 7654 } else if (!ins->objectid) { 7655 ret = -ENOSPC; 7656 } else if (ins->objectid) { 7657 if (!use_cluster && last_ptr) { 7658 spin_lock(&last_ptr->lock); 7659 last_ptr->window_start = ins->objectid; 7660 spin_unlock(&last_ptr->lock); 7661 } 7662 ret = 0; 7663 } 7664 out: 7665 if (ret == -ENOSPC) { 7666 spin_lock(&space_info->lock); 7667 space_info->max_extent_size = max_extent_size; 7668 spin_unlock(&space_info->lock); 7669 ins->offset = max_extent_size; 7670 } 7671 return ret; 7672 } 7673 7674 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 7675 int dump_block_groups) 7676 { 7677 struct btrfs_block_group_cache *cache; 7678 int index = 0; 7679 7680 spin_lock(&info->lock); 7681 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", 7682 info->flags, 7683 info->total_bytes - info->bytes_used - info->bytes_pinned - 7684 info->bytes_reserved - info->bytes_readonly, 7685 (info->full) ? "" : "not "); 7686 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " 7687 "reserved=%llu, may_use=%llu, readonly=%llu\n", 7688 info->total_bytes, info->bytes_used, info->bytes_pinned, 7689 info->bytes_reserved, info->bytes_may_use, 7690 info->bytes_readonly); 7691 spin_unlock(&info->lock); 7692 7693 if (!dump_block_groups) 7694 return; 7695 7696 down_read(&info->groups_sem); 7697 again: 7698 list_for_each_entry(cache, &info->block_groups[index], list) { 7699 spin_lock(&cache->lock); 7700 printk(KERN_INFO "BTRFS: " 7701 "block group %llu has %llu bytes, " 7702 "%llu used %llu pinned %llu reserved %s\n", 7703 cache->key.objectid, cache->key.offset, 7704 btrfs_block_group_used(&cache->item), cache->pinned, 7705 cache->reserved, cache->ro ? "[readonly]" : ""); 7706 btrfs_dump_free_space(cache, bytes); 7707 spin_unlock(&cache->lock); 7708 } 7709 if (++index < BTRFS_NR_RAID_TYPES) 7710 goto again; 7711 up_read(&info->groups_sem); 7712 } 7713 7714 int btrfs_reserve_extent(struct btrfs_root *root, 7715 u64 num_bytes, u64 min_alloc_size, 7716 u64 empty_size, u64 hint_byte, 7717 struct btrfs_key *ins, int is_data, int delalloc) 7718 { 7719 bool final_tried = num_bytes == min_alloc_size; 7720 u64 flags; 7721 int ret; 7722 7723 flags = btrfs_get_alloc_profile(root, is_data); 7724 again: 7725 WARN_ON(num_bytes < root->sectorsize); 7726 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 7727 flags, delalloc); 7728 if (!ret && !is_data) { 7729 btrfs_dec_block_group_reservations(root->fs_info, 7730 ins->objectid); 7731 } else if (ret == -ENOSPC) { 7732 if (!final_tried && ins->offset) { 7733 num_bytes = min(num_bytes >> 1, ins->offset); 7734 num_bytes = round_down(num_bytes, root->sectorsize); 7735 num_bytes = max(num_bytes, min_alloc_size); 7736 if (num_bytes == min_alloc_size) 7737 final_tried = true; 7738 goto again; 7739 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 7740 struct btrfs_space_info *sinfo; 7741 7742 sinfo = __find_space_info(root->fs_info, flags); 7743 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 7744 flags, num_bytes); 7745 if (sinfo) 7746 dump_space_info(sinfo, num_bytes, 1); 7747 } 7748 } 7749 7750 return ret; 7751 } 7752 7753 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 7754 u64 start, u64 len, 7755 int pin, int delalloc) 7756 { 7757 struct btrfs_block_group_cache *cache; 7758 int ret = 0; 7759 7760 cache = btrfs_lookup_block_group(root->fs_info, start); 7761 if (!cache) { 7762 btrfs_err(root->fs_info, "Unable to find block group for %llu", 7763 start); 7764 return -ENOSPC; 7765 } 7766 7767 if (pin) 7768 pin_down_extent(root, cache, start, len, 1); 7769 else { 7770 if (btrfs_test_opt(root, DISCARD)) 7771 ret = btrfs_discard_extent(root, start, len, NULL); 7772 btrfs_add_free_space(cache, start, len); 7773 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 7774 } 7775 7776 btrfs_put_block_group(cache); 7777 7778 trace_btrfs_reserved_extent_free(root, start, len); 7779 7780 return ret; 7781 } 7782 7783 int btrfs_free_reserved_extent(struct btrfs_root *root, 7784 u64 start, u64 len, int delalloc) 7785 { 7786 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); 7787 } 7788 7789 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 7790 u64 start, u64 len) 7791 { 7792 return __btrfs_free_reserved_extent(root, start, len, 1, 0); 7793 } 7794 7795 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7796 struct btrfs_root *root, 7797 u64 parent, u64 root_objectid, 7798 u64 flags, u64 owner, u64 offset, 7799 struct btrfs_key *ins, int ref_mod) 7800 { 7801 int ret; 7802 struct btrfs_fs_info *fs_info = root->fs_info; 7803 struct btrfs_extent_item *extent_item; 7804 struct btrfs_extent_inline_ref *iref; 7805 struct btrfs_path *path; 7806 struct extent_buffer *leaf; 7807 int type; 7808 u32 size; 7809 7810 if (parent > 0) 7811 type = BTRFS_SHARED_DATA_REF_KEY; 7812 else 7813 type = BTRFS_EXTENT_DATA_REF_KEY; 7814 7815 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 7816 7817 path = btrfs_alloc_path(); 7818 if (!path) 7819 return -ENOMEM; 7820 7821 path->leave_spinning = 1; 7822 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7823 ins, size); 7824 if (ret) { 7825 btrfs_free_path(path); 7826 return ret; 7827 } 7828 7829 leaf = path->nodes[0]; 7830 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7831 struct btrfs_extent_item); 7832 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 7833 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7834 btrfs_set_extent_flags(leaf, extent_item, 7835 flags | BTRFS_EXTENT_FLAG_DATA); 7836 7837 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7838 btrfs_set_extent_inline_ref_type(leaf, iref, type); 7839 if (parent > 0) { 7840 struct btrfs_shared_data_ref *ref; 7841 ref = (struct btrfs_shared_data_ref *)(iref + 1); 7842 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7843 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 7844 } else { 7845 struct btrfs_extent_data_ref *ref; 7846 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 7847 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 7848 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 7849 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 7850 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 7851 } 7852 7853 btrfs_mark_buffer_dirty(path->nodes[0]); 7854 btrfs_free_path(path); 7855 7856 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 7857 ins->offset); 7858 if (ret) 7859 return ret; 7860 7861 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 7862 if (ret) { /* -ENOENT, logic error */ 7863 btrfs_err(fs_info, "update block group failed for %llu %llu", 7864 ins->objectid, ins->offset); 7865 BUG(); 7866 } 7867 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 7868 return ret; 7869 } 7870 7871 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 7872 struct btrfs_root *root, 7873 u64 parent, u64 root_objectid, 7874 u64 flags, struct btrfs_disk_key *key, 7875 int level, struct btrfs_key *ins) 7876 { 7877 int ret; 7878 struct btrfs_fs_info *fs_info = root->fs_info; 7879 struct btrfs_extent_item *extent_item; 7880 struct btrfs_tree_block_info *block_info; 7881 struct btrfs_extent_inline_ref *iref; 7882 struct btrfs_path *path; 7883 struct extent_buffer *leaf; 7884 u32 size = sizeof(*extent_item) + sizeof(*iref); 7885 u64 num_bytes = ins->offset; 7886 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7887 SKINNY_METADATA); 7888 7889 if (!skinny_metadata) 7890 size += sizeof(*block_info); 7891 7892 path = btrfs_alloc_path(); 7893 if (!path) { 7894 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7895 root->nodesize); 7896 return -ENOMEM; 7897 } 7898 7899 path->leave_spinning = 1; 7900 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7901 ins, size); 7902 if (ret) { 7903 btrfs_free_path(path); 7904 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7905 root->nodesize); 7906 return ret; 7907 } 7908 7909 leaf = path->nodes[0]; 7910 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7911 struct btrfs_extent_item); 7912 btrfs_set_extent_refs(leaf, extent_item, 1); 7913 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7914 btrfs_set_extent_flags(leaf, extent_item, 7915 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 7916 7917 if (skinny_metadata) { 7918 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7919 num_bytes = root->nodesize; 7920 } else { 7921 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7922 btrfs_set_tree_block_key(leaf, block_info, key); 7923 btrfs_set_tree_block_level(leaf, block_info, level); 7924 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 7925 } 7926 7927 if (parent > 0) { 7928 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 7929 btrfs_set_extent_inline_ref_type(leaf, iref, 7930 BTRFS_SHARED_BLOCK_REF_KEY); 7931 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7932 } else { 7933 btrfs_set_extent_inline_ref_type(leaf, iref, 7934 BTRFS_TREE_BLOCK_REF_KEY); 7935 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 7936 } 7937 7938 btrfs_mark_buffer_dirty(leaf); 7939 btrfs_free_path(path); 7940 7941 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 7942 num_bytes); 7943 if (ret) 7944 return ret; 7945 7946 ret = update_block_group(trans, root, ins->objectid, root->nodesize, 7947 1); 7948 if (ret) { /* -ENOENT, logic error */ 7949 btrfs_err(fs_info, "update block group failed for %llu %llu", 7950 ins->objectid, ins->offset); 7951 BUG(); 7952 } 7953 7954 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); 7955 return ret; 7956 } 7957 7958 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7959 struct btrfs_root *root, 7960 u64 root_objectid, u64 owner, 7961 u64 offset, u64 ram_bytes, 7962 struct btrfs_key *ins) 7963 { 7964 int ret; 7965 7966 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 7967 7968 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 7969 ins->offset, 0, 7970 root_objectid, owner, offset, 7971 ram_bytes, BTRFS_ADD_DELAYED_EXTENT, 7972 NULL); 7973 return ret; 7974 } 7975 7976 /* 7977 * this is used by the tree logging recovery code. It records that 7978 * an extent has been allocated and makes sure to clear the free 7979 * space cache bits as well 7980 */ 7981 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 7982 struct btrfs_root *root, 7983 u64 root_objectid, u64 owner, u64 offset, 7984 struct btrfs_key *ins) 7985 { 7986 int ret; 7987 struct btrfs_block_group_cache *block_group; 7988 7989 /* 7990 * Mixed block groups will exclude before processing the log so we only 7991 * need to do the exclude dance if this fs isn't mixed. 7992 */ 7993 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 7994 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 7995 if (ret) 7996 return ret; 7997 } 7998 7999 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 8000 if (!block_group) 8001 return -EINVAL; 8002 8003 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 8004 RESERVE_ALLOC_NO_ACCOUNT, 0); 8005 BUG_ON(ret); /* logic error */ 8006 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 8007 0, owner, offset, ins, 1); 8008 btrfs_put_block_group(block_group); 8009 return ret; 8010 } 8011 8012 static struct extent_buffer * 8013 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8014 u64 bytenr, int level) 8015 { 8016 struct extent_buffer *buf; 8017 8018 buf = btrfs_find_create_tree_block(root, bytenr); 8019 if (!buf) 8020 return ERR_PTR(-ENOMEM); 8021 btrfs_set_header_generation(buf, trans->transid); 8022 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8023 btrfs_tree_lock(buf); 8024 clean_tree_block(trans, root->fs_info, buf); 8025 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8026 8027 btrfs_set_lock_blocking(buf); 8028 set_extent_buffer_uptodate(buf); 8029 8030 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8031 buf->log_index = root->log_transid % 2; 8032 /* 8033 * we allow two log transactions at a time, use different 8034 * EXENT bit to differentiate dirty pages. 8035 */ 8036 if (buf->log_index == 0) 8037 set_extent_dirty(&root->dirty_log_pages, buf->start, 8038 buf->start + buf->len - 1, GFP_NOFS); 8039 else 8040 set_extent_new(&root->dirty_log_pages, buf->start, 8041 buf->start + buf->len - 1); 8042 } else { 8043 buf->log_index = -1; 8044 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8045 buf->start + buf->len - 1, GFP_NOFS); 8046 } 8047 trans->blocks_used++; 8048 /* this returns a buffer locked for blocking */ 8049 return buf; 8050 } 8051 8052 static struct btrfs_block_rsv * 8053 use_block_rsv(struct btrfs_trans_handle *trans, 8054 struct btrfs_root *root, u32 blocksize) 8055 { 8056 struct btrfs_block_rsv *block_rsv; 8057 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 8058 int ret; 8059 bool global_updated = false; 8060 8061 block_rsv = get_block_rsv(trans, root); 8062 8063 if (unlikely(block_rsv->size == 0)) 8064 goto try_reserve; 8065 again: 8066 ret = block_rsv_use_bytes(block_rsv, blocksize); 8067 if (!ret) 8068 return block_rsv; 8069 8070 if (block_rsv->failfast) 8071 return ERR_PTR(ret); 8072 8073 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8074 global_updated = true; 8075 update_global_block_rsv(root->fs_info); 8076 goto again; 8077 } 8078 8079 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 8080 static DEFINE_RATELIMIT_STATE(_rs, 8081 DEFAULT_RATELIMIT_INTERVAL * 10, 8082 /*DEFAULT_RATELIMIT_BURST*/ 1); 8083 if (__ratelimit(&_rs)) 8084 WARN(1, KERN_DEBUG 8085 "BTRFS: block rsv returned %d\n", ret); 8086 } 8087 try_reserve: 8088 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8089 BTRFS_RESERVE_NO_FLUSH); 8090 if (!ret) 8091 return block_rsv; 8092 /* 8093 * If we couldn't reserve metadata bytes try and use some from 8094 * the global reserve if its space type is the same as the global 8095 * reservation. 8096 */ 8097 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8098 block_rsv->space_info == global_rsv->space_info) { 8099 ret = block_rsv_use_bytes(global_rsv, blocksize); 8100 if (!ret) 8101 return global_rsv; 8102 } 8103 return ERR_PTR(ret); 8104 } 8105 8106 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8107 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8108 { 8109 block_rsv_add_bytes(block_rsv, blocksize, 0); 8110 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 8111 } 8112 8113 /* 8114 * finds a free extent and does all the dirty work required for allocation 8115 * returns the tree buffer or an ERR_PTR on error. 8116 */ 8117 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8118 struct btrfs_root *root, 8119 u64 parent, u64 root_objectid, 8120 struct btrfs_disk_key *key, int level, 8121 u64 hint, u64 empty_size) 8122 { 8123 struct btrfs_key ins; 8124 struct btrfs_block_rsv *block_rsv; 8125 struct extent_buffer *buf; 8126 struct btrfs_delayed_extent_op *extent_op; 8127 u64 flags = 0; 8128 int ret; 8129 u32 blocksize = root->nodesize; 8130 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 8131 SKINNY_METADATA); 8132 8133 if (btrfs_test_is_dummy_root(root)) { 8134 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8135 level); 8136 if (!IS_ERR(buf)) 8137 root->alloc_bytenr += blocksize; 8138 return buf; 8139 } 8140 8141 block_rsv = use_block_rsv(trans, root, blocksize); 8142 if (IS_ERR(block_rsv)) 8143 return ERR_CAST(block_rsv); 8144 8145 ret = btrfs_reserve_extent(root, blocksize, blocksize, 8146 empty_size, hint, &ins, 0, 0); 8147 if (ret) 8148 goto out_unuse; 8149 8150 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 8151 if (IS_ERR(buf)) { 8152 ret = PTR_ERR(buf); 8153 goto out_free_reserved; 8154 } 8155 8156 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8157 if (parent == 0) 8158 parent = ins.objectid; 8159 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8160 } else 8161 BUG_ON(parent > 0); 8162 8163 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8164 extent_op = btrfs_alloc_delayed_extent_op(); 8165 if (!extent_op) { 8166 ret = -ENOMEM; 8167 goto out_free_buf; 8168 } 8169 if (key) 8170 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8171 else 8172 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8173 extent_op->flags_to_set = flags; 8174 extent_op->update_key = skinny_metadata ? false : true; 8175 extent_op->update_flags = true; 8176 extent_op->is_data = false; 8177 extent_op->level = level; 8178 8179 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 8180 ins.objectid, ins.offset, 8181 parent, root_objectid, level, 8182 BTRFS_ADD_DELAYED_EXTENT, 8183 extent_op); 8184 if (ret) 8185 goto out_free_delayed; 8186 } 8187 return buf; 8188 8189 out_free_delayed: 8190 btrfs_free_delayed_extent_op(extent_op); 8191 out_free_buf: 8192 free_extent_buffer(buf); 8193 out_free_reserved: 8194 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); 8195 out_unuse: 8196 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 8197 return ERR_PTR(ret); 8198 } 8199 8200 struct walk_control { 8201 u64 refs[BTRFS_MAX_LEVEL]; 8202 u64 flags[BTRFS_MAX_LEVEL]; 8203 struct btrfs_key update_progress; 8204 int stage; 8205 int level; 8206 int shared_level; 8207 int update_ref; 8208 int keep_locks; 8209 int reada_slot; 8210 int reada_count; 8211 int for_reloc; 8212 }; 8213 8214 #define DROP_REFERENCE 1 8215 #define UPDATE_BACKREF 2 8216 8217 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8218 struct btrfs_root *root, 8219 struct walk_control *wc, 8220 struct btrfs_path *path) 8221 { 8222 u64 bytenr; 8223 u64 generation; 8224 u64 refs; 8225 u64 flags; 8226 u32 nritems; 8227 u32 blocksize; 8228 struct btrfs_key key; 8229 struct extent_buffer *eb; 8230 int ret; 8231 int slot; 8232 int nread = 0; 8233 8234 if (path->slots[wc->level] < wc->reada_slot) { 8235 wc->reada_count = wc->reada_count * 2 / 3; 8236 wc->reada_count = max(wc->reada_count, 2); 8237 } else { 8238 wc->reada_count = wc->reada_count * 3 / 2; 8239 wc->reada_count = min_t(int, wc->reada_count, 8240 BTRFS_NODEPTRS_PER_BLOCK(root)); 8241 } 8242 8243 eb = path->nodes[wc->level]; 8244 nritems = btrfs_header_nritems(eb); 8245 blocksize = root->nodesize; 8246 8247 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8248 if (nread >= wc->reada_count) 8249 break; 8250 8251 cond_resched(); 8252 bytenr = btrfs_node_blockptr(eb, slot); 8253 generation = btrfs_node_ptr_generation(eb, slot); 8254 8255 if (slot == path->slots[wc->level]) 8256 goto reada; 8257 8258 if (wc->stage == UPDATE_BACKREF && 8259 generation <= root->root_key.offset) 8260 continue; 8261 8262 /* We don't lock the tree block, it's OK to be racy here */ 8263 ret = btrfs_lookup_extent_info(trans, root, bytenr, 8264 wc->level - 1, 1, &refs, 8265 &flags); 8266 /* We don't care about errors in readahead. */ 8267 if (ret < 0) 8268 continue; 8269 BUG_ON(refs == 0); 8270 8271 if (wc->stage == DROP_REFERENCE) { 8272 if (refs == 1) 8273 goto reada; 8274 8275 if (wc->level == 1 && 8276 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8277 continue; 8278 if (!wc->update_ref || 8279 generation <= root->root_key.offset) 8280 continue; 8281 btrfs_node_key_to_cpu(eb, &key, slot); 8282 ret = btrfs_comp_cpu_keys(&key, 8283 &wc->update_progress); 8284 if (ret < 0) 8285 continue; 8286 } else { 8287 if (wc->level == 1 && 8288 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8289 continue; 8290 } 8291 reada: 8292 readahead_tree_block(root, bytenr); 8293 nread++; 8294 } 8295 wc->reada_slot = slot; 8296 } 8297 8298 /* 8299 * These may not be seen by the usual inc/dec ref code so we have to 8300 * add them here. 8301 */ 8302 static int record_one_subtree_extent(struct btrfs_trans_handle *trans, 8303 struct btrfs_root *root, u64 bytenr, 8304 u64 num_bytes) 8305 { 8306 struct btrfs_qgroup_extent_record *qrecord; 8307 struct btrfs_delayed_ref_root *delayed_refs; 8308 8309 qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); 8310 if (!qrecord) 8311 return -ENOMEM; 8312 8313 qrecord->bytenr = bytenr; 8314 qrecord->num_bytes = num_bytes; 8315 qrecord->old_roots = NULL; 8316 8317 delayed_refs = &trans->transaction->delayed_refs; 8318 spin_lock(&delayed_refs->lock); 8319 if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) 8320 kfree(qrecord); 8321 spin_unlock(&delayed_refs->lock); 8322 8323 return 0; 8324 } 8325 8326 static int account_leaf_items(struct btrfs_trans_handle *trans, 8327 struct btrfs_root *root, 8328 struct extent_buffer *eb) 8329 { 8330 int nr = btrfs_header_nritems(eb); 8331 int i, extent_type, ret; 8332 struct btrfs_key key; 8333 struct btrfs_file_extent_item *fi; 8334 u64 bytenr, num_bytes; 8335 8336 /* We can be called directly from walk_up_proc() */ 8337 if (!root->fs_info->quota_enabled) 8338 return 0; 8339 8340 for (i = 0; i < nr; i++) { 8341 btrfs_item_key_to_cpu(eb, &key, i); 8342 8343 if (key.type != BTRFS_EXTENT_DATA_KEY) 8344 continue; 8345 8346 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 8347 /* filter out non qgroup-accountable extents */ 8348 extent_type = btrfs_file_extent_type(eb, fi); 8349 8350 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 8351 continue; 8352 8353 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 8354 if (!bytenr) 8355 continue; 8356 8357 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 8358 8359 ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); 8360 if (ret) 8361 return ret; 8362 } 8363 return 0; 8364 } 8365 8366 /* 8367 * Walk up the tree from the bottom, freeing leaves and any interior 8368 * nodes which have had all slots visited. If a node (leaf or 8369 * interior) is freed, the node above it will have it's slot 8370 * incremented. The root node will never be freed. 8371 * 8372 * At the end of this function, we should have a path which has all 8373 * slots incremented to the next position for a search. If we need to 8374 * read a new node it will be NULL and the node above it will have the 8375 * correct slot selected for a later read. 8376 * 8377 * If we increment the root nodes slot counter past the number of 8378 * elements, 1 is returned to signal completion of the search. 8379 */ 8380 static int adjust_slots_upwards(struct btrfs_root *root, 8381 struct btrfs_path *path, int root_level) 8382 { 8383 int level = 0; 8384 int nr, slot; 8385 struct extent_buffer *eb; 8386 8387 if (root_level == 0) 8388 return 1; 8389 8390 while (level <= root_level) { 8391 eb = path->nodes[level]; 8392 nr = btrfs_header_nritems(eb); 8393 path->slots[level]++; 8394 slot = path->slots[level]; 8395 if (slot >= nr || level == 0) { 8396 /* 8397 * Don't free the root - we will detect this 8398 * condition after our loop and return a 8399 * positive value for caller to stop walking the tree. 8400 */ 8401 if (level != root_level) { 8402 btrfs_tree_unlock_rw(eb, path->locks[level]); 8403 path->locks[level] = 0; 8404 8405 free_extent_buffer(eb); 8406 path->nodes[level] = NULL; 8407 path->slots[level] = 0; 8408 } 8409 } else { 8410 /* 8411 * We have a valid slot to walk back down 8412 * from. Stop here so caller can process these 8413 * new nodes. 8414 */ 8415 break; 8416 } 8417 8418 level++; 8419 } 8420 8421 eb = path->nodes[root_level]; 8422 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 8423 return 1; 8424 8425 return 0; 8426 } 8427 8428 /* 8429 * root_eb is the subtree root and is locked before this function is called. 8430 */ 8431 static int account_shared_subtree(struct btrfs_trans_handle *trans, 8432 struct btrfs_root *root, 8433 struct extent_buffer *root_eb, 8434 u64 root_gen, 8435 int root_level) 8436 { 8437 int ret = 0; 8438 int level; 8439 struct extent_buffer *eb = root_eb; 8440 struct btrfs_path *path = NULL; 8441 8442 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); 8443 BUG_ON(root_eb == NULL); 8444 8445 if (!root->fs_info->quota_enabled) 8446 return 0; 8447 8448 if (!extent_buffer_uptodate(root_eb)) { 8449 ret = btrfs_read_buffer(root_eb, root_gen); 8450 if (ret) 8451 goto out; 8452 } 8453 8454 if (root_level == 0) { 8455 ret = account_leaf_items(trans, root, root_eb); 8456 goto out; 8457 } 8458 8459 path = btrfs_alloc_path(); 8460 if (!path) 8461 return -ENOMEM; 8462 8463 /* 8464 * Walk down the tree. Missing extent blocks are filled in as 8465 * we go. Metadata is accounted every time we read a new 8466 * extent block. 8467 * 8468 * When we reach a leaf, we account for file extent items in it, 8469 * walk back up the tree (adjusting slot pointers as we go) 8470 * and restart the search process. 8471 */ 8472 extent_buffer_get(root_eb); /* For path */ 8473 path->nodes[root_level] = root_eb; 8474 path->slots[root_level] = 0; 8475 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 8476 walk_down: 8477 level = root_level; 8478 while (level >= 0) { 8479 if (path->nodes[level] == NULL) { 8480 int parent_slot; 8481 u64 child_gen; 8482 u64 child_bytenr; 8483 8484 /* We need to get child blockptr/gen from 8485 * parent before we can read it. */ 8486 eb = path->nodes[level + 1]; 8487 parent_slot = path->slots[level + 1]; 8488 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 8489 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 8490 8491 eb = read_tree_block(root, child_bytenr, child_gen); 8492 if (IS_ERR(eb)) { 8493 ret = PTR_ERR(eb); 8494 goto out; 8495 } else if (!extent_buffer_uptodate(eb)) { 8496 free_extent_buffer(eb); 8497 ret = -EIO; 8498 goto out; 8499 } 8500 8501 path->nodes[level] = eb; 8502 path->slots[level] = 0; 8503 8504 btrfs_tree_read_lock(eb); 8505 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 8506 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 8507 8508 ret = record_one_subtree_extent(trans, root, child_bytenr, 8509 root->nodesize); 8510 if (ret) 8511 goto out; 8512 } 8513 8514 if (level == 0) { 8515 ret = account_leaf_items(trans, root, path->nodes[level]); 8516 if (ret) 8517 goto out; 8518 8519 /* Nonzero return here means we completed our search */ 8520 ret = adjust_slots_upwards(root, path, root_level); 8521 if (ret) 8522 break; 8523 8524 /* Restart search with new slots */ 8525 goto walk_down; 8526 } 8527 8528 level--; 8529 } 8530 8531 ret = 0; 8532 out: 8533 btrfs_free_path(path); 8534 8535 return ret; 8536 } 8537 8538 /* 8539 * helper to process tree block while walking down the tree. 8540 * 8541 * when wc->stage == UPDATE_BACKREF, this function updates 8542 * back refs for pointers in the block. 8543 * 8544 * NOTE: return value 1 means we should stop walking down. 8545 */ 8546 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8547 struct btrfs_root *root, 8548 struct btrfs_path *path, 8549 struct walk_control *wc, int lookup_info) 8550 { 8551 int level = wc->level; 8552 struct extent_buffer *eb = path->nodes[level]; 8553 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8554 int ret; 8555 8556 if (wc->stage == UPDATE_BACKREF && 8557 btrfs_header_owner(eb) != root->root_key.objectid) 8558 return 1; 8559 8560 /* 8561 * when reference count of tree block is 1, it won't increase 8562 * again. once full backref flag is set, we never clear it. 8563 */ 8564 if (lookup_info && 8565 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8566 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8567 BUG_ON(!path->locks[level]); 8568 ret = btrfs_lookup_extent_info(trans, root, 8569 eb->start, level, 1, 8570 &wc->refs[level], 8571 &wc->flags[level]); 8572 BUG_ON(ret == -ENOMEM); 8573 if (ret) 8574 return ret; 8575 BUG_ON(wc->refs[level] == 0); 8576 } 8577 8578 if (wc->stage == DROP_REFERENCE) { 8579 if (wc->refs[level] > 1) 8580 return 1; 8581 8582 if (path->locks[level] && !wc->keep_locks) { 8583 btrfs_tree_unlock_rw(eb, path->locks[level]); 8584 path->locks[level] = 0; 8585 } 8586 return 0; 8587 } 8588 8589 /* wc->stage == UPDATE_BACKREF */ 8590 if (!(wc->flags[level] & flag)) { 8591 BUG_ON(!path->locks[level]); 8592 ret = btrfs_inc_ref(trans, root, eb, 1); 8593 BUG_ON(ret); /* -ENOMEM */ 8594 ret = btrfs_dec_ref(trans, root, eb, 0); 8595 BUG_ON(ret); /* -ENOMEM */ 8596 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 8597 eb->len, flag, 8598 btrfs_header_level(eb), 0); 8599 BUG_ON(ret); /* -ENOMEM */ 8600 wc->flags[level] |= flag; 8601 } 8602 8603 /* 8604 * the block is shared by multiple trees, so it's not good to 8605 * keep the tree lock 8606 */ 8607 if (path->locks[level] && level > 0) { 8608 btrfs_tree_unlock_rw(eb, path->locks[level]); 8609 path->locks[level] = 0; 8610 } 8611 return 0; 8612 } 8613 8614 /* 8615 * helper to process tree block pointer. 8616 * 8617 * when wc->stage == DROP_REFERENCE, this function checks 8618 * reference count of the block pointed to. if the block 8619 * is shared and we need update back refs for the subtree 8620 * rooted at the block, this function changes wc->stage to 8621 * UPDATE_BACKREF. if the block is shared and there is no 8622 * need to update back, this function drops the reference 8623 * to the block. 8624 * 8625 * NOTE: return value 1 means we should stop walking down. 8626 */ 8627 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8628 struct btrfs_root *root, 8629 struct btrfs_path *path, 8630 struct walk_control *wc, int *lookup_info) 8631 { 8632 u64 bytenr; 8633 u64 generation; 8634 u64 parent; 8635 u32 blocksize; 8636 struct btrfs_key key; 8637 struct extent_buffer *next; 8638 int level = wc->level; 8639 int reada = 0; 8640 int ret = 0; 8641 bool need_account = false; 8642 8643 generation = btrfs_node_ptr_generation(path->nodes[level], 8644 path->slots[level]); 8645 /* 8646 * if the lower level block was created before the snapshot 8647 * was created, we know there is no need to update back refs 8648 * for the subtree 8649 */ 8650 if (wc->stage == UPDATE_BACKREF && 8651 generation <= root->root_key.offset) { 8652 *lookup_info = 1; 8653 return 1; 8654 } 8655 8656 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8657 blocksize = root->nodesize; 8658 8659 next = btrfs_find_tree_block(root->fs_info, bytenr); 8660 if (!next) { 8661 next = btrfs_find_create_tree_block(root, bytenr); 8662 if (!next) 8663 return -ENOMEM; 8664 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8665 level - 1); 8666 reada = 1; 8667 } 8668 btrfs_tree_lock(next); 8669 btrfs_set_lock_blocking(next); 8670 8671 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 8672 &wc->refs[level - 1], 8673 &wc->flags[level - 1]); 8674 if (ret < 0) { 8675 btrfs_tree_unlock(next); 8676 return ret; 8677 } 8678 8679 if (unlikely(wc->refs[level - 1] == 0)) { 8680 btrfs_err(root->fs_info, "Missing references."); 8681 BUG(); 8682 } 8683 *lookup_info = 0; 8684 8685 if (wc->stage == DROP_REFERENCE) { 8686 if (wc->refs[level - 1] > 1) { 8687 need_account = true; 8688 if (level == 1 && 8689 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8690 goto skip; 8691 8692 if (!wc->update_ref || 8693 generation <= root->root_key.offset) 8694 goto skip; 8695 8696 btrfs_node_key_to_cpu(path->nodes[level], &key, 8697 path->slots[level]); 8698 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8699 if (ret < 0) 8700 goto skip; 8701 8702 wc->stage = UPDATE_BACKREF; 8703 wc->shared_level = level - 1; 8704 } 8705 } else { 8706 if (level == 1 && 8707 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8708 goto skip; 8709 } 8710 8711 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8712 btrfs_tree_unlock(next); 8713 free_extent_buffer(next); 8714 next = NULL; 8715 *lookup_info = 1; 8716 } 8717 8718 if (!next) { 8719 if (reada && level == 1) 8720 reada_walk_down(trans, root, wc, path); 8721 next = read_tree_block(root, bytenr, generation); 8722 if (IS_ERR(next)) { 8723 return PTR_ERR(next); 8724 } else if (!extent_buffer_uptodate(next)) { 8725 free_extent_buffer(next); 8726 return -EIO; 8727 } 8728 btrfs_tree_lock(next); 8729 btrfs_set_lock_blocking(next); 8730 } 8731 8732 level--; 8733 BUG_ON(level != btrfs_header_level(next)); 8734 path->nodes[level] = next; 8735 path->slots[level] = 0; 8736 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8737 wc->level = level; 8738 if (wc->level == 1) 8739 wc->reada_slot = 0; 8740 return 0; 8741 skip: 8742 wc->refs[level - 1] = 0; 8743 wc->flags[level - 1] = 0; 8744 if (wc->stage == DROP_REFERENCE) { 8745 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8746 parent = path->nodes[level]->start; 8747 } else { 8748 BUG_ON(root->root_key.objectid != 8749 btrfs_header_owner(path->nodes[level])); 8750 parent = 0; 8751 } 8752 8753 if (need_account) { 8754 ret = account_shared_subtree(trans, root, next, 8755 generation, level - 1); 8756 if (ret) { 8757 btrfs_err_rl(root->fs_info, 8758 "Error " 8759 "%d accounting shared subtree. Quota " 8760 "is out of sync, rescan required.", 8761 ret); 8762 } 8763 } 8764 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 8765 root->root_key.objectid, level - 1, 0); 8766 BUG_ON(ret); /* -ENOMEM */ 8767 } 8768 btrfs_tree_unlock(next); 8769 free_extent_buffer(next); 8770 *lookup_info = 1; 8771 return 1; 8772 } 8773 8774 /* 8775 * helper to process tree block while walking up the tree. 8776 * 8777 * when wc->stage == DROP_REFERENCE, this function drops 8778 * reference count on the block. 8779 * 8780 * when wc->stage == UPDATE_BACKREF, this function changes 8781 * wc->stage back to DROP_REFERENCE if we changed wc->stage 8782 * to UPDATE_BACKREF previously while processing the block. 8783 * 8784 * NOTE: return value 1 means we should stop walking up. 8785 */ 8786 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 8787 struct btrfs_root *root, 8788 struct btrfs_path *path, 8789 struct walk_control *wc) 8790 { 8791 int ret; 8792 int level = wc->level; 8793 struct extent_buffer *eb = path->nodes[level]; 8794 u64 parent = 0; 8795 8796 if (wc->stage == UPDATE_BACKREF) { 8797 BUG_ON(wc->shared_level < level); 8798 if (level < wc->shared_level) 8799 goto out; 8800 8801 ret = find_next_key(path, level + 1, &wc->update_progress); 8802 if (ret > 0) 8803 wc->update_ref = 0; 8804 8805 wc->stage = DROP_REFERENCE; 8806 wc->shared_level = -1; 8807 path->slots[level] = 0; 8808 8809 /* 8810 * check reference count again if the block isn't locked. 8811 * we should start walking down the tree again if reference 8812 * count is one. 8813 */ 8814 if (!path->locks[level]) { 8815 BUG_ON(level == 0); 8816 btrfs_tree_lock(eb); 8817 btrfs_set_lock_blocking(eb); 8818 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8819 8820 ret = btrfs_lookup_extent_info(trans, root, 8821 eb->start, level, 1, 8822 &wc->refs[level], 8823 &wc->flags[level]); 8824 if (ret < 0) { 8825 btrfs_tree_unlock_rw(eb, path->locks[level]); 8826 path->locks[level] = 0; 8827 return ret; 8828 } 8829 BUG_ON(wc->refs[level] == 0); 8830 if (wc->refs[level] == 1) { 8831 btrfs_tree_unlock_rw(eb, path->locks[level]); 8832 path->locks[level] = 0; 8833 return 1; 8834 } 8835 } 8836 } 8837 8838 /* wc->stage == DROP_REFERENCE */ 8839 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 8840 8841 if (wc->refs[level] == 1) { 8842 if (level == 0) { 8843 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8844 ret = btrfs_dec_ref(trans, root, eb, 1); 8845 else 8846 ret = btrfs_dec_ref(trans, root, eb, 0); 8847 BUG_ON(ret); /* -ENOMEM */ 8848 ret = account_leaf_items(trans, root, eb); 8849 if (ret) { 8850 btrfs_err_rl(root->fs_info, 8851 "error " 8852 "%d accounting leaf items. Quota " 8853 "is out of sync, rescan required.", 8854 ret); 8855 } 8856 } 8857 /* make block locked assertion in clean_tree_block happy */ 8858 if (!path->locks[level] && 8859 btrfs_header_generation(eb) == trans->transid) { 8860 btrfs_tree_lock(eb); 8861 btrfs_set_lock_blocking(eb); 8862 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8863 } 8864 clean_tree_block(trans, root->fs_info, eb); 8865 } 8866 8867 if (eb == root->node) { 8868 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8869 parent = eb->start; 8870 else 8871 BUG_ON(root->root_key.objectid != 8872 btrfs_header_owner(eb)); 8873 } else { 8874 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8875 parent = path->nodes[level + 1]->start; 8876 else 8877 BUG_ON(root->root_key.objectid != 8878 btrfs_header_owner(path->nodes[level + 1])); 8879 } 8880 8881 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8882 out: 8883 wc->refs[level] = 0; 8884 wc->flags[level] = 0; 8885 return 0; 8886 } 8887 8888 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8889 struct btrfs_root *root, 8890 struct btrfs_path *path, 8891 struct walk_control *wc) 8892 { 8893 int level = wc->level; 8894 int lookup_info = 1; 8895 int ret; 8896 8897 while (level >= 0) { 8898 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8899 if (ret > 0) 8900 break; 8901 8902 if (level == 0) 8903 break; 8904 8905 if (path->slots[level] >= 8906 btrfs_header_nritems(path->nodes[level])) 8907 break; 8908 8909 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8910 if (ret > 0) { 8911 path->slots[level]++; 8912 continue; 8913 } else if (ret < 0) 8914 return ret; 8915 level = wc->level; 8916 } 8917 return 0; 8918 } 8919 8920 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8921 struct btrfs_root *root, 8922 struct btrfs_path *path, 8923 struct walk_control *wc, int max_level) 8924 { 8925 int level = wc->level; 8926 int ret; 8927 8928 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8929 while (level < max_level && path->nodes[level]) { 8930 wc->level = level; 8931 if (path->slots[level] + 1 < 8932 btrfs_header_nritems(path->nodes[level])) { 8933 path->slots[level]++; 8934 return 0; 8935 } else { 8936 ret = walk_up_proc(trans, root, path, wc); 8937 if (ret > 0) 8938 return 0; 8939 8940 if (path->locks[level]) { 8941 btrfs_tree_unlock_rw(path->nodes[level], 8942 path->locks[level]); 8943 path->locks[level] = 0; 8944 } 8945 free_extent_buffer(path->nodes[level]); 8946 path->nodes[level] = NULL; 8947 level++; 8948 } 8949 } 8950 return 1; 8951 } 8952 8953 /* 8954 * drop a subvolume tree. 8955 * 8956 * this function traverses the tree freeing any blocks that only 8957 * referenced by the tree. 8958 * 8959 * when a shared tree block is found. this function decreases its 8960 * reference count by one. if update_ref is true, this function 8961 * also make sure backrefs for the shared block and all lower level 8962 * blocks are properly updated. 8963 * 8964 * If called with for_reloc == 0, may exit early with -EAGAIN 8965 */ 8966 int btrfs_drop_snapshot(struct btrfs_root *root, 8967 struct btrfs_block_rsv *block_rsv, int update_ref, 8968 int for_reloc) 8969 { 8970 struct btrfs_path *path; 8971 struct btrfs_trans_handle *trans; 8972 struct btrfs_root *tree_root = root->fs_info->tree_root; 8973 struct btrfs_root_item *root_item = &root->root_item; 8974 struct walk_control *wc; 8975 struct btrfs_key key; 8976 int err = 0; 8977 int ret; 8978 int level; 8979 bool root_dropped = false; 8980 8981 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); 8982 8983 path = btrfs_alloc_path(); 8984 if (!path) { 8985 err = -ENOMEM; 8986 goto out; 8987 } 8988 8989 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8990 if (!wc) { 8991 btrfs_free_path(path); 8992 err = -ENOMEM; 8993 goto out; 8994 } 8995 8996 trans = btrfs_start_transaction(tree_root, 0); 8997 if (IS_ERR(trans)) { 8998 err = PTR_ERR(trans); 8999 goto out_free; 9000 } 9001 9002 if (block_rsv) 9003 trans->block_rsv = block_rsv; 9004 9005 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9006 level = btrfs_header_level(root->node); 9007 path->nodes[level] = btrfs_lock_root_node(root); 9008 btrfs_set_lock_blocking(path->nodes[level]); 9009 path->slots[level] = 0; 9010 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9011 memset(&wc->update_progress, 0, 9012 sizeof(wc->update_progress)); 9013 } else { 9014 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9015 memcpy(&wc->update_progress, &key, 9016 sizeof(wc->update_progress)); 9017 9018 level = root_item->drop_level; 9019 BUG_ON(level == 0); 9020 path->lowest_level = level; 9021 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9022 path->lowest_level = 0; 9023 if (ret < 0) { 9024 err = ret; 9025 goto out_end_trans; 9026 } 9027 WARN_ON(ret > 0); 9028 9029 /* 9030 * unlock our path, this is safe because only this 9031 * function is allowed to delete this snapshot 9032 */ 9033 btrfs_unlock_up_safe(path, 0); 9034 9035 level = btrfs_header_level(root->node); 9036 while (1) { 9037 btrfs_tree_lock(path->nodes[level]); 9038 btrfs_set_lock_blocking(path->nodes[level]); 9039 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9040 9041 ret = btrfs_lookup_extent_info(trans, root, 9042 path->nodes[level]->start, 9043 level, 1, &wc->refs[level], 9044 &wc->flags[level]); 9045 if (ret < 0) { 9046 err = ret; 9047 goto out_end_trans; 9048 } 9049 BUG_ON(wc->refs[level] == 0); 9050 9051 if (level == root_item->drop_level) 9052 break; 9053 9054 btrfs_tree_unlock(path->nodes[level]); 9055 path->locks[level] = 0; 9056 WARN_ON(wc->refs[level] != 1); 9057 level--; 9058 } 9059 } 9060 9061 wc->level = level; 9062 wc->shared_level = -1; 9063 wc->stage = DROP_REFERENCE; 9064 wc->update_ref = update_ref; 9065 wc->keep_locks = 0; 9066 wc->for_reloc = for_reloc; 9067 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 9068 9069 while (1) { 9070 9071 ret = walk_down_tree(trans, root, path, wc); 9072 if (ret < 0) { 9073 err = ret; 9074 break; 9075 } 9076 9077 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9078 if (ret < 0) { 9079 err = ret; 9080 break; 9081 } 9082 9083 if (ret > 0) { 9084 BUG_ON(wc->stage != DROP_REFERENCE); 9085 break; 9086 } 9087 9088 if (wc->stage == DROP_REFERENCE) { 9089 level = wc->level; 9090 btrfs_node_key(path->nodes[level], 9091 &root_item->drop_progress, 9092 path->slots[level]); 9093 root_item->drop_level = level; 9094 } 9095 9096 BUG_ON(wc->level == 0); 9097 if (btrfs_should_end_transaction(trans, tree_root) || 9098 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 9099 ret = btrfs_update_root(trans, tree_root, 9100 &root->root_key, 9101 root_item); 9102 if (ret) { 9103 btrfs_abort_transaction(trans, tree_root, ret); 9104 err = ret; 9105 goto out_end_trans; 9106 } 9107 9108 btrfs_end_transaction_throttle(trans, tree_root); 9109 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 9110 pr_debug("BTRFS: drop snapshot early exit\n"); 9111 err = -EAGAIN; 9112 goto out_free; 9113 } 9114 9115 trans = btrfs_start_transaction(tree_root, 0); 9116 if (IS_ERR(trans)) { 9117 err = PTR_ERR(trans); 9118 goto out_free; 9119 } 9120 if (block_rsv) 9121 trans->block_rsv = block_rsv; 9122 } 9123 } 9124 btrfs_release_path(path); 9125 if (err) 9126 goto out_end_trans; 9127 9128 ret = btrfs_del_root(trans, tree_root, &root->root_key); 9129 if (ret) { 9130 btrfs_abort_transaction(trans, tree_root, ret); 9131 goto out_end_trans; 9132 } 9133 9134 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9135 ret = btrfs_find_root(tree_root, &root->root_key, path, 9136 NULL, NULL); 9137 if (ret < 0) { 9138 btrfs_abort_transaction(trans, tree_root, ret); 9139 err = ret; 9140 goto out_end_trans; 9141 } else if (ret > 0) { 9142 /* if we fail to delete the orphan item this time 9143 * around, it'll get picked up the next time. 9144 * 9145 * The most common failure here is just -ENOENT. 9146 */ 9147 btrfs_del_orphan_item(trans, tree_root, 9148 root->root_key.objectid); 9149 } 9150 } 9151 9152 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9153 btrfs_add_dropped_root(trans, root); 9154 } else { 9155 free_extent_buffer(root->node); 9156 free_extent_buffer(root->commit_root); 9157 btrfs_put_fs_root(root); 9158 } 9159 root_dropped = true; 9160 out_end_trans: 9161 btrfs_end_transaction_throttle(trans, tree_root); 9162 out_free: 9163 kfree(wc); 9164 btrfs_free_path(path); 9165 out: 9166 /* 9167 * So if we need to stop dropping the snapshot for whatever reason we 9168 * need to make sure to add it back to the dead root list so that we 9169 * keep trying to do the work later. This also cleans up roots if we 9170 * don't have it in the radix (like when we recover after a power fail 9171 * or unmount) so we don't leak memory. 9172 */ 9173 if (!for_reloc && root_dropped == false) 9174 btrfs_add_dead_root(root); 9175 if (err && err != -EAGAIN) 9176 btrfs_handle_fs_error(root->fs_info, err, NULL); 9177 return err; 9178 } 9179 9180 /* 9181 * drop subtree rooted at tree block 'node'. 9182 * 9183 * NOTE: this function will unlock and release tree block 'node' 9184 * only used by relocation code 9185 */ 9186 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9187 struct btrfs_root *root, 9188 struct extent_buffer *node, 9189 struct extent_buffer *parent) 9190 { 9191 struct btrfs_path *path; 9192 struct walk_control *wc; 9193 int level; 9194 int parent_level; 9195 int ret = 0; 9196 int wret; 9197 9198 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9199 9200 path = btrfs_alloc_path(); 9201 if (!path) 9202 return -ENOMEM; 9203 9204 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9205 if (!wc) { 9206 btrfs_free_path(path); 9207 return -ENOMEM; 9208 } 9209 9210 btrfs_assert_tree_locked(parent); 9211 parent_level = btrfs_header_level(parent); 9212 extent_buffer_get(parent); 9213 path->nodes[parent_level] = parent; 9214 path->slots[parent_level] = btrfs_header_nritems(parent); 9215 9216 btrfs_assert_tree_locked(node); 9217 level = btrfs_header_level(node); 9218 path->nodes[level] = node; 9219 path->slots[level] = 0; 9220 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9221 9222 wc->refs[parent_level] = 1; 9223 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9224 wc->level = level; 9225 wc->shared_level = -1; 9226 wc->stage = DROP_REFERENCE; 9227 wc->update_ref = 0; 9228 wc->keep_locks = 1; 9229 wc->for_reloc = 1; 9230 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 9231 9232 while (1) { 9233 wret = walk_down_tree(trans, root, path, wc); 9234 if (wret < 0) { 9235 ret = wret; 9236 break; 9237 } 9238 9239 wret = walk_up_tree(trans, root, path, wc, parent_level); 9240 if (wret < 0) 9241 ret = wret; 9242 if (wret != 0) 9243 break; 9244 } 9245 9246 kfree(wc); 9247 btrfs_free_path(path); 9248 return ret; 9249 } 9250 9251 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 9252 { 9253 u64 num_devices; 9254 u64 stripped; 9255 9256 /* 9257 * if restripe for this chunk_type is on pick target profile and 9258 * return, otherwise do the usual balance 9259 */ 9260 stripped = get_restripe_target(root->fs_info, flags); 9261 if (stripped) 9262 return extended_to_chunk(stripped); 9263 9264 num_devices = root->fs_info->fs_devices->rw_devices; 9265 9266 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9267 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9268 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9269 9270 if (num_devices == 1) { 9271 stripped |= BTRFS_BLOCK_GROUP_DUP; 9272 stripped = flags & ~stripped; 9273 9274 /* turn raid0 into single device chunks */ 9275 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9276 return stripped; 9277 9278 /* turn mirroring into duplication */ 9279 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9280 BTRFS_BLOCK_GROUP_RAID10)) 9281 return stripped | BTRFS_BLOCK_GROUP_DUP; 9282 } else { 9283 /* they already had raid on here, just return */ 9284 if (flags & stripped) 9285 return flags; 9286 9287 stripped |= BTRFS_BLOCK_GROUP_DUP; 9288 stripped = flags & ~stripped; 9289 9290 /* switch duplicated blocks with raid1 */ 9291 if (flags & BTRFS_BLOCK_GROUP_DUP) 9292 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9293 9294 /* this is drive concat, leave it alone */ 9295 } 9296 9297 return flags; 9298 } 9299 9300 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9301 { 9302 struct btrfs_space_info *sinfo = cache->space_info; 9303 u64 num_bytes; 9304 u64 min_allocable_bytes; 9305 int ret = -ENOSPC; 9306 9307 /* 9308 * We need some metadata space and system metadata space for 9309 * allocating chunks in some corner cases until we force to set 9310 * it to be readonly. 9311 */ 9312 if ((sinfo->flags & 9313 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9314 !force) 9315 min_allocable_bytes = SZ_1M; 9316 else 9317 min_allocable_bytes = 0; 9318 9319 spin_lock(&sinfo->lock); 9320 spin_lock(&cache->lock); 9321 9322 if (cache->ro) { 9323 cache->ro++; 9324 ret = 0; 9325 goto out; 9326 } 9327 9328 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9329 cache->bytes_super - btrfs_block_group_used(&cache->item); 9330 9331 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 9332 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 9333 min_allocable_bytes <= sinfo->total_bytes) { 9334 sinfo->bytes_readonly += num_bytes; 9335 cache->ro++; 9336 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9337 ret = 0; 9338 } 9339 out: 9340 spin_unlock(&cache->lock); 9341 spin_unlock(&sinfo->lock); 9342 return ret; 9343 } 9344 9345 int btrfs_inc_block_group_ro(struct btrfs_root *root, 9346 struct btrfs_block_group_cache *cache) 9347 9348 { 9349 struct btrfs_trans_handle *trans; 9350 u64 alloc_flags; 9351 int ret; 9352 9353 again: 9354 trans = btrfs_join_transaction(root); 9355 if (IS_ERR(trans)) 9356 return PTR_ERR(trans); 9357 9358 /* 9359 * we're not allowed to set block groups readonly after the dirty 9360 * block groups cache has started writing. If it already started, 9361 * back off and let this transaction commit 9362 */ 9363 mutex_lock(&root->fs_info->ro_block_group_mutex); 9364 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9365 u64 transid = trans->transid; 9366 9367 mutex_unlock(&root->fs_info->ro_block_group_mutex); 9368 btrfs_end_transaction(trans, root); 9369 9370 ret = btrfs_wait_for_commit(root, transid); 9371 if (ret) 9372 return ret; 9373 goto again; 9374 } 9375 9376 /* 9377 * if we are changing raid levels, try to allocate a corresponding 9378 * block group with the new raid level. 9379 */ 9380 alloc_flags = update_block_group_flags(root, cache->flags); 9381 if (alloc_flags != cache->flags) { 9382 ret = do_chunk_alloc(trans, root, alloc_flags, 9383 CHUNK_ALLOC_FORCE); 9384 /* 9385 * ENOSPC is allowed here, we may have enough space 9386 * already allocated at the new raid level to 9387 * carry on 9388 */ 9389 if (ret == -ENOSPC) 9390 ret = 0; 9391 if (ret < 0) 9392 goto out; 9393 } 9394 9395 ret = inc_block_group_ro(cache, 0); 9396 if (!ret) 9397 goto out; 9398 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 9399 ret = do_chunk_alloc(trans, root, alloc_flags, 9400 CHUNK_ALLOC_FORCE); 9401 if (ret < 0) 9402 goto out; 9403 ret = inc_block_group_ro(cache, 0); 9404 out: 9405 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9406 alloc_flags = update_block_group_flags(root, cache->flags); 9407 lock_chunks(root->fs_info->chunk_root); 9408 check_system_chunk(trans, root, alloc_flags); 9409 unlock_chunks(root->fs_info->chunk_root); 9410 } 9411 mutex_unlock(&root->fs_info->ro_block_group_mutex); 9412 9413 btrfs_end_transaction(trans, root); 9414 return ret; 9415 } 9416 9417 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 9418 struct btrfs_root *root, u64 type) 9419 { 9420 u64 alloc_flags = get_alloc_profile(root, type); 9421 return do_chunk_alloc(trans, root, alloc_flags, 9422 CHUNK_ALLOC_FORCE); 9423 } 9424 9425 /* 9426 * helper to account the unused space of all the readonly block group in the 9427 * space_info. takes mirrors into account. 9428 */ 9429 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9430 { 9431 struct btrfs_block_group_cache *block_group; 9432 u64 free_bytes = 0; 9433 int factor; 9434 9435 /* It's df, we don't care if it's racy */ 9436 if (list_empty(&sinfo->ro_bgs)) 9437 return 0; 9438 9439 spin_lock(&sinfo->lock); 9440 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9441 spin_lock(&block_group->lock); 9442 9443 if (!block_group->ro) { 9444 spin_unlock(&block_group->lock); 9445 continue; 9446 } 9447 9448 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 9449 BTRFS_BLOCK_GROUP_RAID10 | 9450 BTRFS_BLOCK_GROUP_DUP)) 9451 factor = 2; 9452 else 9453 factor = 1; 9454 9455 free_bytes += (block_group->key.offset - 9456 btrfs_block_group_used(&block_group->item)) * 9457 factor; 9458 9459 spin_unlock(&block_group->lock); 9460 } 9461 spin_unlock(&sinfo->lock); 9462 9463 return free_bytes; 9464 } 9465 9466 void btrfs_dec_block_group_ro(struct btrfs_root *root, 9467 struct btrfs_block_group_cache *cache) 9468 { 9469 struct btrfs_space_info *sinfo = cache->space_info; 9470 u64 num_bytes; 9471 9472 BUG_ON(!cache->ro); 9473 9474 spin_lock(&sinfo->lock); 9475 spin_lock(&cache->lock); 9476 if (!--cache->ro) { 9477 num_bytes = cache->key.offset - cache->reserved - 9478 cache->pinned - cache->bytes_super - 9479 btrfs_block_group_used(&cache->item); 9480 sinfo->bytes_readonly -= num_bytes; 9481 list_del_init(&cache->ro_list); 9482 } 9483 spin_unlock(&cache->lock); 9484 spin_unlock(&sinfo->lock); 9485 } 9486 9487 /* 9488 * checks to see if its even possible to relocate this block group. 9489 * 9490 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9491 * ok to go ahead and try. 9492 */ 9493 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 9494 { 9495 struct btrfs_block_group_cache *block_group; 9496 struct btrfs_space_info *space_info; 9497 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 9498 struct btrfs_device *device; 9499 struct btrfs_trans_handle *trans; 9500 u64 min_free; 9501 u64 dev_min = 1; 9502 u64 dev_nr = 0; 9503 u64 target; 9504 int debug; 9505 int index; 9506 int full = 0; 9507 int ret = 0; 9508 9509 debug = btrfs_test_opt(root, ENOSPC_DEBUG); 9510 9511 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 9512 9513 /* odd, couldn't find the block group, leave it alone */ 9514 if (!block_group) { 9515 if (debug) 9516 btrfs_warn(root->fs_info, 9517 "can't find block group for bytenr %llu", 9518 bytenr); 9519 return -1; 9520 } 9521 9522 min_free = btrfs_block_group_used(&block_group->item); 9523 9524 /* no bytes used, we're good */ 9525 if (!min_free) 9526 goto out; 9527 9528 space_info = block_group->space_info; 9529 spin_lock(&space_info->lock); 9530 9531 full = space_info->full; 9532 9533 /* 9534 * if this is the last block group we have in this space, we can't 9535 * relocate it unless we're able to allocate a new chunk below. 9536 * 9537 * Otherwise, we need to make sure we have room in the space to handle 9538 * all of the extents from this block group. If we can, we're good 9539 */ 9540 if ((space_info->total_bytes != block_group->key.offset) && 9541 (space_info->bytes_used + space_info->bytes_reserved + 9542 space_info->bytes_pinned + space_info->bytes_readonly + 9543 min_free < space_info->total_bytes)) { 9544 spin_unlock(&space_info->lock); 9545 goto out; 9546 } 9547 spin_unlock(&space_info->lock); 9548 9549 /* 9550 * ok we don't have enough space, but maybe we have free space on our 9551 * devices to allocate new chunks for relocation, so loop through our 9552 * alloc devices and guess if we have enough space. if this block 9553 * group is going to be restriped, run checks against the target 9554 * profile instead of the current one. 9555 */ 9556 ret = -1; 9557 9558 /* 9559 * index: 9560 * 0: raid10 9561 * 1: raid1 9562 * 2: dup 9563 * 3: raid0 9564 * 4: single 9565 */ 9566 target = get_restripe_target(root->fs_info, block_group->flags); 9567 if (target) { 9568 index = __get_raid_index(extended_to_chunk(target)); 9569 } else { 9570 /* 9571 * this is just a balance, so if we were marked as full 9572 * we know there is no space for a new chunk 9573 */ 9574 if (full) { 9575 if (debug) 9576 btrfs_warn(root->fs_info, 9577 "no space to alloc new chunk for block group %llu", 9578 block_group->key.objectid); 9579 goto out; 9580 } 9581 9582 index = get_block_group_index(block_group); 9583 } 9584 9585 if (index == BTRFS_RAID_RAID10) { 9586 dev_min = 4; 9587 /* Divide by 2 */ 9588 min_free >>= 1; 9589 } else if (index == BTRFS_RAID_RAID1) { 9590 dev_min = 2; 9591 } else if (index == BTRFS_RAID_DUP) { 9592 /* Multiply by 2 */ 9593 min_free <<= 1; 9594 } else if (index == BTRFS_RAID_RAID0) { 9595 dev_min = fs_devices->rw_devices; 9596 min_free = div64_u64(min_free, dev_min); 9597 } 9598 9599 /* We need to do this so that we can look at pending chunks */ 9600 trans = btrfs_join_transaction(root); 9601 if (IS_ERR(trans)) { 9602 ret = PTR_ERR(trans); 9603 goto out; 9604 } 9605 9606 mutex_lock(&root->fs_info->chunk_mutex); 9607 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9608 u64 dev_offset; 9609 9610 /* 9611 * check to make sure we can actually find a chunk with enough 9612 * space to fit our block group in. 9613 */ 9614 if (device->total_bytes > device->bytes_used + min_free && 9615 !device->is_tgtdev_for_dev_replace) { 9616 ret = find_free_dev_extent(trans, device, min_free, 9617 &dev_offset, NULL); 9618 if (!ret) 9619 dev_nr++; 9620 9621 if (dev_nr >= dev_min) 9622 break; 9623 9624 ret = -1; 9625 } 9626 } 9627 if (debug && ret == -1) 9628 btrfs_warn(root->fs_info, 9629 "no space to allocate a new chunk for block group %llu", 9630 block_group->key.objectid); 9631 mutex_unlock(&root->fs_info->chunk_mutex); 9632 btrfs_end_transaction(trans, root); 9633 out: 9634 btrfs_put_block_group(block_group); 9635 return ret; 9636 } 9637 9638 static int find_first_block_group(struct btrfs_root *root, 9639 struct btrfs_path *path, struct btrfs_key *key) 9640 { 9641 int ret = 0; 9642 struct btrfs_key found_key; 9643 struct extent_buffer *leaf; 9644 int slot; 9645 9646 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9647 if (ret < 0) 9648 goto out; 9649 9650 while (1) { 9651 slot = path->slots[0]; 9652 leaf = path->nodes[0]; 9653 if (slot >= btrfs_header_nritems(leaf)) { 9654 ret = btrfs_next_leaf(root, path); 9655 if (ret == 0) 9656 continue; 9657 if (ret < 0) 9658 goto out; 9659 break; 9660 } 9661 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9662 9663 if (found_key.objectid >= key->objectid && 9664 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9665 ret = 0; 9666 goto out; 9667 } 9668 path->slots[0]++; 9669 } 9670 out: 9671 return ret; 9672 } 9673 9674 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9675 { 9676 struct btrfs_block_group_cache *block_group; 9677 u64 last = 0; 9678 9679 while (1) { 9680 struct inode *inode; 9681 9682 block_group = btrfs_lookup_first_block_group(info, last); 9683 while (block_group) { 9684 spin_lock(&block_group->lock); 9685 if (block_group->iref) 9686 break; 9687 spin_unlock(&block_group->lock); 9688 block_group = next_block_group(info->tree_root, 9689 block_group); 9690 } 9691 if (!block_group) { 9692 if (last == 0) 9693 break; 9694 last = 0; 9695 continue; 9696 } 9697 9698 inode = block_group->inode; 9699 block_group->iref = 0; 9700 block_group->inode = NULL; 9701 spin_unlock(&block_group->lock); 9702 iput(inode); 9703 last = block_group->key.objectid + block_group->key.offset; 9704 btrfs_put_block_group(block_group); 9705 } 9706 } 9707 9708 int btrfs_free_block_groups(struct btrfs_fs_info *info) 9709 { 9710 struct btrfs_block_group_cache *block_group; 9711 struct btrfs_space_info *space_info; 9712 struct btrfs_caching_control *caching_ctl; 9713 struct rb_node *n; 9714 9715 down_write(&info->commit_root_sem); 9716 while (!list_empty(&info->caching_block_groups)) { 9717 caching_ctl = list_entry(info->caching_block_groups.next, 9718 struct btrfs_caching_control, list); 9719 list_del(&caching_ctl->list); 9720 put_caching_control(caching_ctl); 9721 } 9722 up_write(&info->commit_root_sem); 9723 9724 spin_lock(&info->unused_bgs_lock); 9725 while (!list_empty(&info->unused_bgs)) { 9726 block_group = list_first_entry(&info->unused_bgs, 9727 struct btrfs_block_group_cache, 9728 bg_list); 9729 list_del_init(&block_group->bg_list); 9730 btrfs_put_block_group(block_group); 9731 } 9732 spin_unlock(&info->unused_bgs_lock); 9733 9734 spin_lock(&info->block_group_cache_lock); 9735 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 9736 block_group = rb_entry(n, struct btrfs_block_group_cache, 9737 cache_node); 9738 rb_erase(&block_group->cache_node, 9739 &info->block_group_cache_tree); 9740 RB_CLEAR_NODE(&block_group->cache_node); 9741 spin_unlock(&info->block_group_cache_lock); 9742 9743 down_write(&block_group->space_info->groups_sem); 9744 list_del(&block_group->list); 9745 up_write(&block_group->space_info->groups_sem); 9746 9747 if (block_group->cached == BTRFS_CACHE_STARTED) 9748 wait_block_group_cache_done(block_group); 9749 9750 /* 9751 * We haven't cached this block group, which means we could 9752 * possibly have excluded extents on this block group. 9753 */ 9754 if (block_group->cached == BTRFS_CACHE_NO || 9755 block_group->cached == BTRFS_CACHE_ERROR) 9756 free_excluded_extents(info->extent_root, block_group); 9757 9758 btrfs_remove_free_space_cache(block_group); 9759 btrfs_put_block_group(block_group); 9760 9761 spin_lock(&info->block_group_cache_lock); 9762 } 9763 spin_unlock(&info->block_group_cache_lock); 9764 9765 /* now that all the block groups are freed, go through and 9766 * free all the space_info structs. This is only called during 9767 * the final stages of unmount, and so we know nobody is 9768 * using them. We call synchronize_rcu() once before we start, 9769 * just to be on the safe side. 9770 */ 9771 synchronize_rcu(); 9772 9773 release_global_block_rsv(info); 9774 9775 while (!list_empty(&info->space_info)) { 9776 int i; 9777 9778 space_info = list_entry(info->space_info.next, 9779 struct btrfs_space_info, 9780 list); 9781 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 9782 if (WARN_ON(space_info->bytes_pinned > 0 || 9783 space_info->bytes_reserved > 0 || 9784 space_info->bytes_may_use > 0)) { 9785 dump_space_info(space_info, 0, 0); 9786 } 9787 } 9788 list_del(&space_info->list); 9789 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 9790 struct kobject *kobj; 9791 kobj = space_info->block_group_kobjs[i]; 9792 space_info->block_group_kobjs[i] = NULL; 9793 if (kobj) { 9794 kobject_del(kobj); 9795 kobject_put(kobj); 9796 } 9797 } 9798 kobject_del(&space_info->kobj); 9799 kobject_put(&space_info->kobj); 9800 } 9801 return 0; 9802 } 9803 9804 static void __link_block_group(struct btrfs_space_info *space_info, 9805 struct btrfs_block_group_cache *cache) 9806 { 9807 int index = get_block_group_index(cache); 9808 bool first = false; 9809 9810 down_write(&space_info->groups_sem); 9811 if (list_empty(&space_info->block_groups[index])) 9812 first = true; 9813 list_add_tail(&cache->list, &space_info->block_groups[index]); 9814 up_write(&space_info->groups_sem); 9815 9816 if (first) { 9817 struct raid_kobject *rkobj; 9818 int ret; 9819 9820 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 9821 if (!rkobj) 9822 goto out_err; 9823 rkobj->raid_type = index; 9824 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 9825 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 9826 "%s", get_raid_name(index)); 9827 if (ret) { 9828 kobject_put(&rkobj->kobj); 9829 goto out_err; 9830 } 9831 space_info->block_group_kobjs[index] = &rkobj->kobj; 9832 } 9833 9834 return; 9835 out_err: 9836 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 9837 } 9838 9839 static struct btrfs_block_group_cache * 9840 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) 9841 { 9842 struct btrfs_block_group_cache *cache; 9843 9844 cache = kzalloc(sizeof(*cache), GFP_NOFS); 9845 if (!cache) 9846 return NULL; 9847 9848 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 9849 GFP_NOFS); 9850 if (!cache->free_space_ctl) { 9851 kfree(cache); 9852 return NULL; 9853 } 9854 9855 cache->key.objectid = start; 9856 cache->key.offset = size; 9857 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9858 9859 cache->sectorsize = root->sectorsize; 9860 cache->fs_info = root->fs_info; 9861 cache->full_stripe_len = btrfs_full_stripe_len(root, 9862 &root->fs_info->mapping_tree, 9863 start); 9864 set_free_space_tree_thresholds(cache); 9865 9866 atomic_set(&cache->count, 1); 9867 spin_lock_init(&cache->lock); 9868 init_rwsem(&cache->data_rwsem); 9869 INIT_LIST_HEAD(&cache->list); 9870 INIT_LIST_HEAD(&cache->cluster_list); 9871 INIT_LIST_HEAD(&cache->bg_list); 9872 INIT_LIST_HEAD(&cache->ro_list); 9873 INIT_LIST_HEAD(&cache->dirty_list); 9874 INIT_LIST_HEAD(&cache->io_list); 9875 btrfs_init_free_space_ctl(cache); 9876 atomic_set(&cache->trimming, 0); 9877 mutex_init(&cache->free_space_lock); 9878 9879 return cache; 9880 } 9881 9882 int btrfs_read_block_groups(struct btrfs_root *root) 9883 { 9884 struct btrfs_path *path; 9885 int ret; 9886 struct btrfs_block_group_cache *cache; 9887 struct btrfs_fs_info *info = root->fs_info; 9888 struct btrfs_space_info *space_info; 9889 struct btrfs_key key; 9890 struct btrfs_key found_key; 9891 struct extent_buffer *leaf; 9892 int need_clear = 0; 9893 u64 cache_gen; 9894 9895 root = info->extent_root; 9896 key.objectid = 0; 9897 key.offset = 0; 9898 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9899 path = btrfs_alloc_path(); 9900 if (!path) 9901 return -ENOMEM; 9902 path->reada = READA_FORWARD; 9903 9904 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 9905 if (btrfs_test_opt(root, SPACE_CACHE) && 9906 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 9907 need_clear = 1; 9908 if (btrfs_test_opt(root, CLEAR_CACHE)) 9909 need_clear = 1; 9910 9911 while (1) { 9912 ret = find_first_block_group(root, path, &key); 9913 if (ret > 0) 9914 break; 9915 if (ret != 0) 9916 goto error; 9917 9918 leaf = path->nodes[0]; 9919 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9920 9921 cache = btrfs_create_block_group_cache(root, found_key.objectid, 9922 found_key.offset); 9923 if (!cache) { 9924 ret = -ENOMEM; 9925 goto error; 9926 } 9927 9928 if (need_clear) { 9929 /* 9930 * When we mount with old space cache, we need to 9931 * set BTRFS_DC_CLEAR and set dirty flag. 9932 * 9933 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9934 * truncate the old free space cache inode and 9935 * setup a new one. 9936 * b) Setting 'dirty flag' makes sure that we flush 9937 * the new space cache info onto disk. 9938 */ 9939 if (btrfs_test_opt(root, SPACE_CACHE)) 9940 cache->disk_cache_state = BTRFS_DC_CLEAR; 9941 } 9942 9943 read_extent_buffer(leaf, &cache->item, 9944 btrfs_item_ptr_offset(leaf, path->slots[0]), 9945 sizeof(cache->item)); 9946 cache->flags = btrfs_block_group_flags(&cache->item); 9947 9948 key.objectid = found_key.objectid + found_key.offset; 9949 btrfs_release_path(path); 9950 9951 /* 9952 * We need to exclude the super stripes now so that the space 9953 * info has super bytes accounted for, otherwise we'll think 9954 * we have more space than we actually do. 9955 */ 9956 ret = exclude_super_stripes(root, cache); 9957 if (ret) { 9958 /* 9959 * We may have excluded something, so call this just in 9960 * case. 9961 */ 9962 free_excluded_extents(root, cache); 9963 btrfs_put_block_group(cache); 9964 goto error; 9965 } 9966 9967 /* 9968 * check for two cases, either we are full, and therefore 9969 * don't need to bother with the caching work since we won't 9970 * find any space, or we are empty, and we can just add all 9971 * the space in and be done with it. This saves us _alot_ of 9972 * time, particularly in the full case. 9973 */ 9974 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 9975 cache->last_byte_to_unpin = (u64)-1; 9976 cache->cached = BTRFS_CACHE_FINISHED; 9977 free_excluded_extents(root, cache); 9978 } else if (btrfs_block_group_used(&cache->item) == 0) { 9979 cache->last_byte_to_unpin = (u64)-1; 9980 cache->cached = BTRFS_CACHE_FINISHED; 9981 add_new_free_space(cache, root->fs_info, 9982 found_key.objectid, 9983 found_key.objectid + 9984 found_key.offset); 9985 free_excluded_extents(root, cache); 9986 } 9987 9988 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9989 if (ret) { 9990 btrfs_remove_free_space_cache(cache); 9991 btrfs_put_block_group(cache); 9992 goto error; 9993 } 9994 9995 ret = update_space_info(info, cache->flags, found_key.offset, 9996 btrfs_block_group_used(&cache->item), 9997 &space_info); 9998 if (ret) { 9999 btrfs_remove_free_space_cache(cache); 10000 spin_lock(&info->block_group_cache_lock); 10001 rb_erase(&cache->cache_node, 10002 &info->block_group_cache_tree); 10003 RB_CLEAR_NODE(&cache->cache_node); 10004 spin_unlock(&info->block_group_cache_lock); 10005 btrfs_put_block_group(cache); 10006 goto error; 10007 } 10008 10009 cache->space_info = space_info; 10010 spin_lock(&cache->space_info->lock); 10011 cache->space_info->bytes_readonly += cache->bytes_super; 10012 spin_unlock(&cache->space_info->lock); 10013 10014 __link_block_group(space_info, cache); 10015 10016 set_avail_alloc_bits(root->fs_info, cache->flags); 10017 if (btrfs_chunk_readonly(root, cache->key.objectid)) { 10018 inc_block_group_ro(cache, 1); 10019 } else if (btrfs_block_group_used(&cache->item) == 0) { 10020 spin_lock(&info->unused_bgs_lock); 10021 /* Should always be true but just in case. */ 10022 if (list_empty(&cache->bg_list)) { 10023 btrfs_get_block_group(cache); 10024 list_add_tail(&cache->bg_list, 10025 &info->unused_bgs); 10026 } 10027 spin_unlock(&info->unused_bgs_lock); 10028 } 10029 } 10030 10031 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 10032 if (!(get_alloc_profile(root, space_info->flags) & 10033 (BTRFS_BLOCK_GROUP_RAID10 | 10034 BTRFS_BLOCK_GROUP_RAID1 | 10035 BTRFS_BLOCK_GROUP_RAID5 | 10036 BTRFS_BLOCK_GROUP_RAID6 | 10037 BTRFS_BLOCK_GROUP_DUP))) 10038 continue; 10039 /* 10040 * avoid allocating from un-mirrored block group if there are 10041 * mirrored block groups. 10042 */ 10043 list_for_each_entry(cache, 10044 &space_info->block_groups[BTRFS_RAID_RAID0], 10045 list) 10046 inc_block_group_ro(cache, 1); 10047 list_for_each_entry(cache, 10048 &space_info->block_groups[BTRFS_RAID_SINGLE], 10049 list) 10050 inc_block_group_ro(cache, 1); 10051 } 10052 10053 init_global_block_rsv(info); 10054 ret = 0; 10055 error: 10056 btrfs_free_path(path); 10057 return ret; 10058 } 10059 10060 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 10061 struct btrfs_root *root) 10062 { 10063 struct btrfs_block_group_cache *block_group, *tmp; 10064 struct btrfs_root *extent_root = root->fs_info->extent_root; 10065 struct btrfs_block_group_item item; 10066 struct btrfs_key key; 10067 int ret = 0; 10068 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 10069 10070 trans->can_flush_pending_bgs = false; 10071 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 10072 if (ret) 10073 goto next; 10074 10075 spin_lock(&block_group->lock); 10076 memcpy(&item, &block_group->item, sizeof(item)); 10077 memcpy(&key, &block_group->key, sizeof(key)); 10078 spin_unlock(&block_group->lock); 10079 10080 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10081 sizeof(item)); 10082 if (ret) 10083 btrfs_abort_transaction(trans, extent_root, ret); 10084 ret = btrfs_finish_chunk_alloc(trans, extent_root, 10085 key.objectid, key.offset); 10086 if (ret) 10087 btrfs_abort_transaction(trans, extent_root, ret); 10088 add_block_group_free_space(trans, root->fs_info, block_group); 10089 /* already aborted the transaction if it failed. */ 10090 next: 10091 list_del_init(&block_group->bg_list); 10092 } 10093 trans->can_flush_pending_bgs = can_flush_pending_bgs; 10094 } 10095 10096 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 10097 struct btrfs_root *root, u64 bytes_used, 10098 u64 type, u64 chunk_objectid, u64 chunk_offset, 10099 u64 size) 10100 { 10101 int ret; 10102 struct btrfs_root *extent_root; 10103 struct btrfs_block_group_cache *cache; 10104 10105 extent_root = root->fs_info->extent_root; 10106 10107 btrfs_set_log_full_commit(root->fs_info, trans); 10108 10109 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 10110 if (!cache) 10111 return -ENOMEM; 10112 10113 btrfs_set_block_group_used(&cache->item, bytes_used); 10114 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 10115 btrfs_set_block_group_flags(&cache->item, type); 10116 10117 cache->flags = type; 10118 cache->last_byte_to_unpin = (u64)-1; 10119 cache->cached = BTRFS_CACHE_FINISHED; 10120 cache->needs_free_space = 1; 10121 ret = exclude_super_stripes(root, cache); 10122 if (ret) { 10123 /* 10124 * We may have excluded something, so call this just in 10125 * case. 10126 */ 10127 free_excluded_extents(root, cache); 10128 btrfs_put_block_group(cache); 10129 return ret; 10130 } 10131 10132 add_new_free_space(cache, root->fs_info, chunk_offset, 10133 chunk_offset + size); 10134 10135 free_excluded_extents(root, cache); 10136 10137 #ifdef CONFIG_BTRFS_DEBUG 10138 if (btrfs_should_fragment_free_space(root, cache)) { 10139 u64 new_bytes_used = size - bytes_used; 10140 10141 bytes_used += new_bytes_used >> 1; 10142 fragment_free_space(root, cache); 10143 } 10144 #endif 10145 /* 10146 * Call to ensure the corresponding space_info object is created and 10147 * assigned to our block group, but don't update its counters just yet. 10148 * We want our bg to be added to the rbtree with its ->space_info set. 10149 */ 10150 ret = update_space_info(root->fs_info, cache->flags, 0, 0, 10151 &cache->space_info); 10152 if (ret) { 10153 btrfs_remove_free_space_cache(cache); 10154 btrfs_put_block_group(cache); 10155 return ret; 10156 } 10157 10158 ret = btrfs_add_block_group_cache(root->fs_info, cache); 10159 if (ret) { 10160 btrfs_remove_free_space_cache(cache); 10161 btrfs_put_block_group(cache); 10162 return ret; 10163 } 10164 10165 /* 10166 * Now that our block group has its ->space_info set and is inserted in 10167 * the rbtree, update the space info's counters. 10168 */ 10169 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 10170 &cache->space_info); 10171 if (ret) { 10172 btrfs_remove_free_space_cache(cache); 10173 spin_lock(&root->fs_info->block_group_cache_lock); 10174 rb_erase(&cache->cache_node, 10175 &root->fs_info->block_group_cache_tree); 10176 RB_CLEAR_NODE(&cache->cache_node); 10177 spin_unlock(&root->fs_info->block_group_cache_lock); 10178 btrfs_put_block_group(cache); 10179 return ret; 10180 } 10181 update_global_block_rsv(root->fs_info); 10182 10183 spin_lock(&cache->space_info->lock); 10184 cache->space_info->bytes_readonly += cache->bytes_super; 10185 spin_unlock(&cache->space_info->lock); 10186 10187 __link_block_group(cache->space_info, cache); 10188 10189 list_add_tail(&cache->bg_list, &trans->new_bgs); 10190 10191 set_avail_alloc_bits(extent_root->fs_info, type); 10192 10193 return 0; 10194 } 10195 10196 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10197 { 10198 u64 extra_flags = chunk_to_extended(flags) & 10199 BTRFS_EXTENDED_PROFILE_MASK; 10200 10201 write_seqlock(&fs_info->profiles_lock); 10202 if (flags & BTRFS_BLOCK_GROUP_DATA) 10203 fs_info->avail_data_alloc_bits &= ~extra_flags; 10204 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10205 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10206 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10207 fs_info->avail_system_alloc_bits &= ~extra_flags; 10208 write_sequnlock(&fs_info->profiles_lock); 10209 } 10210 10211 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10212 struct btrfs_root *root, u64 group_start, 10213 struct extent_map *em) 10214 { 10215 struct btrfs_path *path; 10216 struct btrfs_block_group_cache *block_group; 10217 struct btrfs_free_cluster *cluster; 10218 struct btrfs_root *tree_root = root->fs_info->tree_root; 10219 struct btrfs_key key; 10220 struct inode *inode; 10221 struct kobject *kobj = NULL; 10222 int ret; 10223 int index; 10224 int factor; 10225 struct btrfs_caching_control *caching_ctl = NULL; 10226 bool remove_em; 10227 10228 root = root->fs_info->extent_root; 10229 10230 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 10231 BUG_ON(!block_group); 10232 BUG_ON(!block_group->ro); 10233 10234 /* 10235 * Free the reserved super bytes from this block group before 10236 * remove it. 10237 */ 10238 free_excluded_extents(root, block_group); 10239 10240 memcpy(&key, &block_group->key, sizeof(key)); 10241 index = get_block_group_index(block_group); 10242 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 10243 BTRFS_BLOCK_GROUP_RAID1 | 10244 BTRFS_BLOCK_GROUP_RAID10)) 10245 factor = 2; 10246 else 10247 factor = 1; 10248 10249 /* make sure this block group isn't part of an allocation cluster */ 10250 cluster = &root->fs_info->data_alloc_cluster; 10251 spin_lock(&cluster->refill_lock); 10252 btrfs_return_cluster_to_free_space(block_group, cluster); 10253 spin_unlock(&cluster->refill_lock); 10254 10255 /* 10256 * make sure this block group isn't part of a metadata 10257 * allocation cluster 10258 */ 10259 cluster = &root->fs_info->meta_alloc_cluster; 10260 spin_lock(&cluster->refill_lock); 10261 btrfs_return_cluster_to_free_space(block_group, cluster); 10262 spin_unlock(&cluster->refill_lock); 10263 10264 path = btrfs_alloc_path(); 10265 if (!path) { 10266 ret = -ENOMEM; 10267 goto out; 10268 } 10269 10270 /* 10271 * get the inode first so any iput calls done for the io_list 10272 * aren't the final iput (no unlinks allowed now) 10273 */ 10274 inode = lookup_free_space_inode(tree_root, block_group, path); 10275 10276 mutex_lock(&trans->transaction->cache_write_mutex); 10277 /* 10278 * make sure our free spache cache IO is done before remove the 10279 * free space inode 10280 */ 10281 spin_lock(&trans->transaction->dirty_bgs_lock); 10282 if (!list_empty(&block_group->io_list)) { 10283 list_del_init(&block_group->io_list); 10284 10285 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10286 10287 spin_unlock(&trans->transaction->dirty_bgs_lock); 10288 btrfs_wait_cache_io(root, trans, block_group, 10289 &block_group->io_ctl, path, 10290 block_group->key.objectid); 10291 btrfs_put_block_group(block_group); 10292 spin_lock(&trans->transaction->dirty_bgs_lock); 10293 } 10294 10295 if (!list_empty(&block_group->dirty_list)) { 10296 list_del_init(&block_group->dirty_list); 10297 btrfs_put_block_group(block_group); 10298 } 10299 spin_unlock(&trans->transaction->dirty_bgs_lock); 10300 mutex_unlock(&trans->transaction->cache_write_mutex); 10301 10302 if (!IS_ERR(inode)) { 10303 ret = btrfs_orphan_add(trans, inode); 10304 if (ret) { 10305 btrfs_add_delayed_iput(inode); 10306 goto out; 10307 } 10308 clear_nlink(inode); 10309 /* One for the block groups ref */ 10310 spin_lock(&block_group->lock); 10311 if (block_group->iref) { 10312 block_group->iref = 0; 10313 block_group->inode = NULL; 10314 spin_unlock(&block_group->lock); 10315 iput(inode); 10316 } else { 10317 spin_unlock(&block_group->lock); 10318 } 10319 /* One for our lookup ref */ 10320 btrfs_add_delayed_iput(inode); 10321 } 10322 10323 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10324 key.offset = block_group->key.objectid; 10325 key.type = 0; 10326 10327 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10328 if (ret < 0) 10329 goto out; 10330 if (ret > 0) 10331 btrfs_release_path(path); 10332 if (ret == 0) { 10333 ret = btrfs_del_item(trans, tree_root, path); 10334 if (ret) 10335 goto out; 10336 btrfs_release_path(path); 10337 } 10338 10339 spin_lock(&root->fs_info->block_group_cache_lock); 10340 rb_erase(&block_group->cache_node, 10341 &root->fs_info->block_group_cache_tree); 10342 RB_CLEAR_NODE(&block_group->cache_node); 10343 10344 if (root->fs_info->first_logical_byte == block_group->key.objectid) 10345 root->fs_info->first_logical_byte = (u64)-1; 10346 spin_unlock(&root->fs_info->block_group_cache_lock); 10347 10348 down_write(&block_group->space_info->groups_sem); 10349 /* 10350 * we must use list_del_init so people can check to see if they 10351 * are still on the list after taking the semaphore 10352 */ 10353 list_del_init(&block_group->list); 10354 if (list_empty(&block_group->space_info->block_groups[index])) { 10355 kobj = block_group->space_info->block_group_kobjs[index]; 10356 block_group->space_info->block_group_kobjs[index] = NULL; 10357 clear_avail_alloc_bits(root->fs_info, block_group->flags); 10358 } 10359 up_write(&block_group->space_info->groups_sem); 10360 if (kobj) { 10361 kobject_del(kobj); 10362 kobject_put(kobj); 10363 } 10364 10365 if (block_group->has_caching_ctl) 10366 caching_ctl = get_caching_control(block_group); 10367 if (block_group->cached == BTRFS_CACHE_STARTED) 10368 wait_block_group_cache_done(block_group); 10369 if (block_group->has_caching_ctl) { 10370 down_write(&root->fs_info->commit_root_sem); 10371 if (!caching_ctl) { 10372 struct btrfs_caching_control *ctl; 10373 10374 list_for_each_entry(ctl, 10375 &root->fs_info->caching_block_groups, list) 10376 if (ctl->block_group == block_group) { 10377 caching_ctl = ctl; 10378 atomic_inc(&caching_ctl->count); 10379 break; 10380 } 10381 } 10382 if (caching_ctl) 10383 list_del_init(&caching_ctl->list); 10384 up_write(&root->fs_info->commit_root_sem); 10385 if (caching_ctl) { 10386 /* Once for the caching bgs list and once for us. */ 10387 put_caching_control(caching_ctl); 10388 put_caching_control(caching_ctl); 10389 } 10390 } 10391 10392 spin_lock(&trans->transaction->dirty_bgs_lock); 10393 if (!list_empty(&block_group->dirty_list)) { 10394 WARN_ON(1); 10395 } 10396 if (!list_empty(&block_group->io_list)) { 10397 WARN_ON(1); 10398 } 10399 spin_unlock(&trans->transaction->dirty_bgs_lock); 10400 btrfs_remove_free_space_cache(block_group); 10401 10402 spin_lock(&block_group->space_info->lock); 10403 list_del_init(&block_group->ro_list); 10404 10405 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 10406 WARN_ON(block_group->space_info->total_bytes 10407 < block_group->key.offset); 10408 WARN_ON(block_group->space_info->bytes_readonly 10409 < block_group->key.offset); 10410 WARN_ON(block_group->space_info->disk_total 10411 < block_group->key.offset * factor); 10412 } 10413 block_group->space_info->total_bytes -= block_group->key.offset; 10414 block_group->space_info->bytes_readonly -= block_group->key.offset; 10415 block_group->space_info->disk_total -= block_group->key.offset * factor; 10416 10417 spin_unlock(&block_group->space_info->lock); 10418 10419 memcpy(&key, &block_group->key, sizeof(key)); 10420 10421 lock_chunks(root); 10422 if (!list_empty(&em->list)) { 10423 /* We're in the transaction->pending_chunks list. */ 10424 free_extent_map(em); 10425 } 10426 spin_lock(&block_group->lock); 10427 block_group->removed = 1; 10428 /* 10429 * At this point trimming can't start on this block group, because we 10430 * removed the block group from the tree fs_info->block_group_cache_tree 10431 * so no one can't find it anymore and even if someone already got this 10432 * block group before we removed it from the rbtree, they have already 10433 * incremented block_group->trimming - if they didn't, they won't find 10434 * any free space entries because we already removed them all when we 10435 * called btrfs_remove_free_space_cache(). 10436 * 10437 * And we must not remove the extent map from the fs_info->mapping_tree 10438 * to prevent the same logical address range and physical device space 10439 * ranges from being reused for a new block group. This is because our 10440 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10441 * completely transactionless, so while it is trimming a range the 10442 * currently running transaction might finish and a new one start, 10443 * allowing for new block groups to be created that can reuse the same 10444 * physical device locations unless we take this special care. 10445 * 10446 * There may also be an implicit trim operation if the file system 10447 * is mounted with -odiscard. The same protections must remain 10448 * in place until the extents have been discarded completely when 10449 * the transaction commit has completed. 10450 */ 10451 remove_em = (atomic_read(&block_group->trimming) == 0); 10452 /* 10453 * Make sure a trimmer task always sees the em in the pinned_chunks list 10454 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10455 * before checking block_group->removed). 10456 */ 10457 if (!remove_em) { 10458 /* 10459 * Our em might be in trans->transaction->pending_chunks which 10460 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10461 * and so is the fs_info->pinned_chunks list. 10462 * 10463 * So at this point we must be holding the chunk_mutex to avoid 10464 * any races with chunk allocation (more specifically at 10465 * volumes.c:contains_pending_extent()), to ensure it always 10466 * sees the em, either in the pending_chunks list or in the 10467 * pinned_chunks list. 10468 */ 10469 list_move_tail(&em->list, &root->fs_info->pinned_chunks); 10470 } 10471 spin_unlock(&block_group->lock); 10472 10473 if (remove_em) { 10474 struct extent_map_tree *em_tree; 10475 10476 em_tree = &root->fs_info->mapping_tree.map_tree; 10477 write_lock(&em_tree->lock); 10478 /* 10479 * The em might be in the pending_chunks list, so make sure the 10480 * chunk mutex is locked, since remove_extent_mapping() will 10481 * delete us from that list. 10482 */ 10483 remove_extent_mapping(em_tree, em); 10484 write_unlock(&em_tree->lock); 10485 /* once for the tree */ 10486 free_extent_map(em); 10487 } 10488 10489 unlock_chunks(root); 10490 10491 ret = remove_block_group_free_space(trans, root->fs_info, block_group); 10492 if (ret) 10493 goto out; 10494 10495 btrfs_put_block_group(block_group); 10496 btrfs_put_block_group(block_group); 10497 10498 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10499 if (ret > 0) 10500 ret = -EIO; 10501 if (ret < 0) 10502 goto out; 10503 10504 ret = btrfs_del_item(trans, root, path); 10505 out: 10506 btrfs_free_path(path); 10507 return ret; 10508 } 10509 10510 struct btrfs_trans_handle * 10511 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10512 const u64 chunk_offset) 10513 { 10514 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10515 struct extent_map *em; 10516 struct map_lookup *map; 10517 unsigned int num_items; 10518 10519 read_lock(&em_tree->lock); 10520 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10521 read_unlock(&em_tree->lock); 10522 ASSERT(em && em->start == chunk_offset); 10523 10524 /* 10525 * We need to reserve 3 + N units from the metadata space info in order 10526 * to remove a block group (done at btrfs_remove_chunk() and at 10527 * btrfs_remove_block_group()), which are used for: 10528 * 10529 * 1 unit for adding the free space inode's orphan (located in the tree 10530 * of tree roots). 10531 * 1 unit for deleting the block group item (located in the extent 10532 * tree). 10533 * 1 unit for deleting the free space item (located in tree of tree 10534 * roots). 10535 * N units for deleting N device extent items corresponding to each 10536 * stripe (located in the device tree). 10537 * 10538 * In order to remove a block group we also need to reserve units in the 10539 * system space info in order to update the chunk tree (update one or 10540 * more device items and remove one chunk item), but this is done at 10541 * btrfs_remove_chunk() through a call to check_system_chunk(). 10542 */ 10543 map = em->map_lookup; 10544 num_items = 3 + map->num_stripes; 10545 free_extent_map(em); 10546 10547 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10548 num_items, 1); 10549 } 10550 10551 /* 10552 * Process the unused_bgs list and remove any that don't have any allocated 10553 * space inside of them. 10554 */ 10555 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10556 { 10557 struct btrfs_block_group_cache *block_group; 10558 struct btrfs_space_info *space_info; 10559 struct btrfs_root *root = fs_info->extent_root; 10560 struct btrfs_trans_handle *trans; 10561 int ret = 0; 10562 10563 if (!fs_info->open) 10564 return; 10565 10566 spin_lock(&fs_info->unused_bgs_lock); 10567 while (!list_empty(&fs_info->unused_bgs)) { 10568 u64 start, end; 10569 int trimming; 10570 10571 block_group = list_first_entry(&fs_info->unused_bgs, 10572 struct btrfs_block_group_cache, 10573 bg_list); 10574 list_del_init(&block_group->bg_list); 10575 10576 space_info = block_group->space_info; 10577 10578 if (ret || btrfs_mixed_space_info(space_info)) { 10579 btrfs_put_block_group(block_group); 10580 continue; 10581 } 10582 spin_unlock(&fs_info->unused_bgs_lock); 10583 10584 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10585 10586 /* Don't want to race with allocators so take the groups_sem */ 10587 down_write(&space_info->groups_sem); 10588 spin_lock(&block_group->lock); 10589 if (block_group->reserved || 10590 btrfs_block_group_used(&block_group->item) || 10591 block_group->ro || 10592 list_is_singular(&block_group->list)) { 10593 /* 10594 * We want to bail if we made new allocations or have 10595 * outstanding allocations in this block group. We do 10596 * the ro check in case balance is currently acting on 10597 * this block group. 10598 */ 10599 spin_unlock(&block_group->lock); 10600 up_write(&space_info->groups_sem); 10601 goto next; 10602 } 10603 spin_unlock(&block_group->lock); 10604 10605 /* We don't want to force the issue, only flip if it's ok. */ 10606 ret = inc_block_group_ro(block_group, 0); 10607 up_write(&space_info->groups_sem); 10608 if (ret < 0) { 10609 ret = 0; 10610 goto next; 10611 } 10612 10613 /* 10614 * Want to do this before we do anything else so we can recover 10615 * properly if we fail to join the transaction. 10616 */ 10617 trans = btrfs_start_trans_remove_block_group(fs_info, 10618 block_group->key.objectid); 10619 if (IS_ERR(trans)) { 10620 btrfs_dec_block_group_ro(root, block_group); 10621 ret = PTR_ERR(trans); 10622 goto next; 10623 } 10624 10625 /* 10626 * We could have pending pinned extents for this block group, 10627 * just delete them, we don't care about them anymore. 10628 */ 10629 start = block_group->key.objectid; 10630 end = start + block_group->key.offset - 1; 10631 /* 10632 * Hold the unused_bg_unpin_mutex lock to avoid racing with 10633 * btrfs_finish_extent_commit(). If we are at transaction N, 10634 * another task might be running finish_extent_commit() for the 10635 * previous transaction N - 1, and have seen a range belonging 10636 * to the block group in freed_extents[] before we were able to 10637 * clear the whole block group range from freed_extents[]. This 10638 * means that task can lookup for the block group after we 10639 * unpinned it from freed_extents[] and removed it, leading to 10640 * a BUG_ON() at btrfs_unpin_extent_range(). 10641 */ 10642 mutex_lock(&fs_info->unused_bg_unpin_mutex); 10643 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 10644 EXTENT_DIRTY); 10645 if (ret) { 10646 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10647 btrfs_dec_block_group_ro(root, block_group); 10648 goto end_trans; 10649 } 10650 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 10651 EXTENT_DIRTY); 10652 if (ret) { 10653 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10654 btrfs_dec_block_group_ro(root, block_group); 10655 goto end_trans; 10656 } 10657 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10658 10659 /* Reset pinned so btrfs_put_block_group doesn't complain */ 10660 spin_lock(&space_info->lock); 10661 spin_lock(&block_group->lock); 10662 10663 space_info->bytes_pinned -= block_group->pinned; 10664 space_info->bytes_readonly += block_group->pinned; 10665 percpu_counter_add(&space_info->total_bytes_pinned, 10666 -block_group->pinned); 10667 block_group->pinned = 0; 10668 10669 spin_unlock(&block_group->lock); 10670 spin_unlock(&space_info->lock); 10671 10672 /* DISCARD can flip during remount */ 10673 trimming = btrfs_test_opt(root, DISCARD); 10674 10675 /* Implicit trim during transaction commit. */ 10676 if (trimming) 10677 btrfs_get_block_group_trimming(block_group); 10678 10679 /* 10680 * Btrfs_remove_chunk will abort the transaction if things go 10681 * horribly wrong. 10682 */ 10683 ret = btrfs_remove_chunk(trans, root, 10684 block_group->key.objectid); 10685 10686 if (ret) { 10687 if (trimming) 10688 btrfs_put_block_group_trimming(block_group); 10689 goto end_trans; 10690 } 10691 10692 /* 10693 * If we're not mounted with -odiscard, we can just forget 10694 * about this block group. Otherwise we'll need to wait 10695 * until transaction commit to do the actual discard. 10696 */ 10697 if (trimming) { 10698 spin_lock(&fs_info->unused_bgs_lock); 10699 /* 10700 * A concurrent scrub might have added us to the list 10701 * fs_info->unused_bgs, so use a list_move operation 10702 * to add the block group to the deleted_bgs list. 10703 */ 10704 list_move(&block_group->bg_list, 10705 &trans->transaction->deleted_bgs); 10706 spin_unlock(&fs_info->unused_bgs_lock); 10707 btrfs_get_block_group(block_group); 10708 } 10709 end_trans: 10710 btrfs_end_transaction(trans, root); 10711 next: 10712 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 10713 btrfs_put_block_group(block_group); 10714 spin_lock(&fs_info->unused_bgs_lock); 10715 } 10716 spin_unlock(&fs_info->unused_bgs_lock); 10717 } 10718 10719 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 10720 { 10721 struct btrfs_space_info *space_info; 10722 struct btrfs_super_block *disk_super; 10723 u64 features; 10724 u64 flags; 10725 int mixed = 0; 10726 int ret; 10727 10728 disk_super = fs_info->super_copy; 10729 if (!btrfs_super_root(disk_super)) 10730 return -EINVAL; 10731 10732 features = btrfs_super_incompat_flags(disk_super); 10733 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 10734 mixed = 1; 10735 10736 flags = BTRFS_BLOCK_GROUP_SYSTEM; 10737 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 10738 if (ret) 10739 goto out; 10740 10741 if (mixed) { 10742 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 10743 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 10744 } else { 10745 flags = BTRFS_BLOCK_GROUP_METADATA; 10746 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 10747 if (ret) 10748 goto out; 10749 10750 flags = BTRFS_BLOCK_GROUP_DATA; 10751 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 10752 } 10753 out: 10754 return ret; 10755 } 10756 10757 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 10758 { 10759 return unpin_extent_range(root, start, end, false); 10760 } 10761 10762 /* 10763 * It used to be that old block groups would be left around forever. 10764 * Iterating over them would be enough to trim unused space. Since we 10765 * now automatically remove them, we also need to iterate over unallocated 10766 * space. 10767 * 10768 * We don't want a transaction for this since the discard may take a 10769 * substantial amount of time. We don't require that a transaction be 10770 * running, but we do need to take a running transaction into account 10771 * to ensure that we're not discarding chunks that were released in 10772 * the current transaction. 10773 * 10774 * Holding the chunks lock will prevent other threads from allocating 10775 * or releasing chunks, but it won't prevent a running transaction 10776 * from committing and releasing the memory that the pending chunks 10777 * list head uses. For that, we need to take a reference to the 10778 * transaction. 10779 */ 10780 static int btrfs_trim_free_extents(struct btrfs_device *device, 10781 u64 minlen, u64 *trimmed) 10782 { 10783 u64 start = 0, len = 0; 10784 int ret; 10785 10786 *trimmed = 0; 10787 10788 /* Not writeable = nothing to do. */ 10789 if (!device->writeable) 10790 return 0; 10791 10792 /* No free space = nothing to do. */ 10793 if (device->total_bytes <= device->bytes_used) 10794 return 0; 10795 10796 ret = 0; 10797 10798 while (1) { 10799 struct btrfs_fs_info *fs_info = device->dev_root->fs_info; 10800 struct btrfs_transaction *trans; 10801 u64 bytes; 10802 10803 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 10804 if (ret) 10805 return ret; 10806 10807 down_read(&fs_info->commit_root_sem); 10808 10809 spin_lock(&fs_info->trans_lock); 10810 trans = fs_info->running_transaction; 10811 if (trans) 10812 atomic_inc(&trans->use_count); 10813 spin_unlock(&fs_info->trans_lock); 10814 10815 ret = find_free_dev_extent_start(trans, device, minlen, start, 10816 &start, &len); 10817 if (trans) 10818 btrfs_put_transaction(trans); 10819 10820 if (ret) { 10821 up_read(&fs_info->commit_root_sem); 10822 mutex_unlock(&fs_info->chunk_mutex); 10823 if (ret == -ENOSPC) 10824 ret = 0; 10825 break; 10826 } 10827 10828 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 10829 up_read(&fs_info->commit_root_sem); 10830 mutex_unlock(&fs_info->chunk_mutex); 10831 10832 if (ret) 10833 break; 10834 10835 start += len; 10836 *trimmed += bytes; 10837 10838 if (fatal_signal_pending(current)) { 10839 ret = -ERESTARTSYS; 10840 break; 10841 } 10842 10843 cond_resched(); 10844 } 10845 10846 return ret; 10847 } 10848 10849 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 10850 { 10851 struct btrfs_fs_info *fs_info = root->fs_info; 10852 struct btrfs_block_group_cache *cache = NULL; 10853 struct btrfs_device *device; 10854 struct list_head *devices; 10855 u64 group_trimmed; 10856 u64 start; 10857 u64 end; 10858 u64 trimmed = 0; 10859 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 10860 int ret = 0; 10861 10862 /* 10863 * try to trim all FS space, our block group may start from non-zero. 10864 */ 10865 if (range->len == total_bytes) 10866 cache = btrfs_lookup_first_block_group(fs_info, range->start); 10867 else 10868 cache = btrfs_lookup_block_group(fs_info, range->start); 10869 10870 while (cache) { 10871 if (cache->key.objectid >= (range->start + range->len)) { 10872 btrfs_put_block_group(cache); 10873 break; 10874 } 10875 10876 start = max(range->start, cache->key.objectid); 10877 end = min(range->start + range->len, 10878 cache->key.objectid + cache->key.offset); 10879 10880 if (end - start >= range->minlen) { 10881 if (!block_group_cache_done(cache)) { 10882 ret = cache_block_group(cache, 0); 10883 if (ret) { 10884 btrfs_put_block_group(cache); 10885 break; 10886 } 10887 ret = wait_block_group_cache_done(cache); 10888 if (ret) { 10889 btrfs_put_block_group(cache); 10890 break; 10891 } 10892 } 10893 ret = btrfs_trim_block_group(cache, 10894 &group_trimmed, 10895 start, 10896 end, 10897 range->minlen); 10898 10899 trimmed += group_trimmed; 10900 if (ret) { 10901 btrfs_put_block_group(cache); 10902 break; 10903 } 10904 } 10905 10906 cache = next_block_group(fs_info->tree_root, cache); 10907 } 10908 10909 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 10910 devices = &root->fs_info->fs_devices->alloc_list; 10911 list_for_each_entry(device, devices, dev_alloc_list) { 10912 ret = btrfs_trim_free_extents(device, range->minlen, 10913 &group_trimmed); 10914 if (ret) 10915 break; 10916 10917 trimmed += group_trimmed; 10918 } 10919 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 10920 10921 range->len = trimmed; 10922 return ret; 10923 } 10924 10925 /* 10926 * btrfs_{start,end}_write_no_snapshoting() are similar to 10927 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 10928 * data into the page cache through nocow before the subvolume is snapshoted, 10929 * but flush the data into disk after the snapshot creation, or to prevent 10930 * operations while snapshoting is ongoing and that cause the snapshot to be 10931 * inconsistent (writes followed by expanding truncates for example). 10932 */ 10933 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 10934 { 10935 percpu_counter_dec(&root->subv_writers->counter); 10936 /* 10937 * Make sure counter is updated before we wake up waiters. 10938 */ 10939 smp_mb(); 10940 if (waitqueue_active(&root->subv_writers->wait)) 10941 wake_up(&root->subv_writers->wait); 10942 } 10943 10944 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 10945 { 10946 if (atomic_read(&root->will_be_snapshoted)) 10947 return 0; 10948 10949 percpu_counter_inc(&root->subv_writers->counter); 10950 /* 10951 * Make sure counter is updated before we check for snapshot creation. 10952 */ 10953 smp_mb(); 10954 if (atomic_read(&root->will_be_snapshoted)) { 10955 btrfs_end_write_no_snapshoting(root); 10956 return 0; 10957 } 10958 return 1; 10959 } 10960 10961 static int wait_snapshoting_atomic_t(atomic_t *a) 10962 { 10963 schedule(); 10964 return 0; 10965 } 10966 10967 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 10968 { 10969 while (true) { 10970 int ret; 10971 10972 ret = btrfs_start_write_no_snapshoting(root); 10973 if (ret) 10974 break; 10975 wait_on_atomic_t(&root->will_be_snapshoted, 10976 wait_snapshoting_atomic_t, 10977 TASK_UNINTERRUPTIBLE); 10978 } 10979 } 10980