1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "math.h" 37 #include "sysfs.h" 38 #include "qgroup.h" 39 40 #undef SCRAMBLE_DELAYED_REFS 41 42 /* 43 * control flags for do_chunk_alloc's force field 44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 45 * if we really need one. 46 * 47 * CHUNK_ALLOC_LIMITED means to only try and allocate one 48 * if we have very few chunks already allocated. This is 49 * used as part of the clustering code to help make sure 50 * we have a good pool of storage to cluster in, without 51 * filling the FS with empty chunks 52 * 53 * CHUNK_ALLOC_FORCE means it must try to allocate one 54 * 55 */ 56 enum { 57 CHUNK_ALLOC_NO_FORCE = 0, 58 CHUNK_ALLOC_LIMITED = 1, 59 CHUNK_ALLOC_FORCE = 2, 60 }; 61 62 /* 63 * Control how reservations are dealt with. 64 * 65 * RESERVE_FREE - freeing a reservation. 66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 67 * ENOSPC accounting 68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 69 * bytes_may_use as the ENOSPC accounting is done elsewhere 70 */ 71 enum { 72 RESERVE_FREE = 0, 73 RESERVE_ALLOC = 1, 74 RESERVE_ALLOC_NO_ACCOUNT = 2, 75 }; 76 77 static int update_block_group(struct btrfs_trans_handle *trans, 78 struct btrfs_root *root, u64 bytenr, 79 u64 num_bytes, int alloc); 80 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 81 struct btrfs_root *root, 82 u64 bytenr, u64 num_bytes, u64 parent, 83 u64 root_objectid, u64 owner_objectid, 84 u64 owner_offset, int refs_to_drop, 85 struct btrfs_delayed_extent_op *extra_op, 86 int no_quota); 87 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 88 struct extent_buffer *leaf, 89 struct btrfs_extent_item *ei); 90 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 91 struct btrfs_root *root, 92 u64 parent, u64 root_objectid, 93 u64 flags, u64 owner, u64 offset, 94 struct btrfs_key *ins, int ref_mod); 95 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 96 struct btrfs_root *root, 97 u64 parent, u64 root_objectid, 98 u64 flags, struct btrfs_disk_key *key, 99 int level, struct btrfs_key *ins, 100 int no_quota); 101 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 102 struct btrfs_root *extent_root, u64 flags, 103 int force); 104 static int find_next_key(struct btrfs_path *path, int level, 105 struct btrfs_key *key); 106 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 107 int dump_block_groups); 108 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 109 u64 num_bytes, int reserve, 110 int delalloc); 111 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 112 u64 num_bytes); 113 int btrfs_pin_extent(struct btrfs_root *root, 114 u64 bytenr, u64 num_bytes, int reserved); 115 116 static noinline int 117 block_group_cache_done(struct btrfs_block_group_cache *cache) 118 { 119 smp_mb(); 120 return cache->cached == BTRFS_CACHE_FINISHED || 121 cache->cached == BTRFS_CACHE_ERROR; 122 } 123 124 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 125 { 126 return (cache->flags & bits) == bits; 127 } 128 129 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 130 { 131 atomic_inc(&cache->count); 132 } 133 134 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 135 { 136 if (atomic_dec_and_test(&cache->count)) { 137 WARN_ON(cache->pinned > 0); 138 WARN_ON(cache->reserved > 0); 139 kfree(cache->free_space_ctl); 140 kfree(cache); 141 } 142 } 143 144 /* 145 * this adds the block group to the fs_info rb tree for the block group 146 * cache 147 */ 148 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 149 struct btrfs_block_group_cache *block_group) 150 { 151 struct rb_node **p; 152 struct rb_node *parent = NULL; 153 struct btrfs_block_group_cache *cache; 154 155 spin_lock(&info->block_group_cache_lock); 156 p = &info->block_group_cache_tree.rb_node; 157 158 while (*p) { 159 parent = *p; 160 cache = rb_entry(parent, struct btrfs_block_group_cache, 161 cache_node); 162 if (block_group->key.objectid < cache->key.objectid) { 163 p = &(*p)->rb_left; 164 } else if (block_group->key.objectid > cache->key.objectid) { 165 p = &(*p)->rb_right; 166 } else { 167 spin_unlock(&info->block_group_cache_lock); 168 return -EEXIST; 169 } 170 } 171 172 rb_link_node(&block_group->cache_node, parent, p); 173 rb_insert_color(&block_group->cache_node, 174 &info->block_group_cache_tree); 175 176 if (info->first_logical_byte > block_group->key.objectid) 177 info->first_logical_byte = block_group->key.objectid; 178 179 spin_unlock(&info->block_group_cache_lock); 180 181 return 0; 182 } 183 184 /* 185 * This will return the block group at or after bytenr if contains is 0, else 186 * it will return the block group that contains the bytenr 187 */ 188 static struct btrfs_block_group_cache * 189 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 190 int contains) 191 { 192 struct btrfs_block_group_cache *cache, *ret = NULL; 193 struct rb_node *n; 194 u64 end, start; 195 196 spin_lock(&info->block_group_cache_lock); 197 n = info->block_group_cache_tree.rb_node; 198 199 while (n) { 200 cache = rb_entry(n, struct btrfs_block_group_cache, 201 cache_node); 202 end = cache->key.objectid + cache->key.offset - 1; 203 start = cache->key.objectid; 204 205 if (bytenr < start) { 206 if (!contains && (!ret || start < ret->key.objectid)) 207 ret = cache; 208 n = n->rb_left; 209 } else if (bytenr > start) { 210 if (contains && bytenr <= end) { 211 ret = cache; 212 break; 213 } 214 n = n->rb_right; 215 } else { 216 ret = cache; 217 break; 218 } 219 } 220 if (ret) { 221 btrfs_get_block_group(ret); 222 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 223 info->first_logical_byte = ret->key.objectid; 224 } 225 spin_unlock(&info->block_group_cache_lock); 226 227 return ret; 228 } 229 230 static int add_excluded_extent(struct btrfs_root *root, 231 u64 start, u64 num_bytes) 232 { 233 u64 end = start + num_bytes - 1; 234 set_extent_bits(&root->fs_info->freed_extents[0], 235 start, end, EXTENT_UPTODATE, GFP_NOFS); 236 set_extent_bits(&root->fs_info->freed_extents[1], 237 start, end, EXTENT_UPTODATE, GFP_NOFS); 238 return 0; 239 } 240 241 static void free_excluded_extents(struct btrfs_root *root, 242 struct btrfs_block_group_cache *cache) 243 { 244 u64 start, end; 245 246 start = cache->key.objectid; 247 end = start + cache->key.offset - 1; 248 249 clear_extent_bits(&root->fs_info->freed_extents[0], 250 start, end, EXTENT_UPTODATE, GFP_NOFS); 251 clear_extent_bits(&root->fs_info->freed_extents[1], 252 start, end, EXTENT_UPTODATE, GFP_NOFS); 253 } 254 255 static int exclude_super_stripes(struct btrfs_root *root, 256 struct btrfs_block_group_cache *cache) 257 { 258 u64 bytenr; 259 u64 *logical; 260 int stripe_len; 261 int i, nr, ret; 262 263 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 264 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 265 cache->bytes_super += stripe_len; 266 ret = add_excluded_extent(root, cache->key.objectid, 267 stripe_len); 268 if (ret) 269 return ret; 270 } 271 272 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 273 bytenr = btrfs_sb_offset(i); 274 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 275 cache->key.objectid, bytenr, 276 0, &logical, &nr, &stripe_len); 277 if (ret) 278 return ret; 279 280 while (nr--) { 281 u64 start, len; 282 283 if (logical[nr] > cache->key.objectid + 284 cache->key.offset) 285 continue; 286 287 if (logical[nr] + stripe_len <= cache->key.objectid) 288 continue; 289 290 start = logical[nr]; 291 if (start < cache->key.objectid) { 292 start = cache->key.objectid; 293 len = (logical[nr] + stripe_len) - start; 294 } else { 295 len = min_t(u64, stripe_len, 296 cache->key.objectid + 297 cache->key.offset - start); 298 } 299 300 cache->bytes_super += len; 301 ret = add_excluded_extent(root, start, len); 302 if (ret) { 303 kfree(logical); 304 return ret; 305 } 306 } 307 308 kfree(logical); 309 } 310 return 0; 311 } 312 313 static struct btrfs_caching_control * 314 get_caching_control(struct btrfs_block_group_cache *cache) 315 { 316 struct btrfs_caching_control *ctl; 317 318 spin_lock(&cache->lock); 319 if (!cache->caching_ctl) { 320 spin_unlock(&cache->lock); 321 return NULL; 322 } 323 324 ctl = cache->caching_ctl; 325 atomic_inc(&ctl->count); 326 spin_unlock(&cache->lock); 327 return ctl; 328 } 329 330 static void put_caching_control(struct btrfs_caching_control *ctl) 331 { 332 if (atomic_dec_and_test(&ctl->count)) 333 kfree(ctl); 334 } 335 336 /* 337 * this is only called by cache_block_group, since we could have freed extents 338 * we need to check the pinned_extents for any extents that can't be used yet 339 * since their free space will be released as soon as the transaction commits. 340 */ 341 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 342 struct btrfs_fs_info *info, u64 start, u64 end) 343 { 344 u64 extent_start, extent_end, size, total_added = 0; 345 int ret; 346 347 while (start < end) { 348 ret = find_first_extent_bit(info->pinned_extents, start, 349 &extent_start, &extent_end, 350 EXTENT_DIRTY | EXTENT_UPTODATE, 351 NULL); 352 if (ret) 353 break; 354 355 if (extent_start <= start) { 356 start = extent_end + 1; 357 } else if (extent_start > start && extent_start < end) { 358 size = extent_start - start; 359 total_added += size; 360 ret = btrfs_add_free_space(block_group, start, 361 size); 362 BUG_ON(ret); /* -ENOMEM or logic error */ 363 start = extent_end + 1; 364 } else { 365 break; 366 } 367 } 368 369 if (start < end) { 370 size = end - start; 371 total_added += size; 372 ret = btrfs_add_free_space(block_group, start, size); 373 BUG_ON(ret); /* -ENOMEM or logic error */ 374 } 375 376 return total_added; 377 } 378 379 static noinline void caching_thread(struct btrfs_work *work) 380 { 381 struct btrfs_block_group_cache *block_group; 382 struct btrfs_fs_info *fs_info; 383 struct btrfs_caching_control *caching_ctl; 384 struct btrfs_root *extent_root; 385 struct btrfs_path *path; 386 struct extent_buffer *leaf; 387 struct btrfs_key key; 388 u64 total_found = 0; 389 u64 last = 0; 390 u32 nritems; 391 int ret = -ENOMEM; 392 393 caching_ctl = container_of(work, struct btrfs_caching_control, work); 394 block_group = caching_ctl->block_group; 395 fs_info = block_group->fs_info; 396 extent_root = fs_info->extent_root; 397 398 path = btrfs_alloc_path(); 399 if (!path) 400 goto out; 401 402 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 403 404 /* 405 * We don't want to deadlock with somebody trying to allocate a new 406 * extent for the extent root while also trying to search the extent 407 * root to add free space. So we skip locking and search the commit 408 * root, since its read-only 409 */ 410 path->skip_locking = 1; 411 path->search_commit_root = 1; 412 path->reada = 1; 413 414 key.objectid = last; 415 key.offset = 0; 416 key.type = BTRFS_EXTENT_ITEM_KEY; 417 again: 418 mutex_lock(&caching_ctl->mutex); 419 /* need to make sure the commit_root doesn't disappear */ 420 down_read(&fs_info->commit_root_sem); 421 422 next: 423 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 424 if (ret < 0) 425 goto err; 426 427 leaf = path->nodes[0]; 428 nritems = btrfs_header_nritems(leaf); 429 430 while (1) { 431 if (btrfs_fs_closing(fs_info) > 1) { 432 last = (u64)-1; 433 break; 434 } 435 436 if (path->slots[0] < nritems) { 437 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 438 } else { 439 ret = find_next_key(path, 0, &key); 440 if (ret) 441 break; 442 443 if (need_resched() || 444 rwsem_is_contended(&fs_info->commit_root_sem)) { 445 caching_ctl->progress = last; 446 btrfs_release_path(path); 447 up_read(&fs_info->commit_root_sem); 448 mutex_unlock(&caching_ctl->mutex); 449 cond_resched(); 450 goto again; 451 } 452 453 ret = btrfs_next_leaf(extent_root, path); 454 if (ret < 0) 455 goto err; 456 if (ret) 457 break; 458 leaf = path->nodes[0]; 459 nritems = btrfs_header_nritems(leaf); 460 continue; 461 } 462 463 if (key.objectid < last) { 464 key.objectid = last; 465 key.offset = 0; 466 key.type = BTRFS_EXTENT_ITEM_KEY; 467 468 caching_ctl->progress = last; 469 btrfs_release_path(path); 470 goto next; 471 } 472 473 if (key.objectid < block_group->key.objectid) { 474 path->slots[0]++; 475 continue; 476 } 477 478 if (key.objectid >= block_group->key.objectid + 479 block_group->key.offset) 480 break; 481 482 if (key.type == BTRFS_EXTENT_ITEM_KEY || 483 key.type == BTRFS_METADATA_ITEM_KEY) { 484 total_found += add_new_free_space(block_group, 485 fs_info, last, 486 key.objectid); 487 if (key.type == BTRFS_METADATA_ITEM_KEY) 488 last = key.objectid + 489 fs_info->tree_root->nodesize; 490 else 491 last = key.objectid + key.offset; 492 493 if (total_found > (1024 * 1024 * 2)) { 494 total_found = 0; 495 wake_up(&caching_ctl->wait); 496 } 497 } 498 path->slots[0]++; 499 } 500 ret = 0; 501 502 total_found += add_new_free_space(block_group, fs_info, last, 503 block_group->key.objectid + 504 block_group->key.offset); 505 caching_ctl->progress = (u64)-1; 506 507 spin_lock(&block_group->lock); 508 block_group->caching_ctl = NULL; 509 block_group->cached = BTRFS_CACHE_FINISHED; 510 spin_unlock(&block_group->lock); 511 512 err: 513 btrfs_free_path(path); 514 up_read(&fs_info->commit_root_sem); 515 516 free_excluded_extents(extent_root, block_group); 517 518 mutex_unlock(&caching_ctl->mutex); 519 out: 520 if (ret) { 521 spin_lock(&block_group->lock); 522 block_group->caching_ctl = NULL; 523 block_group->cached = BTRFS_CACHE_ERROR; 524 spin_unlock(&block_group->lock); 525 } 526 wake_up(&caching_ctl->wait); 527 528 put_caching_control(caching_ctl); 529 btrfs_put_block_group(block_group); 530 } 531 532 static int cache_block_group(struct btrfs_block_group_cache *cache, 533 int load_cache_only) 534 { 535 DEFINE_WAIT(wait); 536 struct btrfs_fs_info *fs_info = cache->fs_info; 537 struct btrfs_caching_control *caching_ctl; 538 int ret = 0; 539 540 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 541 if (!caching_ctl) 542 return -ENOMEM; 543 544 INIT_LIST_HEAD(&caching_ctl->list); 545 mutex_init(&caching_ctl->mutex); 546 init_waitqueue_head(&caching_ctl->wait); 547 caching_ctl->block_group = cache; 548 caching_ctl->progress = cache->key.objectid; 549 atomic_set(&caching_ctl->count, 1); 550 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 551 caching_thread, NULL, NULL); 552 553 spin_lock(&cache->lock); 554 /* 555 * This should be a rare occasion, but this could happen I think in the 556 * case where one thread starts to load the space cache info, and then 557 * some other thread starts a transaction commit which tries to do an 558 * allocation while the other thread is still loading the space cache 559 * info. The previous loop should have kept us from choosing this block 560 * group, but if we've moved to the state where we will wait on caching 561 * block groups we need to first check if we're doing a fast load here, 562 * so we can wait for it to finish, otherwise we could end up allocating 563 * from a block group who's cache gets evicted for one reason or 564 * another. 565 */ 566 while (cache->cached == BTRFS_CACHE_FAST) { 567 struct btrfs_caching_control *ctl; 568 569 ctl = cache->caching_ctl; 570 atomic_inc(&ctl->count); 571 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 572 spin_unlock(&cache->lock); 573 574 schedule(); 575 576 finish_wait(&ctl->wait, &wait); 577 put_caching_control(ctl); 578 spin_lock(&cache->lock); 579 } 580 581 if (cache->cached != BTRFS_CACHE_NO) { 582 spin_unlock(&cache->lock); 583 kfree(caching_ctl); 584 return 0; 585 } 586 WARN_ON(cache->caching_ctl); 587 cache->caching_ctl = caching_ctl; 588 cache->cached = BTRFS_CACHE_FAST; 589 spin_unlock(&cache->lock); 590 591 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 592 mutex_lock(&caching_ctl->mutex); 593 ret = load_free_space_cache(fs_info, cache); 594 595 spin_lock(&cache->lock); 596 if (ret == 1) { 597 cache->caching_ctl = NULL; 598 cache->cached = BTRFS_CACHE_FINISHED; 599 cache->last_byte_to_unpin = (u64)-1; 600 caching_ctl->progress = (u64)-1; 601 } else { 602 if (load_cache_only) { 603 cache->caching_ctl = NULL; 604 cache->cached = BTRFS_CACHE_NO; 605 } else { 606 cache->cached = BTRFS_CACHE_STARTED; 607 cache->has_caching_ctl = 1; 608 } 609 } 610 spin_unlock(&cache->lock); 611 mutex_unlock(&caching_ctl->mutex); 612 613 wake_up(&caching_ctl->wait); 614 if (ret == 1) { 615 put_caching_control(caching_ctl); 616 free_excluded_extents(fs_info->extent_root, cache); 617 return 0; 618 } 619 } else { 620 /* 621 * We are not going to do the fast caching, set cached to the 622 * appropriate value and wakeup any waiters. 623 */ 624 spin_lock(&cache->lock); 625 if (load_cache_only) { 626 cache->caching_ctl = NULL; 627 cache->cached = BTRFS_CACHE_NO; 628 } else { 629 cache->cached = BTRFS_CACHE_STARTED; 630 cache->has_caching_ctl = 1; 631 } 632 spin_unlock(&cache->lock); 633 wake_up(&caching_ctl->wait); 634 } 635 636 if (load_cache_only) { 637 put_caching_control(caching_ctl); 638 return 0; 639 } 640 641 down_write(&fs_info->commit_root_sem); 642 atomic_inc(&caching_ctl->count); 643 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 644 up_write(&fs_info->commit_root_sem); 645 646 btrfs_get_block_group(cache); 647 648 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 649 650 return ret; 651 } 652 653 /* 654 * return the block group that starts at or after bytenr 655 */ 656 static struct btrfs_block_group_cache * 657 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 658 { 659 struct btrfs_block_group_cache *cache; 660 661 cache = block_group_cache_tree_search(info, bytenr, 0); 662 663 return cache; 664 } 665 666 /* 667 * return the block group that contains the given bytenr 668 */ 669 struct btrfs_block_group_cache *btrfs_lookup_block_group( 670 struct btrfs_fs_info *info, 671 u64 bytenr) 672 { 673 struct btrfs_block_group_cache *cache; 674 675 cache = block_group_cache_tree_search(info, bytenr, 1); 676 677 return cache; 678 } 679 680 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 681 u64 flags) 682 { 683 struct list_head *head = &info->space_info; 684 struct btrfs_space_info *found; 685 686 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 687 688 rcu_read_lock(); 689 list_for_each_entry_rcu(found, head, list) { 690 if (found->flags & flags) { 691 rcu_read_unlock(); 692 return found; 693 } 694 } 695 rcu_read_unlock(); 696 return NULL; 697 } 698 699 /* 700 * after adding space to the filesystem, we need to clear the full flags 701 * on all the space infos. 702 */ 703 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 704 { 705 struct list_head *head = &info->space_info; 706 struct btrfs_space_info *found; 707 708 rcu_read_lock(); 709 list_for_each_entry_rcu(found, head, list) 710 found->full = 0; 711 rcu_read_unlock(); 712 } 713 714 /* simple helper to search for an existing data extent at a given offset */ 715 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) 716 { 717 int ret; 718 struct btrfs_key key; 719 struct btrfs_path *path; 720 721 path = btrfs_alloc_path(); 722 if (!path) 723 return -ENOMEM; 724 725 key.objectid = start; 726 key.offset = len; 727 key.type = BTRFS_EXTENT_ITEM_KEY; 728 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 729 0, 0); 730 btrfs_free_path(path); 731 return ret; 732 } 733 734 /* 735 * helper function to lookup reference count and flags of a tree block. 736 * 737 * the head node for delayed ref is used to store the sum of all the 738 * reference count modifications queued up in the rbtree. the head 739 * node may also store the extent flags to set. This way you can check 740 * to see what the reference count and extent flags would be if all of 741 * the delayed refs are not processed. 742 */ 743 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 744 struct btrfs_root *root, u64 bytenr, 745 u64 offset, int metadata, u64 *refs, u64 *flags) 746 { 747 struct btrfs_delayed_ref_head *head; 748 struct btrfs_delayed_ref_root *delayed_refs; 749 struct btrfs_path *path; 750 struct btrfs_extent_item *ei; 751 struct extent_buffer *leaf; 752 struct btrfs_key key; 753 u32 item_size; 754 u64 num_refs; 755 u64 extent_flags; 756 int ret; 757 758 /* 759 * If we don't have skinny metadata, don't bother doing anything 760 * different 761 */ 762 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 763 offset = root->nodesize; 764 metadata = 0; 765 } 766 767 path = btrfs_alloc_path(); 768 if (!path) 769 return -ENOMEM; 770 771 if (!trans) { 772 path->skip_locking = 1; 773 path->search_commit_root = 1; 774 } 775 776 search_again: 777 key.objectid = bytenr; 778 key.offset = offset; 779 if (metadata) 780 key.type = BTRFS_METADATA_ITEM_KEY; 781 else 782 key.type = BTRFS_EXTENT_ITEM_KEY; 783 784 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 785 &key, path, 0, 0); 786 if (ret < 0) 787 goto out_free; 788 789 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 790 if (path->slots[0]) { 791 path->slots[0]--; 792 btrfs_item_key_to_cpu(path->nodes[0], &key, 793 path->slots[0]); 794 if (key.objectid == bytenr && 795 key.type == BTRFS_EXTENT_ITEM_KEY && 796 key.offset == root->nodesize) 797 ret = 0; 798 } 799 } 800 801 if (ret == 0) { 802 leaf = path->nodes[0]; 803 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 804 if (item_size >= sizeof(*ei)) { 805 ei = btrfs_item_ptr(leaf, path->slots[0], 806 struct btrfs_extent_item); 807 num_refs = btrfs_extent_refs(leaf, ei); 808 extent_flags = btrfs_extent_flags(leaf, ei); 809 } else { 810 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 811 struct btrfs_extent_item_v0 *ei0; 812 BUG_ON(item_size != sizeof(*ei0)); 813 ei0 = btrfs_item_ptr(leaf, path->slots[0], 814 struct btrfs_extent_item_v0); 815 num_refs = btrfs_extent_refs_v0(leaf, ei0); 816 /* FIXME: this isn't correct for data */ 817 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 818 #else 819 BUG(); 820 #endif 821 } 822 BUG_ON(num_refs == 0); 823 } else { 824 num_refs = 0; 825 extent_flags = 0; 826 ret = 0; 827 } 828 829 if (!trans) 830 goto out; 831 832 delayed_refs = &trans->transaction->delayed_refs; 833 spin_lock(&delayed_refs->lock); 834 head = btrfs_find_delayed_ref_head(trans, bytenr); 835 if (head) { 836 if (!mutex_trylock(&head->mutex)) { 837 atomic_inc(&head->node.refs); 838 spin_unlock(&delayed_refs->lock); 839 840 btrfs_release_path(path); 841 842 /* 843 * Mutex was contended, block until it's released and try 844 * again 845 */ 846 mutex_lock(&head->mutex); 847 mutex_unlock(&head->mutex); 848 btrfs_put_delayed_ref(&head->node); 849 goto search_again; 850 } 851 spin_lock(&head->lock); 852 if (head->extent_op && head->extent_op->update_flags) 853 extent_flags |= head->extent_op->flags_to_set; 854 else 855 BUG_ON(num_refs == 0); 856 857 num_refs += head->node.ref_mod; 858 spin_unlock(&head->lock); 859 mutex_unlock(&head->mutex); 860 } 861 spin_unlock(&delayed_refs->lock); 862 out: 863 WARN_ON(num_refs == 0); 864 if (refs) 865 *refs = num_refs; 866 if (flags) 867 *flags = extent_flags; 868 out_free: 869 btrfs_free_path(path); 870 return ret; 871 } 872 873 /* 874 * Back reference rules. Back refs have three main goals: 875 * 876 * 1) differentiate between all holders of references to an extent so that 877 * when a reference is dropped we can make sure it was a valid reference 878 * before freeing the extent. 879 * 880 * 2) Provide enough information to quickly find the holders of an extent 881 * if we notice a given block is corrupted or bad. 882 * 883 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 884 * maintenance. This is actually the same as #2, but with a slightly 885 * different use case. 886 * 887 * There are two kinds of back refs. The implicit back refs is optimized 888 * for pointers in non-shared tree blocks. For a given pointer in a block, 889 * back refs of this kind provide information about the block's owner tree 890 * and the pointer's key. These information allow us to find the block by 891 * b-tree searching. The full back refs is for pointers in tree blocks not 892 * referenced by their owner trees. The location of tree block is recorded 893 * in the back refs. Actually the full back refs is generic, and can be 894 * used in all cases the implicit back refs is used. The major shortcoming 895 * of the full back refs is its overhead. Every time a tree block gets 896 * COWed, we have to update back refs entry for all pointers in it. 897 * 898 * For a newly allocated tree block, we use implicit back refs for 899 * pointers in it. This means most tree related operations only involve 900 * implicit back refs. For a tree block created in old transaction, the 901 * only way to drop a reference to it is COW it. So we can detect the 902 * event that tree block loses its owner tree's reference and do the 903 * back refs conversion. 904 * 905 * When a tree block is COW'd through a tree, there are four cases: 906 * 907 * The reference count of the block is one and the tree is the block's 908 * owner tree. Nothing to do in this case. 909 * 910 * The reference count of the block is one and the tree is not the 911 * block's owner tree. In this case, full back refs is used for pointers 912 * in the block. Remove these full back refs, add implicit back refs for 913 * every pointers in the new block. 914 * 915 * The reference count of the block is greater than one and the tree is 916 * the block's owner tree. In this case, implicit back refs is used for 917 * pointers in the block. Add full back refs for every pointers in the 918 * block, increase lower level extents' reference counts. The original 919 * implicit back refs are entailed to the new block. 920 * 921 * The reference count of the block is greater than one and the tree is 922 * not the block's owner tree. Add implicit back refs for every pointer in 923 * the new block, increase lower level extents' reference count. 924 * 925 * Back Reference Key composing: 926 * 927 * The key objectid corresponds to the first byte in the extent, 928 * The key type is used to differentiate between types of back refs. 929 * There are different meanings of the key offset for different types 930 * of back refs. 931 * 932 * File extents can be referenced by: 933 * 934 * - multiple snapshots, subvolumes, or different generations in one subvol 935 * - different files inside a single subvolume 936 * - different offsets inside a file (bookend extents in file.c) 937 * 938 * The extent ref structure for the implicit back refs has fields for: 939 * 940 * - Objectid of the subvolume root 941 * - objectid of the file holding the reference 942 * - original offset in the file 943 * - how many bookend extents 944 * 945 * The key offset for the implicit back refs is hash of the first 946 * three fields. 947 * 948 * The extent ref structure for the full back refs has field for: 949 * 950 * - number of pointers in the tree leaf 951 * 952 * The key offset for the implicit back refs is the first byte of 953 * the tree leaf 954 * 955 * When a file extent is allocated, The implicit back refs is used. 956 * the fields are filled in: 957 * 958 * (root_key.objectid, inode objectid, offset in file, 1) 959 * 960 * When a file extent is removed file truncation, we find the 961 * corresponding implicit back refs and check the following fields: 962 * 963 * (btrfs_header_owner(leaf), inode objectid, offset in file) 964 * 965 * Btree extents can be referenced by: 966 * 967 * - Different subvolumes 968 * 969 * Both the implicit back refs and the full back refs for tree blocks 970 * only consist of key. The key offset for the implicit back refs is 971 * objectid of block's owner tree. The key offset for the full back refs 972 * is the first byte of parent block. 973 * 974 * When implicit back refs is used, information about the lowest key and 975 * level of the tree block are required. These information are stored in 976 * tree block info structure. 977 */ 978 979 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 980 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 981 struct btrfs_root *root, 982 struct btrfs_path *path, 983 u64 owner, u32 extra_size) 984 { 985 struct btrfs_extent_item *item; 986 struct btrfs_extent_item_v0 *ei0; 987 struct btrfs_extent_ref_v0 *ref0; 988 struct btrfs_tree_block_info *bi; 989 struct extent_buffer *leaf; 990 struct btrfs_key key; 991 struct btrfs_key found_key; 992 u32 new_size = sizeof(*item); 993 u64 refs; 994 int ret; 995 996 leaf = path->nodes[0]; 997 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 998 999 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1000 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1001 struct btrfs_extent_item_v0); 1002 refs = btrfs_extent_refs_v0(leaf, ei0); 1003 1004 if (owner == (u64)-1) { 1005 while (1) { 1006 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1007 ret = btrfs_next_leaf(root, path); 1008 if (ret < 0) 1009 return ret; 1010 BUG_ON(ret > 0); /* Corruption */ 1011 leaf = path->nodes[0]; 1012 } 1013 btrfs_item_key_to_cpu(leaf, &found_key, 1014 path->slots[0]); 1015 BUG_ON(key.objectid != found_key.objectid); 1016 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1017 path->slots[0]++; 1018 continue; 1019 } 1020 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1021 struct btrfs_extent_ref_v0); 1022 owner = btrfs_ref_objectid_v0(leaf, ref0); 1023 break; 1024 } 1025 } 1026 btrfs_release_path(path); 1027 1028 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1029 new_size += sizeof(*bi); 1030 1031 new_size -= sizeof(*ei0); 1032 ret = btrfs_search_slot(trans, root, &key, path, 1033 new_size + extra_size, 1); 1034 if (ret < 0) 1035 return ret; 1036 BUG_ON(ret); /* Corruption */ 1037 1038 btrfs_extend_item(root, path, new_size); 1039 1040 leaf = path->nodes[0]; 1041 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1042 btrfs_set_extent_refs(leaf, item, refs); 1043 /* FIXME: get real generation */ 1044 btrfs_set_extent_generation(leaf, item, 0); 1045 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1046 btrfs_set_extent_flags(leaf, item, 1047 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1048 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1049 bi = (struct btrfs_tree_block_info *)(item + 1); 1050 /* FIXME: get first key of the block */ 1051 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1052 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1053 } else { 1054 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1055 } 1056 btrfs_mark_buffer_dirty(leaf); 1057 return 0; 1058 } 1059 #endif 1060 1061 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1062 { 1063 u32 high_crc = ~(u32)0; 1064 u32 low_crc = ~(u32)0; 1065 __le64 lenum; 1066 1067 lenum = cpu_to_le64(root_objectid); 1068 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1069 lenum = cpu_to_le64(owner); 1070 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1071 lenum = cpu_to_le64(offset); 1072 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1073 1074 return ((u64)high_crc << 31) ^ (u64)low_crc; 1075 } 1076 1077 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1078 struct btrfs_extent_data_ref *ref) 1079 { 1080 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1081 btrfs_extent_data_ref_objectid(leaf, ref), 1082 btrfs_extent_data_ref_offset(leaf, ref)); 1083 } 1084 1085 static int match_extent_data_ref(struct extent_buffer *leaf, 1086 struct btrfs_extent_data_ref *ref, 1087 u64 root_objectid, u64 owner, u64 offset) 1088 { 1089 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1090 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1091 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1092 return 0; 1093 return 1; 1094 } 1095 1096 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1097 struct btrfs_root *root, 1098 struct btrfs_path *path, 1099 u64 bytenr, u64 parent, 1100 u64 root_objectid, 1101 u64 owner, u64 offset) 1102 { 1103 struct btrfs_key key; 1104 struct btrfs_extent_data_ref *ref; 1105 struct extent_buffer *leaf; 1106 u32 nritems; 1107 int ret; 1108 int recow; 1109 int err = -ENOENT; 1110 1111 key.objectid = bytenr; 1112 if (parent) { 1113 key.type = BTRFS_SHARED_DATA_REF_KEY; 1114 key.offset = parent; 1115 } else { 1116 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1117 key.offset = hash_extent_data_ref(root_objectid, 1118 owner, offset); 1119 } 1120 again: 1121 recow = 0; 1122 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1123 if (ret < 0) { 1124 err = ret; 1125 goto fail; 1126 } 1127 1128 if (parent) { 1129 if (!ret) 1130 return 0; 1131 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1132 key.type = BTRFS_EXTENT_REF_V0_KEY; 1133 btrfs_release_path(path); 1134 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1135 if (ret < 0) { 1136 err = ret; 1137 goto fail; 1138 } 1139 if (!ret) 1140 return 0; 1141 #endif 1142 goto fail; 1143 } 1144 1145 leaf = path->nodes[0]; 1146 nritems = btrfs_header_nritems(leaf); 1147 while (1) { 1148 if (path->slots[0] >= nritems) { 1149 ret = btrfs_next_leaf(root, path); 1150 if (ret < 0) 1151 err = ret; 1152 if (ret) 1153 goto fail; 1154 1155 leaf = path->nodes[0]; 1156 nritems = btrfs_header_nritems(leaf); 1157 recow = 1; 1158 } 1159 1160 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1161 if (key.objectid != bytenr || 1162 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1163 goto fail; 1164 1165 ref = btrfs_item_ptr(leaf, path->slots[0], 1166 struct btrfs_extent_data_ref); 1167 1168 if (match_extent_data_ref(leaf, ref, root_objectid, 1169 owner, offset)) { 1170 if (recow) { 1171 btrfs_release_path(path); 1172 goto again; 1173 } 1174 err = 0; 1175 break; 1176 } 1177 path->slots[0]++; 1178 } 1179 fail: 1180 return err; 1181 } 1182 1183 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1184 struct btrfs_root *root, 1185 struct btrfs_path *path, 1186 u64 bytenr, u64 parent, 1187 u64 root_objectid, u64 owner, 1188 u64 offset, int refs_to_add) 1189 { 1190 struct btrfs_key key; 1191 struct extent_buffer *leaf; 1192 u32 size; 1193 u32 num_refs; 1194 int ret; 1195 1196 key.objectid = bytenr; 1197 if (parent) { 1198 key.type = BTRFS_SHARED_DATA_REF_KEY; 1199 key.offset = parent; 1200 size = sizeof(struct btrfs_shared_data_ref); 1201 } else { 1202 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1203 key.offset = hash_extent_data_ref(root_objectid, 1204 owner, offset); 1205 size = sizeof(struct btrfs_extent_data_ref); 1206 } 1207 1208 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1209 if (ret && ret != -EEXIST) 1210 goto fail; 1211 1212 leaf = path->nodes[0]; 1213 if (parent) { 1214 struct btrfs_shared_data_ref *ref; 1215 ref = btrfs_item_ptr(leaf, path->slots[0], 1216 struct btrfs_shared_data_ref); 1217 if (ret == 0) { 1218 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1219 } else { 1220 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1221 num_refs += refs_to_add; 1222 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1223 } 1224 } else { 1225 struct btrfs_extent_data_ref *ref; 1226 while (ret == -EEXIST) { 1227 ref = btrfs_item_ptr(leaf, path->slots[0], 1228 struct btrfs_extent_data_ref); 1229 if (match_extent_data_ref(leaf, ref, root_objectid, 1230 owner, offset)) 1231 break; 1232 btrfs_release_path(path); 1233 key.offset++; 1234 ret = btrfs_insert_empty_item(trans, root, path, &key, 1235 size); 1236 if (ret && ret != -EEXIST) 1237 goto fail; 1238 1239 leaf = path->nodes[0]; 1240 } 1241 ref = btrfs_item_ptr(leaf, path->slots[0], 1242 struct btrfs_extent_data_ref); 1243 if (ret == 0) { 1244 btrfs_set_extent_data_ref_root(leaf, ref, 1245 root_objectid); 1246 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1247 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1248 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1249 } else { 1250 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1251 num_refs += refs_to_add; 1252 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1253 } 1254 } 1255 btrfs_mark_buffer_dirty(leaf); 1256 ret = 0; 1257 fail: 1258 btrfs_release_path(path); 1259 return ret; 1260 } 1261 1262 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1263 struct btrfs_root *root, 1264 struct btrfs_path *path, 1265 int refs_to_drop, int *last_ref) 1266 { 1267 struct btrfs_key key; 1268 struct btrfs_extent_data_ref *ref1 = NULL; 1269 struct btrfs_shared_data_ref *ref2 = NULL; 1270 struct extent_buffer *leaf; 1271 u32 num_refs = 0; 1272 int ret = 0; 1273 1274 leaf = path->nodes[0]; 1275 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1276 1277 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1278 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1279 struct btrfs_extent_data_ref); 1280 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1281 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1282 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1283 struct btrfs_shared_data_ref); 1284 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1285 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1286 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1287 struct btrfs_extent_ref_v0 *ref0; 1288 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1289 struct btrfs_extent_ref_v0); 1290 num_refs = btrfs_ref_count_v0(leaf, ref0); 1291 #endif 1292 } else { 1293 BUG(); 1294 } 1295 1296 BUG_ON(num_refs < refs_to_drop); 1297 num_refs -= refs_to_drop; 1298 1299 if (num_refs == 0) { 1300 ret = btrfs_del_item(trans, root, path); 1301 *last_ref = 1; 1302 } else { 1303 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1304 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1305 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1306 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1307 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1308 else { 1309 struct btrfs_extent_ref_v0 *ref0; 1310 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1311 struct btrfs_extent_ref_v0); 1312 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1313 } 1314 #endif 1315 btrfs_mark_buffer_dirty(leaf); 1316 } 1317 return ret; 1318 } 1319 1320 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1321 struct btrfs_path *path, 1322 struct btrfs_extent_inline_ref *iref) 1323 { 1324 struct btrfs_key key; 1325 struct extent_buffer *leaf; 1326 struct btrfs_extent_data_ref *ref1; 1327 struct btrfs_shared_data_ref *ref2; 1328 u32 num_refs = 0; 1329 1330 leaf = path->nodes[0]; 1331 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1332 if (iref) { 1333 if (btrfs_extent_inline_ref_type(leaf, iref) == 1334 BTRFS_EXTENT_DATA_REF_KEY) { 1335 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1336 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1337 } else { 1338 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1339 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1340 } 1341 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1342 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1343 struct btrfs_extent_data_ref); 1344 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1345 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1346 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1347 struct btrfs_shared_data_ref); 1348 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1349 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1350 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1351 struct btrfs_extent_ref_v0 *ref0; 1352 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1353 struct btrfs_extent_ref_v0); 1354 num_refs = btrfs_ref_count_v0(leaf, ref0); 1355 #endif 1356 } else { 1357 WARN_ON(1); 1358 } 1359 return num_refs; 1360 } 1361 1362 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1363 struct btrfs_root *root, 1364 struct btrfs_path *path, 1365 u64 bytenr, u64 parent, 1366 u64 root_objectid) 1367 { 1368 struct btrfs_key key; 1369 int ret; 1370 1371 key.objectid = bytenr; 1372 if (parent) { 1373 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1374 key.offset = parent; 1375 } else { 1376 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1377 key.offset = root_objectid; 1378 } 1379 1380 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1381 if (ret > 0) 1382 ret = -ENOENT; 1383 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1384 if (ret == -ENOENT && parent) { 1385 btrfs_release_path(path); 1386 key.type = BTRFS_EXTENT_REF_V0_KEY; 1387 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1388 if (ret > 0) 1389 ret = -ENOENT; 1390 } 1391 #endif 1392 return ret; 1393 } 1394 1395 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1396 struct btrfs_root *root, 1397 struct btrfs_path *path, 1398 u64 bytenr, u64 parent, 1399 u64 root_objectid) 1400 { 1401 struct btrfs_key key; 1402 int ret; 1403 1404 key.objectid = bytenr; 1405 if (parent) { 1406 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1407 key.offset = parent; 1408 } else { 1409 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1410 key.offset = root_objectid; 1411 } 1412 1413 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1414 btrfs_release_path(path); 1415 return ret; 1416 } 1417 1418 static inline int extent_ref_type(u64 parent, u64 owner) 1419 { 1420 int type; 1421 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1422 if (parent > 0) 1423 type = BTRFS_SHARED_BLOCK_REF_KEY; 1424 else 1425 type = BTRFS_TREE_BLOCK_REF_KEY; 1426 } else { 1427 if (parent > 0) 1428 type = BTRFS_SHARED_DATA_REF_KEY; 1429 else 1430 type = BTRFS_EXTENT_DATA_REF_KEY; 1431 } 1432 return type; 1433 } 1434 1435 static int find_next_key(struct btrfs_path *path, int level, 1436 struct btrfs_key *key) 1437 1438 { 1439 for (; level < BTRFS_MAX_LEVEL; level++) { 1440 if (!path->nodes[level]) 1441 break; 1442 if (path->slots[level] + 1 >= 1443 btrfs_header_nritems(path->nodes[level])) 1444 continue; 1445 if (level == 0) 1446 btrfs_item_key_to_cpu(path->nodes[level], key, 1447 path->slots[level] + 1); 1448 else 1449 btrfs_node_key_to_cpu(path->nodes[level], key, 1450 path->slots[level] + 1); 1451 return 0; 1452 } 1453 return 1; 1454 } 1455 1456 /* 1457 * look for inline back ref. if back ref is found, *ref_ret is set 1458 * to the address of inline back ref, and 0 is returned. 1459 * 1460 * if back ref isn't found, *ref_ret is set to the address where it 1461 * should be inserted, and -ENOENT is returned. 1462 * 1463 * if insert is true and there are too many inline back refs, the path 1464 * points to the extent item, and -EAGAIN is returned. 1465 * 1466 * NOTE: inline back refs are ordered in the same way that back ref 1467 * items in the tree are ordered. 1468 */ 1469 static noinline_for_stack 1470 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1471 struct btrfs_root *root, 1472 struct btrfs_path *path, 1473 struct btrfs_extent_inline_ref **ref_ret, 1474 u64 bytenr, u64 num_bytes, 1475 u64 parent, u64 root_objectid, 1476 u64 owner, u64 offset, int insert) 1477 { 1478 struct btrfs_key key; 1479 struct extent_buffer *leaf; 1480 struct btrfs_extent_item *ei; 1481 struct btrfs_extent_inline_ref *iref; 1482 u64 flags; 1483 u64 item_size; 1484 unsigned long ptr; 1485 unsigned long end; 1486 int extra_size; 1487 int type; 1488 int want; 1489 int ret; 1490 int err = 0; 1491 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1492 SKINNY_METADATA); 1493 1494 key.objectid = bytenr; 1495 key.type = BTRFS_EXTENT_ITEM_KEY; 1496 key.offset = num_bytes; 1497 1498 want = extent_ref_type(parent, owner); 1499 if (insert) { 1500 extra_size = btrfs_extent_inline_ref_size(want); 1501 path->keep_locks = 1; 1502 } else 1503 extra_size = -1; 1504 1505 /* 1506 * Owner is our parent level, so we can just add one to get the level 1507 * for the block we are interested in. 1508 */ 1509 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1510 key.type = BTRFS_METADATA_ITEM_KEY; 1511 key.offset = owner; 1512 } 1513 1514 again: 1515 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1516 if (ret < 0) { 1517 err = ret; 1518 goto out; 1519 } 1520 1521 /* 1522 * We may be a newly converted file system which still has the old fat 1523 * extent entries for metadata, so try and see if we have one of those. 1524 */ 1525 if (ret > 0 && skinny_metadata) { 1526 skinny_metadata = false; 1527 if (path->slots[0]) { 1528 path->slots[0]--; 1529 btrfs_item_key_to_cpu(path->nodes[0], &key, 1530 path->slots[0]); 1531 if (key.objectid == bytenr && 1532 key.type == BTRFS_EXTENT_ITEM_KEY && 1533 key.offset == num_bytes) 1534 ret = 0; 1535 } 1536 if (ret) { 1537 key.objectid = bytenr; 1538 key.type = BTRFS_EXTENT_ITEM_KEY; 1539 key.offset = num_bytes; 1540 btrfs_release_path(path); 1541 goto again; 1542 } 1543 } 1544 1545 if (ret && !insert) { 1546 err = -ENOENT; 1547 goto out; 1548 } else if (WARN_ON(ret)) { 1549 err = -EIO; 1550 goto out; 1551 } 1552 1553 leaf = path->nodes[0]; 1554 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1555 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1556 if (item_size < sizeof(*ei)) { 1557 if (!insert) { 1558 err = -ENOENT; 1559 goto out; 1560 } 1561 ret = convert_extent_item_v0(trans, root, path, owner, 1562 extra_size); 1563 if (ret < 0) { 1564 err = ret; 1565 goto out; 1566 } 1567 leaf = path->nodes[0]; 1568 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1569 } 1570 #endif 1571 BUG_ON(item_size < sizeof(*ei)); 1572 1573 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1574 flags = btrfs_extent_flags(leaf, ei); 1575 1576 ptr = (unsigned long)(ei + 1); 1577 end = (unsigned long)ei + item_size; 1578 1579 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1580 ptr += sizeof(struct btrfs_tree_block_info); 1581 BUG_ON(ptr > end); 1582 } 1583 1584 err = -ENOENT; 1585 while (1) { 1586 if (ptr >= end) { 1587 WARN_ON(ptr > end); 1588 break; 1589 } 1590 iref = (struct btrfs_extent_inline_ref *)ptr; 1591 type = btrfs_extent_inline_ref_type(leaf, iref); 1592 if (want < type) 1593 break; 1594 if (want > type) { 1595 ptr += btrfs_extent_inline_ref_size(type); 1596 continue; 1597 } 1598 1599 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1600 struct btrfs_extent_data_ref *dref; 1601 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1602 if (match_extent_data_ref(leaf, dref, root_objectid, 1603 owner, offset)) { 1604 err = 0; 1605 break; 1606 } 1607 if (hash_extent_data_ref_item(leaf, dref) < 1608 hash_extent_data_ref(root_objectid, owner, offset)) 1609 break; 1610 } else { 1611 u64 ref_offset; 1612 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1613 if (parent > 0) { 1614 if (parent == ref_offset) { 1615 err = 0; 1616 break; 1617 } 1618 if (ref_offset < parent) 1619 break; 1620 } else { 1621 if (root_objectid == ref_offset) { 1622 err = 0; 1623 break; 1624 } 1625 if (ref_offset < root_objectid) 1626 break; 1627 } 1628 } 1629 ptr += btrfs_extent_inline_ref_size(type); 1630 } 1631 if (err == -ENOENT && insert) { 1632 if (item_size + extra_size >= 1633 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1634 err = -EAGAIN; 1635 goto out; 1636 } 1637 /* 1638 * To add new inline back ref, we have to make sure 1639 * there is no corresponding back ref item. 1640 * For simplicity, we just do not add new inline back 1641 * ref if there is any kind of item for this block 1642 */ 1643 if (find_next_key(path, 0, &key) == 0 && 1644 key.objectid == bytenr && 1645 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1646 err = -EAGAIN; 1647 goto out; 1648 } 1649 } 1650 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1651 out: 1652 if (insert) { 1653 path->keep_locks = 0; 1654 btrfs_unlock_up_safe(path, 1); 1655 } 1656 return err; 1657 } 1658 1659 /* 1660 * helper to add new inline back ref 1661 */ 1662 static noinline_for_stack 1663 void setup_inline_extent_backref(struct btrfs_root *root, 1664 struct btrfs_path *path, 1665 struct btrfs_extent_inline_ref *iref, 1666 u64 parent, u64 root_objectid, 1667 u64 owner, u64 offset, int refs_to_add, 1668 struct btrfs_delayed_extent_op *extent_op) 1669 { 1670 struct extent_buffer *leaf; 1671 struct btrfs_extent_item *ei; 1672 unsigned long ptr; 1673 unsigned long end; 1674 unsigned long item_offset; 1675 u64 refs; 1676 int size; 1677 int type; 1678 1679 leaf = path->nodes[0]; 1680 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1681 item_offset = (unsigned long)iref - (unsigned long)ei; 1682 1683 type = extent_ref_type(parent, owner); 1684 size = btrfs_extent_inline_ref_size(type); 1685 1686 btrfs_extend_item(root, path, size); 1687 1688 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1689 refs = btrfs_extent_refs(leaf, ei); 1690 refs += refs_to_add; 1691 btrfs_set_extent_refs(leaf, ei, refs); 1692 if (extent_op) 1693 __run_delayed_extent_op(extent_op, leaf, ei); 1694 1695 ptr = (unsigned long)ei + item_offset; 1696 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1697 if (ptr < end - size) 1698 memmove_extent_buffer(leaf, ptr + size, ptr, 1699 end - size - ptr); 1700 1701 iref = (struct btrfs_extent_inline_ref *)ptr; 1702 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1703 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1704 struct btrfs_extent_data_ref *dref; 1705 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1706 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1707 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1708 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1709 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1710 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1711 struct btrfs_shared_data_ref *sref; 1712 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1713 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1714 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1715 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1716 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1717 } else { 1718 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1719 } 1720 btrfs_mark_buffer_dirty(leaf); 1721 } 1722 1723 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1724 struct btrfs_root *root, 1725 struct btrfs_path *path, 1726 struct btrfs_extent_inline_ref **ref_ret, 1727 u64 bytenr, u64 num_bytes, u64 parent, 1728 u64 root_objectid, u64 owner, u64 offset) 1729 { 1730 int ret; 1731 1732 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1733 bytenr, num_bytes, parent, 1734 root_objectid, owner, offset, 0); 1735 if (ret != -ENOENT) 1736 return ret; 1737 1738 btrfs_release_path(path); 1739 *ref_ret = NULL; 1740 1741 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1742 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1743 root_objectid); 1744 } else { 1745 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1746 root_objectid, owner, offset); 1747 } 1748 return ret; 1749 } 1750 1751 /* 1752 * helper to update/remove inline back ref 1753 */ 1754 static noinline_for_stack 1755 void update_inline_extent_backref(struct btrfs_root *root, 1756 struct btrfs_path *path, 1757 struct btrfs_extent_inline_ref *iref, 1758 int refs_to_mod, 1759 struct btrfs_delayed_extent_op *extent_op, 1760 int *last_ref) 1761 { 1762 struct extent_buffer *leaf; 1763 struct btrfs_extent_item *ei; 1764 struct btrfs_extent_data_ref *dref = NULL; 1765 struct btrfs_shared_data_ref *sref = NULL; 1766 unsigned long ptr; 1767 unsigned long end; 1768 u32 item_size; 1769 int size; 1770 int type; 1771 u64 refs; 1772 1773 leaf = path->nodes[0]; 1774 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1775 refs = btrfs_extent_refs(leaf, ei); 1776 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1777 refs += refs_to_mod; 1778 btrfs_set_extent_refs(leaf, ei, refs); 1779 if (extent_op) 1780 __run_delayed_extent_op(extent_op, leaf, ei); 1781 1782 type = btrfs_extent_inline_ref_type(leaf, iref); 1783 1784 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1785 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1786 refs = btrfs_extent_data_ref_count(leaf, dref); 1787 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1788 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1789 refs = btrfs_shared_data_ref_count(leaf, sref); 1790 } else { 1791 refs = 1; 1792 BUG_ON(refs_to_mod != -1); 1793 } 1794 1795 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1796 refs += refs_to_mod; 1797 1798 if (refs > 0) { 1799 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1800 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1801 else 1802 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1803 } else { 1804 *last_ref = 1; 1805 size = btrfs_extent_inline_ref_size(type); 1806 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1807 ptr = (unsigned long)iref; 1808 end = (unsigned long)ei + item_size; 1809 if (ptr + size < end) 1810 memmove_extent_buffer(leaf, ptr, ptr + size, 1811 end - ptr - size); 1812 item_size -= size; 1813 btrfs_truncate_item(root, path, item_size, 1); 1814 } 1815 btrfs_mark_buffer_dirty(leaf); 1816 } 1817 1818 static noinline_for_stack 1819 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1820 struct btrfs_root *root, 1821 struct btrfs_path *path, 1822 u64 bytenr, u64 num_bytes, u64 parent, 1823 u64 root_objectid, u64 owner, 1824 u64 offset, int refs_to_add, 1825 struct btrfs_delayed_extent_op *extent_op) 1826 { 1827 struct btrfs_extent_inline_ref *iref; 1828 int ret; 1829 1830 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1831 bytenr, num_bytes, parent, 1832 root_objectid, owner, offset, 1); 1833 if (ret == 0) { 1834 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1835 update_inline_extent_backref(root, path, iref, 1836 refs_to_add, extent_op, NULL); 1837 } else if (ret == -ENOENT) { 1838 setup_inline_extent_backref(root, path, iref, parent, 1839 root_objectid, owner, offset, 1840 refs_to_add, extent_op); 1841 ret = 0; 1842 } 1843 return ret; 1844 } 1845 1846 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1847 struct btrfs_root *root, 1848 struct btrfs_path *path, 1849 u64 bytenr, u64 parent, u64 root_objectid, 1850 u64 owner, u64 offset, int refs_to_add) 1851 { 1852 int ret; 1853 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1854 BUG_ON(refs_to_add != 1); 1855 ret = insert_tree_block_ref(trans, root, path, bytenr, 1856 parent, root_objectid); 1857 } else { 1858 ret = insert_extent_data_ref(trans, root, path, bytenr, 1859 parent, root_objectid, 1860 owner, offset, refs_to_add); 1861 } 1862 return ret; 1863 } 1864 1865 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1866 struct btrfs_root *root, 1867 struct btrfs_path *path, 1868 struct btrfs_extent_inline_ref *iref, 1869 int refs_to_drop, int is_data, int *last_ref) 1870 { 1871 int ret = 0; 1872 1873 BUG_ON(!is_data && refs_to_drop != 1); 1874 if (iref) { 1875 update_inline_extent_backref(root, path, iref, 1876 -refs_to_drop, NULL, last_ref); 1877 } else if (is_data) { 1878 ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1879 last_ref); 1880 } else { 1881 *last_ref = 1; 1882 ret = btrfs_del_item(trans, root, path); 1883 } 1884 return ret; 1885 } 1886 1887 static int btrfs_issue_discard(struct block_device *bdev, 1888 u64 start, u64 len) 1889 { 1890 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1891 } 1892 1893 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1894 u64 num_bytes, u64 *actual_bytes) 1895 { 1896 int ret; 1897 u64 discarded_bytes = 0; 1898 struct btrfs_bio *bbio = NULL; 1899 1900 1901 /* Tell the block device(s) that the sectors can be discarded */ 1902 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1903 bytenr, &num_bytes, &bbio, 0); 1904 /* Error condition is -ENOMEM */ 1905 if (!ret) { 1906 struct btrfs_bio_stripe *stripe = bbio->stripes; 1907 int i; 1908 1909 1910 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1911 if (!stripe->dev->can_discard) 1912 continue; 1913 1914 ret = btrfs_issue_discard(stripe->dev->bdev, 1915 stripe->physical, 1916 stripe->length); 1917 if (!ret) 1918 discarded_bytes += stripe->length; 1919 else if (ret != -EOPNOTSUPP) 1920 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1921 1922 /* 1923 * Just in case we get back EOPNOTSUPP for some reason, 1924 * just ignore the return value so we don't screw up 1925 * people calling discard_extent. 1926 */ 1927 ret = 0; 1928 } 1929 btrfs_put_bbio(bbio); 1930 } 1931 1932 if (actual_bytes) 1933 *actual_bytes = discarded_bytes; 1934 1935 1936 if (ret == -EOPNOTSUPP) 1937 ret = 0; 1938 return ret; 1939 } 1940 1941 /* Can return -ENOMEM */ 1942 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1943 struct btrfs_root *root, 1944 u64 bytenr, u64 num_bytes, u64 parent, 1945 u64 root_objectid, u64 owner, u64 offset, 1946 int no_quota) 1947 { 1948 int ret; 1949 struct btrfs_fs_info *fs_info = root->fs_info; 1950 1951 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1952 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1953 1954 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1955 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1956 num_bytes, 1957 parent, root_objectid, (int)owner, 1958 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1959 } else { 1960 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1961 num_bytes, 1962 parent, root_objectid, owner, offset, 1963 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1964 } 1965 return ret; 1966 } 1967 1968 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1969 struct btrfs_root *root, 1970 u64 bytenr, u64 num_bytes, 1971 u64 parent, u64 root_objectid, 1972 u64 owner, u64 offset, int refs_to_add, 1973 int no_quota, 1974 struct btrfs_delayed_extent_op *extent_op) 1975 { 1976 struct btrfs_fs_info *fs_info = root->fs_info; 1977 struct btrfs_path *path; 1978 struct extent_buffer *leaf; 1979 struct btrfs_extent_item *item; 1980 struct btrfs_key key; 1981 u64 refs; 1982 int ret; 1983 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; 1984 1985 path = btrfs_alloc_path(); 1986 if (!path) 1987 return -ENOMEM; 1988 1989 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) 1990 no_quota = 1; 1991 1992 path->reada = 1; 1993 path->leave_spinning = 1; 1994 /* this will setup the path even if it fails to insert the back ref */ 1995 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, 1996 bytenr, num_bytes, parent, 1997 root_objectid, owner, offset, 1998 refs_to_add, extent_op); 1999 if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) 2000 goto out; 2001 /* 2002 * Ok we were able to insert an inline extent and it appears to be a new 2003 * reference, deal with the qgroup accounting. 2004 */ 2005 if (!ret && !no_quota) { 2006 ASSERT(root->fs_info->quota_enabled); 2007 leaf = path->nodes[0]; 2008 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2009 item = btrfs_item_ptr(leaf, path->slots[0], 2010 struct btrfs_extent_item); 2011 if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) 2012 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2013 btrfs_release_path(path); 2014 2015 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2016 bytenr, num_bytes, type, 0); 2017 goto out; 2018 } 2019 2020 /* 2021 * Ok we had -EAGAIN which means we didn't have space to insert and 2022 * inline extent ref, so just update the reference count and add a 2023 * normal backref. 2024 */ 2025 leaf = path->nodes[0]; 2026 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2027 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2028 refs = btrfs_extent_refs(leaf, item); 2029 if (refs) 2030 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2031 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2032 if (extent_op) 2033 __run_delayed_extent_op(extent_op, leaf, item); 2034 2035 btrfs_mark_buffer_dirty(leaf); 2036 btrfs_release_path(path); 2037 2038 if (!no_quota) { 2039 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2040 bytenr, num_bytes, type, 0); 2041 if (ret) 2042 goto out; 2043 } 2044 2045 path->reada = 1; 2046 path->leave_spinning = 1; 2047 /* now insert the actual backref */ 2048 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2049 path, bytenr, parent, root_objectid, 2050 owner, offset, refs_to_add); 2051 if (ret) 2052 btrfs_abort_transaction(trans, root, ret); 2053 out: 2054 btrfs_free_path(path); 2055 return ret; 2056 } 2057 2058 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2059 struct btrfs_root *root, 2060 struct btrfs_delayed_ref_node *node, 2061 struct btrfs_delayed_extent_op *extent_op, 2062 int insert_reserved) 2063 { 2064 int ret = 0; 2065 struct btrfs_delayed_data_ref *ref; 2066 struct btrfs_key ins; 2067 u64 parent = 0; 2068 u64 ref_root = 0; 2069 u64 flags = 0; 2070 2071 ins.objectid = node->bytenr; 2072 ins.offset = node->num_bytes; 2073 ins.type = BTRFS_EXTENT_ITEM_KEY; 2074 2075 ref = btrfs_delayed_node_to_data_ref(node); 2076 trace_run_delayed_data_ref(node, ref, node->action); 2077 2078 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2079 parent = ref->parent; 2080 ref_root = ref->root; 2081 2082 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2083 if (extent_op) 2084 flags |= extent_op->flags_to_set; 2085 ret = alloc_reserved_file_extent(trans, root, 2086 parent, ref_root, flags, 2087 ref->objectid, ref->offset, 2088 &ins, node->ref_mod); 2089 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2090 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2091 node->num_bytes, parent, 2092 ref_root, ref->objectid, 2093 ref->offset, node->ref_mod, 2094 node->no_quota, extent_op); 2095 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2096 ret = __btrfs_free_extent(trans, root, node->bytenr, 2097 node->num_bytes, parent, 2098 ref_root, ref->objectid, 2099 ref->offset, node->ref_mod, 2100 extent_op, node->no_quota); 2101 } else { 2102 BUG(); 2103 } 2104 return ret; 2105 } 2106 2107 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2108 struct extent_buffer *leaf, 2109 struct btrfs_extent_item *ei) 2110 { 2111 u64 flags = btrfs_extent_flags(leaf, ei); 2112 if (extent_op->update_flags) { 2113 flags |= extent_op->flags_to_set; 2114 btrfs_set_extent_flags(leaf, ei, flags); 2115 } 2116 2117 if (extent_op->update_key) { 2118 struct btrfs_tree_block_info *bi; 2119 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2120 bi = (struct btrfs_tree_block_info *)(ei + 1); 2121 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2122 } 2123 } 2124 2125 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2126 struct btrfs_root *root, 2127 struct btrfs_delayed_ref_node *node, 2128 struct btrfs_delayed_extent_op *extent_op) 2129 { 2130 struct btrfs_key key; 2131 struct btrfs_path *path; 2132 struct btrfs_extent_item *ei; 2133 struct extent_buffer *leaf; 2134 u32 item_size; 2135 int ret; 2136 int err = 0; 2137 int metadata = !extent_op->is_data; 2138 2139 if (trans->aborted) 2140 return 0; 2141 2142 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2143 metadata = 0; 2144 2145 path = btrfs_alloc_path(); 2146 if (!path) 2147 return -ENOMEM; 2148 2149 key.objectid = node->bytenr; 2150 2151 if (metadata) { 2152 key.type = BTRFS_METADATA_ITEM_KEY; 2153 key.offset = extent_op->level; 2154 } else { 2155 key.type = BTRFS_EXTENT_ITEM_KEY; 2156 key.offset = node->num_bytes; 2157 } 2158 2159 again: 2160 path->reada = 1; 2161 path->leave_spinning = 1; 2162 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2163 path, 0, 1); 2164 if (ret < 0) { 2165 err = ret; 2166 goto out; 2167 } 2168 if (ret > 0) { 2169 if (metadata) { 2170 if (path->slots[0] > 0) { 2171 path->slots[0]--; 2172 btrfs_item_key_to_cpu(path->nodes[0], &key, 2173 path->slots[0]); 2174 if (key.objectid == node->bytenr && 2175 key.type == BTRFS_EXTENT_ITEM_KEY && 2176 key.offset == node->num_bytes) 2177 ret = 0; 2178 } 2179 if (ret > 0) { 2180 btrfs_release_path(path); 2181 metadata = 0; 2182 2183 key.objectid = node->bytenr; 2184 key.offset = node->num_bytes; 2185 key.type = BTRFS_EXTENT_ITEM_KEY; 2186 goto again; 2187 } 2188 } else { 2189 err = -EIO; 2190 goto out; 2191 } 2192 } 2193 2194 leaf = path->nodes[0]; 2195 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2196 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2197 if (item_size < sizeof(*ei)) { 2198 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2199 path, (u64)-1, 0); 2200 if (ret < 0) { 2201 err = ret; 2202 goto out; 2203 } 2204 leaf = path->nodes[0]; 2205 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2206 } 2207 #endif 2208 BUG_ON(item_size < sizeof(*ei)); 2209 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2210 __run_delayed_extent_op(extent_op, leaf, ei); 2211 2212 btrfs_mark_buffer_dirty(leaf); 2213 out: 2214 btrfs_free_path(path); 2215 return err; 2216 } 2217 2218 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2219 struct btrfs_root *root, 2220 struct btrfs_delayed_ref_node *node, 2221 struct btrfs_delayed_extent_op *extent_op, 2222 int insert_reserved) 2223 { 2224 int ret = 0; 2225 struct btrfs_delayed_tree_ref *ref; 2226 struct btrfs_key ins; 2227 u64 parent = 0; 2228 u64 ref_root = 0; 2229 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2230 SKINNY_METADATA); 2231 2232 ref = btrfs_delayed_node_to_tree_ref(node); 2233 trace_run_delayed_tree_ref(node, ref, node->action); 2234 2235 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2236 parent = ref->parent; 2237 ref_root = ref->root; 2238 2239 ins.objectid = node->bytenr; 2240 if (skinny_metadata) { 2241 ins.offset = ref->level; 2242 ins.type = BTRFS_METADATA_ITEM_KEY; 2243 } else { 2244 ins.offset = node->num_bytes; 2245 ins.type = BTRFS_EXTENT_ITEM_KEY; 2246 } 2247 2248 BUG_ON(node->ref_mod != 1); 2249 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2250 BUG_ON(!extent_op || !extent_op->update_flags); 2251 ret = alloc_reserved_tree_block(trans, root, 2252 parent, ref_root, 2253 extent_op->flags_to_set, 2254 &extent_op->key, 2255 ref->level, &ins, 2256 node->no_quota); 2257 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2258 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2259 node->num_bytes, parent, ref_root, 2260 ref->level, 0, 1, node->no_quota, 2261 extent_op); 2262 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2263 ret = __btrfs_free_extent(trans, root, node->bytenr, 2264 node->num_bytes, parent, ref_root, 2265 ref->level, 0, 1, extent_op, 2266 node->no_quota); 2267 } else { 2268 BUG(); 2269 } 2270 return ret; 2271 } 2272 2273 /* helper function to actually process a single delayed ref entry */ 2274 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2275 struct btrfs_root *root, 2276 struct btrfs_delayed_ref_node *node, 2277 struct btrfs_delayed_extent_op *extent_op, 2278 int insert_reserved) 2279 { 2280 int ret = 0; 2281 2282 if (trans->aborted) { 2283 if (insert_reserved) 2284 btrfs_pin_extent(root, node->bytenr, 2285 node->num_bytes, 1); 2286 return 0; 2287 } 2288 2289 if (btrfs_delayed_ref_is_head(node)) { 2290 struct btrfs_delayed_ref_head *head; 2291 /* 2292 * we've hit the end of the chain and we were supposed 2293 * to insert this extent into the tree. But, it got 2294 * deleted before we ever needed to insert it, so all 2295 * we have to do is clean up the accounting 2296 */ 2297 BUG_ON(extent_op); 2298 head = btrfs_delayed_node_to_head(node); 2299 trace_run_delayed_ref_head(node, head, node->action); 2300 2301 if (insert_reserved) { 2302 btrfs_pin_extent(root, node->bytenr, 2303 node->num_bytes, 1); 2304 if (head->is_data) { 2305 ret = btrfs_del_csums(trans, root, 2306 node->bytenr, 2307 node->num_bytes); 2308 } 2309 } 2310 return ret; 2311 } 2312 2313 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2314 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2315 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2316 insert_reserved); 2317 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2318 node->type == BTRFS_SHARED_DATA_REF_KEY) 2319 ret = run_delayed_data_ref(trans, root, node, extent_op, 2320 insert_reserved); 2321 else 2322 BUG(); 2323 return ret; 2324 } 2325 2326 static noinline struct btrfs_delayed_ref_node * 2327 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2328 { 2329 struct rb_node *node; 2330 struct btrfs_delayed_ref_node *ref, *last = NULL;; 2331 2332 /* 2333 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2334 * this prevents ref count from going down to zero when 2335 * there still are pending delayed ref. 2336 */ 2337 node = rb_first(&head->ref_root); 2338 while (node) { 2339 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2340 rb_node); 2341 if (ref->action == BTRFS_ADD_DELAYED_REF) 2342 return ref; 2343 else if (last == NULL) 2344 last = ref; 2345 node = rb_next(node); 2346 } 2347 return last; 2348 } 2349 2350 /* 2351 * Returns 0 on success or if called with an already aborted transaction. 2352 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2353 */ 2354 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2355 struct btrfs_root *root, 2356 unsigned long nr) 2357 { 2358 struct btrfs_delayed_ref_root *delayed_refs; 2359 struct btrfs_delayed_ref_node *ref; 2360 struct btrfs_delayed_ref_head *locked_ref = NULL; 2361 struct btrfs_delayed_extent_op *extent_op; 2362 struct btrfs_fs_info *fs_info = root->fs_info; 2363 ktime_t start = ktime_get(); 2364 int ret; 2365 unsigned long count = 0; 2366 unsigned long actual_count = 0; 2367 int must_insert_reserved = 0; 2368 2369 delayed_refs = &trans->transaction->delayed_refs; 2370 while (1) { 2371 if (!locked_ref) { 2372 if (count >= nr) 2373 break; 2374 2375 spin_lock(&delayed_refs->lock); 2376 locked_ref = btrfs_select_ref_head(trans); 2377 if (!locked_ref) { 2378 spin_unlock(&delayed_refs->lock); 2379 break; 2380 } 2381 2382 /* grab the lock that says we are going to process 2383 * all the refs for this head */ 2384 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2385 spin_unlock(&delayed_refs->lock); 2386 /* 2387 * we may have dropped the spin lock to get the head 2388 * mutex lock, and that might have given someone else 2389 * time to free the head. If that's true, it has been 2390 * removed from our list and we can move on. 2391 */ 2392 if (ret == -EAGAIN) { 2393 locked_ref = NULL; 2394 count++; 2395 continue; 2396 } 2397 } 2398 2399 /* 2400 * We need to try and merge add/drops of the same ref since we 2401 * can run into issues with relocate dropping the implicit ref 2402 * and then it being added back again before the drop can 2403 * finish. If we merged anything we need to re-loop so we can 2404 * get a good ref. 2405 */ 2406 spin_lock(&locked_ref->lock); 2407 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2408 locked_ref); 2409 2410 /* 2411 * locked_ref is the head node, so we have to go one 2412 * node back for any delayed ref updates 2413 */ 2414 ref = select_delayed_ref(locked_ref); 2415 2416 if (ref && ref->seq && 2417 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2418 spin_unlock(&locked_ref->lock); 2419 btrfs_delayed_ref_unlock(locked_ref); 2420 spin_lock(&delayed_refs->lock); 2421 locked_ref->processing = 0; 2422 delayed_refs->num_heads_ready++; 2423 spin_unlock(&delayed_refs->lock); 2424 locked_ref = NULL; 2425 cond_resched(); 2426 count++; 2427 continue; 2428 } 2429 2430 /* 2431 * record the must insert reserved flag before we 2432 * drop the spin lock. 2433 */ 2434 must_insert_reserved = locked_ref->must_insert_reserved; 2435 locked_ref->must_insert_reserved = 0; 2436 2437 extent_op = locked_ref->extent_op; 2438 locked_ref->extent_op = NULL; 2439 2440 if (!ref) { 2441 2442 2443 /* All delayed refs have been processed, Go ahead 2444 * and send the head node to run_one_delayed_ref, 2445 * so that any accounting fixes can happen 2446 */ 2447 ref = &locked_ref->node; 2448 2449 if (extent_op && must_insert_reserved) { 2450 btrfs_free_delayed_extent_op(extent_op); 2451 extent_op = NULL; 2452 } 2453 2454 if (extent_op) { 2455 spin_unlock(&locked_ref->lock); 2456 ret = run_delayed_extent_op(trans, root, 2457 ref, extent_op); 2458 btrfs_free_delayed_extent_op(extent_op); 2459 2460 if (ret) { 2461 /* 2462 * Need to reset must_insert_reserved if 2463 * there was an error so the abort stuff 2464 * can cleanup the reserved space 2465 * properly. 2466 */ 2467 if (must_insert_reserved) 2468 locked_ref->must_insert_reserved = 1; 2469 locked_ref->processing = 0; 2470 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2471 btrfs_delayed_ref_unlock(locked_ref); 2472 return ret; 2473 } 2474 continue; 2475 } 2476 2477 /* 2478 * Need to drop our head ref lock and re-aqcuire the 2479 * delayed ref lock and then re-check to make sure 2480 * nobody got added. 2481 */ 2482 spin_unlock(&locked_ref->lock); 2483 spin_lock(&delayed_refs->lock); 2484 spin_lock(&locked_ref->lock); 2485 if (rb_first(&locked_ref->ref_root) || 2486 locked_ref->extent_op) { 2487 spin_unlock(&locked_ref->lock); 2488 spin_unlock(&delayed_refs->lock); 2489 continue; 2490 } 2491 ref->in_tree = 0; 2492 delayed_refs->num_heads--; 2493 rb_erase(&locked_ref->href_node, 2494 &delayed_refs->href_root); 2495 spin_unlock(&delayed_refs->lock); 2496 } else { 2497 actual_count++; 2498 ref->in_tree = 0; 2499 rb_erase(&ref->rb_node, &locked_ref->ref_root); 2500 } 2501 atomic_dec(&delayed_refs->num_entries); 2502 2503 if (!btrfs_delayed_ref_is_head(ref)) { 2504 /* 2505 * when we play the delayed ref, also correct the 2506 * ref_mod on head 2507 */ 2508 switch (ref->action) { 2509 case BTRFS_ADD_DELAYED_REF: 2510 case BTRFS_ADD_DELAYED_EXTENT: 2511 locked_ref->node.ref_mod -= ref->ref_mod; 2512 break; 2513 case BTRFS_DROP_DELAYED_REF: 2514 locked_ref->node.ref_mod += ref->ref_mod; 2515 break; 2516 default: 2517 WARN_ON(1); 2518 } 2519 } 2520 spin_unlock(&locked_ref->lock); 2521 2522 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2523 must_insert_reserved); 2524 2525 btrfs_free_delayed_extent_op(extent_op); 2526 if (ret) { 2527 locked_ref->processing = 0; 2528 btrfs_delayed_ref_unlock(locked_ref); 2529 btrfs_put_delayed_ref(ref); 2530 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2531 return ret; 2532 } 2533 2534 /* 2535 * If this node is a head, that means all the refs in this head 2536 * have been dealt with, and we will pick the next head to deal 2537 * with, so we must unlock the head and drop it from the cluster 2538 * list before we release it. 2539 */ 2540 if (btrfs_delayed_ref_is_head(ref)) { 2541 btrfs_delayed_ref_unlock(locked_ref); 2542 locked_ref = NULL; 2543 } 2544 btrfs_put_delayed_ref(ref); 2545 count++; 2546 cond_resched(); 2547 } 2548 2549 /* 2550 * We don't want to include ref heads since we can have empty ref heads 2551 * and those will drastically skew our runtime down since we just do 2552 * accounting, no actual extent tree updates. 2553 */ 2554 if (actual_count > 0) { 2555 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2556 u64 avg; 2557 2558 /* 2559 * We weigh the current average higher than our current runtime 2560 * to avoid large swings in the average. 2561 */ 2562 spin_lock(&delayed_refs->lock); 2563 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2564 avg = div64_u64(avg, 4); 2565 fs_info->avg_delayed_ref_runtime = avg; 2566 spin_unlock(&delayed_refs->lock); 2567 } 2568 return 0; 2569 } 2570 2571 #ifdef SCRAMBLE_DELAYED_REFS 2572 /* 2573 * Normally delayed refs get processed in ascending bytenr order. This 2574 * correlates in most cases to the order added. To expose dependencies on this 2575 * order, we start to process the tree in the middle instead of the beginning 2576 */ 2577 static u64 find_middle(struct rb_root *root) 2578 { 2579 struct rb_node *n = root->rb_node; 2580 struct btrfs_delayed_ref_node *entry; 2581 int alt = 1; 2582 u64 middle; 2583 u64 first = 0, last = 0; 2584 2585 n = rb_first(root); 2586 if (n) { 2587 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2588 first = entry->bytenr; 2589 } 2590 n = rb_last(root); 2591 if (n) { 2592 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2593 last = entry->bytenr; 2594 } 2595 n = root->rb_node; 2596 2597 while (n) { 2598 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2599 WARN_ON(!entry->in_tree); 2600 2601 middle = entry->bytenr; 2602 2603 if (alt) 2604 n = n->rb_left; 2605 else 2606 n = n->rb_right; 2607 2608 alt = 1 - alt; 2609 } 2610 return middle; 2611 } 2612 #endif 2613 2614 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2615 { 2616 u64 num_bytes; 2617 2618 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2619 sizeof(struct btrfs_extent_inline_ref)); 2620 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2621 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2622 2623 /* 2624 * We don't ever fill up leaves all the way so multiply by 2 just to be 2625 * closer to what we're really going to want to ouse. 2626 */ 2627 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2628 } 2629 2630 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2631 struct btrfs_root *root) 2632 { 2633 struct btrfs_block_rsv *global_rsv; 2634 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2635 u64 num_bytes; 2636 int ret = 0; 2637 2638 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2639 num_heads = heads_to_leaves(root, num_heads); 2640 if (num_heads > 1) 2641 num_bytes += (num_heads - 1) * root->nodesize; 2642 num_bytes <<= 1; 2643 global_rsv = &root->fs_info->global_block_rsv; 2644 2645 /* 2646 * If we can't allocate any more chunks lets make sure we have _lots_ of 2647 * wiggle room since running delayed refs can create more delayed refs. 2648 */ 2649 if (global_rsv->space_info->full) 2650 num_bytes <<= 1; 2651 2652 spin_lock(&global_rsv->lock); 2653 if (global_rsv->reserved <= num_bytes) 2654 ret = 1; 2655 spin_unlock(&global_rsv->lock); 2656 return ret; 2657 } 2658 2659 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2660 struct btrfs_root *root) 2661 { 2662 struct btrfs_fs_info *fs_info = root->fs_info; 2663 u64 num_entries = 2664 atomic_read(&trans->transaction->delayed_refs.num_entries); 2665 u64 avg_runtime; 2666 u64 val; 2667 2668 smp_mb(); 2669 avg_runtime = fs_info->avg_delayed_ref_runtime; 2670 val = num_entries * avg_runtime; 2671 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2672 return 1; 2673 if (val >= NSEC_PER_SEC / 2) 2674 return 2; 2675 2676 return btrfs_check_space_for_delayed_refs(trans, root); 2677 } 2678 2679 struct async_delayed_refs { 2680 struct btrfs_root *root; 2681 int count; 2682 int error; 2683 int sync; 2684 struct completion wait; 2685 struct btrfs_work work; 2686 }; 2687 2688 static void delayed_ref_async_start(struct btrfs_work *work) 2689 { 2690 struct async_delayed_refs *async; 2691 struct btrfs_trans_handle *trans; 2692 int ret; 2693 2694 async = container_of(work, struct async_delayed_refs, work); 2695 2696 trans = btrfs_join_transaction(async->root); 2697 if (IS_ERR(trans)) { 2698 async->error = PTR_ERR(trans); 2699 goto done; 2700 } 2701 2702 /* 2703 * trans->sync means that when we call end_transaciton, we won't 2704 * wait on delayed refs 2705 */ 2706 trans->sync = true; 2707 ret = btrfs_run_delayed_refs(trans, async->root, async->count); 2708 if (ret) 2709 async->error = ret; 2710 2711 ret = btrfs_end_transaction(trans, async->root); 2712 if (ret && !async->error) 2713 async->error = ret; 2714 done: 2715 if (async->sync) 2716 complete(&async->wait); 2717 else 2718 kfree(async); 2719 } 2720 2721 int btrfs_async_run_delayed_refs(struct btrfs_root *root, 2722 unsigned long count, int wait) 2723 { 2724 struct async_delayed_refs *async; 2725 int ret; 2726 2727 async = kmalloc(sizeof(*async), GFP_NOFS); 2728 if (!async) 2729 return -ENOMEM; 2730 2731 async->root = root->fs_info->tree_root; 2732 async->count = count; 2733 async->error = 0; 2734 if (wait) 2735 async->sync = 1; 2736 else 2737 async->sync = 0; 2738 init_completion(&async->wait); 2739 2740 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2741 delayed_ref_async_start, NULL, NULL); 2742 2743 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2744 2745 if (wait) { 2746 wait_for_completion(&async->wait); 2747 ret = async->error; 2748 kfree(async); 2749 return ret; 2750 } 2751 return 0; 2752 } 2753 2754 /* 2755 * this starts processing the delayed reference count updates and 2756 * extent insertions we have queued up so far. count can be 2757 * 0, which means to process everything in the tree at the start 2758 * of the run (but not newly added entries), or it can be some target 2759 * number you'd like to process. 2760 * 2761 * Returns 0 on success or if called with an aborted transaction 2762 * Returns <0 on error and aborts the transaction 2763 */ 2764 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2765 struct btrfs_root *root, unsigned long count) 2766 { 2767 struct rb_node *node; 2768 struct btrfs_delayed_ref_root *delayed_refs; 2769 struct btrfs_delayed_ref_head *head; 2770 int ret; 2771 int run_all = count == (unsigned long)-1; 2772 2773 /* We'll clean this up in btrfs_cleanup_transaction */ 2774 if (trans->aborted) 2775 return 0; 2776 2777 if (root == root->fs_info->extent_root) 2778 root = root->fs_info->tree_root; 2779 2780 delayed_refs = &trans->transaction->delayed_refs; 2781 if (count == 0) 2782 count = atomic_read(&delayed_refs->num_entries) * 2; 2783 2784 again: 2785 #ifdef SCRAMBLE_DELAYED_REFS 2786 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2787 #endif 2788 ret = __btrfs_run_delayed_refs(trans, root, count); 2789 if (ret < 0) { 2790 btrfs_abort_transaction(trans, root, ret); 2791 return ret; 2792 } 2793 2794 if (run_all) { 2795 if (!list_empty(&trans->new_bgs)) 2796 btrfs_create_pending_block_groups(trans, root); 2797 2798 spin_lock(&delayed_refs->lock); 2799 node = rb_first(&delayed_refs->href_root); 2800 if (!node) { 2801 spin_unlock(&delayed_refs->lock); 2802 goto out; 2803 } 2804 count = (unsigned long)-1; 2805 2806 while (node) { 2807 head = rb_entry(node, struct btrfs_delayed_ref_head, 2808 href_node); 2809 if (btrfs_delayed_ref_is_head(&head->node)) { 2810 struct btrfs_delayed_ref_node *ref; 2811 2812 ref = &head->node; 2813 atomic_inc(&ref->refs); 2814 2815 spin_unlock(&delayed_refs->lock); 2816 /* 2817 * Mutex was contended, block until it's 2818 * released and try again 2819 */ 2820 mutex_lock(&head->mutex); 2821 mutex_unlock(&head->mutex); 2822 2823 btrfs_put_delayed_ref(ref); 2824 cond_resched(); 2825 goto again; 2826 } else { 2827 WARN_ON(1); 2828 } 2829 node = rb_next(node); 2830 } 2831 spin_unlock(&delayed_refs->lock); 2832 cond_resched(); 2833 goto again; 2834 } 2835 out: 2836 ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); 2837 if (ret) 2838 return ret; 2839 assert_qgroups_uptodate(trans); 2840 return 0; 2841 } 2842 2843 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2844 struct btrfs_root *root, 2845 u64 bytenr, u64 num_bytes, u64 flags, 2846 int level, int is_data) 2847 { 2848 struct btrfs_delayed_extent_op *extent_op; 2849 int ret; 2850 2851 extent_op = btrfs_alloc_delayed_extent_op(); 2852 if (!extent_op) 2853 return -ENOMEM; 2854 2855 extent_op->flags_to_set = flags; 2856 extent_op->update_flags = 1; 2857 extent_op->update_key = 0; 2858 extent_op->is_data = is_data ? 1 : 0; 2859 extent_op->level = level; 2860 2861 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2862 num_bytes, extent_op); 2863 if (ret) 2864 btrfs_free_delayed_extent_op(extent_op); 2865 return ret; 2866 } 2867 2868 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2869 struct btrfs_root *root, 2870 struct btrfs_path *path, 2871 u64 objectid, u64 offset, u64 bytenr) 2872 { 2873 struct btrfs_delayed_ref_head *head; 2874 struct btrfs_delayed_ref_node *ref; 2875 struct btrfs_delayed_data_ref *data_ref; 2876 struct btrfs_delayed_ref_root *delayed_refs; 2877 struct rb_node *node; 2878 int ret = 0; 2879 2880 delayed_refs = &trans->transaction->delayed_refs; 2881 spin_lock(&delayed_refs->lock); 2882 head = btrfs_find_delayed_ref_head(trans, bytenr); 2883 if (!head) { 2884 spin_unlock(&delayed_refs->lock); 2885 return 0; 2886 } 2887 2888 if (!mutex_trylock(&head->mutex)) { 2889 atomic_inc(&head->node.refs); 2890 spin_unlock(&delayed_refs->lock); 2891 2892 btrfs_release_path(path); 2893 2894 /* 2895 * Mutex was contended, block until it's released and let 2896 * caller try again 2897 */ 2898 mutex_lock(&head->mutex); 2899 mutex_unlock(&head->mutex); 2900 btrfs_put_delayed_ref(&head->node); 2901 return -EAGAIN; 2902 } 2903 spin_unlock(&delayed_refs->lock); 2904 2905 spin_lock(&head->lock); 2906 node = rb_first(&head->ref_root); 2907 while (node) { 2908 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2909 node = rb_next(node); 2910 2911 /* If it's a shared ref we know a cross reference exists */ 2912 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2913 ret = 1; 2914 break; 2915 } 2916 2917 data_ref = btrfs_delayed_node_to_data_ref(ref); 2918 2919 /* 2920 * If our ref doesn't match the one we're currently looking at 2921 * then we have a cross reference. 2922 */ 2923 if (data_ref->root != root->root_key.objectid || 2924 data_ref->objectid != objectid || 2925 data_ref->offset != offset) { 2926 ret = 1; 2927 break; 2928 } 2929 } 2930 spin_unlock(&head->lock); 2931 mutex_unlock(&head->mutex); 2932 return ret; 2933 } 2934 2935 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2936 struct btrfs_root *root, 2937 struct btrfs_path *path, 2938 u64 objectid, u64 offset, u64 bytenr) 2939 { 2940 struct btrfs_root *extent_root = root->fs_info->extent_root; 2941 struct extent_buffer *leaf; 2942 struct btrfs_extent_data_ref *ref; 2943 struct btrfs_extent_inline_ref *iref; 2944 struct btrfs_extent_item *ei; 2945 struct btrfs_key key; 2946 u32 item_size; 2947 int ret; 2948 2949 key.objectid = bytenr; 2950 key.offset = (u64)-1; 2951 key.type = BTRFS_EXTENT_ITEM_KEY; 2952 2953 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2954 if (ret < 0) 2955 goto out; 2956 BUG_ON(ret == 0); /* Corruption */ 2957 2958 ret = -ENOENT; 2959 if (path->slots[0] == 0) 2960 goto out; 2961 2962 path->slots[0]--; 2963 leaf = path->nodes[0]; 2964 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2965 2966 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2967 goto out; 2968 2969 ret = 1; 2970 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2971 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2972 if (item_size < sizeof(*ei)) { 2973 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2974 goto out; 2975 } 2976 #endif 2977 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2978 2979 if (item_size != sizeof(*ei) + 2980 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2981 goto out; 2982 2983 if (btrfs_extent_generation(leaf, ei) <= 2984 btrfs_root_last_snapshot(&root->root_item)) 2985 goto out; 2986 2987 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2988 if (btrfs_extent_inline_ref_type(leaf, iref) != 2989 BTRFS_EXTENT_DATA_REF_KEY) 2990 goto out; 2991 2992 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2993 if (btrfs_extent_refs(leaf, ei) != 2994 btrfs_extent_data_ref_count(leaf, ref) || 2995 btrfs_extent_data_ref_root(leaf, ref) != 2996 root->root_key.objectid || 2997 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2998 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2999 goto out; 3000 3001 ret = 0; 3002 out: 3003 return ret; 3004 } 3005 3006 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3007 struct btrfs_root *root, 3008 u64 objectid, u64 offset, u64 bytenr) 3009 { 3010 struct btrfs_path *path; 3011 int ret; 3012 int ret2; 3013 3014 path = btrfs_alloc_path(); 3015 if (!path) 3016 return -ENOENT; 3017 3018 do { 3019 ret = check_committed_ref(trans, root, path, objectid, 3020 offset, bytenr); 3021 if (ret && ret != -ENOENT) 3022 goto out; 3023 3024 ret2 = check_delayed_ref(trans, root, path, objectid, 3025 offset, bytenr); 3026 } while (ret2 == -EAGAIN); 3027 3028 if (ret2 && ret2 != -ENOENT) { 3029 ret = ret2; 3030 goto out; 3031 } 3032 3033 if (ret != -ENOENT || ret2 != -ENOENT) 3034 ret = 0; 3035 out: 3036 btrfs_free_path(path); 3037 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3038 WARN_ON(ret > 0); 3039 return ret; 3040 } 3041 3042 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3043 struct btrfs_root *root, 3044 struct extent_buffer *buf, 3045 int full_backref, int inc) 3046 { 3047 u64 bytenr; 3048 u64 num_bytes; 3049 u64 parent; 3050 u64 ref_root; 3051 u32 nritems; 3052 struct btrfs_key key; 3053 struct btrfs_file_extent_item *fi; 3054 int i; 3055 int level; 3056 int ret = 0; 3057 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3058 u64, u64, u64, u64, u64, u64, int); 3059 3060 3061 if (btrfs_test_is_dummy_root(root)) 3062 return 0; 3063 3064 ref_root = btrfs_header_owner(buf); 3065 nritems = btrfs_header_nritems(buf); 3066 level = btrfs_header_level(buf); 3067 3068 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3069 return 0; 3070 3071 if (inc) 3072 process_func = btrfs_inc_extent_ref; 3073 else 3074 process_func = btrfs_free_extent; 3075 3076 if (full_backref) 3077 parent = buf->start; 3078 else 3079 parent = 0; 3080 3081 for (i = 0; i < nritems; i++) { 3082 if (level == 0) { 3083 btrfs_item_key_to_cpu(buf, &key, i); 3084 if (key.type != BTRFS_EXTENT_DATA_KEY) 3085 continue; 3086 fi = btrfs_item_ptr(buf, i, 3087 struct btrfs_file_extent_item); 3088 if (btrfs_file_extent_type(buf, fi) == 3089 BTRFS_FILE_EXTENT_INLINE) 3090 continue; 3091 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3092 if (bytenr == 0) 3093 continue; 3094 3095 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3096 key.offset -= btrfs_file_extent_offset(buf, fi); 3097 ret = process_func(trans, root, bytenr, num_bytes, 3098 parent, ref_root, key.objectid, 3099 key.offset, 1); 3100 if (ret) 3101 goto fail; 3102 } else { 3103 bytenr = btrfs_node_blockptr(buf, i); 3104 num_bytes = root->nodesize; 3105 ret = process_func(trans, root, bytenr, num_bytes, 3106 parent, ref_root, level - 1, 0, 3107 1); 3108 if (ret) 3109 goto fail; 3110 } 3111 } 3112 return 0; 3113 fail: 3114 return ret; 3115 } 3116 3117 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3118 struct extent_buffer *buf, int full_backref) 3119 { 3120 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3121 } 3122 3123 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3124 struct extent_buffer *buf, int full_backref) 3125 { 3126 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3127 } 3128 3129 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3130 struct btrfs_root *root, 3131 struct btrfs_path *path, 3132 struct btrfs_block_group_cache *cache) 3133 { 3134 int ret; 3135 struct btrfs_root *extent_root = root->fs_info->extent_root; 3136 unsigned long bi; 3137 struct extent_buffer *leaf; 3138 3139 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3140 if (ret) { 3141 if (ret > 0) 3142 ret = -ENOENT; 3143 goto fail; 3144 } 3145 3146 leaf = path->nodes[0]; 3147 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3148 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3149 btrfs_mark_buffer_dirty(leaf); 3150 btrfs_release_path(path); 3151 fail: 3152 if (ret) 3153 btrfs_abort_transaction(trans, root, ret); 3154 return ret; 3155 3156 } 3157 3158 static struct btrfs_block_group_cache * 3159 next_block_group(struct btrfs_root *root, 3160 struct btrfs_block_group_cache *cache) 3161 { 3162 struct rb_node *node; 3163 3164 spin_lock(&root->fs_info->block_group_cache_lock); 3165 3166 /* If our block group was removed, we need a full search. */ 3167 if (RB_EMPTY_NODE(&cache->cache_node)) { 3168 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3169 3170 spin_unlock(&root->fs_info->block_group_cache_lock); 3171 btrfs_put_block_group(cache); 3172 cache = btrfs_lookup_first_block_group(root->fs_info, 3173 next_bytenr); 3174 return cache; 3175 } 3176 node = rb_next(&cache->cache_node); 3177 btrfs_put_block_group(cache); 3178 if (node) { 3179 cache = rb_entry(node, struct btrfs_block_group_cache, 3180 cache_node); 3181 btrfs_get_block_group(cache); 3182 } else 3183 cache = NULL; 3184 spin_unlock(&root->fs_info->block_group_cache_lock); 3185 return cache; 3186 } 3187 3188 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3189 struct btrfs_trans_handle *trans, 3190 struct btrfs_path *path) 3191 { 3192 struct btrfs_root *root = block_group->fs_info->tree_root; 3193 struct inode *inode = NULL; 3194 u64 alloc_hint = 0; 3195 int dcs = BTRFS_DC_ERROR; 3196 int num_pages = 0; 3197 int retries = 0; 3198 int ret = 0; 3199 3200 /* 3201 * If this block group is smaller than 100 megs don't bother caching the 3202 * block group. 3203 */ 3204 if (block_group->key.offset < (100 * 1024 * 1024)) { 3205 spin_lock(&block_group->lock); 3206 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3207 spin_unlock(&block_group->lock); 3208 return 0; 3209 } 3210 3211 if (trans->aborted) 3212 return 0; 3213 again: 3214 inode = lookup_free_space_inode(root, block_group, path); 3215 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3216 ret = PTR_ERR(inode); 3217 btrfs_release_path(path); 3218 goto out; 3219 } 3220 3221 if (IS_ERR(inode)) { 3222 BUG_ON(retries); 3223 retries++; 3224 3225 if (block_group->ro) 3226 goto out_free; 3227 3228 ret = create_free_space_inode(root, trans, block_group, path); 3229 if (ret) 3230 goto out_free; 3231 goto again; 3232 } 3233 3234 /* We've already setup this transaction, go ahead and exit */ 3235 if (block_group->cache_generation == trans->transid && 3236 i_size_read(inode)) { 3237 dcs = BTRFS_DC_SETUP; 3238 goto out_put; 3239 } 3240 3241 /* 3242 * We want to set the generation to 0, that way if anything goes wrong 3243 * from here on out we know not to trust this cache when we load up next 3244 * time. 3245 */ 3246 BTRFS_I(inode)->generation = 0; 3247 ret = btrfs_update_inode(trans, root, inode); 3248 if (ret) { 3249 /* 3250 * So theoretically we could recover from this, simply set the 3251 * super cache generation to 0 so we know to invalidate the 3252 * cache, but then we'd have to keep track of the block groups 3253 * that fail this way so we know we _have_ to reset this cache 3254 * before the next commit or risk reading stale cache. So to 3255 * limit our exposure to horrible edge cases lets just abort the 3256 * transaction, this only happens in really bad situations 3257 * anyway. 3258 */ 3259 btrfs_abort_transaction(trans, root, ret); 3260 goto out_put; 3261 } 3262 WARN_ON(ret); 3263 3264 if (i_size_read(inode) > 0) { 3265 ret = btrfs_check_trunc_cache_free_space(root, 3266 &root->fs_info->global_block_rsv); 3267 if (ret) 3268 goto out_put; 3269 3270 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3271 if (ret) 3272 goto out_put; 3273 } 3274 3275 spin_lock(&block_group->lock); 3276 if (block_group->cached != BTRFS_CACHE_FINISHED || 3277 !btrfs_test_opt(root, SPACE_CACHE) || 3278 block_group->delalloc_bytes) { 3279 /* 3280 * don't bother trying to write stuff out _if_ 3281 * a) we're not cached, 3282 * b) we're with nospace_cache mount option. 3283 */ 3284 dcs = BTRFS_DC_WRITTEN; 3285 spin_unlock(&block_group->lock); 3286 goto out_put; 3287 } 3288 spin_unlock(&block_group->lock); 3289 3290 /* 3291 * Try to preallocate enough space based on how big the block group is. 3292 * Keep in mind this has to include any pinned space which could end up 3293 * taking up quite a bit since it's not folded into the other space 3294 * cache. 3295 */ 3296 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3297 if (!num_pages) 3298 num_pages = 1; 3299 3300 num_pages *= 16; 3301 num_pages *= PAGE_CACHE_SIZE; 3302 3303 ret = btrfs_check_data_free_space(inode, num_pages); 3304 if (ret) 3305 goto out_put; 3306 3307 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3308 num_pages, num_pages, 3309 &alloc_hint); 3310 if (!ret) 3311 dcs = BTRFS_DC_SETUP; 3312 btrfs_free_reserved_data_space(inode, num_pages); 3313 3314 out_put: 3315 iput(inode); 3316 out_free: 3317 btrfs_release_path(path); 3318 out: 3319 spin_lock(&block_group->lock); 3320 if (!ret && dcs == BTRFS_DC_SETUP) 3321 block_group->cache_generation = trans->transid; 3322 block_group->disk_cache_state = dcs; 3323 spin_unlock(&block_group->lock); 3324 3325 return ret; 3326 } 3327 3328 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3329 struct btrfs_root *root) 3330 { 3331 struct btrfs_block_group_cache *cache; 3332 struct btrfs_transaction *cur_trans = trans->transaction; 3333 int ret = 0; 3334 struct btrfs_path *path; 3335 3336 if (list_empty(&cur_trans->dirty_bgs)) 3337 return 0; 3338 3339 path = btrfs_alloc_path(); 3340 if (!path) 3341 return -ENOMEM; 3342 3343 /* 3344 * We don't need the lock here since we are protected by the transaction 3345 * commit. We want to do the cache_save_setup first and then run the 3346 * delayed refs to make sure we have the best chance at doing this all 3347 * in one shot. 3348 */ 3349 while (!list_empty(&cur_trans->dirty_bgs)) { 3350 cache = list_first_entry(&cur_trans->dirty_bgs, 3351 struct btrfs_block_group_cache, 3352 dirty_list); 3353 list_del_init(&cache->dirty_list); 3354 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3355 cache_save_setup(cache, trans, path); 3356 if (!ret) 3357 ret = btrfs_run_delayed_refs(trans, root, 3358 (unsigned long) -1); 3359 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) 3360 btrfs_write_out_cache(root, trans, cache, path); 3361 if (!ret) 3362 ret = write_one_cache_group(trans, root, path, cache); 3363 btrfs_put_block_group(cache); 3364 } 3365 3366 btrfs_free_path(path); 3367 return ret; 3368 } 3369 3370 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3371 { 3372 struct btrfs_block_group_cache *block_group; 3373 int readonly = 0; 3374 3375 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3376 if (!block_group || block_group->ro) 3377 readonly = 1; 3378 if (block_group) 3379 btrfs_put_block_group(block_group); 3380 return readonly; 3381 } 3382 3383 static const char *alloc_name(u64 flags) 3384 { 3385 switch (flags) { 3386 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3387 return "mixed"; 3388 case BTRFS_BLOCK_GROUP_METADATA: 3389 return "metadata"; 3390 case BTRFS_BLOCK_GROUP_DATA: 3391 return "data"; 3392 case BTRFS_BLOCK_GROUP_SYSTEM: 3393 return "system"; 3394 default: 3395 WARN_ON(1); 3396 return "invalid-combination"; 3397 }; 3398 } 3399 3400 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3401 u64 total_bytes, u64 bytes_used, 3402 struct btrfs_space_info **space_info) 3403 { 3404 struct btrfs_space_info *found; 3405 int i; 3406 int factor; 3407 int ret; 3408 3409 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3410 BTRFS_BLOCK_GROUP_RAID10)) 3411 factor = 2; 3412 else 3413 factor = 1; 3414 3415 found = __find_space_info(info, flags); 3416 if (found) { 3417 spin_lock(&found->lock); 3418 found->total_bytes += total_bytes; 3419 found->disk_total += total_bytes * factor; 3420 found->bytes_used += bytes_used; 3421 found->disk_used += bytes_used * factor; 3422 found->full = 0; 3423 spin_unlock(&found->lock); 3424 *space_info = found; 3425 return 0; 3426 } 3427 found = kzalloc(sizeof(*found), GFP_NOFS); 3428 if (!found) 3429 return -ENOMEM; 3430 3431 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3432 if (ret) { 3433 kfree(found); 3434 return ret; 3435 } 3436 3437 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3438 INIT_LIST_HEAD(&found->block_groups[i]); 3439 init_rwsem(&found->groups_sem); 3440 spin_lock_init(&found->lock); 3441 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3442 found->total_bytes = total_bytes; 3443 found->disk_total = total_bytes * factor; 3444 found->bytes_used = bytes_used; 3445 found->disk_used = bytes_used * factor; 3446 found->bytes_pinned = 0; 3447 found->bytes_reserved = 0; 3448 found->bytes_readonly = 0; 3449 found->bytes_may_use = 0; 3450 found->full = 0; 3451 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3452 found->chunk_alloc = 0; 3453 found->flush = 0; 3454 init_waitqueue_head(&found->wait); 3455 INIT_LIST_HEAD(&found->ro_bgs); 3456 3457 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3458 info->space_info_kobj, "%s", 3459 alloc_name(found->flags)); 3460 if (ret) { 3461 kfree(found); 3462 return ret; 3463 } 3464 3465 *space_info = found; 3466 list_add_rcu(&found->list, &info->space_info); 3467 if (flags & BTRFS_BLOCK_GROUP_DATA) 3468 info->data_sinfo = found; 3469 3470 return ret; 3471 } 3472 3473 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3474 { 3475 u64 extra_flags = chunk_to_extended(flags) & 3476 BTRFS_EXTENDED_PROFILE_MASK; 3477 3478 write_seqlock(&fs_info->profiles_lock); 3479 if (flags & BTRFS_BLOCK_GROUP_DATA) 3480 fs_info->avail_data_alloc_bits |= extra_flags; 3481 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3482 fs_info->avail_metadata_alloc_bits |= extra_flags; 3483 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3484 fs_info->avail_system_alloc_bits |= extra_flags; 3485 write_sequnlock(&fs_info->profiles_lock); 3486 } 3487 3488 /* 3489 * returns target flags in extended format or 0 if restripe for this 3490 * chunk_type is not in progress 3491 * 3492 * should be called with either volume_mutex or balance_lock held 3493 */ 3494 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3495 { 3496 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3497 u64 target = 0; 3498 3499 if (!bctl) 3500 return 0; 3501 3502 if (flags & BTRFS_BLOCK_GROUP_DATA && 3503 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3504 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3505 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3506 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3507 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3508 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3509 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3510 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3511 } 3512 3513 return target; 3514 } 3515 3516 /* 3517 * @flags: available profiles in extended format (see ctree.h) 3518 * 3519 * Returns reduced profile in chunk format. If profile changing is in 3520 * progress (either running or paused) picks the target profile (if it's 3521 * already available), otherwise falls back to plain reducing. 3522 */ 3523 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3524 { 3525 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3526 u64 target; 3527 u64 tmp; 3528 3529 /* 3530 * see if restripe for this chunk_type is in progress, if so 3531 * try to reduce to the target profile 3532 */ 3533 spin_lock(&root->fs_info->balance_lock); 3534 target = get_restripe_target(root->fs_info, flags); 3535 if (target) { 3536 /* pick target profile only if it's already available */ 3537 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3538 spin_unlock(&root->fs_info->balance_lock); 3539 return extended_to_chunk(target); 3540 } 3541 } 3542 spin_unlock(&root->fs_info->balance_lock); 3543 3544 /* First, mask out the RAID levels which aren't possible */ 3545 if (num_devices == 1) 3546 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3547 BTRFS_BLOCK_GROUP_RAID5); 3548 if (num_devices < 3) 3549 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3550 if (num_devices < 4) 3551 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3552 3553 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3554 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3555 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3556 flags &= ~tmp; 3557 3558 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3559 tmp = BTRFS_BLOCK_GROUP_RAID6; 3560 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3561 tmp = BTRFS_BLOCK_GROUP_RAID5; 3562 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3563 tmp = BTRFS_BLOCK_GROUP_RAID10; 3564 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3565 tmp = BTRFS_BLOCK_GROUP_RAID1; 3566 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3567 tmp = BTRFS_BLOCK_GROUP_RAID0; 3568 3569 return extended_to_chunk(flags | tmp); 3570 } 3571 3572 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 3573 { 3574 unsigned seq; 3575 u64 flags; 3576 3577 do { 3578 flags = orig_flags; 3579 seq = read_seqbegin(&root->fs_info->profiles_lock); 3580 3581 if (flags & BTRFS_BLOCK_GROUP_DATA) 3582 flags |= root->fs_info->avail_data_alloc_bits; 3583 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3584 flags |= root->fs_info->avail_system_alloc_bits; 3585 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3586 flags |= root->fs_info->avail_metadata_alloc_bits; 3587 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3588 3589 return btrfs_reduce_alloc_profile(root, flags); 3590 } 3591 3592 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3593 { 3594 u64 flags; 3595 u64 ret; 3596 3597 if (data) 3598 flags = BTRFS_BLOCK_GROUP_DATA; 3599 else if (root == root->fs_info->chunk_root) 3600 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3601 else 3602 flags = BTRFS_BLOCK_GROUP_METADATA; 3603 3604 ret = get_alloc_profile(root, flags); 3605 return ret; 3606 } 3607 3608 /* 3609 * This will check the space that the inode allocates from to make sure we have 3610 * enough space for bytes. 3611 */ 3612 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3613 { 3614 struct btrfs_space_info *data_sinfo; 3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3616 struct btrfs_fs_info *fs_info = root->fs_info; 3617 u64 used; 3618 int ret = 0, committed = 0, alloc_chunk = 1; 3619 3620 /* make sure bytes are sectorsize aligned */ 3621 bytes = ALIGN(bytes, root->sectorsize); 3622 3623 if (btrfs_is_free_space_inode(inode)) { 3624 committed = 1; 3625 ASSERT(current->journal_info); 3626 } 3627 3628 data_sinfo = fs_info->data_sinfo; 3629 if (!data_sinfo) 3630 goto alloc; 3631 3632 again: 3633 /* make sure we have enough space to handle the data first */ 3634 spin_lock(&data_sinfo->lock); 3635 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3636 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3637 data_sinfo->bytes_may_use; 3638 3639 if (used + bytes > data_sinfo->total_bytes) { 3640 struct btrfs_trans_handle *trans; 3641 3642 /* 3643 * if we don't have enough free bytes in this space then we need 3644 * to alloc a new chunk. 3645 */ 3646 if (!data_sinfo->full && alloc_chunk) { 3647 u64 alloc_target; 3648 3649 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3650 spin_unlock(&data_sinfo->lock); 3651 alloc: 3652 alloc_target = btrfs_get_alloc_profile(root, 1); 3653 /* 3654 * It is ugly that we don't call nolock join 3655 * transaction for the free space inode case here. 3656 * But it is safe because we only do the data space 3657 * reservation for the free space cache in the 3658 * transaction context, the common join transaction 3659 * just increase the counter of the current transaction 3660 * handler, doesn't try to acquire the trans_lock of 3661 * the fs. 3662 */ 3663 trans = btrfs_join_transaction(root); 3664 if (IS_ERR(trans)) 3665 return PTR_ERR(trans); 3666 3667 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3668 alloc_target, 3669 CHUNK_ALLOC_NO_FORCE); 3670 btrfs_end_transaction(trans, root); 3671 if (ret < 0) { 3672 if (ret != -ENOSPC) 3673 return ret; 3674 else 3675 goto commit_trans; 3676 } 3677 3678 if (!data_sinfo) 3679 data_sinfo = fs_info->data_sinfo; 3680 3681 goto again; 3682 } 3683 3684 /* 3685 * If we don't have enough pinned space to deal with this 3686 * allocation don't bother committing the transaction. 3687 */ 3688 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, 3689 bytes) < 0) 3690 committed = 1; 3691 spin_unlock(&data_sinfo->lock); 3692 3693 /* commit the current transaction and try again */ 3694 commit_trans: 3695 if (!committed && 3696 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3697 committed = 1; 3698 3699 trans = btrfs_join_transaction(root); 3700 if (IS_ERR(trans)) 3701 return PTR_ERR(trans); 3702 ret = btrfs_commit_transaction(trans, root); 3703 if (ret) 3704 return ret; 3705 goto again; 3706 } 3707 3708 trace_btrfs_space_reservation(root->fs_info, 3709 "space_info:enospc", 3710 data_sinfo->flags, bytes, 1); 3711 return -ENOSPC; 3712 } 3713 data_sinfo->bytes_may_use += bytes; 3714 trace_btrfs_space_reservation(root->fs_info, "space_info", 3715 data_sinfo->flags, bytes, 1); 3716 spin_unlock(&data_sinfo->lock); 3717 3718 return 0; 3719 } 3720 3721 /* 3722 * Called if we need to clear a data reservation for this inode. 3723 */ 3724 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3725 { 3726 struct btrfs_root *root = BTRFS_I(inode)->root; 3727 struct btrfs_space_info *data_sinfo; 3728 3729 /* make sure bytes are sectorsize aligned */ 3730 bytes = ALIGN(bytes, root->sectorsize); 3731 3732 data_sinfo = root->fs_info->data_sinfo; 3733 spin_lock(&data_sinfo->lock); 3734 WARN_ON(data_sinfo->bytes_may_use < bytes); 3735 data_sinfo->bytes_may_use -= bytes; 3736 trace_btrfs_space_reservation(root->fs_info, "space_info", 3737 data_sinfo->flags, bytes, 0); 3738 spin_unlock(&data_sinfo->lock); 3739 } 3740 3741 static void force_metadata_allocation(struct btrfs_fs_info *info) 3742 { 3743 struct list_head *head = &info->space_info; 3744 struct btrfs_space_info *found; 3745 3746 rcu_read_lock(); 3747 list_for_each_entry_rcu(found, head, list) { 3748 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3749 found->force_alloc = CHUNK_ALLOC_FORCE; 3750 } 3751 rcu_read_unlock(); 3752 } 3753 3754 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3755 { 3756 return (global->size << 1); 3757 } 3758 3759 static int should_alloc_chunk(struct btrfs_root *root, 3760 struct btrfs_space_info *sinfo, int force) 3761 { 3762 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3763 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3764 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3765 u64 thresh; 3766 3767 if (force == CHUNK_ALLOC_FORCE) 3768 return 1; 3769 3770 /* 3771 * We need to take into account the global rsv because for all intents 3772 * and purposes it's used space. Don't worry about locking the 3773 * global_rsv, it doesn't change except when the transaction commits. 3774 */ 3775 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3776 num_allocated += calc_global_rsv_need_space(global_rsv); 3777 3778 /* 3779 * in limited mode, we want to have some free space up to 3780 * about 1% of the FS size. 3781 */ 3782 if (force == CHUNK_ALLOC_LIMITED) { 3783 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3784 thresh = max_t(u64, 64 * 1024 * 1024, 3785 div_factor_fine(thresh, 1)); 3786 3787 if (num_bytes - num_allocated < thresh) 3788 return 1; 3789 } 3790 3791 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3792 return 0; 3793 return 1; 3794 } 3795 3796 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3797 { 3798 u64 num_dev; 3799 3800 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3801 BTRFS_BLOCK_GROUP_RAID0 | 3802 BTRFS_BLOCK_GROUP_RAID5 | 3803 BTRFS_BLOCK_GROUP_RAID6)) 3804 num_dev = root->fs_info->fs_devices->rw_devices; 3805 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3806 num_dev = 2; 3807 else 3808 num_dev = 1; /* DUP or single */ 3809 3810 /* metadata for updaing devices and chunk tree */ 3811 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3812 } 3813 3814 static void check_system_chunk(struct btrfs_trans_handle *trans, 3815 struct btrfs_root *root, u64 type) 3816 { 3817 struct btrfs_space_info *info; 3818 u64 left; 3819 u64 thresh; 3820 3821 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3822 spin_lock(&info->lock); 3823 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3824 info->bytes_reserved - info->bytes_readonly; 3825 spin_unlock(&info->lock); 3826 3827 thresh = get_system_chunk_thresh(root, type); 3828 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3829 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3830 left, thresh, type); 3831 dump_space_info(info, 0, 0); 3832 } 3833 3834 if (left < thresh) { 3835 u64 flags; 3836 3837 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3838 btrfs_alloc_chunk(trans, root, flags); 3839 } 3840 } 3841 3842 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3843 struct btrfs_root *extent_root, u64 flags, int force) 3844 { 3845 struct btrfs_space_info *space_info; 3846 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3847 int wait_for_alloc = 0; 3848 int ret = 0; 3849 3850 /* Don't re-enter if we're already allocating a chunk */ 3851 if (trans->allocating_chunk) 3852 return -ENOSPC; 3853 3854 space_info = __find_space_info(extent_root->fs_info, flags); 3855 if (!space_info) { 3856 ret = update_space_info(extent_root->fs_info, flags, 3857 0, 0, &space_info); 3858 BUG_ON(ret); /* -ENOMEM */ 3859 } 3860 BUG_ON(!space_info); /* Logic error */ 3861 3862 again: 3863 spin_lock(&space_info->lock); 3864 if (force < space_info->force_alloc) 3865 force = space_info->force_alloc; 3866 if (space_info->full) { 3867 if (should_alloc_chunk(extent_root, space_info, force)) 3868 ret = -ENOSPC; 3869 else 3870 ret = 0; 3871 spin_unlock(&space_info->lock); 3872 return ret; 3873 } 3874 3875 if (!should_alloc_chunk(extent_root, space_info, force)) { 3876 spin_unlock(&space_info->lock); 3877 return 0; 3878 } else if (space_info->chunk_alloc) { 3879 wait_for_alloc = 1; 3880 } else { 3881 space_info->chunk_alloc = 1; 3882 } 3883 3884 spin_unlock(&space_info->lock); 3885 3886 mutex_lock(&fs_info->chunk_mutex); 3887 3888 /* 3889 * The chunk_mutex is held throughout the entirety of a chunk 3890 * allocation, so once we've acquired the chunk_mutex we know that the 3891 * other guy is done and we need to recheck and see if we should 3892 * allocate. 3893 */ 3894 if (wait_for_alloc) { 3895 mutex_unlock(&fs_info->chunk_mutex); 3896 wait_for_alloc = 0; 3897 goto again; 3898 } 3899 3900 trans->allocating_chunk = true; 3901 3902 /* 3903 * If we have mixed data/metadata chunks we want to make sure we keep 3904 * allocating mixed chunks instead of individual chunks. 3905 */ 3906 if (btrfs_mixed_space_info(space_info)) 3907 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3908 3909 /* 3910 * if we're doing a data chunk, go ahead and make sure that 3911 * we keep a reasonable number of metadata chunks allocated in the 3912 * FS as well. 3913 */ 3914 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3915 fs_info->data_chunk_allocations++; 3916 if (!(fs_info->data_chunk_allocations % 3917 fs_info->metadata_ratio)) 3918 force_metadata_allocation(fs_info); 3919 } 3920 3921 /* 3922 * Check if we have enough space in SYSTEM chunk because we may need 3923 * to update devices. 3924 */ 3925 check_system_chunk(trans, extent_root, flags); 3926 3927 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3928 trans->allocating_chunk = false; 3929 3930 spin_lock(&space_info->lock); 3931 if (ret < 0 && ret != -ENOSPC) 3932 goto out; 3933 if (ret) 3934 space_info->full = 1; 3935 else 3936 ret = 1; 3937 3938 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3939 out: 3940 space_info->chunk_alloc = 0; 3941 spin_unlock(&space_info->lock); 3942 mutex_unlock(&fs_info->chunk_mutex); 3943 return ret; 3944 } 3945 3946 static int can_overcommit(struct btrfs_root *root, 3947 struct btrfs_space_info *space_info, u64 bytes, 3948 enum btrfs_reserve_flush_enum flush) 3949 { 3950 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3951 u64 profile = btrfs_get_alloc_profile(root, 0); 3952 u64 space_size; 3953 u64 avail; 3954 u64 used; 3955 3956 used = space_info->bytes_used + space_info->bytes_reserved + 3957 space_info->bytes_pinned + space_info->bytes_readonly; 3958 3959 /* 3960 * We only want to allow over committing if we have lots of actual space 3961 * free, but if we don't have enough space to handle the global reserve 3962 * space then we could end up having a real enospc problem when trying 3963 * to allocate a chunk or some other such important allocation. 3964 */ 3965 spin_lock(&global_rsv->lock); 3966 space_size = calc_global_rsv_need_space(global_rsv); 3967 spin_unlock(&global_rsv->lock); 3968 if (used + space_size >= space_info->total_bytes) 3969 return 0; 3970 3971 used += space_info->bytes_may_use; 3972 3973 spin_lock(&root->fs_info->free_chunk_lock); 3974 avail = root->fs_info->free_chunk_space; 3975 spin_unlock(&root->fs_info->free_chunk_lock); 3976 3977 /* 3978 * If we have dup, raid1 or raid10 then only half of the free 3979 * space is actually useable. For raid56, the space info used 3980 * doesn't include the parity drive, so we don't have to 3981 * change the math 3982 */ 3983 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3984 BTRFS_BLOCK_GROUP_RAID1 | 3985 BTRFS_BLOCK_GROUP_RAID10)) 3986 avail >>= 1; 3987 3988 /* 3989 * If we aren't flushing all things, let us overcommit up to 3990 * 1/2th of the space. If we can flush, don't let us overcommit 3991 * too much, let it overcommit up to 1/8 of the space. 3992 */ 3993 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3994 avail >>= 3; 3995 else 3996 avail >>= 1; 3997 3998 if (used + bytes < space_info->total_bytes + avail) 3999 return 1; 4000 return 0; 4001 } 4002 4003 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4004 unsigned long nr_pages, int nr_items) 4005 { 4006 struct super_block *sb = root->fs_info->sb; 4007 4008 if (down_read_trylock(&sb->s_umount)) { 4009 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4010 up_read(&sb->s_umount); 4011 } else { 4012 /* 4013 * We needn't worry the filesystem going from r/w to r/o though 4014 * we don't acquire ->s_umount mutex, because the filesystem 4015 * should guarantee the delalloc inodes list be empty after 4016 * the filesystem is readonly(all dirty pages are written to 4017 * the disk). 4018 */ 4019 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4020 if (!current->journal_info) 4021 btrfs_wait_ordered_roots(root->fs_info, nr_items); 4022 } 4023 } 4024 4025 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4026 { 4027 u64 bytes; 4028 int nr; 4029 4030 bytes = btrfs_calc_trans_metadata_size(root, 1); 4031 nr = (int)div64_u64(to_reclaim, bytes); 4032 if (!nr) 4033 nr = 1; 4034 return nr; 4035 } 4036 4037 #define EXTENT_SIZE_PER_ITEM (256 * 1024) 4038 4039 /* 4040 * shrink metadata reservation for delalloc 4041 */ 4042 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4043 bool wait_ordered) 4044 { 4045 struct btrfs_block_rsv *block_rsv; 4046 struct btrfs_space_info *space_info; 4047 struct btrfs_trans_handle *trans; 4048 u64 delalloc_bytes; 4049 u64 max_reclaim; 4050 long time_left; 4051 unsigned long nr_pages; 4052 int loops; 4053 int items; 4054 enum btrfs_reserve_flush_enum flush; 4055 4056 /* Calc the number of the pages we need flush for space reservation */ 4057 items = calc_reclaim_items_nr(root, to_reclaim); 4058 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4059 4060 trans = (struct btrfs_trans_handle *)current->journal_info; 4061 block_rsv = &root->fs_info->delalloc_block_rsv; 4062 space_info = block_rsv->space_info; 4063 4064 delalloc_bytes = percpu_counter_sum_positive( 4065 &root->fs_info->delalloc_bytes); 4066 if (delalloc_bytes == 0) { 4067 if (trans) 4068 return; 4069 if (wait_ordered) 4070 btrfs_wait_ordered_roots(root->fs_info, items); 4071 return; 4072 } 4073 4074 loops = 0; 4075 while (delalloc_bytes && loops < 3) { 4076 max_reclaim = min(delalloc_bytes, to_reclaim); 4077 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4078 btrfs_writeback_inodes_sb_nr(root, nr_pages, items); 4079 /* 4080 * We need to wait for the async pages to actually start before 4081 * we do anything. 4082 */ 4083 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4084 if (!max_reclaim) 4085 goto skip_async; 4086 4087 if (max_reclaim <= nr_pages) 4088 max_reclaim = 0; 4089 else 4090 max_reclaim -= nr_pages; 4091 4092 wait_event(root->fs_info->async_submit_wait, 4093 atomic_read(&root->fs_info->async_delalloc_pages) <= 4094 (int)max_reclaim); 4095 skip_async: 4096 if (!trans) 4097 flush = BTRFS_RESERVE_FLUSH_ALL; 4098 else 4099 flush = BTRFS_RESERVE_NO_FLUSH; 4100 spin_lock(&space_info->lock); 4101 if (can_overcommit(root, space_info, orig, flush)) { 4102 spin_unlock(&space_info->lock); 4103 break; 4104 } 4105 spin_unlock(&space_info->lock); 4106 4107 loops++; 4108 if (wait_ordered && !trans) { 4109 btrfs_wait_ordered_roots(root->fs_info, items); 4110 } else { 4111 time_left = schedule_timeout_killable(1); 4112 if (time_left) 4113 break; 4114 } 4115 delalloc_bytes = percpu_counter_sum_positive( 4116 &root->fs_info->delalloc_bytes); 4117 } 4118 } 4119 4120 /** 4121 * maybe_commit_transaction - possibly commit the transaction if its ok to 4122 * @root - the root we're allocating for 4123 * @bytes - the number of bytes we want to reserve 4124 * @force - force the commit 4125 * 4126 * This will check to make sure that committing the transaction will actually 4127 * get us somewhere and then commit the transaction if it does. Otherwise it 4128 * will return -ENOSPC. 4129 */ 4130 static int may_commit_transaction(struct btrfs_root *root, 4131 struct btrfs_space_info *space_info, 4132 u64 bytes, int force) 4133 { 4134 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4135 struct btrfs_trans_handle *trans; 4136 4137 trans = (struct btrfs_trans_handle *)current->journal_info; 4138 if (trans) 4139 return -EAGAIN; 4140 4141 if (force) 4142 goto commit; 4143 4144 /* See if there is enough pinned space to make this reservation */ 4145 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4146 bytes) >= 0) 4147 goto commit; 4148 4149 /* 4150 * See if there is some space in the delayed insertion reservation for 4151 * this reservation. 4152 */ 4153 if (space_info != delayed_rsv->space_info) 4154 return -ENOSPC; 4155 4156 spin_lock(&delayed_rsv->lock); 4157 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4158 bytes - delayed_rsv->size) >= 0) { 4159 spin_unlock(&delayed_rsv->lock); 4160 return -ENOSPC; 4161 } 4162 spin_unlock(&delayed_rsv->lock); 4163 4164 commit: 4165 trans = btrfs_join_transaction(root); 4166 if (IS_ERR(trans)) 4167 return -ENOSPC; 4168 4169 return btrfs_commit_transaction(trans, root); 4170 } 4171 4172 enum flush_state { 4173 FLUSH_DELAYED_ITEMS_NR = 1, 4174 FLUSH_DELAYED_ITEMS = 2, 4175 FLUSH_DELALLOC = 3, 4176 FLUSH_DELALLOC_WAIT = 4, 4177 ALLOC_CHUNK = 5, 4178 COMMIT_TRANS = 6, 4179 }; 4180 4181 static int flush_space(struct btrfs_root *root, 4182 struct btrfs_space_info *space_info, u64 num_bytes, 4183 u64 orig_bytes, int state) 4184 { 4185 struct btrfs_trans_handle *trans; 4186 int nr; 4187 int ret = 0; 4188 4189 switch (state) { 4190 case FLUSH_DELAYED_ITEMS_NR: 4191 case FLUSH_DELAYED_ITEMS: 4192 if (state == FLUSH_DELAYED_ITEMS_NR) 4193 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4194 else 4195 nr = -1; 4196 4197 trans = btrfs_join_transaction(root); 4198 if (IS_ERR(trans)) { 4199 ret = PTR_ERR(trans); 4200 break; 4201 } 4202 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4203 btrfs_end_transaction(trans, root); 4204 break; 4205 case FLUSH_DELALLOC: 4206 case FLUSH_DELALLOC_WAIT: 4207 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4208 state == FLUSH_DELALLOC_WAIT); 4209 break; 4210 case ALLOC_CHUNK: 4211 trans = btrfs_join_transaction(root); 4212 if (IS_ERR(trans)) { 4213 ret = PTR_ERR(trans); 4214 break; 4215 } 4216 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4217 btrfs_get_alloc_profile(root, 0), 4218 CHUNK_ALLOC_NO_FORCE); 4219 btrfs_end_transaction(trans, root); 4220 if (ret == -ENOSPC) 4221 ret = 0; 4222 break; 4223 case COMMIT_TRANS: 4224 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4225 break; 4226 default: 4227 ret = -ENOSPC; 4228 break; 4229 } 4230 4231 return ret; 4232 } 4233 4234 static inline u64 4235 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4236 struct btrfs_space_info *space_info) 4237 { 4238 u64 used; 4239 u64 expected; 4240 u64 to_reclaim; 4241 4242 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, 4243 16 * 1024 * 1024); 4244 spin_lock(&space_info->lock); 4245 if (can_overcommit(root, space_info, to_reclaim, 4246 BTRFS_RESERVE_FLUSH_ALL)) { 4247 to_reclaim = 0; 4248 goto out; 4249 } 4250 4251 used = space_info->bytes_used + space_info->bytes_reserved + 4252 space_info->bytes_pinned + space_info->bytes_readonly + 4253 space_info->bytes_may_use; 4254 if (can_overcommit(root, space_info, 1024 * 1024, 4255 BTRFS_RESERVE_FLUSH_ALL)) 4256 expected = div_factor_fine(space_info->total_bytes, 95); 4257 else 4258 expected = div_factor_fine(space_info->total_bytes, 90); 4259 4260 if (used > expected) 4261 to_reclaim = used - expected; 4262 else 4263 to_reclaim = 0; 4264 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4265 space_info->bytes_reserved); 4266 out: 4267 spin_unlock(&space_info->lock); 4268 4269 return to_reclaim; 4270 } 4271 4272 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4273 struct btrfs_fs_info *fs_info, u64 used) 4274 { 4275 return (used >= div_factor_fine(space_info->total_bytes, 98) && 4276 !btrfs_fs_closing(fs_info) && 4277 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4278 } 4279 4280 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4281 struct btrfs_fs_info *fs_info, 4282 int flush_state) 4283 { 4284 u64 used; 4285 4286 spin_lock(&space_info->lock); 4287 /* 4288 * We run out of space and have not got any free space via flush_space, 4289 * so don't bother doing async reclaim. 4290 */ 4291 if (flush_state > COMMIT_TRANS && space_info->full) { 4292 spin_unlock(&space_info->lock); 4293 return 0; 4294 } 4295 4296 used = space_info->bytes_used + space_info->bytes_reserved + 4297 space_info->bytes_pinned + space_info->bytes_readonly + 4298 space_info->bytes_may_use; 4299 if (need_do_async_reclaim(space_info, fs_info, used)) { 4300 spin_unlock(&space_info->lock); 4301 return 1; 4302 } 4303 spin_unlock(&space_info->lock); 4304 4305 return 0; 4306 } 4307 4308 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4309 { 4310 struct btrfs_fs_info *fs_info; 4311 struct btrfs_space_info *space_info; 4312 u64 to_reclaim; 4313 int flush_state; 4314 4315 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4316 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4317 4318 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4319 space_info); 4320 if (!to_reclaim) 4321 return; 4322 4323 flush_state = FLUSH_DELAYED_ITEMS_NR; 4324 do { 4325 flush_space(fs_info->fs_root, space_info, to_reclaim, 4326 to_reclaim, flush_state); 4327 flush_state++; 4328 if (!btrfs_need_do_async_reclaim(space_info, fs_info, 4329 flush_state)) 4330 return; 4331 } while (flush_state <= COMMIT_TRANS); 4332 4333 if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) 4334 queue_work(system_unbound_wq, work); 4335 } 4336 4337 void btrfs_init_async_reclaim_work(struct work_struct *work) 4338 { 4339 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 4340 } 4341 4342 /** 4343 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4344 * @root - the root we're allocating for 4345 * @block_rsv - the block_rsv we're allocating for 4346 * @orig_bytes - the number of bytes we want 4347 * @flush - whether or not we can flush to make our reservation 4348 * 4349 * This will reserve orgi_bytes number of bytes from the space info associated 4350 * with the block_rsv. If there is not enough space it will make an attempt to 4351 * flush out space to make room. It will do this by flushing delalloc if 4352 * possible or committing the transaction. If flush is 0 then no attempts to 4353 * regain reservations will be made and this will fail if there is not enough 4354 * space already. 4355 */ 4356 static int reserve_metadata_bytes(struct btrfs_root *root, 4357 struct btrfs_block_rsv *block_rsv, 4358 u64 orig_bytes, 4359 enum btrfs_reserve_flush_enum flush) 4360 { 4361 struct btrfs_space_info *space_info = block_rsv->space_info; 4362 u64 used; 4363 u64 num_bytes = orig_bytes; 4364 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4365 int ret = 0; 4366 bool flushing = false; 4367 4368 again: 4369 ret = 0; 4370 spin_lock(&space_info->lock); 4371 /* 4372 * We only want to wait if somebody other than us is flushing and we 4373 * are actually allowed to flush all things. 4374 */ 4375 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4376 space_info->flush) { 4377 spin_unlock(&space_info->lock); 4378 /* 4379 * If we have a trans handle we can't wait because the flusher 4380 * may have to commit the transaction, which would mean we would 4381 * deadlock since we are waiting for the flusher to finish, but 4382 * hold the current transaction open. 4383 */ 4384 if (current->journal_info) 4385 return -EAGAIN; 4386 ret = wait_event_killable(space_info->wait, !space_info->flush); 4387 /* Must have been killed, return */ 4388 if (ret) 4389 return -EINTR; 4390 4391 spin_lock(&space_info->lock); 4392 } 4393 4394 ret = -ENOSPC; 4395 used = space_info->bytes_used + space_info->bytes_reserved + 4396 space_info->bytes_pinned + space_info->bytes_readonly + 4397 space_info->bytes_may_use; 4398 4399 /* 4400 * The idea here is that we've not already over-reserved the block group 4401 * then we can go ahead and save our reservation first and then start 4402 * flushing if we need to. Otherwise if we've already overcommitted 4403 * lets start flushing stuff first and then come back and try to make 4404 * our reservation. 4405 */ 4406 if (used <= space_info->total_bytes) { 4407 if (used + orig_bytes <= space_info->total_bytes) { 4408 space_info->bytes_may_use += orig_bytes; 4409 trace_btrfs_space_reservation(root->fs_info, 4410 "space_info", space_info->flags, orig_bytes, 1); 4411 ret = 0; 4412 } else { 4413 /* 4414 * Ok set num_bytes to orig_bytes since we aren't 4415 * overocmmitted, this way we only try and reclaim what 4416 * we need. 4417 */ 4418 num_bytes = orig_bytes; 4419 } 4420 } else { 4421 /* 4422 * Ok we're over committed, set num_bytes to the overcommitted 4423 * amount plus the amount of bytes that we need for this 4424 * reservation. 4425 */ 4426 num_bytes = used - space_info->total_bytes + 4427 (orig_bytes * 2); 4428 } 4429 4430 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4431 space_info->bytes_may_use += orig_bytes; 4432 trace_btrfs_space_reservation(root->fs_info, "space_info", 4433 space_info->flags, orig_bytes, 4434 1); 4435 ret = 0; 4436 } 4437 4438 /* 4439 * Couldn't make our reservation, save our place so while we're trying 4440 * to reclaim space we can actually use it instead of somebody else 4441 * stealing it from us. 4442 * 4443 * We make the other tasks wait for the flush only when we can flush 4444 * all things. 4445 */ 4446 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4447 flushing = true; 4448 space_info->flush = 1; 4449 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4450 used += orig_bytes; 4451 /* 4452 * We will do the space reservation dance during log replay, 4453 * which means we won't have fs_info->fs_root set, so don't do 4454 * the async reclaim as we will panic. 4455 */ 4456 if (!root->fs_info->log_root_recovering && 4457 need_do_async_reclaim(space_info, root->fs_info, used) && 4458 !work_busy(&root->fs_info->async_reclaim_work)) 4459 queue_work(system_unbound_wq, 4460 &root->fs_info->async_reclaim_work); 4461 } 4462 spin_unlock(&space_info->lock); 4463 4464 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4465 goto out; 4466 4467 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4468 flush_state); 4469 flush_state++; 4470 4471 /* 4472 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4473 * would happen. So skip delalloc flush. 4474 */ 4475 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4476 (flush_state == FLUSH_DELALLOC || 4477 flush_state == FLUSH_DELALLOC_WAIT)) 4478 flush_state = ALLOC_CHUNK; 4479 4480 if (!ret) 4481 goto again; 4482 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4483 flush_state < COMMIT_TRANS) 4484 goto again; 4485 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4486 flush_state <= COMMIT_TRANS) 4487 goto again; 4488 4489 out: 4490 if (ret == -ENOSPC && 4491 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4492 struct btrfs_block_rsv *global_rsv = 4493 &root->fs_info->global_block_rsv; 4494 4495 if (block_rsv != global_rsv && 4496 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4497 ret = 0; 4498 } 4499 if (ret == -ENOSPC) 4500 trace_btrfs_space_reservation(root->fs_info, 4501 "space_info:enospc", 4502 space_info->flags, orig_bytes, 1); 4503 if (flushing) { 4504 spin_lock(&space_info->lock); 4505 space_info->flush = 0; 4506 wake_up_all(&space_info->wait); 4507 spin_unlock(&space_info->lock); 4508 } 4509 return ret; 4510 } 4511 4512 static struct btrfs_block_rsv *get_block_rsv( 4513 const struct btrfs_trans_handle *trans, 4514 const struct btrfs_root *root) 4515 { 4516 struct btrfs_block_rsv *block_rsv = NULL; 4517 4518 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4519 block_rsv = trans->block_rsv; 4520 4521 if (root == root->fs_info->csum_root && trans->adding_csums) 4522 block_rsv = trans->block_rsv; 4523 4524 if (root == root->fs_info->uuid_root) 4525 block_rsv = trans->block_rsv; 4526 4527 if (!block_rsv) 4528 block_rsv = root->block_rsv; 4529 4530 if (!block_rsv) 4531 block_rsv = &root->fs_info->empty_block_rsv; 4532 4533 return block_rsv; 4534 } 4535 4536 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4537 u64 num_bytes) 4538 { 4539 int ret = -ENOSPC; 4540 spin_lock(&block_rsv->lock); 4541 if (block_rsv->reserved >= num_bytes) { 4542 block_rsv->reserved -= num_bytes; 4543 if (block_rsv->reserved < block_rsv->size) 4544 block_rsv->full = 0; 4545 ret = 0; 4546 } 4547 spin_unlock(&block_rsv->lock); 4548 return ret; 4549 } 4550 4551 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4552 u64 num_bytes, int update_size) 4553 { 4554 spin_lock(&block_rsv->lock); 4555 block_rsv->reserved += num_bytes; 4556 if (update_size) 4557 block_rsv->size += num_bytes; 4558 else if (block_rsv->reserved >= block_rsv->size) 4559 block_rsv->full = 1; 4560 spin_unlock(&block_rsv->lock); 4561 } 4562 4563 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 4564 struct btrfs_block_rsv *dest, u64 num_bytes, 4565 int min_factor) 4566 { 4567 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4568 u64 min_bytes; 4569 4570 if (global_rsv->space_info != dest->space_info) 4571 return -ENOSPC; 4572 4573 spin_lock(&global_rsv->lock); 4574 min_bytes = div_factor(global_rsv->size, min_factor); 4575 if (global_rsv->reserved < min_bytes + num_bytes) { 4576 spin_unlock(&global_rsv->lock); 4577 return -ENOSPC; 4578 } 4579 global_rsv->reserved -= num_bytes; 4580 if (global_rsv->reserved < global_rsv->size) 4581 global_rsv->full = 0; 4582 spin_unlock(&global_rsv->lock); 4583 4584 block_rsv_add_bytes(dest, num_bytes, 1); 4585 return 0; 4586 } 4587 4588 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4589 struct btrfs_block_rsv *block_rsv, 4590 struct btrfs_block_rsv *dest, u64 num_bytes) 4591 { 4592 struct btrfs_space_info *space_info = block_rsv->space_info; 4593 4594 spin_lock(&block_rsv->lock); 4595 if (num_bytes == (u64)-1) 4596 num_bytes = block_rsv->size; 4597 block_rsv->size -= num_bytes; 4598 if (block_rsv->reserved >= block_rsv->size) { 4599 num_bytes = block_rsv->reserved - block_rsv->size; 4600 block_rsv->reserved = block_rsv->size; 4601 block_rsv->full = 1; 4602 } else { 4603 num_bytes = 0; 4604 } 4605 spin_unlock(&block_rsv->lock); 4606 4607 if (num_bytes > 0) { 4608 if (dest) { 4609 spin_lock(&dest->lock); 4610 if (!dest->full) { 4611 u64 bytes_to_add; 4612 4613 bytes_to_add = dest->size - dest->reserved; 4614 bytes_to_add = min(num_bytes, bytes_to_add); 4615 dest->reserved += bytes_to_add; 4616 if (dest->reserved >= dest->size) 4617 dest->full = 1; 4618 num_bytes -= bytes_to_add; 4619 } 4620 spin_unlock(&dest->lock); 4621 } 4622 if (num_bytes) { 4623 spin_lock(&space_info->lock); 4624 space_info->bytes_may_use -= num_bytes; 4625 trace_btrfs_space_reservation(fs_info, "space_info", 4626 space_info->flags, num_bytes, 0); 4627 spin_unlock(&space_info->lock); 4628 } 4629 } 4630 } 4631 4632 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4633 struct btrfs_block_rsv *dst, u64 num_bytes) 4634 { 4635 int ret; 4636 4637 ret = block_rsv_use_bytes(src, num_bytes); 4638 if (ret) 4639 return ret; 4640 4641 block_rsv_add_bytes(dst, num_bytes, 1); 4642 return 0; 4643 } 4644 4645 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4646 { 4647 memset(rsv, 0, sizeof(*rsv)); 4648 spin_lock_init(&rsv->lock); 4649 rsv->type = type; 4650 } 4651 4652 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4653 unsigned short type) 4654 { 4655 struct btrfs_block_rsv *block_rsv; 4656 struct btrfs_fs_info *fs_info = root->fs_info; 4657 4658 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4659 if (!block_rsv) 4660 return NULL; 4661 4662 btrfs_init_block_rsv(block_rsv, type); 4663 block_rsv->space_info = __find_space_info(fs_info, 4664 BTRFS_BLOCK_GROUP_METADATA); 4665 return block_rsv; 4666 } 4667 4668 void btrfs_free_block_rsv(struct btrfs_root *root, 4669 struct btrfs_block_rsv *rsv) 4670 { 4671 if (!rsv) 4672 return; 4673 btrfs_block_rsv_release(root, rsv, (u64)-1); 4674 kfree(rsv); 4675 } 4676 4677 int btrfs_block_rsv_add(struct btrfs_root *root, 4678 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4679 enum btrfs_reserve_flush_enum flush) 4680 { 4681 int ret; 4682 4683 if (num_bytes == 0) 4684 return 0; 4685 4686 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4687 if (!ret) { 4688 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4689 return 0; 4690 } 4691 4692 return ret; 4693 } 4694 4695 int btrfs_block_rsv_check(struct btrfs_root *root, 4696 struct btrfs_block_rsv *block_rsv, int min_factor) 4697 { 4698 u64 num_bytes = 0; 4699 int ret = -ENOSPC; 4700 4701 if (!block_rsv) 4702 return 0; 4703 4704 spin_lock(&block_rsv->lock); 4705 num_bytes = div_factor(block_rsv->size, min_factor); 4706 if (block_rsv->reserved >= num_bytes) 4707 ret = 0; 4708 spin_unlock(&block_rsv->lock); 4709 4710 return ret; 4711 } 4712 4713 int btrfs_block_rsv_refill(struct btrfs_root *root, 4714 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4715 enum btrfs_reserve_flush_enum flush) 4716 { 4717 u64 num_bytes = 0; 4718 int ret = -ENOSPC; 4719 4720 if (!block_rsv) 4721 return 0; 4722 4723 spin_lock(&block_rsv->lock); 4724 num_bytes = min_reserved; 4725 if (block_rsv->reserved >= num_bytes) 4726 ret = 0; 4727 else 4728 num_bytes -= block_rsv->reserved; 4729 spin_unlock(&block_rsv->lock); 4730 4731 if (!ret) 4732 return 0; 4733 4734 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4735 if (!ret) { 4736 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4737 return 0; 4738 } 4739 4740 return ret; 4741 } 4742 4743 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4744 struct btrfs_block_rsv *dst_rsv, 4745 u64 num_bytes) 4746 { 4747 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4748 } 4749 4750 void btrfs_block_rsv_release(struct btrfs_root *root, 4751 struct btrfs_block_rsv *block_rsv, 4752 u64 num_bytes) 4753 { 4754 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4755 if (global_rsv == block_rsv || 4756 block_rsv->space_info != global_rsv->space_info) 4757 global_rsv = NULL; 4758 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4759 num_bytes); 4760 } 4761 4762 /* 4763 * helper to calculate size of global block reservation. 4764 * the desired value is sum of space used by extent tree, 4765 * checksum tree and root tree 4766 */ 4767 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4768 { 4769 struct btrfs_space_info *sinfo; 4770 u64 num_bytes; 4771 u64 meta_used; 4772 u64 data_used; 4773 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4774 4775 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4776 spin_lock(&sinfo->lock); 4777 data_used = sinfo->bytes_used; 4778 spin_unlock(&sinfo->lock); 4779 4780 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4781 spin_lock(&sinfo->lock); 4782 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4783 data_used = 0; 4784 meta_used = sinfo->bytes_used; 4785 spin_unlock(&sinfo->lock); 4786 4787 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4788 csum_size * 2; 4789 num_bytes += div64_u64(data_used + meta_used, 50); 4790 4791 if (num_bytes * 3 > meta_used) 4792 num_bytes = div64_u64(meta_used, 3); 4793 4794 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); 4795 } 4796 4797 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4798 { 4799 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4800 struct btrfs_space_info *sinfo = block_rsv->space_info; 4801 u64 num_bytes; 4802 4803 num_bytes = calc_global_metadata_size(fs_info); 4804 4805 spin_lock(&sinfo->lock); 4806 spin_lock(&block_rsv->lock); 4807 4808 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4809 4810 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4811 sinfo->bytes_reserved + sinfo->bytes_readonly + 4812 sinfo->bytes_may_use; 4813 4814 if (sinfo->total_bytes > num_bytes) { 4815 num_bytes = sinfo->total_bytes - num_bytes; 4816 block_rsv->reserved += num_bytes; 4817 sinfo->bytes_may_use += num_bytes; 4818 trace_btrfs_space_reservation(fs_info, "space_info", 4819 sinfo->flags, num_bytes, 1); 4820 } 4821 4822 if (block_rsv->reserved >= block_rsv->size) { 4823 num_bytes = block_rsv->reserved - block_rsv->size; 4824 sinfo->bytes_may_use -= num_bytes; 4825 trace_btrfs_space_reservation(fs_info, "space_info", 4826 sinfo->flags, num_bytes, 0); 4827 block_rsv->reserved = block_rsv->size; 4828 block_rsv->full = 1; 4829 } 4830 4831 spin_unlock(&block_rsv->lock); 4832 spin_unlock(&sinfo->lock); 4833 } 4834 4835 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4836 { 4837 struct btrfs_space_info *space_info; 4838 4839 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4840 fs_info->chunk_block_rsv.space_info = space_info; 4841 4842 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4843 fs_info->global_block_rsv.space_info = space_info; 4844 fs_info->delalloc_block_rsv.space_info = space_info; 4845 fs_info->trans_block_rsv.space_info = space_info; 4846 fs_info->empty_block_rsv.space_info = space_info; 4847 fs_info->delayed_block_rsv.space_info = space_info; 4848 4849 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4850 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4851 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4852 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4853 if (fs_info->quota_root) 4854 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 4855 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4856 4857 update_global_block_rsv(fs_info); 4858 } 4859 4860 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4861 { 4862 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4863 (u64)-1); 4864 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4865 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4866 WARN_ON(fs_info->trans_block_rsv.size > 0); 4867 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4868 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4869 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4870 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4871 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4872 } 4873 4874 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4875 struct btrfs_root *root) 4876 { 4877 if (!trans->block_rsv) 4878 return; 4879 4880 if (!trans->bytes_reserved) 4881 return; 4882 4883 trace_btrfs_space_reservation(root->fs_info, "transaction", 4884 trans->transid, trans->bytes_reserved, 0); 4885 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4886 trans->bytes_reserved = 0; 4887 } 4888 4889 /* Can only return 0 or -ENOSPC */ 4890 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4891 struct inode *inode) 4892 { 4893 struct btrfs_root *root = BTRFS_I(inode)->root; 4894 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4895 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4896 4897 /* 4898 * We need to hold space in order to delete our orphan item once we've 4899 * added it, so this takes the reservation so we can release it later 4900 * when we are truly done with the orphan item. 4901 */ 4902 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4903 trace_btrfs_space_reservation(root->fs_info, "orphan", 4904 btrfs_ino(inode), num_bytes, 1); 4905 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4906 } 4907 4908 void btrfs_orphan_release_metadata(struct inode *inode) 4909 { 4910 struct btrfs_root *root = BTRFS_I(inode)->root; 4911 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4912 trace_btrfs_space_reservation(root->fs_info, "orphan", 4913 btrfs_ino(inode), num_bytes, 0); 4914 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4915 } 4916 4917 /* 4918 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4919 * root: the root of the parent directory 4920 * rsv: block reservation 4921 * items: the number of items that we need do reservation 4922 * qgroup_reserved: used to return the reserved size in qgroup 4923 * 4924 * This function is used to reserve the space for snapshot/subvolume 4925 * creation and deletion. Those operations are different with the 4926 * common file/directory operations, they change two fs/file trees 4927 * and root tree, the number of items that the qgroup reserves is 4928 * different with the free space reservation. So we can not use 4929 * the space reseravtion mechanism in start_transaction(). 4930 */ 4931 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4932 struct btrfs_block_rsv *rsv, 4933 int items, 4934 u64 *qgroup_reserved, 4935 bool use_global_rsv) 4936 { 4937 u64 num_bytes; 4938 int ret; 4939 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4940 4941 if (root->fs_info->quota_enabled) { 4942 /* One for parent inode, two for dir entries */ 4943 num_bytes = 3 * root->nodesize; 4944 ret = btrfs_qgroup_reserve(root, num_bytes); 4945 if (ret) 4946 return ret; 4947 } else { 4948 num_bytes = 0; 4949 } 4950 4951 *qgroup_reserved = num_bytes; 4952 4953 num_bytes = btrfs_calc_trans_metadata_size(root, items); 4954 rsv->space_info = __find_space_info(root->fs_info, 4955 BTRFS_BLOCK_GROUP_METADATA); 4956 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 4957 BTRFS_RESERVE_FLUSH_ALL); 4958 4959 if (ret == -ENOSPC && use_global_rsv) 4960 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 4961 4962 if (ret) { 4963 if (*qgroup_reserved) 4964 btrfs_qgroup_free(root, *qgroup_reserved); 4965 } 4966 4967 return ret; 4968 } 4969 4970 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 4971 struct btrfs_block_rsv *rsv, 4972 u64 qgroup_reserved) 4973 { 4974 btrfs_block_rsv_release(root, rsv, (u64)-1); 4975 if (qgroup_reserved) 4976 btrfs_qgroup_free(root, qgroup_reserved); 4977 } 4978 4979 /** 4980 * drop_outstanding_extent - drop an outstanding extent 4981 * @inode: the inode we're dropping the extent for 4982 * @num_bytes: the number of bytes we're relaseing. 4983 * 4984 * This is called when we are freeing up an outstanding extent, either called 4985 * after an error or after an extent is written. This will return the number of 4986 * reserved extents that need to be freed. This must be called with 4987 * BTRFS_I(inode)->lock held. 4988 */ 4989 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) 4990 { 4991 unsigned drop_inode_space = 0; 4992 unsigned dropped_extents = 0; 4993 unsigned num_extents = 0; 4994 4995 num_extents = (unsigned)div64_u64(num_bytes + 4996 BTRFS_MAX_EXTENT_SIZE - 1, 4997 BTRFS_MAX_EXTENT_SIZE); 4998 ASSERT(num_extents); 4999 ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); 5000 BTRFS_I(inode)->outstanding_extents -= num_extents; 5001 5002 if (BTRFS_I(inode)->outstanding_extents == 0 && 5003 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5004 &BTRFS_I(inode)->runtime_flags)) 5005 drop_inode_space = 1; 5006 5007 /* 5008 * If we have more or the same amount of outsanding extents than we have 5009 * reserved then we need to leave the reserved extents count alone. 5010 */ 5011 if (BTRFS_I(inode)->outstanding_extents >= 5012 BTRFS_I(inode)->reserved_extents) 5013 return drop_inode_space; 5014 5015 dropped_extents = BTRFS_I(inode)->reserved_extents - 5016 BTRFS_I(inode)->outstanding_extents; 5017 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5018 return dropped_extents + drop_inode_space; 5019 } 5020 5021 /** 5022 * calc_csum_metadata_size - return the amount of metada space that must be 5023 * reserved/free'd for the given bytes. 5024 * @inode: the inode we're manipulating 5025 * @num_bytes: the number of bytes in question 5026 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5027 * 5028 * This adjusts the number of csum_bytes in the inode and then returns the 5029 * correct amount of metadata that must either be reserved or freed. We 5030 * calculate how many checksums we can fit into one leaf and then divide the 5031 * number of bytes that will need to be checksumed by this value to figure out 5032 * how many checksums will be required. If we are adding bytes then the number 5033 * may go up and we will return the number of additional bytes that must be 5034 * reserved. If it is going down we will return the number of bytes that must 5035 * be freed. 5036 * 5037 * This must be called with BTRFS_I(inode)->lock held. 5038 */ 5039 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5040 int reserve) 5041 { 5042 struct btrfs_root *root = BTRFS_I(inode)->root; 5043 u64 csum_size; 5044 int num_csums_per_leaf; 5045 int num_csums; 5046 int old_csums; 5047 5048 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5049 BTRFS_I(inode)->csum_bytes == 0) 5050 return 0; 5051 5052 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5053 if (reserve) 5054 BTRFS_I(inode)->csum_bytes += num_bytes; 5055 else 5056 BTRFS_I(inode)->csum_bytes -= num_bytes; 5057 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 5058 num_csums_per_leaf = (int)div64_u64(csum_size, 5059 sizeof(struct btrfs_csum_item) + 5060 sizeof(struct btrfs_disk_key)); 5061 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5062 num_csums = num_csums + num_csums_per_leaf - 1; 5063 num_csums = num_csums / num_csums_per_leaf; 5064 5065 old_csums = old_csums + num_csums_per_leaf - 1; 5066 old_csums = old_csums / num_csums_per_leaf; 5067 5068 /* No change, no need to reserve more */ 5069 if (old_csums == num_csums) 5070 return 0; 5071 5072 if (reserve) 5073 return btrfs_calc_trans_metadata_size(root, 5074 num_csums - old_csums); 5075 5076 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 5077 } 5078 5079 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5080 { 5081 struct btrfs_root *root = BTRFS_I(inode)->root; 5082 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 5083 u64 to_reserve = 0; 5084 u64 csum_bytes; 5085 unsigned nr_extents = 0; 5086 int extra_reserve = 0; 5087 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5088 int ret = 0; 5089 bool delalloc_lock = true; 5090 u64 to_free = 0; 5091 unsigned dropped; 5092 5093 /* If we are a free space inode we need to not flush since we will be in 5094 * the middle of a transaction commit. We also don't need the delalloc 5095 * mutex since we won't race with anybody. We need this mostly to make 5096 * lockdep shut its filthy mouth. 5097 */ 5098 if (btrfs_is_free_space_inode(inode)) { 5099 flush = BTRFS_RESERVE_NO_FLUSH; 5100 delalloc_lock = false; 5101 } 5102 5103 if (flush != BTRFS_RESERVE_NO_FLUSH && 5104 btrfs_transaction_in_commit(root->fs_info)) 5105 schedule_timeout(1); 5106 5107 if (delalloc_lock) 5108 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5109 5110 num_bytes = ALIGN(num_bytes, root->sectorsize); 5111 5112 spin_lock(&BTRFS_I(inode)->lock); 5113 BTRFS_I(inode)->outstanding_extents++; 5114 5115 if (BTRFS_I(inode)->outstanding_extents > 5116 BTRFS_I(inode)->reserved_extents) 5117 nr_extents = BTRFS_I(inode)->outstanding_extents - 5118 BTRFS_I(inode)->reserved_extents; 5119 5120 /* 5121 * Add an item to reserve for updating the inode when we complete the 5122 * delalloc io. 5123 */ 5124 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5125 &BTRFS_I(inode)->runtime_flags)) { 5126 nr_extents++; 5127 extra_reserve = 1; 5128 } 5129 5130 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 5131 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5132 csum_bytes = BTRFS_I(inode)->csum_bytes; 5133 spin_unlock(&BTRFS_I(inode)->lock); 5134 5135 if (root->fs_info->quota_enabled) { 5136 ret = btrfs_qgroup_reserve(root, num_bytes + 5137 nr_extents * root->nodesize); 5138 if (ret) 5139 goto out_fail; 5140 } 5141 5142 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5143 if (unlikely(ret)) { 5144 if (root->fs_info->quota_enabled) 5145 btrfs_qgroup_free(root, num_bytes + 5146 nr_extents * root->nodesize); 5147 goto out_fail; 5148 } 5149 5150 spin_lock(&BTRFS_I(inode)->lock); 5151 if (extra_reserve) { 5152 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5153 &BTRFS_I(inode)->runtime_flags); 5154 nr_extents--; 5155 } 5156 BTRFS_I(inode)->reserved_extents += nr_extents; 5157 spin_unlock(&BTRFS_I(inode)->lock); 5158 5159 if (delalloc_lock) 5160 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5161 5162 if (to_reserve) 5163 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5164 btrfs_ino(inode), to_reserve, 1); 5165 block_rsv_add_bytes(block_rsv, to_reserve, 1); 5166 5167 return 0; 5168 5169 out_fail: 5170 spin_lock(&BTRFS_I(inode)->lock); 5171 dropped = drop_outstanding_extent(inode, num_bytes); 5172 /* 5173 * If the inodes csum_bytes is the same as the original 5174 * csum_bytes then we know we haven't raced with any free()ers 5175 * so we can just reduce our inodes csum bytes and carry on. 5176 */ 5177 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 5178 calc_csum_metadata_size(inode, num_bytes, 0); 5179 } else { 5180 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 5181 u64 bytes; 5182 5183 /* 5184 * This is tricky, but first we need to figure out how much we 5185 * free'd from any free-ers that occured during this 5186 * reservation, so we reset ->csum_bytes to the csum_bytes 5187 * before we dropped our lock, and then call the free for the 5188 * number of bytes that were freed while we were trying our 5189 * reservation. 5190 */ 5191 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 5192 BTRFS_I(inode)->csum_bytes = csum_bytes; 5193 to_free = calc_csum_metadata_size(inode, bytes, 0); 5194 5195 5196 /* 5197 * Now we need to see how much we would have freed had we not 5198 * been making this reservation and our ->csum_bytes were not 5199 * artificially inflated. 5200 */ 5201 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 5202 bytes = csum_bytes - orig_csum_bytes; 5203 bytes = calc_csum_metadata_size(inode, bytes, 0); 5204 5205 /* 5206 * Now reset ->csum_bytes to what it should be. If bytes is 5207 * more than to_free then we would have free'd more space had we 5208 * not had an artificially high ->csum_bytes, so we need to free 5209 * the remainder. If bytes is the same or less then we don't 5210 * need to do anything, the other free-ers did the correct 5211 * thing. 5212 */ 5213 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 5214 if (bytes > to_free) 5215 to_free = bytes - to_free; 5216 else 5217 to_free = 0; 5218 } 5219 spin_unlock(&BTRFS_I(inode)->lock); 5220 if (dropped) 5221 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5222 5223 if (to_free) { 5224 btrfs_block_rsv_release(root, block_rsv, to_free); 5225 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5226 btrfs_ino(inode), to_free, 0); 5227 } 5228 if (delalloc_lock) 5229 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5230 return ret; 5231 } 5232 5233 /** 5234 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5235 * @inode: the inode to release the reservation for 5236 * @num_bytes: the number of bytes we're releasing 5237 * 5238 * This will release the metadata reservation for an inode. This can be called 5239 * once we complete IO for a given set of bytes to release their metadata 5240 * reservations. 5241 */ 5242 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 5243 { 5244 struct btrfs_root *root = BTRFS_I(inode)->root; 5245 u64 to_free = 0; 5246 unsigned dropped; 5247 5248 num_bytes = ALIGN(num_bytes, root->sectorsize); 5249 spin_lock(&BTRFS_I(inode)->lock); 5250 dropped = drop_outstanding_extent(inode, num_bytes); 5251 5252 if (num_bytes) 5253 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5254 spin_unlock(&BTRFS_I(inode)->lock); 5255 if (dropped > 0) 5256 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5257 5258 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5259 btrfs_ino(inode), to_free, 0); 5260 if (root->fs_info->quota_enabled) { 5261 btrfs_qgroup_free(root, num_bytes + 5262 dropped * root->nodesize); 5263 } 5264 5265 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5266 to_free); 5267 } 5268 5269 /** 5270 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 5271 * @inode: inode we're writing to 5272 * @num_bytes: the number of bytes we want to allocate 5273 * 5274 * This will do the following things 5275 * 5276 * o reserve space in the data space info for num_bytes 5277 * o reserve space in the metadata space info based on number of outstanding 5278 * extents and how much csums will be needed 5279 * o add to the inodes ->delalloc_bytes 5280 * o add it to the fs_info's delalloc inodes list. 5281 * 5282 * This will return 0 for success and -ENOSPC if there is no space left. 5283 */ 5284 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 5285 { 5286 int ret; 5287 5288 ret = btrfs_check_data_free_space(inode, num_bytes); 5289 if (ret) 5290 return ret; 5291 5292 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 5293 if (ret) { 5294 btrfs_free_reserved_data_space(inode, num_bytes); 5295 return ret; 5296 } 5297 5298 return 0; 5299 } 5300 5301 /** 5302 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5303 * @inode: inode we're releasing space for 5304 * @num_bytes: the number of bytes we want to free up 5305 * 5306 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5307 * called in the case that we don't need the metadata AND data reservations 5308 * anymore. So if there is an error or we insert an inline extent. 5309 * 5310 * This function will release the metadata space that was not used and will 5311 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5312 * list if there are no delalloc bytes left. 5313 */ 5314 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5315 { 5316 btrfs_delalloc_release_metadata(inode, num_bytes); 5317 btrfs_free_reserved_data_space(inode, num_bytes); 5318 } 5319 5320 static int update_block_group(struct btrfs_trans_handle *trans, 5321 struct btrfs_root *root, u64 bytenr, 5322 u64 num_bytes, int alloc) 5323 { 5324 struct btrfs_block_group_cache *cache = NULL; 5325 struct btrfs_fs_info *info = root->fs_info; 5326 u64 total = num_bytes; 5327 u64 old_val; 5328 u64 byte_in_group; 5329 int factor; 5330 5331 /* block accounting for super block */ 5332 spin_lock(&info->delalloc_root_lock); 5333 old_val = btrfs_super_bytes_used(info->super_copy); 5334 if (alloc) 5335 old_val += num_bytes; 5336 else 5337 old_val -= num_bytes; 5338 btrfs_set_super_bytes_used(info->super_copy, old_val); 5339 spin_unlock(&info->delalloc_root_lock); 5340 5341 while (total) { 5342 cache = btrfs_lookup_block_group(info, bytenr); 5343 if (!cache) 5344 return -ENOENT; 5345 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5346 BTRFS_BLOCK_GROUP_RAID1 | 5347 BTRFS_BLOCK_GROUP_RAID10)) 5348 factor = 2; 5349 else 5350 factor = 1; 5351 /* 5352 * If this block group has free space cache written out, we 5353 * need to make sure to load it if we are removing space. This 5354 * is because we need the unpinning stage to actually add the 5355 * space back to the block group, otherwise we will leak space. 5356 */ 5357 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5358 cache_block_group(cache, 1); 5359 5360 spin_lock(&trans->transaction->dirty_bgs_lock); 5361 if (list_empty(&cache->dirty_list)) { 5362 list_add_tail(&cache->dirty_list, 5363 &trans->transaction->dirty_bgs); 5364 btrfs_get_block_group(cache); 5365 } 5366 spin_unlock(&trans->transaction->dirty_bgs_lock); 5367 5368 byte_in_group = bytenr - cache->key.objectid; 5369 WARN_ON(byte_in_group > cache->key.offset); 5370 5371 spin_lock(&cache->space_info->lock); 5372 spin_lock(&cache->lock); 5373 5374 if (btrfs_test_opt(root, SPACE_CACHE) && 5375 cache->disk_cache_state < BTRFS_DC_CLEAR) 5376 cache->disk_cache_state = BTRFS_DC_CLEAR; 5377 5378 old_val = btrfs_block_group_used(&cache->item); 5379 num_bytes = min(total, cache->key.offset - byte_in_group); 5380 if (alloc) { 5381 old_val += num_bytes; 5382 btrfs_set_block_group_used(&cache->item, old_val); 5383 cache->reserved -= num_bytes; 5384 cache->space_info->bytes_reserved -= num_bytes; 5385 cache->space_info->bytes_used += num_bytes; 5386 cache->space_info->disk_used += num_bytes * factor; 5387 spin_unlock(&cache->lock); 5388 spin_unlock(&cache->space_info->lock); 5389 } else { 5390 old_val -= num_bytes; 5391 btrfs_set_block_group_used(&cache->item, old_val); 5392 cache->pinned += num_bytes; 5393 cache->space_info->bytes_pinned += num_bytes; 5394 cache->space_info->bytes_used -= num_bytes; 5395 cache->space_info->disk_used -= num_bytes * factor; 5396 spin_unlock(&cache->lock); 5397 spin_unlock(&cache->space_info->lock); 5398 5399 set_extent_dirty(info->pinned_extents, 5400 bytenr, bytenr + num_bytes - 1, 5401 GFP_NOFS | __GFP_NOFAIL); 5402 /* 5403 * No longer have used bytes in this block group, queue 5404 * it for deletion. 5405 */ 5406 if (old_val == 0) { 5407 spin_lock(&info->unused_bgs_lock); 5408 if (list_empty(&cache->bg_list)) { 5409 btrfs_get_block_group(cache); 5410 list_add_tail(&cache->bg_list, 5411 &info->unused_bgs); 5412 } 5413 spin_unlock(&info->unused_bgs_lock); 5414 } 5415 } 5416 btrfs_put_block_group(cache); 5417 total -= num_bytes; 5418 bytenr += num_bytes; 5419 } 5420 return 0; 5421 } 5422 5423 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5424 { 5425 struct btrfs_block_group_cache *cache; 5426 u64 bytenr; 5427 5428 spin_lock(&root->fs_info->block_group_cache_lock); 5429 bytenr = root->fs_info->first_logical_byte; 5430 spin_unlock(&root->fs_info->block_group_cache_lock); 5431 5432 if (bytenr < (u64)-1) 5433 return bytenr; 5434 5435 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5436 if (!cache) 5437 return 0; 5438 5439 bytenr = cache->key.objectid; 5440 btrfs_put_block_group(cache); 5441 5442 return bytenr; 5443 } 5444 5445 static int pin_down_extent(struct btrfs_root *root, 5446 struct btrfs_block_group_cache *cache, 5447 u64 bytenr, u64 num_bytes, int reserved) 5448 { 5449 spin_lock(&cache->space_info->lock); 5450 spin_lock(&cache->lock); 5451 cache->pinned += num_bytes; 5452 cache->space_info->bytes_pinned += num_bytes; 5453 if (reserved) { 5454 cache->reserved -= num_bytes; 5455 cache->space_info->bytes_reserved -= num_bytes; 5456 } 5457 spin_unlock(&cache->lock); 5458 spin_unlock(&cache->space_info->lock); 5459 5460 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5461 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5462 if (reserved) 5463 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); 5464 return 0; 5465 } 5466 5467 /* 5468 * this function must be called within transaction 5469 */ 5470 int btrfs_pin_extent(struct btrfs_root *root, 5471 u64 bytenr, u64 num_bytes, int reserved) 5472 { 5473 struct btrfs_block_group_cache *cache; 5474 5475 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5476 BUG_ON(!cache); /* Logic error */ 5477 5478 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5479 5480 btrfs_put_block_group(cache); 5481 return 0; 5482 } 5483 5484 /* 5485 * this function must be called within transaction 5486 */ 5487 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5488 u64 bytenr, u64 num_bytes) 5489 { 5490 struct btrfs_block_group_cache *cache; 5491 int ret; 5492 5493 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5494 if (!cache) 5495 return -EINVAL; 5496 5497 /* 5498 * pull in the free space cache (if any) so that our pin 5499 * removes the free space from the cache. We have load_only set 5500 * to one because the slow code to read in the free extents does check 5501 * the pinned extents. 5502 */ 5503 cache_block_group(cache, 1); 5504 5505 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5506 5507 /* remove us from the free space cache (if we're there at all) */ 5508 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5509 btrfs_put_block_group(cache); 5510 return ret; 5511 } 5512 5513 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 5514 { 5515 int ret; 5516 struct btrfs_block_group_cache *block_group; 5517 struct btrfs_caching_control *caching_ctl; 5518 5519 block_group = btrfs_lookup_block_group(root->fs_info, start); 5520 if (!block_group) 5521 return -EINVAL; 5522 5523 cache_block_group(block_group, 0); 5524 caching_ctl = get_caching_control(block_group); 5525 5526 if (!caching_ctl) { 5527 /* Logic error */ 5528 BUG_ON(!block_group_cache_done(block_group)); 5529 ret = btrfs_remove_free_space(block_group, start, num_bytes); 5530 } else { 5531 mutex_lock(&caching_ctl->mutex); 5532 5533 if (start >= caching_ctl->progress) { 5534 ret = add_excluded_extent(root, start, num_bytes); 5535 } else if (start + num_bytes <= caching_ctl->progress) { 5536 ret = btrfs_remove_free_space(block_group, 5537 start, num_bytes); 5538 } else { 5539 num_bytes = caching_ctl->progress - start; 5540 ret = btrfs_remove_free_space(block_group, 5541 start, num_bytes); 5542 if (ret) 5543 goto out_lock; 5544 5545 num_bytes = (start + num_bytes) - 5546 caching_ctl->progress; 5547 start = caching_ctl->progress; 5548 ret = add_excluded_extent(root, start, num_bytes); 5549 } 5550 out_lock: 5551 mutex_unlock(&caching_ctl->mutex); 5552 put_caching_control(caching_ctl); 5553 } 5554 btrfs_put_block_group(block_group); 5555 return ret; 5556 } 5557 5558 int btrfs_exclude_logged_extents(struct btrfs_root *log, 5559 struct extent_buffer *eb) 5560 { 5561 struct btrfs_file_extent_item *item; 5562 struct btrfs_key key; 5563 int found_type; 5564 int i; 5565 5566 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 5567 return 0; 5568 5569 for (i = 0; i < btrfs_header_nritems(eb); i++) { 5570 btrfs_item_key_to_cpu(eb, &key, i); 5571 if (key.type != BTRFS_EXTENT_DATA_KEY) 5572 continue; 5573 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 5574 found_type = btrfs_file_extent_type(eb, item); 5575 if (found_type == BTRFS_FILE_EXTENT_INLINE) 5576 continue; 5577 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 5578 continue; 5579 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 5580 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 5581 __exclude_logged_extent(log, key.objectid, key.offset); 5582 } 5583 5584 return 0; 5585 } 5586 5587 /** 5588 * btrfs_update_reserved_bytes - update the block_group and space info counters 5589 * @cache: The cache we are manipulating 5590 * @num_bytes: The number of bytes in question 5591 * @reserve: One of the reservation enums 5592 * @delalloc: The blocks are allocated for the delalloc write 5593 * 5594 * This is called by the allocator when it reserves space, or by somebody who is 5595 * freeing space that was never actually used on disk. For example if you 5596 * reserve some space for a new leaf in transaction A and before transaction A 5597 * commits you free that leaf, you call this with reserve set to 0 in order to 5598 * clear the reservation. 5599 * 5600 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5601 * ENOSPC accounting. For data we handle the reservation through clearing the 5602 * delalloc bits in the io_tree. We have to do this since we could end up 5603 * allocating less disk space for the amount of data we have reserved in the 5604 * case of compression. 5605 * 5606 * If this is a reservation and the block group has become read only we cannot 5607 * make the reservation and return -EAGAIN, otherwise this function always 5608 * succeeds. 5609 */ 5610 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5611 u64 num_bytes, int reserve, int delalloc) 5612 { 5613 struct btrfs_space_info *space_info = cache->space_info; 5614 int ret = 0; 5615 5616 spin_lock(&space_info->lock); 5617 spin_lock(&cache->lock); 5618 if (reserve != RESERVE_FREE) { 5619 if (cache->ro) { 5620 ret = -EAGAIN; 5621 } else { 5622 cache->reserved += num_bytes; 5623 space_info->bytes_reserved += num_bytes; 5624 if (reserve == RESERVE_ALLOC) { 5625 trace_btrfs_space_reservation(cache->fs_info, 5626 "space_info", space_info->flags, 5627 num_bytes, 0); 5628 space_info->bytes_may_use -= num_bytes; 5629 } 5630 5631 if (delalloc) 5632 cache->delalloc_bytes += num_bytes; 5633 } 5634 } else { 5635 if (cache->ro) 5636 space_info->bytes_readonly += num_bytes; 5637 cache->reserved -= num_bytes; 5638 space_info->bytes_reserved -= num_bytes; 5639 5640 if (delalloc) 5641 cache->delalloc_bytes -= num_bytes; 5642 } 5643 spin_unlock(&cache->lock); 5644 spin_unlock(&space_info->lock); 5645 return ret; 5646 } 5647 5648 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5649 struct btrfs_root *root) 5650 { 5651 struct btrfs_fs_info *fs_info = root->fs_info; 5652 struct btrfs_caching_control *next; 5653 struct btrfs_caching_control *caching_ctl; 5654 struct btrfs_block_group_cache *cache; 5655 5656 down_write(&fs_info->commit_root_sem); 5657 5658 list_for_each_entry_safe(caching_ctl, next, 5659 &fs_info->caching_block_groups, list) { 5660 cache = caching_ctl->block_group; 5661 if (block_group_cache_done(cache)) { 5662 cache->last_byte_to_unpin = (u64)-1; 5663 list_del_init(&caching_ctl->list); 5664 put_caching_control(caching_ctl); 5665 } else { 5666 cache->last_byte_to_unpin = caching_ctl->progress; 5667 } 5668 } 5669 5670 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5671 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5672 else 5673 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5674 5675 up_write(&fs_info->commit_root_sem); 5676 5677 update_global_block_rsv(fs_info); 5678 } 5679 5680 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, 5681 const bool return_free_space) 5682 { 5683 struct btrfs_fs_info *fs_info = root->fs_info; 5684 struct btrfs_block_group_cache *cache = NULL; 5685 struct btrfs_space_info *space_info; 5686 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5687 u64 len; 5688 bool readonly; 5689 5690 while (start <= end) { 5691 readonly = false; 5692 if (!cache || 5693 start >= cache->key.objectid + cache->key.offset) { 5694 if (cache) 5695 btrfs_put_block_group(cache); 5696 cache = btrfs_lookup_block_group(fs_info, start); 5697 BUG_ON(!cache); /* Logic error */ 5698 } 5699 5700 len = cache->key.objectid + cache->key.offset - start; 5701 len = min(len, end + 1 - start); 5702 5703 if (start < cache->last_byte_to_unpin) { 5704 len = min(len, cache->last_byte_to_unpin - start); 5705 if (return_free_space) 5706 btrfs_add_free_space(cache, start, len); 5707 } 5708 5709 start += len; 5710 space_info = cache->space_info; 5711 5712 spin_lock(&space_info->lock); 5713 spin_lock(&cache->lock); 5714 cache->pinned -= len; 5715 space_info->bytes_pinned -= len; 5716 percpu_counter_add(&space_info->total_bytes_pinned, -len); 5717 if (cache->ro) { 5718 space_info->bytes_readonly += len; 5719 readonly = true; 5720 } 5721 spin_unlock(&cache->lock); 5722 if (!readonly && global_rsv->space_info == space_info) { 5723 spin_lock(&global_rsv->lock); 5724 if (!global_rsv->full) { 5725 len = min(len, global_rsv->size - 5726 global_rsv->reserved); 5727 global_rsv->reserved += len; 5728 space_info->bytes_may_use += len; 5729 if (global_rsv->reserved >= global_rsv->size) 5730 global_rsv->full = 1; 5731 } 5732 spin_unlock(&global_rsv->lock); 5733 } 5734 spin_unlock(&space_info->lock); 5735 } 5736 5737 if (cache) 5738 btrfs_put_block_group(cache); 5739 return 0; 5740 } 5741 5742 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5743 struct btrfs_root *root) 5744 { 5745 struct btrfs_fs_info *fs_info = root->fs_info; 5746 struct extent_io_tree *unpin; 5747 u64 start; 5748 u64 end; 5749 int ret; 5750 5751 if (trans->aborted) 5752 return 0; 5753 5754 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5755 unpin = &fs_info->freed_extents[1]; 5756 else 5757 unpin = &fs_info->freed_extents[0]; 5758 5759 while (1) { 5760 mutex_lock(&fs_info->unused_bg_unpin_mutex); 5761 ret = find_first_extent_bit(unpin, 0, &start, &end, 5762 EXTENT_DIRTY, NULL); 5763 if (ret) { 5764 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 5765 break; 5766 } 5767 5768 if (btrfs_test_opt(root, DISCARD)) 5769 ret = btrfs_discard_extent(root, start, 5770 end + 1 - start, NULL); 5771 5772 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5773 unpin_extent_range(root, start, end, true); 5774 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 5775 cond_resched(); 5776 } 5777 5778 return 0; 5779 } 5780 5781 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 5782 u64 owner, u64 root_objectid) 5783 { 5784 struct btrfs_space_info *space_info; 5785 u64 flags; 5786 5787 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5788 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 5789 flags = BTRFS_BLOCK_GROUP_SYSTEM; 5790 else 5791 flags = BTRFS_BLOCK_GROUP_METADATA; 5792 } else { 5793 flags = BTRFS_BLOCK_GROUP_DATA; 5794 } 5795 5796 space_info = __find_space_info(fs_info, flags); 5797 BUG_ON(!space_info); /* Logic bug */ 5798 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 5799 } 5800 5801 5802 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5803 struct btrfs_root *root, 5804 u64 bytenr, u64 num_bytes, u64 parent, 5805 u64 root_objectid, u64 owner_objectid, 5806 u64 owner_offset, int refs_to_drop, 5807 struct btrfs_delayed_extent_op *extent_op, 5808 int no_quota) 5809 { 5810 struct btrfs_key key; 5811 struct btrfs_path *path; 5812 struct btrfs_fs_info *info = root->fs_info; 5813 struct btrfs_root *extent_root = info->extent_root; 5814 struct extent_buffer *leaf; 5815 struct btrfs_extent_item *ei; 5816 struct btrfs_extent_inline_ref *iref; 5817 int ret; 5818 int is_data; 5819 int extent_slot = 0; 5820 int found_extent = 0; 5821 int num_to_del = 1; 5822 u32 item_size; 5823 u64 refs; 5824 int last_ref = 0; 5825 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; 5826 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5827 SKINNY_METADATA); 5828 5829 if (!info->quota_enabled || !is_fstree(root_objectid)) 5830 no_quota = 1; 5831 5832 path = btrfs_alloc_path(); 5833 if (!path) 5834 return -ENOMEM; 5835 5836 path->reada = 1; 5837 path->leave_spinning = 1; 5838 5839 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5840 BUG_ON(!is_data && refs_to_drop != 1); 5841 5842 if (is_data) 5843 skinny_metadata = 0; 5844 5845 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5846 bytenr, num_bytes, parent, 5847 root_objectid, owner_objectid, 5848 owner_offset); 5849 if (ret == 0) { 5850 extent_slot = path->slots[0]; 5851 while (extent_slot >= 0) { 5852 btrfs_item_key_to_cpu(path->nodes[0], &key, 5853 extent_slot); 5854 if (key.objectid != bytenr) 5855 break; 5856 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5857 key.offset == num_bytes) { 5858 found_extent = 1; 5859 break; 5860 } 5861 if (key.type == BTRFS_METADATA_ITEM_KEY && 5862 key.offset == owner_objectid) { 5863 found_extent = 1; 5864 break; 5865 } 5866 if (path->slots[0] - extent_slot > 5) 5867 break; 5868 extent_slot--; 5869 } 5870 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5871 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5872 if (found_extent && item_size < sizeof(*ei)) 5873 found_extent = 0; 5874 #endif 5875 if (!found_extent) { 5876 BUG_ON(iref); 5877 ret = remove_extent_backref(trans, extent_root, path, 5878 NULL, refs_to_drop, 5879 is_data, &last_ref); 5880 if (ret) { 5881 btrfs_abort_transaction(trans, extent_root, ret); 5882 goto out; 5883 } 5884 btrfs_release_path(path); 5885 path->leave_spinning = 1; 5886 5887 key.objectid = bytenr; 5888 key.type = BTRFS_EXTENT_ITEM_KEY; 5889 key.offset = num_bytes; 5890 5891 if (!is_data && skinny_metadata) { 5892 key.type = BTRFS_METADATA_ITEM_KEY; 5893 key.offset = owner_objectid; 5894 } 5895 5896 ret = btrfs_search_slot(trans, extent_root, 5897 &key, path, -1, 1); 5898 if (ret > 0 && skinny_metadata && path->slots[0]) { 5899 /* 5900 * Couldn't find our skinny metadata item, 5901 * see if we have ye olde extent item. 5902 */ 5903 path->slots[0]--; 5904 btrfs_item_key_to_cpu(path->nodes[0], &key, 5905 path->slots[0]); 5906 if (key.objectid == bytenr && 5907 key.type == BTRFS_EXTENT_ITEM_KEY && 5908 key.offset == num_bytes) 5909 ret = 0; 5910 } 5911 5912 if (ret > 0 && skinny_metadata) { 5913 skinny_metadata = false; 5914 key.objectid = bytenr; 5915 key.type = BTRFS_EXTENT_ITEM_KEY; 5916 key.offset = num_bytes; 5917 btrfs_release_path(path); 5918 ret = btrfs_search_slot(trans, extent_root, 5919 &key, path, -1, 1); 5920 } 5921 5922 if (ret) { 5923 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5924 ret, bytenr); 5925 if (ret > 0) 5926 btrfs_print_leaf(extent_root, 5927 path->nodes[0]); 5928 } 5929 if (ret < 0) { 5930 btrfs_abort_transaction(trans, extent_root, ret); 5931 goto out; 5932 } 5933 extent_slot = path->slots[0]; 5934 } 5935 } else if (WARN_ON(ret == -ENOENT)) { 5936 btrfs_print_leaf(extent_root, path->nodes[0]); 5937 btrfs_err(info, 5938 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5939 bytenr, parent, root_objectid, owner_objectid, 5940 owner_offset); 5941 btrfs_abort_transaction(trans, extent_root, ret); 5942 goto out; 5943 } else { 5944 btrfs_abort_transaction(trans, extent_root, ret); 5945 goto out; 5946 } 5947 5948 leaf = path->nodes[0]; 5949 item_size = btrfs_item_size_nr(leaf, extent_slot); 5950 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5951 if (item_size < sizeof(*ei)) { 5952 BUG_ON(found_extent || extent_slot != path->slots[0]); 5953 ret = convert_extent_item_v0(trans, extent_root, path, 5954 owner_objectid, 0); 5955 if (ret < 0) { 5956 btrfs_abort_transaction(trans, extent_root, ret); 5957 goto out; 5958 } 5959 5960 btrfs_release_path(path); 5961 path->leave_spinning = 1; 5962 5963 key.objectid = bytenr; 5964 key.type = BTRFS_EXTENT_ITEM_KEY; 5965 key.offset = num_bytes; 5966 5967 ret = btrfs_search_slot(trans, extent_root, &key, path, 5968 -1, 1); 5969 if (ret) { 5970 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5971 ret, bytenr); 5972 btrfs_print_leaf(extent_root, path->nodes[0]); 5973 } 5974 if (ret < 0) { 5975 btrfs_abort_transaction(trans, extent_root, ret); 5976 goto out; 5977 } 5978 5979 extent_slot = path->slots[0]; 5980 leaf = path->nodes[0]; 5981 item_size = btrfs_item_size_nr(leaf, extent_slot); 5982 } 5983 #endif 5984 BUG_ON(item_size < sizeof(*ei)); 5985 ei = btrfs_item_ptr(leaf, extent_slot, 5986 struct btrfs_extent_item); 5987 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 5988 key.type == BTRFS_EXTENT_ITEM_KEY) { 5989 struct btrfs_tree_block_info *bi; 5990 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 5991 bi = (struct btrfs_tree_block_info *)(ei + 1); 5992 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 5993 } 5994 5995 refs = btrfs_extent_refs(leaf, ei); 5996 if (refs < refs_to_drop) { 5997 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 5998 "for bytenr %Lu", refs_to_drop, refs, bytenr); 5999 ret = -EINVAL; 6000 btrfs_abort_transaction(trans, extent_root, ret); 6001 goto out; 6002 } 6003 refs -= refs_to_drop; 6004 6005 if (refs > 0) { 6006 type = BTRFS_QGROUP_OPER_SUB_SHARED; 6007 if (extent_op) 6008 __run_delayed_extent_op(extent_op, leaf, ei); 6009 /* 6010 * In the case of inline back ref, reference count will 6011 * be updated by remove_extent_backref 6012 */ 6013 if (iref) { 6014 BUG_ON(!found_extent); 6015 } else { 6016 btrfs_set_extent_refs(leaf, ei, refs); 6017 btrfs_mark_buffer_dirty(leaf); 6018 } 6019 if (found_extent) { 6020 ret = remove_extent_backref(trans, extent_root, path, 6021 iref, refs_to_drop, 6022 is_data, &last_ref); 6023 if (ret) { 6024 btrfs_abort_transaction(trans, extent_root, ret); 6025 goto out; 6026 } 6027 } 6028 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 6029 root_objectid); 6030 } else { 6031 if (found_extent) { 6032 BUG_ON(is_data && refs_to_drop != 6033 extent_data_ref_count(root, path, iref)); 6034 if (iref) { 6035 BUG_ON(path->slots[0] != extent_slot); 6036 } else { 6037 BUG_ON(path->slots[0] != extent_slot + 1); 6038 path->slots[0] = extent_slot; 6039 num_to_del = 2; 6040 } 6041 } 6042 6043 last_ref = 1; 6044 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6045 num_to_del); 6046 if (ret) { 6047 btrfs_abort_transaction(trans, extent_root, ret); 6048 goto out; 6049 } 6050 btrfs_release_path(path); 6051 6052 if (is_data) { 6053 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 6054 if (ret) { 6055 btrfs_abort_transaction(trans, extent_root, ret); 6056 goto out; 6057 } 6058 } 6059 6060 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 6061 if (ret) { 6062 btrfs_abort_transaction(trans, extent_root, ret); 6063 goto out; 6064 } 6065 } 6066 btrfs_release_path(path); 6067 6068 /* Deal with the quota accounting */ 6069 if (!ret && last_ref && !no_quota) { 6070 int mod_seq = 0; 6071 6072 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && 6073 type == BTRFS_QGROUP_OPER_SUB_SHARED) 6074 mod_seq = 1; 6075 6076 ret = btrfs_qgroup_record_ref(trans, info, root_objectid, 6077 bytenr, num_bytes, type, 6078 mod_seq); 6079 } 6080 out: 6081 btrfs_free_path(path); 6082 return ret; 6083 } 6084 6085 /* 6086 * when we free an block, it is possible (and likely) that we free the last 6087 * delayed ref for that extent as well. This searches the delayed ref tree for 6088 * a given extent, and if there are no other delayed refs to be processed, it 6089 * removes it from the tree. 6090 */ 6091 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 6092 struct btrfs_root *root, u64 bytenr) 6093 { 6094 struct btrfs_delayed_ref_head *head; 6095 struct btrfs_delayed_ref_root *delayed_refs; 6096 int ret = 0; 6097 6098 delayed_refs = &trans->transaction->delayed_refs; 6099 spin_lock(&delayed_refs->lock); 6100 head = btrfs_find_delayed_ref_head(trans, bytenr); 6101 if (!head) 6102 goto out_delayed_unlock; 6103 6104 spin_lock(&head->lock); 6105 if (rb_first(&head->ref_root)) 6106 goto out; 6107 6108 if (head->extent_op) { 6109 if (!head->must_insert_reserved) 6110 goto out; 6111 btrfs_free_delayed_extent_op(head->extent_op); 6112 head->extent_op = NULL; 6113 } 6114 6115 /* 6116 * waiting for the lock here would deadlock. If someone else has it 6117 * locked they are already in the process of dropping it anyway 6118 */ 6119 if (!mutex_trylock(&head->mutex)) 6120 goto out; 6121 6122 /* 6123 * at this point we have a head with no other entries. Go 6124 * ahead and process it. 6125 */ 6126 head->node.in_tree = 0; 6127 rb_erase(&head->href_node, &delayed_refs->href_root); 6128 6129 atomic_dec(&delayed_refs->num_entries); 6130 6131 /* 6132 * we don't take a ref on the node because we're removing it from the 6133 * tree, so we just steal the ref the tree was holding. 6134 */ 6135 delayed_refs->num_heads--; 6136 if (head->processing == 0) 6137 delayed_refs->num_heads_ready--; 6138 head->processing = 0; 6139 spin_unlock(&head->lock); 6140 spin_unlock(&delayed_refs->lock); 6141 6142 BUG_ON(head->extent_op); 6143 if (head->must_insert_reserved) 6144 ret = 1; 6145 6146 mutex_unlock(&head->mutex); 6147 btrfs_put_delayed_ref(&head->node); 6148 return ret; 6149 out: 6150 spin_unlock(&head->lock); 6151 6152 out_delayed_unlock: 6153 spin_unlock(&delayed_refs->lock); 6154 return 0; 6155 } 6156 6157 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 6158 struct btrfs_root *root, 6159 struct extent_buffer *buf, 6160 u64 parent, int last_ref) 6161 { 6162 int pin = 1; 6163 int ret; 6164 6165 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6166 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6167 buf->start, buf->len, 6168 parent, root->root_key.objectid, 6169 btrfs_header_level(buf), 6170 BTRFS_DROP_DELAYED_REF, NULL, 0); 6171 BUG_ON(ret); /* -ENOMEM */ 6172 } 6173 6174 if (!last_ref) 6175 return; 6176 6177 if (btrfs_header_generation(buf) == trans->transid) { 6178 struct btrfs_block_group_cache *cache; 6179 6180 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6181 ret = check_ref_cleanup(trans, root, buf->start); 6182 if (!ret) 6183 goto out; 6184 } 6185 6186 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 6187 6188 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6189 pin_down_extent(root, cache, buf->start, buf->len, 1); 6190 btrfs_put_block_group(cache); 6191 goto out; 6192 } 6193 6194 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6195 6196 btrfs_add_free_space(cache, buf->start, buf->len); 6197 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 6198 btrfs_put_block_group(cache); 6199 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6200 pin = 0; 6201 } 6202 out: 6203 if (pin) 6204 add_pinned_bytes(root->fs_info, buf->len, 6205 btrfs_header_level(buf), 6206 root->root_key.objectid); 6207 6208 /* 6209 * Deleting the buffer, clear the corrupt flag since it doesn't matter 6210 * anymore. 6211 */ 6212 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6213 } 6214 6215 /* Can return -ENOMEM */ 6216 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6217 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6218 u64 owner, u64 offset, int no_quota) 6219 { 6220 int ret; 6221 struct btrfs_fs_info *fs_info = root->fs_info; 6222 6223 if (btrfs_test_is_dummy_root(root)) 6224 return 0; 6225 6226 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6227 6228 /* 6229 * tree log blocks never actually go into the extent allocation 6230 * tree, just update pinning info and exit early. 6231 */ 6232 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 6233 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 6234 /* unlocks the pinned mutex */ 6235 btrfs_pin_extent(root, bytenr, num_bytes, 1); 6236 ret = 0; 6237 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6238 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6239 num_bytes, 6240 parent, root_objectid, (int)owner, 6241 BTRFS_DROP_DELAYED_REF, NULL, no_quota); 6242 } else { 6243 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6244 num_bytes, 6245 parent, root_objectid, owner, 6246 offset, BTRFS_DROP_DELAYED_REF, 6247 NULL, no_quota); 6248 } 6249 return ret; 6250 } 6251 6252 /* 6253 * when we wait for progress in the block group caching, its because 6254 * our allocation attempt failed at least once. So, we must sleep 6255 * and let some progress happen before we try again. 6256 * 6257 * This function will sleep at least once waiting for new free space to 6258 * show up, and then it will check the block group free space numbers 6259 * for our min num_bytes. Another option is to have it go ahead 6260 * and look in the rbtree for a free extent of a given size, but this 6261 * is a good start. 6262 * 6263 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 6264 * any of the information in this block group. 6265 */ 6266 static noinline void 6267 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 6268 u64 num_bytes) 6269 { 6270 struct btrfs_caching_control *caching_ctl; 6271 6272 caching_ctl = get_caching_control(cache); 6273 if (!caching_ctl) 6274 return; 6275 6276 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 6277 (cache->free_space_ctl->free_space >= num_bytes)); 6278 6279 put_caching_control(caching_ctl); 6280 } 6281 6282 static noinline int 6283 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 6284 { 6285 struct btrfs_caching_control *caching_ctl; 6286 int ret = 0; 6287 6288 caching_ctl = get_caching_control(cache); 6289 if (!caching_ctl) 6290 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 6291 6292 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 6293 if (cache->cached == BTRFS_CACHE_ERROR) 6294 ret = -EIO; 6295 put_caching_control(caching_ctl); 6296 return ret; 6297 } 6298 6299 int __get_raid_index(u64 flags) 6300 { 6301 if (flags & BTRFS_BLOCK_GROUP_RAID10) 6302 return BTRFS_RAID_RAID10; 6303 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 6304 return BTRFS_RAID_RAID1; 6305 else if (flags & BTRFS_BLOCK_GROUP_DUP) 6306 return BTRFS_RAID_DUP; 6307 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 6308 return BTRFS_RAID_RAID0; 6309 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 6310 return BTRFS_RAID_RAID5; 6311 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 6312 return BTRFS_RAID_RAID6; 6313 6314 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 6315 } 6316 6317 int get_block_group_index(struct btrfs_block_group_cache *cache) 6318 { 6319 return __get_raid_index(cache->flags); 6320 } 6321 6322 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 6323 [BTRFS_RAID_RAID10] = "raid10", 6324 [BTRFS_RAID_RAID1] = "raid1", 6325 [BTRFS_RAID_DUP] = "dup", 6326 [BTRFS_RAID_RAID0] = "raid0", 6327 [BTRFS_RAID_SINGLE] = "single", 6328 [BTRFS_RAID_RAID5] = "raid5", 6329 [BTRFS_RAID_RAID6] = "raid6", 6330 }; 6331 6332 static const char *get_raid_name(enum btrfs_raid_types type) 6333 { 6334 if (type >= BTRFS_NR_RAID_TYPES) 6335 return NULL; 6336 6337 return btrfs_raid_type_names[type]; 6338 } 6339 6340 enum btrfs_loop_type { 6341 LOOP_CACHING_NOWAIT = 0, 6342 LOOP_CACHING_WAIT = 1, 6343 LOOP_ALLOC_CHUNK = 2, 6344 LOOP_NO_EMPTY_SIZE = 3, 6345 }; 6346 6347 static inline void 6348 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 6349 int delalloc) 6350 { 6351 if (delalloc) 6352 down_read(&cache->data_rwsem); 6353 } 6354 6355 static inline void 6356 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 6357 int delalloc) 6358 { 6359 btrfs_get_block_group(cache); 6360 if (delalloc) 6361 down_read(&cache->data_rwsem); 6362 } 6363 6364 static struct btrfs_block_group_cache * 6365 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 6366 struct btrfs_free_cluster *cluster, 6367 int delalloc) 6368 { 6369 struct btrfs_block_group_cache *used_bg; 6370 bool locked = false; 6371 again: 6372 spin_lock(&cluster->refill_lock); 6373 if (locked) { 6374 if (used_bg == cluster->block_group) 6375 return used_bg; 6376 6377 up_read(&used_bg->data_rwsem); 6378 btrfs_put_block_group(used_bg); 6379 } 6380 6381 used_bg = cluster->block_group; 6382 if (!used_bg) 6383 return NULL; 6384 6385 if (used_bg == block_group) 6386 return used_bg; 6387 6388 btrfs_get_block_group(used_bg); 6389 6390 if (!delalloc) 6391 return used_bg; 6392 6393 if (down_read_trylock(&used_bg->data_rwsem)) 6394 return used_bg; 6395 6396 spin_unlock(&cluster->refill_lock); 6397 down_read(&used_bg->data_rwsem); 6398 locked = true; 6399 goto again; 6400 } 6401 6402 static inline void 6403 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 6404 int delalloc) 6405 { 6406 if (delalloc) 6407 up_read(&cache->data_rwsem); 6408 btrfs_put_block_group(cache); 6409 } 6410 6411 /* 6412 * walks the btree of allocated extents and find a hole of a given size. 6413 * The key ins is changed to record the hole: 6414 * ins->objectid == start position 6415 * ins->flags = BTRFS_EXTENT_ITEM_KEY 6416 * ins->offset == the size of the hole. 6417 * Any available blocks before search_start are skipped. 6418 * 6419 * If there is no suitable free space, we will record the max size of 6420 * the free space extent currently. 6421 */ 6422 static noinline int find_free_extent(struct btrfs_root *orig_root, 6423 u64 num_bytes, u64 empty_size, 6424 u64 hint_byte, struct btrfs_key *ins, 6425 u64 flags, int delalloc) 6426 { 6427 int ret = 0; 6428 struct btrfs_root *root = orig_root->fs_info->extent_root; 6429 struct btrfs_free_cluster *last_ptr = NULL; 6430 struct btrfs_block_group_cache *block_group = NULL; 6431 u64 search_start = 0; 6432 u64 max_extent_size = 0; 6433 int empty_cluster = 2 * 1024 * 1024; 6434 struct btrfs_space_info *space_info; 6435 int loop = 0; 6436 int index = __get_raid_index(flags); 6437 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 6438 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 6439 bool failed_cluster_refill = false; 6440 bool failed_alloc = false; 6441 bool use_cluster = true; 6442 bool have_caching_bg = false; 6443 6444 WARN_ON(num_bytes < root->sectorsize); 6445 ins->type = BTRFS_EXTENT_ITEM_KEY; 6446 ins->objectid = 0; 6447 ins->offset = 0; 6448 6449 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 6450 6451 space_info = __find_space_info(root->fs_info, flags); 6452 if (!space_info) { 6453 btrfs_err(root->fs_info, "No space info for %llu", flags); 6454 return -ENOSPC; 6455 } 6456 6457 /* 6458 * If the space info is for both data and metadata it means we have a 6459 * small filesystem and we can't use the clustering stuff. 6460 */ 6461 if (btrfs_mixed_space_info(space_info)) 6462 use_cluster = false; 6463 6464 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 6465 last_ptr = &root->fs_info->meta_alloc_cluster; 6466 if (!btrfs_test_opt(root, SSD)) 6467 empty_cluster = 64 * 1024; 6468 } 6469 6470 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 6471 btrfs_test_opt(root, SSD)) { 6472 last_ptr = &root->fs_info->data_alloc_cluster; 6473 } 6474 6475 if (last_ptr) { 6476 spin_lock(&last_ptr->lock); 6477 if (last_ptr->block_group) 6478 hint_byte = last_ptr->window_start; 6479 spin_unlock(&last_ptr->lock); 6480 } 6481 6482 search_start = max(search_start, first_logical_byte(root, 0)); 6483 search_start = max(search_start, hint_byte); 6484 6485 if (!last_ptr) 6486 empty_cluster = 0; 6487 6488 if (search_start == hint_byte) { 6489 block_group = btrfs_lookup_block_group(root->fs_info, 6490 search_start); 6491 /* 6492 * we don't want to use the block group if it doesn't match our 6493 * allocation bits, or if its not cached. 6494 * 6495 * However if we are re-searching with an ideal block group 6496 * picked out then we don't care that the block group is cached. 6497 */ 6498 if (block_group && block_group_bits(block_group, flags) && 6499 block_group->cached != BTRFS_CACHE_NO) { 6500 down_read(&space_info->groups_sem); 6501 if (list_empty(&block_group->list) || 6502 block_group->ro) { 6503 /* 6504 * someone is removing this block group, 6505 * we can't jump into the have_block_group 6506 * target because our list pointers are not 6507 * valid 6508 */ 6509 btrfs_put_block_group(block_group); 6510 up_read(&space_info->groups_sem); 6511 } else { 6512 index = get_block_group_index(block_group); 6513 btrfs_lock_block_group(block_group, delalloc); 6514 goto have_block_group; 6515 } 6516 } else if (block_group) { 6517 btrfs_put_block_group(block_group); 6518 } 6519 } 6520 search: 6521 have_caching_bg = false; 6522 down_read(&space_info->groups_sem); 6523 list_for_each_entry(block_group, &space_info->block_groups[index], 6524 list) { 6525 u64 offset; 6526 int cached; 6527 6528 btrfs_grab_block_group(block_group, delalloc); 6529 search_start = block_group->key.objectid; 6530 6531 /* 6532 * this can happen if we end up cycling through all the 6533 * raid types, but we want to make sure we only allocate 6534 * for the proper type. 6535 */ 6536 if (!block_group_bits(block_group, flags)) { 6537 u64 extra = BTRFS_BLOCK_GROUP_DUP | 6538 BTRFS_BLOCK_GROUP_RAID1 | 6539 BTRFS_BLOCK_GROUP_RAID5 | 6540 BTRFS_BLOCK_GROUP_RAID6 | 6541 BTRFS_BLOCK_GROUP_RAID10; 6542 6543 /* 6544 * if they asked for extra copies and this block group 6545 * doesn't provide them, bail. This does allow us to 6546 * fill raid0 from raid1. 6547 */ 6548 if ((flags & extra) && !(block_group->flags & extra)) 6549 goto loop; 6550 } 6551 6552 have_block_group: 6553 cached = block_group_cache_done(block_group); 6554 if (unlikely(!cached)) { 6555 ret = cache_block_group(block_group, 0); 6556 BUG_ON(ret < 0); 6557 ret = 0; 6558 } 6559 6560 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 6561 goto loop; 6562 if (unlikely(block_group->ro)) 6563 goto loop; 6564 6565 /* 6566 * Ok we want to try and use the cluster allocator, so 6567 * lets look there 6568 */ 6569 if (last_ptr) { 6570 struct btrfs_block_group_cache *used_block_group; 6571 unsigned long aligned_cluster; 6572 /* 6573 * the refill lock keeps out other 6574 * people trying to start a new cluster 6575 */ 6576 used_block_group = btrfs_lock_cluster(block_group, 6577 last_ptr, 6578 delalloc); 6579 if (!used_block_group) 6580 goto refill_cluster; 6581 6582 if (used_block_group != block_group && 6583 (used_block_group->ro || 6584 !block_group_bits(used_block_group, flags))) 6585 goto release_cluster; 6586 6587 offset = btrfs_alloc_from_cluster(used_block_group, 6588 last_ptr, 6589 num_bytes, 6590 used_block_group->key.objectid, 6591 &max_extent_size); 6592 if (offset) { 6593 /* we have a block, we're done */ 6594 spin_unlock(&last_ptr->refill_lock); 6595 trace_btrfs_reserve_extent_cluster(root, 6596 used_block_group, 6597 search_start, num_bytes); 6598 if (used_block_group != block_group) { 6599 btrfs_release_block_group(block_group, 6600 delalloc); 6601 block_group = used_block_group; 6602 } 6603 goto checks; 6604 } 6605 6606 WARN_ON(last_ptr->block_group != used_block_group); 6607 release_cluster: 6608 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6609 * set up a new clusters, so lets just skip it 6610 * and let the allocator find whatever block 6611 * it can find. If we reach this point, we 6612 * will have tried the cluster allocator 6613 * plenty of times and not have found 6614 * anything, so we are likely way too 6615 * fragmented for the clustering stuff to find 6616 * anything. 6617 * 6618 * However, if the cluster is taken from the 6619 * current block group, release the cluster 6620 * first, so that we stand a better chance of 6621 * succeeding in the unclustered 6622 * allocation. */ 6623 if (loop >= LOOP_NO_EMPTY_SIZE && 6624 used_block_group != block_group) { 6625 spin_unlock(&last_ptr->refill_lock); 6626 btrfs_release_block_group(used_block_group, 6627 delalloc); 6628 goto unclustered_alloc; 6629 } 6630 6631 /* 6632 * this cluster didn't work out, free it and 6633 * start over 6634 */ 6635 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6636 6637 if (used_block_group != block_group) 6638 btrfs_release_block_group(used_block_group, 6639 delalloc); 6640 refill_cluster: 6641 if (loop >= LOOP_NO_EMPTY_SIZE) { 6642 spin_unlock(&last_ptr->refill_lock); 6643 goto unclustered_alloc; 6644 } 6645 6646 aligned_cluster = max_t(unsigned long, 6647 empty_cluster + empty_size, 6648 block_group->full_stripe_len); 6649 6650 /* allocate a cluster in this block group */ 6651 ret = btrfs_find_space_cluster(root, block_group, 6652 last_ptr, search_start, 6653 num_bytes, 6654 aligned_cluster); 6655 if (ret == 0) { 6656 /* 6657 * now pull our allocation out of this 6658 * cluster 6659 */ 6660 offset = btrfs_alloc_from_cluster(block_group, 6661 last_ptr, 6662 num_bytes, 6663 search_start, 6664 &max_extent_size); 6665 if (offset) { 6666 /* we found one, proceed */ 6667 spin_unlock(&last_ptr->refill_lock); 6668 trace_btrfs_reserve_extent_cluster(root, 6669 block_group, search_start, 6670 num_bytes); 6671 goto checks; 6672 } 6673 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6674 && !failed_cluster_refill) { 6675 spin_unlock(&last_ptr->refill_lock); 6676 6677 failed_cluster_refill = true; 6678 wait_block_group_cache_progress(block_group, 6679 num_bytes + empty_cluster + empty_size); 6680 goto have_block_group; 6681 } 6682 6683 /* 6684 * at this point we either didn't find a cluster 6685 * or we weren't able to allocate a block from our 6686 * cluster. Free the cluster we've been trying 6687 * to use, and go to the next block group 6688 */ 6689 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6690 spin_unlock(&last_ptr->refill_lock); 6691 goto loop; 6692 } 6693 6694 unclustered_alloc: 6695 spin_lock(&block_group->free_space_ctl->tree_lock); 6696 if (cached && 6697 block_group->free_space_ctl->free_space < 6698 num_bytes + empty_cluster + empty_size) { 6699 if (block_group->free_space_ctl->free_space > 6700 max_extent_size) 6701 max_extent_size = 6702 block_group->free_space_ctl->free_space; 6703 spin_unlock(&block_group->free_space_ctl->tree_lock); 6704 goto loop; 6705 } 6706 spin_unlock(&block_group->free_space_ctl->tree_lock); 6707 6708 offset = btrfs_find_space_for_alloc(block_group, search_start, 6709 num_bytes, empty_size, 6710 &max_extent_size); 6711 /* 6712 * If we didn't find a chunk, and we haven't failed on this 6713 * block group before, and this block group is in the middle of 6714 * caching and we are ok with waiting, then go ahead and wait 6715 * for progress to be made, and set failed_alloc to true. 6716 * 6717 * If failed_alloc is true then we've already waited on this 6718 * block group once and should move on to the next block group. 6719 */ 6720 if (!offset && !failed_alloc && !cached && 6721 loop > LOOP_CACHING_NOWAIT) { 6722 wait_block_group_cache_progress(block_group, 6723 num_bytes + empty_size); 6724 failed_alloc = true; 6725 goto have_block_group; 6726 } else if (!offset) { 6727 if (!cached) 6728 have_caching_bg = true; 6729 goto loop; 6730 } 6731 checks: 6732 search_start = ALIGN(offset, root->stripesize); 6733 6734 /* move on to the next group */ 6735 if (search_start + num_bytes > 6736 block_group->key.objectid + block_group->key.offset) { 6737 btrfs_add_free_space(block_group, offset, num_bytes); 6738 goto loop; 6739 } 6740 6741 if (offset < search_start) 6742 btrfs_add_free_space(block_group, offset, 6743 search_start - offset); 6744 BUG_ON(offset > search_start); 6745 6746 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 6747 alloc_type, delalloc); 6748 if (ret == -EAGAIN) { 6749 btrfs_add_free_space(block_group, offset, num_bytes); 6750 goto loop; 6751 } 6752 6753 /* we are all good, lets return */ 6754 ins->objectid = search_start; 6755 ins->offset = num_bytes; 6756 6757 trace_btrfs_reserve_extent(orig_root, block_group, 6758 search_start, num_bytes); 6759 btrfs_release_block_group(block_group, delalloc); 6760 break; 6761 loop: 6762 failed_cluster_refill = false; 6763 failed_alloc = false; 6764 BUG_ON(index != get_block_group_index(block_group)); 6765 btrfs_release_block_group(block_group, delalloc); 6766 } 6767 up_read(&space_info->groups_sem); 6768 6769 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6770 goto search; 6771 6772 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6773 goto search; 6774 6775 /* 6776 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6777 * caching kthreads as we move along 6778 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6779 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6780 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6781 * again 6782 */ 6783 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6784 index = 0; 6785 loop++; 6786 if (loop == LOOP_ALLOC_CHUNK) { 6787 struct btrfs_trans_handle *trans; 6788 int exist = 0; 6789 6790 trans = current->journal_info; 6791 if (trans) 6792 exist = 1; 6793 else 6794 trans = btrfs_join_transaction(root); 6795 6796 if (IS_ERR(trans)) { 6797 ret = PTR_ERR(trans); 6798 goto out; 6799 } 6800 6801 ret = do_chunk_alloc(trans, root, flags, 6802 CHUNK_ALLOC_FORCE); 6803 /* 6804 * Do not bail out on ENOSPC since we 6805 * can do more things. 6806 */ 6807 if (ret < 0 && ret != -ENOSPC) 6808 btrfs_abort_transaction(trans, 6809 root, ret); 6810 else 6811 ret = 0; 6812 if (!exist) 6813 btrfs_end_transaction(trans, root); 6814 if (ret) 6815 goto out; 6816 } 6817 6818 if (loop == LOOP_NO_EMPTY_SIZE) { 6819 empty_size = 0; 6820 empty_cluster = 0; 6821 } 6822 6823 goto search; 6824 } else if (!ins->objectid) { 6825 ret = -ENOSPC; 6826 } else if (ins->objectid) { 6827 ret = 0; 6828 } 6829 out: 6830 if (ret == -ENOSPC) 6831 ins->offset = max_extent_size; 6832 return ret; 6833 } 6834 6835 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6836 int dump_block_groups) 6837 { 6838 struct btrfs_block_group_cache *cache; 6839 int index = 0; 6840 6841 spin_lock(&info->lock); 6842 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", 6843 info->flags, 6844 info->total_bytes - info->bytes_used - info->bytes_pinned - 6845 info->bytes_reserved - info->bytes_readonly, 6846 (info->full) ? "" : "not "); 6847 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " 6848 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6849 info->total_bytes, info->bytes_used, info->bytes_pinned, 6850 info->bytes_reserved, info->bytes_may_use, 6851 info->bytes_readonly); 6852 spin_unlock(&info->lock); 6853 6854 if (!dump_block_groups) 6855 return; 6856 6857 down_read(&info->groups_sem); 6858 again: 6859 list_for_each_entry(cache, &info->block_groups[index], list) { 6860 spin_lock(&cache->lock); 6861 printk(KERN_INFO "BTRFS: " 6862 "block group %llu has %llu bytes, " 6863 "%llu used %llu pinned %llu reserved %s\n", 6864 cache->key.objectid, cache->key.offset, 6865 btrfs_block_group_used(&cache->item), cache->pinned, 6866 cache->reserved, cache->ro ? "[readonly]" : ""); 6867 btrfs_dump_free_space(cache, bytes); 6868 spin_unlock(&cache->lock); 6869 } 6870 if (++index < BTRFS_NR_RAID_TYPES) 6871 goto again; 6872 up_read(&info->groups_sem); 6873 } 6874 6875 int btrfs_reserve_extent(struct btrfs_root *root, 6876 u64 num_bytes, u64 min_alloc_size, 6877 u64 empty_size, u64 hint_byte, 6878 struct btrfs_key *ins, int is_data, int delalloc) 6879 { 6880 bool final_tried = false; 6881 u64 flags; 6882 int ret; 6883 6884 flags = btrfs_get_alloc_profile(root, is_data); 6885 again: 6886 WARN_ON(num_bytes < root->sectorsize); 6887 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 6888 flags, delalloc); 6889 6890 if (ret == -ENOSPC) { 6891 if (!final_tried && ins->offset) { 6892 num_bytes = min(num_bytes >> 1, ins->offset); 6893 num_bytes = round_down(num_bytes, root->sectorsize); 6894 num_bytes = max(num_bytes, min_alloc_size); 6895 if (num_bytes == min_alloc_size) 6896 final_tried = true; 6897 goto again; 6898 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6899 struct btrfs_space_info *sinfo; 6900 6901 sinfo = __find_space_info(root->fs_info, flags); 6902 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6903 flags, num_bytes); 6904 if (sinfo) 6905 dump_space_info(sinfo, num_bytes, 1); 6906 } 6907 } 6908 6909 return ret; 6910 } 6911 6912 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6913 u64 start, u64 len, 6914 int pin, int delalloc) 6915 { 6916 struct btrfs_block_group_cache *cache; 6917 int ret = 0; 6918 6919 cache = btrfs_lookup_block_group(root->fs_info, start); 6920 if (!cache) { 6921 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6922 start); 6923 return -ENOSPC; 6924 } 6925 6926 if (btrfs_test_opt(root, DISCARD)) 6927 ret = btrfs_discard_extent(root, start, len, NULL); 6928 6929 if (pin) 6930 pin_down_extent(root, cache, start, len, 1); 6931 else { 6932 btrfs_add_free_space(cache, start, len); 6933 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 6934 } 6935 btrfs_put_block_group(cache); 6936 6937 trace_btrfs_reserved_extent_free(root, start, len); 6938 6939 return ret; 6940 } 6941 6942 int btrfs_free_reserved_extent(struct btrfs_root *root, 6943 u64 start, u64 len, int delalloc) 6944 { 6945 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); 6946 } 6947 6948 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6949 u64 start, u64 len) 6950 { 6951 return __btrfs_free_reserved_extent(root, start, len, 1, 0); 6952 } 6953 6954 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6955 struct btrfs_root *root, 6956 u64 parent, u64 root_objectid, 6957 u64 flags, u64 owner, u64 offset, 6958 struct btrfs_key *ins, int ref_mod) 6959 { 6960 int ret; 6961 struct btrfs_fs_info *fs_info = root->fs_info; 6962 struct btrfs_extent_item *extent_item; 6963 struct btrfs_extent_inline_ref *iref; 6964 struct btrfs_path *path; 6965 struct extent_buffer *leaf; 6966 int type; 6967 u32 size; 6968 6969 if (parent > 0) 6970 type = BTRFS_SHARED_DATA_REF_KEY; 6971 else 6972 type = BTRFS_EXTENT_DATA_REF_KEY; 6973 6974 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 6975 6976 path = btrfs_alloc_path(); 6977 if (!path) 6978 return -ENOMEM; 6979 6980 path->leave_spinning = 1; 6981 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6982 ins, size); 6983 if (ret) { 6984 btrfs_free_path(path); 6985 return ret; 6986 } 6987 6988 leaf = path->nodes[0]; 6989 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6990 struct btrfs_extent_item); 6991 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 6992 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6993 btrfs_set_extent_flags(leaf, extent_item, 6994 flags | BTRFS_EXTENT_FLAG_DATA); 6995 6996 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6997 btrfs_set_extent_inline_ref_type(leaf, iref, type); 6998 if (parent > 0) { 6999 struct btrfs_shared_data_ref *ref; 7000 ref = (struct btrfs_shared_data_ref *)(iref + 1); 7001 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7002 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 7003 } else { 7004 struct btrfs_extent_data_ref *ref; 7005 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 7006 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 7007 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 7008 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 7009 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 7010 } 7011 7012 btrfs_mark_buffer_dirty(path->nodes[0]); 7013 btrfs_free_path(path); 7014 7015 /* Always set parent to 0 here since its exclusive anyway. */ 7016 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7017 ins->objectid, ins->offset, 7018 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7019 if (ret) 7020 return ret; 7021 7022 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 7023 if (ret) { /* -ENOENT, logic error */ 7024 btrfs_err(fs_info, "update block group failed for %llu %llu", 7025 ins->objectid, ins->offset); 7026 BUG(); 7027 } 7028 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 7029 return ret; 7030 } 7031 7032 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 7033 struct btrfs_root *root, 7034 u64 parent, u64 root_objectid, 7035 u64 flags, struct btrfs_disk_key *key, 7036 int level, struct btrfs_key *ins, 7037 int no_quota) 7038 { 7039 int ret; 7040 struct btrfs_fs_info *fs_info = root->fs_info; 7041 struct btrfs_extent_item *extent_item; 7042 struct btrfs_tree_block_info *block_info; 7043 struct btrfs_extent_inline_ref *iref; 7044 struct btrfs_path *path; 7045 struct extent_buffer *leaf; 7046 u32 size = sizeof(*extent_item) + sizeof(*iref); 7047 u64 num_bytes = ins->offset; 7048 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7049 SKINNY_METADATA); 7050 7051 if (!skinny_metadata) 7052 size += sizeof(*block_info); 7053 7054 path = btrfs_alloc_path(); 7055 if (!path) { 7056 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7057 root->nodesize); 7058 return -ENOMEM; 7059 } 7060 7061 path->leave_spinning = 1; 7062 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7063 ins, size); 7064 if (ret) { 7065 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7066 root->nodesize); 7067 btrfs_free_path(path); 7068 return ret; 7069 } 7070 7071 leaf = path->nodes[0]; 7072 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7073 struct btrfs_extent_item); 7074 btrfs_set_extent_refs(leaf, extent_item, 1); 7075 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7076 btrfs_set_extent_flags(leaf, extent_item, 7077 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 7078 7079 if (skinny_metadata) { 7080 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7081 num_bytes = root->nodesize; 7082 } else { 7083 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7084 btrfs_set_tree_block_key(leaf, block_info, key); 7085 btrfs_set_tree_block_level(leaf, block_info, level); 7086 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 7087 } 7088 7089 if (parent > 0) { 7090 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 7091 btrfs_set_extent_inline_ref_type(leaf, iref, 7092 BTRFS_SHARED_BLOCK_REF_KEY); 7093 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7094 } else { 7095 btrfs_set_extent_inline_ref_type(leaf, iref, 7096 BTRFS_TREE_BLOCK_REF_KEY); 7097 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 7098 } 7099 7100 btrfs_mark_buffer_dirty(leaf); 7101 btrfs_free_path(path); 7102 7103 if (!no_quota) { 7104 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7105 ins->objectid, num_bytes, 7106 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7107 if (ret) 7108 return ret; 7109 } 7110 7111 ret = update_block_group(trans, root, ins->objectid, root->nodesize, 7112 1); 7113 if (ret) { /* -ENOENT, logic error */ 7114 btrfs_err(fs_info, "update block group failed for %llu %llu", 7115 ins->objectid, ins->offset); 7116 BUG(); 7117 } 7118 7119 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); 7120 return ret; 7121 } 7122 7123 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7124 struct btrfs_root *root, 7125 u64 root_objectid, u64 owner, 7126 u64 offset, struct btrfs_key *ins) 7127 { 7128 int ret; 7129 7130 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 7131 7132 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 7133 ins->offset, 0, 7134 root_objectid, owner, offset, 7135 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 7136 return ret; 7137 } 7138 7139 /* 7140 * this is used by the tree logging recovery code. It records that 7141 * an extent has been allocated and makes sure to clear the free 7142 * space cache bits as well 7143 */ 7144 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 7145 struct btrfs_root *root, 7146 u64 root_objectid, u64 owner, u64 offset, 7147 struct btrfs_key *ins) 7148 { 7149 int ret; 7150 struct btrfs_block_group_cache *block_group; 7151 7152 /* 7153 * Mixed block groups will exclude before processing the log so we only 7154 * need to do the exlude dance if this fs isn't mixed. 7155 */ 7156 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 7157 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 7158 if (ret) 7159 return ret; 7160 } 7161 7162 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 7163 if (!block_group) 7164 return -EINVAL; 7165 7166 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 7167 RESERVE_ALLOC_NO_ACCOUNT, 0); 7168 BUG_ON(ret); /* logic error */ 7169 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 7170 0, owner, offset, ins, 1); 7171 btrfs_put_block_group(block_group); 7172 return ret; 7173 } 7174 7175 static struct extent_buffer * 7176 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 7177 u64 bytenr, int level) 7178 { 7179 struct extent_buffer *buf; 7180 7181 buf = btrfs_find_create_tree_block(root, bytenr); 7182 if (!buf) 7183 return ERR_PTR(-ENOMEM); 7184 btrfs_set_header_generation(buf, trans->transid); 7185 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 7186 btrfs_tree_lock(buf); 7187 clean_tree_block(trans, root, buf); 7188 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 7189 7190 btrfs_set_lock_blocking(buf); 7191 btrfs_set_buffer_uptodate(buf); 7192 7193 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7194 buf->log_index = root->log_transid % 2; 7195 /* 7196 * we allow two log transactions at a time, use different 7197 * EXENT bit to differentiate dirty pages. 7198 */ 7199 if (buf->log_index == 0) 7200 set_extent_dirty(&root->dirty_log_pages, buf->start, 7201 buf->start + buf->len - 1, GFP_NOFS); 7202 else 7203 set_extent_new(&root->dirty_log_pages, buf->start, 7204 buf->start + buf->len - 1, GFP_NOFS); 7205 } else { 7206 buf->log_index = -1; 7207 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7208 buf->start + buf->len - 1, GFP_NOFS); 7209 } 7210 trans->blocks_used++; 7211 /* this returns a buffer locked for blocking */ 7212 return buf; 7213 } 7214 7215 static struct btrfs_block_rsv * 7216 use_block_rsv(struct btrfs_trans_handle *trans, 7217 struct btrfs_root *root, u32 blocksize) 7218 { 7219 struct btrfs_block_rsv *block_rsv; 7220 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 7221 int ret; 7222 bool global_updated = false; 7223 7224 block_rsv = get_block_rsv(trans, root); 7225 7226 if (unlikely(block_rsv->size == 0)) 7227 goto try_reserve; 7228 again: 7229 ret = block_rsv_use_bytes(block_rsv, blocksize); 7230 if (!ret) 7231 return block_rsv; 7232 7233 if (block_rsv->failfast) 7234 return ERR_PTR(ret); 7235 7236 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 7237 global_updated = true; 7238 update_global_block_rsv(root->fs_info); 7239 goto again; 7240 } 7241 7242 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 7243 static DEFINE_RATELIMIT_STATE(_rs, 7244 DEFAULT_RATELIMIT_INTERVAL * 10, 7245 /*DEFAULT_RATELIMIT_BURST*/ 1); 7246 if (__ratelimit(&_rs)) 7247 WARN(1, KERN_DEBUG 7248 "BTRFS: block rsv returned %d\n", ret); 7249 } 7250 try_reserve: 7251 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 7252 BTRFS_RESERVE_NO_FLUSH); 7253 if (!ret) 7254 return block_rsv; 7255 /* 7256 * If we couldn't reserve metadata bytes try and use some from 7257 * the global reserve if its space type is the same as the global 7258 * reservation. 7259 */ 7260 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 7261 block_rsv->space_info == global_rsv->space_info) { 7262 ret = block_rsv_use_bytes(global_rsv, blocksize); 7263 if (!ret) 7264 return global_rsv; 7265 } 7266 return ERR_PTR(ret); 7267 } 7268 7269 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 7270 struct btrfs_block_rsv *block_rsv, u32 blocksize) 7271 { 7272 block_rsv_add_bytes(block_rsv, blocksize, 0); 7273 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 7274 } 7275 7276 /* 7277 * finds a free extent and does all the dirty work required for allocation 7278 * returns the key for the extent through ins, and a tree buffer for 7279 * the first block of the extent through buf. 7280 * 7281 * returns the tree buffer or NULL. 7282 */ 7283 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 7284 struct btrfs_root *root, 7285 u64 parent, u64 root_objectid, 7286 struct btrfs_disk_key *key, int level, 7287 u64 hint, u64 empty_size) 7288 { 7289 struct btrfs_key ins; 7290 struct btrfs_block_rsv *block_rsv; 7291 struct extent_buffer *buf; 7292 u64 flags = 0; 7293 int ret; 7294 u32 blocksize = root->nodesize; 7295 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7296 SKINNY_METADATA); 7297 7298 if (btrfs_test_is_dummy_root(root)) { 7299 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7300 level); 7301 if (!IS_ERR(buf)) 7302 root->alloc_bytenr += blocksize; 7303 return buf; 7304 } 7305 7306 block_rsv = use_block_rsv(trans, root, blocksize); 7307 if (IS_ERR(block_rsv)) 7308 return ERR_CAST(block_rsv); 7309 7310 ret = btrfs_reserve_extent(root, blocksize, blocksize, 7311 empty_size, hint, &ins, 0, 0); 7312 if (ret) { 7313 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 7314 return ERR_PTR(ret); 7315 } 7316 7317 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 7318 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 7319 7320 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 7321 if (parent == 0) 7322 parent = ins.objectid; 7323 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 7324 } else 7325 BUG_ON(parent > 0); 7326 7327 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 7328 struct btrfs_delayed_extent_op *extent_op; 7329 extent_op = btrfs_alloc_delayed_extent_op(); 7330 BUG_ON(!extent_op); /* -ENOMEM */ 7331 if (key) 7332 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 7333 else 7334 memset(&extent_op->key, 0, sizeof(extent_op->key)); 7335 extent_op->flags_to_set = flags; 7336 if (skinny_metadata) 7337 extent_op->update_key = 0; 7338 else 7339 extent_op->update_key = 1; 7340 extent_op->update_flags = 1; 7341 extent_op->is_data = 0; 7342 extent_op->level = level; 7343 7344 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 7345 ins.objectid, 7346 ins.offset, parent, root_objectid, 7347 level, BTRFS_ADD_DELAYED_EXTENT, 7348 extent_op, 0); 7349 BUG_ON(ret); /* -ENOMEM */ 7350 } 7351 return buf; 7352 } 7353 7354 struct walk_control { 7355 u64 refs[BTRFS_MAX_LEVEL]; 7356 u64 flags[BTRFS_MAX_LEVEL]; 7357 struct btrfs_key update_progress; 7358 int stage; 7359 int level; 7360 int shared_level; 7361 int update_ref; 7362 int keep_locks; 7363 int reada_slot; 7364 int reada_count; 7365 int for_reloc; 7366 }; 7367 7368 #define DROP_REFERENCE 1 7369 #define UPDATE_BACKREF 2 7370 7371 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 7372 struct btrfs_root *root, 7373 struct walk_control *wc, 7374 struct btrfs_path *path) 7375 { 7376 u64 bytenr; 7377 u64 generation; 7378 u64 refs; 7379 u64 flags; 7380 u32 nritems; 7381 u32 blocksize; 7382 struct btrfs_key key; 7383 struct extent_buffer *eb; 7384 int ret; 7385 int slot; 7386 int nread = 0; 7387 7388 if (path->slots[wc->level] < wc->reada_slot) { 7389 wc->reada_count = wc->reada_count * 2 / 3; 7390 wc->reada_count = max(wc->reada_count, 2); 7391 } else { 7392 wc->reada_count = wc->reada_count * 3 / 2; 7393 wc->reada_count = min_t(int, wc->reada_count, 7394 BTRFS_NODEPTRS_PER_BLOCK(root)); 7395 } 7396 7397 eb = path->nodes[wc->level]; 7398 nritems = btrfs_header_nritems(eb); 7399 blocksize = root->nodesize; 7400 7401 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7402 if (nread >= wc->reada_count) 7403 break; 7404 7405 cond_resched(); 7406 bytenr = btrfs_node_blockptr(eb, slot); 7407 generation = btrfs_node_ptr_generation(eb, slot); 7408 7409 if (slot == path->slots[wc->level]) 7410 goto reada; 7411 7412 if (wc->stage == UPDATE_BACKREF && 7413 generation <= root->root_key.offset) 7414 continue; 7415 7416 /* We don't lock the tree block, it's OK to be racy here */ 7417 ret = btrfs_lookup_extent_info(trans, root, bytenr, 7418 wc->level - 1, 1, &refs, 7419 &flags); 7420 /* We don't care about errors in readahead. */ 7421 if (ret < 0) 7422 continue; 7423 BUG_ON(refs == 0); 7424 7425 if (wc->stage == DROP_REFERENCE) { 7426 if (refs == 1) 7427 goto reada; 7428 7429 if (wc->level == 1 && 7430 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7431 continue; 7432 if (!wc->update_ref || 7433 generation <= root->root_key.offset) 7434 continue; 7435 btrfs_node_key_to_cpu(eb, &key, slot); 7436 ret = btrfs_comp_cpu_keys(&key, 7437 &wc->update_progress); 7438 if (ret < 0) 7439 continue; 7440 } else { 7441 if (wc->level == 1 && 7442 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7443 continue; 7444 } 7445 reada: 7446 readahead_tree_block(root, bytenr); 7447 nread++; 7448 } 7449 wc->reada_slot = slot; 7450 } 7451 7452 static int account_leaf_items(struct btrfs_trans_handle *trans, 7453 struct btrfs_root *root, 7454 struct extent_buffer *eb) 7455 { 7456 int nr = btrfs_header_nritems(eb); 7457 int i, extent_type, ret; 7458 struct btrfs_key key; 7459 struct btrfs_file_extent_item *fi; 7460 u64 bytenr, num_bytes; 7461 7462 for (i = 0; i < nr; i++) { 7463 btrfs_item_key_to_cpu(eb, &key, i); 7464 7465 if (key.type != BTRFS_EXTENT_DATA_KEY) 7466 continue; 7467 7468 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 7469 /* filter out non qgroup-accountable extents */ 7470 extent_type = btrfs_file_extent_type(eb, fi); 7471 7472 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 7473 continue; 7474 7475 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 7476 if (!bytenr) 7477 continue; 7478 7479 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 7480 7481 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7482 root->objectid, 7483 bytenr, num_bytes, 7484 BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); 7485 if (ret) 7486 return ret; 7487 } 7488 return 0; 7489 } 7490 7491 /* 7492 * Walk up the tree from the bottom, freeing leaves and any interior 7493 * nodes which have had all slots visited. If a node (leaf or 7494 * interior) is freed, the node above it will have it's slot 7495 * incremented. The root node will never be freed. 7496 * 7497 * At the end of this function, we should have a path which has all 7498 * slots incremented to the next position for a search. If we need to 7499 * read a new node it will be NULL and the node above it will have the 7500 * correct slot selected for a later read. 7501 * 7502 * If we increment the root nodes slot counter past the number of 7503 * elements, 1 is returned to signal completion of the search. 7504 */ 7505 static int adjust_slots_upwards(struct btrfs_root *root, 7506 struct btrfs_path *path, int root_level) 7507 { 7508 int level = 0; 7509 int nr, slot; 7510 struct extent_buffer *eb; 7511 7512 if (root_level == 0) 7513 return 1; 7514 7515 while (level <= root_level) { 7516 eb = path->nodes[level]; 7517 nr = btrfs_header_nritems(eb); 7518 path->slots[level]++; 7519 slot = path->slots[level]; 7520 if (slot >= nr || level == 0) { 7521 /* 7522 * Don't free the root - we will detect this 7523 * condition after our loop and return a 7524 * positive value for caller to stop walking the tree. 7525 */ 7526 if (level != root_level) { 7527 btrfs_tree_unlock_rw(eb, path->locks[level]); 7528 path->locks[level] = 0; 7529 7530 free_extent_buffer(eb); 7531 path->nodes[level] = NULL; 7532 path->slots[level] = 0; 7533 } 7534 } else { 7535 /* 7536 * We have a valid slot to walk back down 7537 * from. Stop here so caller can process these 7538 * new nodes. 7539 */ 7540 break; 7541 } 7542 7543 level++; 7544 } 7545 7546 eb = path->nodes[root_level]; 7547 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 7548 return 1; 7549 7550 return 0; 7551 } 7552 7553 /* 7554 * root_eb is the subtree root and is locked before this function is called. 7555 */ 7556 static int account_shared_subtree(struct btrfs_trans_handle *trans, 7557 struct btrfs_root *root, 7558 struct extent_buffer *root_eb, 7559 u64 root_gen, 7560 int root_level) 7561 { 7562 int ret = 0; 7563 int level; 7564 struct extent_buffer *eb = root_eb; 7565 struct btrfs_path *path = NULL; 7566 7567 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); 7568 BUG_ON(root_eb == NULL); 7569 7570 if (!root->fs_info->quota_enabled) 7571 return 0; 7572 7573 if (!extent_buffer_uptodate(root_eb)) { 7574 ret = btrfs_read_buffer(root_eb, root_gen); 7575 if (ret) 7576 goto out; 7577 } 7578 7579 if (root_level == 0) { 7580 ret = account_leaf_items(trans, root, root_eb); 7581 goto out; 7582 } 7583 7584 path = btrfs_alloc_path(); 7585 if (!path) 7586 return -ENOMEM; 7587 7588 /* 7589 * Walk down the tree. Missing extent blocks are filled in as 7590 * we go. Metadata is accounted every time we read a new 7591 * extent block. 7592 * 7593 * When we reach a leaf, we account for file extent items in it, 7594 * walk back up the tree (adjusting slot pointers as we go) 7595 * and restart the search process. 7596 */ 7597 extent_buffer_get(root_eb); /* For path */ 7598 path->nodes[root_level] = root_eb; 7599 path->slots[root_level] = 0; 7600 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 7601 walk_down: 7602 level = root_level; 7603 while (level >= 0) { 7604 if (path->nodes[level] == NULL) { 7605 int parent_slot; 7606 u64 child_gen; 7607 u64 child_bytenr; 7608 7609 /* We need to get child blockptr/gen from 7610 * parent before we can read it. */ 7611 eb = path->nodes[level + 1]; 7612 parent_slot = path->slots[level + 1]; 7613 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 7614 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 7615 7616 eb = read_tree_block(root, child_bytenr, child_gen); 7617 if (!eb || !extent_buffer_uptodate(eb)) { 7618 ret = -EIO; 7619 goto out; 7620 } 7621 7622 path->nodes[level] = eb; 7623 path->slots[level] = 0; 7624 7625 btrfs_tree_read_lock(eb); 7626 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 7627 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 7628 7629 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7630 root->objectid, 7631 child_bytenr, 7632 root->nodesize, 7633 BTRFS_QGROUP_OPER_SUB_SUBTREE, 7634 0); 7635 if (ret) 7636 goto out; 7637 7638 } 7639 7640 if (level == 0) { 7641 ret = account_leaf_items(trans, root, path->nodes[level]); 7642 if (ret) 7643 goto out; 7644 7645 /* Nonzero return here means we completed our search */ 7646 ret = adjust_slots_upwards(root, path, root_level); 7647 if (ret) 7648 break; 7649 7650 /* Restart search with new slots */ 7651 goto walk_down; 7652 } 7653 7654 level--; 7655 } 7656 7657 ret = 0; 7658 out: 7659 btrfs_free_path(path); 7660 7661 return ret; 7662 } 7663 7664 /* 7665 * helper to process tree block while walking down the tree. 7666 * 7667 * when wc->stage == UPDATE_BACKREF, this function updates 7668 * back refs for pointers in the block. 7669 * 7670 * NOTE: return value 1 means we should stop walking down. 7671 */ 7672 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 7673 struct btrfs_root *root, 7674 struct btrfs_path *path, 7675 struct walk_control *wc, int lookup_info) 7676 { 7677 int level = wc->level; 7678 struct extent_buffer *eb = path->nodes[level]; 7679 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7680 int ret; 7681 7682 if (wc->stage == UPDATE_BACKREF && 7683 btrfs_header_owner(eb) != root->root_key.objectid) 7684 return 1; 7685 7686 /* 7687 * when reference count of tree block is 1, it won't increase 7688 * again. once full backref flag is set, we never clear it. 7689 */ 7690 if (lookup_info && 7691 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 7692 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 7693 BUG_ON(!path->locks[level]); 7694 ret = btrfs_lookup_extent_info(trans, root, 7695 eb->start, level, 1, 7696 &wc->refs[level], 7697 &wc->flags[level]); 7698 BUG_ON(ret == -ENOMEM); 7699 if (ret) 7700 return ret; 7701 BUG_ON(wc->refs[level] == 0); 7702 } 7703 7704 if (wc->stage == DROP_REFERENCE) { 7705 if (wc->refs[level] > 1) 7706 return 1; 7707 7708 if (path->locks[level] && !wc->keep_locks) { 7709 btrfs_tree_unlock_rw(eb, path->locks[level]); 7710 path->locks[level] = 0; 7711 } 7712 return 0; 7713 } 7714 7715 /* wc->stage == UPDATE_BACKREF */ 7716 if (!(wc->flags[level] & flag)) { 7717 BUG_ON(!path->locks[level]); 7718 ret = btrfs_inc_ref(trans, root, eb, 1); 7719 BUG_ON(ret); /* -ENOMEM */ 7720 ret = btrfs_dec_ref(trans, root, eb, 0); 7721 BUG_ON(ret); /* -ENOMEM */ 7722 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7723 eb->len, flag, 7724 btrfs_header_level(eb), 0); 7725 BUG_ON(ret); /* -ENOMEM */ 7726 wc->flags[level] |= flag; 7727 } 7728 7729 /* 7730 * the block is shared by multiple trees, so it's not good to 7731 * keep the tree lock 7732 */ 7733 if (path->locks[level] && level > 0) { 7734 btrfs_tree_unlock_rw(eb, path->locks[level]); 7735 path->locks[level] = 0; 7736 } 7737 return 0; 7738 } 7739 7740 /* 7741 * helper to process tree block pointer. 7742 * 7743 * when wc->stage == DROP_REFERENCE, this function checks 7744 * reference count of the block pointed to. if the block 7745 * is shared and we need update back refs for the subtree 7746 * rooted at the block, this function changes wc->stage to 7747 * UPDATE_BACKREF. if the block is shared and there is no 7748 * need to update back, this function drops the reference 7749 * to the block. 7750 * 7751 * NOTE: return value 1 means we should stop walking down. 7752 */ 7753 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 7754 struct btrfs_root *root, 7755 struct btrfs_path *path, 7756 struct walk_control *wc, int *lookup_info) 7757 { 7758 u64 bytenr; 7759 u64 generation; 7760 u64 parent; 7761 u32 blocksize; 7762 struct btrfs_key key; 7763 struct extent_buffer *next; 7764 int level = wc->level; 7765 int reada = 0; 7766 int ret = 0; 7767 bool need_account = false; 7768 7769 generation = btrfs_node_ptr_generation(path->nodes[level], 7770 path->slots[level]); 7771 /* 7772 * if the lower level block was created before the snapshot 7773 * was created, we know there is no need to update back refs 7774 * for the subtree 7775 */ 7776 if (wc->stage == UPDATE_BACKREF && 7777 generation <= root->root_key.offset) { 7778 *lookup_info = 1; 7779 return 1; 7780 } 7781 7782 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7783 blocksize = root->nodesize; 7784 7785 next = btrfs_find_tree_block(root, bytenr); 7786 if (!next) { 7787 next = btrfs_find_create_tree_block(root, bytenr); 7788 if (!next) 7789 return -ENOMEM; 7790 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 7791 level - 1); 7792 reada = 1; 7793 } 7794 btrfs_tree_lock(next); 7795 btrfs_set_lock_blocking(next); 7796 7797 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7798 &wc->refs[level - 1], 7799 &wc->flags[level - 1]); 7800 if (ret < 0) { 7801 btrfs_tree_unlock(next); 7802 return ret; 7803 } 7804 7805 if (unlikely(wc->refs[level - 1] == 0)) { 7806 btrfs_err(root->fs_info, "Missing references."); 7807 BUG(); 7808 } 7809 *lookup_info = 0; 7810 7811 if (wc->stage == DROP_REFERENCE) { 7812 if (wc->refs[level - 1] > 1) { 7813 need_account = true; 7814 if (level == 1 && 7815 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7816 goto skip; 7817 7818 if (!wc->update_ref || 7819 generation <= root->root_key.offset) 7820 goto skip; 7821 7822 btrfs_node_key_to_cpu(path->nodes[level], &key, 7823 path->slots[level]); 7824 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7825 if (ret < 0) 7826 goto skip; 7827 7828 wc->stage = UPDATE_BACKREF; 7829 wc->shared_level = level - 1; 7830 } 7831 } else { 7832 if (level == 1 && 7833 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7834 goto skip; 7835 } 7836 7837 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7838 btrfs_tree_unlock(next); 7839 free_extent_buffer(next); 7840 next = NULL; 7841 *lookup_info = 1; 7842 } 7843 7844 if (!next) { 7845 if (reada && level == 1) 7846 reada_walk_down(trans, root, wc, path); 7847 next = read_tree_block(root, bytenr, generation); 7848 if (!next || !extent_buffer_uptodate(next)) { 7849 free_extent_buffer(next); 7850 return -EIO; 7851 } 7852 btrfs_tree_lock(next); 7853 btrfs_set_lock_blocking(next); 7854 } 7855 7856 level--; 7857 BUG_ON(level != btrfs_header_level(next)); 7858 path->nodes[level] = next; 7859 path->slots[level] = 0; 7860 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7861 wc->level = level; 7862 if (wc->level == 1) 7863 wc->reada_slot = 0; 7864 return 0; 7865 skip: 7866 wc->refs[level - 1] = 0; 7867 wc->flags[level - 1] = 0; 7868 if (wc->stage == DROP_REFERENCE) { 7869 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7870 parent = path->nodes[level]->start; 7871 } else { 7872 BUG_ON(root->root_key.objectid != 7873 btrfs_header_owner(path->nodes[level])); 7874 parent = 0; 7875 } 7876 7877 if (need_account) { 7878 ret = account_shared_subtree(trans, root, next, 7879 generation, level - 1); 7880 if (ret) { 7881 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 7882 "%d accounting shared subtree. Quota " 7883 "is out of sync, rescan required.\n", 7884 root->fs_info->sb->s_id, ret); 7885 } 7886 } 7887 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7888 root->root_key.objectid, level - 1, 0, 0); 7889 BUG_ON(ret); /* -ENOMEM */ 7890 } 7891 btrfs_tree_unlock(next); 7892 free_extent_buffer(next); 7893 *lookup_info = 1; 7894 return 1; 7895 } 7896 7897 /* 7898 * helper to process tree block while walking up the tree. 7899 * 7900 * when wc->stage == DROP_REFERENCE, this function drops 7901 * reference count on the block. 7902 * 7903 * when wc->stage == UPDATE_BACKREF, this function changes 7904 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7905 * to UPDATE_BACKREF previously while processing the block. 7906 * 7907 * NOTE: return value 1 means we should stop walking up. 7908 */ 7909 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7910 struct btrfs_root *root, 7911 struct btrfs_path *path, 7912 struct walk_control *wc) 7913 { 7914 int ret; 7915 int level = wc->level; 7916 struct extent_buffer *eb = path->nodes[level]; 7917 u64 parent = 0; 7918 7919 if (wc->stage == UPDATE_BACKREF) { 7920 BUG_ON(wc->shared_level < level); 7921 if (level < wc->shared_level) 7922 goto out; 7923 7924 ret = find_next_key(path, level + 1, &wc->update_progress); 7925 if (ret > 0) 7926 wc->update_ref = 0; 7927 7928 wc->stage = DROP_REFERENCE; 7929 wc->shared_level = -1; 7930 path->slots[level] = 0; 7931 7932 /* 7933 * check reference count again if the block isn't locked. 7934 * we should start walking down the tree again if reference 7935 * count is one. 7936 */ 7937 if (!path->locks[level]) { 7938 BUG_ON(level == 0); 7939 btrfs_tree_lock(eb); 7940 btrfs_set_lock_blocking(eb); 7941 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7942 7943 ret = btrfs_lookup_extent_info(trans, root, 7944 eb->start, level, 1, 7945 &wc->refs[level], 7946 &wc->flags[level]); 7947 if (ret < 0) { 7948 btrfs_tree_unlock_rw(eb, path->locks[level]); 7949 path->locks[level] = 0; 7950 return ret; 7951 } 7952 BUG_ON(wc->refs[level] == 0); 7953 if (wc->refs[level] == 1) { 7954 btrfs_tree_unlock_rw(eb, path->locks[level]); 7955 path->locks[level] = 0; 7956 return 1; 7957 } 7958 } 7959 } 7960 7961 /* wc->stage == DROP_REFERENCE */ 7962 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7963 7964 if (wc->refs[level] == 1) { 7965 if (level == 0) { 7966 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7967 ret = btrfs_dec_ref(trans, root, eb, 1); 7968 else 7969 ret = btrfs_dec_ref(trans, root, eb, 0); 7970 BUG_ON(ret); /* -ENOMEM */ 7971 ret = account_leaf_items(trans, root, eb); 7972 if (ret) { 7973 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 7974 "%d accounting leaf items. Quota " 7975 "is out of sync, rescan required.\n", 7976 root->fs_info->sb->s_id, ret); 7977 } 7978 } 7979 /* make block locked assertion in clean_tree_block happy */ 7980 if (!path->locks[level] && 7981 btrfs_header_generation(eb) == trans->transid) { 7982 btrfs_tree_lock(eb); 7983 btrfs_set_lock_blocking(eb); 7984 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7985 } 7986 clean_tree_block(trans, root, eb); 7987 } 7988 7989 if (eb == root->node) { 7990 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7991 parent = eb->start; 7992 else 7993 BUG_ON(root->root_key.objectid != 7994 btrfs_header_owner(eb)); 7995 } else { 7996 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7997 parent = path->nodes[level + 1]->start; 7998 else 7999 BUG_ON(root->root_key.objectid != 8000 btrfs_header_owner(path->nodes[level + 1])); 8001 } 8002 8003 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8004 out: 8005 wc->refs[level] = 0; 8006 wc->flags[level] = 0; 8007 return 0; 8008 } 8009 8010 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8011 struct btrfs_root *root, 8012 struct btrfs_path *path, 8013 struct walk_control *wc) 8014 { 8015 int level = wc->level; 8016 int lookup_info = 1; 8017 int ret; 8018 8019 while (level >= 0) { 8020 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8021 if (ret > 0) 8022 break; 8023 8024 if (level == 0) 8025 break; 8026 8027 if (path->slots[level] >= 8028 btrfs_header_nritems(path->nodes[level])) 8029 break; 8030 8031 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8032 if (ret > 0) { 8033 path->slots[level]++; 8034 continue; 8035 } else if (ret < 0) 8036 return ret; 8037 level = wc->level; 8038 } 8039 return 0; 8040 } 8041 8042 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8043 struct btrfs_root *root, 8044 struct btrfs_path *path, 8045 struct walk_control *wc, int max_level) 8046 { 8047 int level = wc->level; 8048 int ret; 8049 8050 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8051 while (level < max_level && path->nodes[level]) { 8052 wc->level = level; 8053 if (path->slots[level] + 1 < 8054 btrfs_header_nritems(path->nodes[level])) { 8055 path->slots[level]++; 8056 return 0; 8057 } else { 8058 ret = walk_up_proc(trans, root, path, wc); 8059 if (ret > 0) 8060 return 0; 8061 8062 if (path->locks[level]) { 8063 btrfs_tree_unlock_rw(path->nodes[level], 8064 path->locks[level]); 8065 path->locks[level] = 0; 8066 } 8067 free_extent_buffer(path->nodes[level]); 8068 path->nodes[level] = NULL; 8069 level++; 8070 } 8071 } 8072 return 1; 8073 } 8074 8075 /* 8076 * drop a subvolume tree. 8077 * 8078 * this function traverses the tree freeing any blocks that only 8079 * referenced by the tree. 8080 * 8081 * when a shared tree block is found. this function decreases its 8082 * reference count by one. if update_ref is true, this function 8083 * also make sure backrefs for the shared block and all lower level 8084 * blocks are properly updated. 8085 * 8086 * If called with for_reloc == 0, may exit early with -EAGAIN 8087 */ 8088 int btrfs_drop_snapshot(struct btrfs_root *root, 8089 struct btrfs_block_rsv *block_rsv, int update_ref, 8090 int for_reloc) 8091 { 8092 struct btrfs_path *path; 8093 struct btrfs_trans_handle *trans; 8094 struct btrfs_root *tree_root = root->fs_info->tree_root; 8095 struct btrfs_root_item *root_item = &root->root_item; 8096 struct walk_control *wc; 8097 struct btrfs_key key; 8098 int err = 0; 8099 int ret; 8100 int level; 8101 bool root_dropped = false; 8102 8103 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); 8104 8105 path = btrfs_alloc_path(); 8106 if (!path) { 8107 err = -ENOMEM; 8108 goto out; 8109 } 8110 8111 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8112 if (!wc) { 8113 btrfs_free_path(path); 8114 err = -ENOMEM; 8115 goto out; 8116 } 8117 8118 trans = btrfs_start_transaction(tree_root, 0); 8119 if (IS_ERR(trans)) { 8120 err = PTR_ERR(trans); 8121 goto out_free; 8122 } 8123 8124 if (block_rsv) 8125 trans->block_rsv = block_rsv; 8126 8127 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 8128 level = btrfs_header_level(root->node); 8129 path->nodes[level] = btrfs_lock_root_node(root); 8130 btrfs_set_lock_blocking(path->nodes[level]); 8131 path->slots[level] = 0; 8132 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8133 memset(&wc->update_progress, 0, 8134 sizeof(wc->update_progress)); 8135 } else { 8136 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 8137 memcpy(&wc->update_progress, &key, 8138 sizeof(wc->update_progress)); 8139 8140 level = root_item->drop_level; 8141 BUG_ON(level == 0); 8142 path->lowest_level = level; 8143 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8144 path->lowest_level = 0; 8145 if (ret < 0) { 8146 err = ret; 8147 goto out_end_trans; 8148 } 8149 WARN_ON(ret > 0); 8150 8151 /* 8152 * unlock our path, this is safe because only this 8153 * function is allowed to delete this snapshot 8154 */ 8155 btrfs_unlock_up_safe(path, 0); 8156 8157 level = btrfs_header_level(root->node); 8158 while (1) { 8159 btrfs_tree_lock(path->nodes[level]); 8160 btrfs_set_lock_blocking(path->nodes[level]); 8161 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8162 8163 ret = btrfs_lookup_extent_info(trans, root, 8164 path->nodes[level]->start, 8165 level, 1, &wc->refs[level], 8166 &wc->flags[level]); 8167 if (ret < 0) { 8168 err = ret; 8169 goto out_end_trans; 8170 } 8171 BUG_ON(wc->refs[level] == 0); 8172 8173 if (level == root_item->drop_level) 8174 break; 8175 8176 btrfs_tree_unlock(path->nodes[level]); 8177 path->locks[level] = 0; 8178 WARN_ON(wc->refs[level] != 1); 8179 level--; 8180 } 8181 } 8182 8183 wc->level = level; 8184 wc->shared_level = -1; 8185 wc->stage = DROP_REFERENCE; 8186 wc->update_ref = update_ref; 8187 wc->keep_locks = 0; 8188 wc->for_reloc = for_reloc; 8189 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8190 8191 while (1) { 8192 8193 ret = walk_down_tree(trans, root, path, wc); 8194 if (ret < 0) { 8195 err = ret; 8196 break; 8197 } 8198 8199 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 8200 if (ret < 0) { 8201 err = ret; 8202 break; 8203 } 8204 8205 if (ret > 0) { 8206 BUG_ON(wc->stage != DROP_REFERENCE); 8207 break; 8208 } 8209 8210 if (wc->stage == DROP_REFERENCE) { 8211 level = wc->level; 8212 btrfs_node_key(path->nodes[level], 8213 &root_item->drop_progress, 8214 path->slots[level]); 8215 root_item->drop_level = level; 8216 } 8217 8218 BUG_ON(wc->level == 0); 8219 if (btrfs_should_end_transaction(trans, tree_root) || 8220 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 8221 ret = btrfs_update_root(trans, tree_root, 8222 &root->root_key, 8223 root_item); 8224 if (ret) { 8225 btrfs_abort_transaction(trans, tree_root, ret); 8226 err = ret; 8227 goto out_end_trans; 8228 } 8229 8230 /* 8231 * Qgroup update accounting is run from 8232 * delayed ref handling. This usually works 8233 * out because delayed refs are normally the 8234 * only way qgroup updates are added. However, 8235 * we may have added updates during our tree 8236 * walk so run qgroups here to make sure we 8237 * don't lose any updates. 8238 */ 8239 ret = btrfs_delayed_qgroup_accounting(trans, 8240 root->fs_info); 8241 if (ret) 8242 printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8243 "running qgroup updates " 8244 "during snapshot delete. " 8245 "Quota is out of sync, " 8246 "rescan required.\n", ret); 8247 8248 btrfs_end_transaction_throttle(trans, tree_root); 8249 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8250 pr_debug("BTRFS: drop snapshot early exit\n"); 8251 err = -EAGAIN; 8252 goto out_free; 8253 } 8254 8255 trans = btrfs_start_transaction(tree_root, 0); 8256 if (IS_ERR(trans)) { 8257 err = PTR_ERR(trans); 8258 goto out_free; 8259 } 8260 if (block_rsv) 8261 trans->block_rsv = block_rsv; 8262 } 8263 } 8264 btrfs_release_path(path); 8265 if (err) 8266 goto out_end_trans; 8267 8268 ret = btrfs_del_root(trans, tree_root, &root->root_key); 8269 if (ret) { 8270 btrfs_abort_transaction(trans, tree_root, ret); 8271 goto out_end_trans; 8272 } 8273 8274 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 8275 ret = btrfs_find_root(tree_root, &root->root_key, path, 8276 NULL, NULL); 8277 if (ret < 0) { 8278 btrfs_abort_transaction(trans, tree_root, ret); 8279 err = ret; 8280 goto out_end_trans; 8281 } else if (ret > 0) { 8282 /* if we fail to delete the orphan item this time 8283 * around, it'll get picked up the next time. 8284 * 8285 * The most common failure here is just -ENOENT. 8286 */ 8287 btrfs_del_orphan_item(trans, tree_root, 8288 root->root_key.objectid); 8289 } 8290 } 8291 8292 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 8293 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 8294 } else { 8295 free_extent_buffer(root->node); 8296 free_extent_buffer(root->commit_root); 8297 btrfs_put_fs_root(root); 8298 } 8299 root_dropped = true; 8300 out_end_trans: 8301 ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); 8302 if (ret) 8303 printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8304 "running qgroup updates " 8305 "during snapshot delete. " 8306 "Quota is out of sync, " 8307 "rescan required.\n", ret); 8308 8309 btrfs_end_transaction_throttle(trans, tree_root); 8310 out_free: 8311 kfree(wc); 8312 btrfs_free_path(path); 8313 out: 8314 /* 8315 * So if we need to stop dropping the snapshot for whatever reason we 8316 * need to make sure to add it back to the dead root list so that we 8317 * keep trying to do the work later. This also cleans up roots if we 8318 * don't have it in the radix (like when we recover after a power fail 8319 * or unmount) so we don't leak memory. 8320 */ 8321 if (!for_reloc && root_dropped == false) 8322 btrfs_add_dead_root(root); 8323 if (err && err != -EAGAIN) 8324 btrfs_std_error(root->fs_info, err); 8325 return err; 8326 } 8327 8328 /* 8329 * drop subtree rooted at tree block 'node'. 8330 * 8331 * NOTE: this function will unlock and release tree block 'node' 8332 * only used by relocation code 8333 */ 8334 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 8335 struct btrfs_root *root, 8336 struct extent_buffer *node, 8337 struct extent_buffer *parent) 8338 { 8339 struct btrfs_path *path; 8340 struct walk_control *wc; 8341 int level; 8342 int parent_level; 8343 int ret = 0; 8344 int wret; 8345 8346 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 8347 8348 path = btrfs_alloc_path(); 8349 if (!path) 8350 return -ENOMEM; 8351 8352 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8353 if (!wc) { 8354 btrfs_free_path(path); 8355 return -ENOMEM; 8356 } 8357 8358 btrfs_assert_tree_locked(parent); 8359 parent_level = btrfs_header_level(parent); 8360 extent_buffer_get(parent); 8361 path->nodes[parent_level] = parent; 8362 path->slots[parent_level] = btrfs_header_nritems(parent); 8363 8364 btrfs_assert_tree_locked(node); 8365 level = btrfs_header_level(node); 8366 path->nodes[level] = node; 8367 path->slots[level] = 0; 8368 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8369 8370 wc->refs[parent_level] = 1; 8371 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8372 wc->level = level; 8373 wc->shared_level = -1; 8374 wc->stage = DROP_REFERENCE; 8375 wc->update_ref = 0; 8376 wc->keep_locks = 1; 8377 wc->for_reloc = 1; 8378 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8379 8380 while (1) { 8381 wret = walk_down_tree(trans, root, path, wc); 8382 if (wret < 0) { 8383 ret = wret; 8384 break; 8385 } 8386 8387 wret = walk_up_tree(trans, root, path, wc, parent_level); 8388 if (wret < 0) 8389 ret = wret; 8390 if (wret != 0) 8391 break; 8392 } 8393 8394 kfree(wc); 8395 btrfs_free_path(path); 8396 return ret; 8397 } 8398 8399 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 8400 { 8401 u64 num_devices; 8402 u64 stripped; 8403 8404 /* 8405 * if restripe for this chunk_type is on pick target profile and 8406 * return, otherwise do the usual balance 8407 */ 8408 stripped = get_restripe_target(root->fs_info, flags); 8409 if (stripped) 8410 return extended_to_chunk(stripped); 8411 8412 num_devices = root->fs_info->fs_devices->rw_devices; 8413 8414 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8415 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8416 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 8417 8418 if (num_devices == 1) { 8419 stripped |= BTRFS_BLOCK_GROUP_DUP; 8420 stripped = flags & ~stripped; 8421 8422 /* turn raid0 into single device chunks */ 8423 if (flags & BTRFS_BLOCK_GROUP_RAID0) 8424 return stripped; 8425 8426 /* turn mirroring into duplication */ 8427 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 8428 BTRFS_BLOCK_GROUP_RAID10)) 8429 return stripped | BTRFS_BLOCK_GROUP_DUP; 8430 } else { 8431 /* they already had raid on here, just return */ 8432 if (flags & stripped) 8433 return flags; 8434 8435 stripped |= BTRFS_BLOCK_GROUP_DUP; 8436 stripped = flags & ~stripped; 8437 8438 /* switch duplicated blocks with raid1 */ 8439 if (flags & BTRFS_BLOCK_GROUP_DUP) 8440 return stripped | BTRFS_BLOCK_GROUP_RAID1; 8441 8442 /* this is drive concat, leave it alone */ 8443 } 8444 8445 return flags; 8446 } 8447 8448 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 8449 { 8450 struct btrfs_space_info *sinfo = cache->space_info; 8451 u64 num_bytes; 8452 u64 min_allocable_bytes; 8453 int ret = -ENOSPC; 8454 8455 8456 /* 8457 * We need some metadata space and system metadata space for 8458 * allocating chunks in some corner cases until we force to set 8459 * it to be readonly. 8460 */ 8461 if ((sinfo->flags & 8462 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 8463 !force) 8464 min_allocable_bytes = 1 * 1024 * 1024; 8465 else 8466 min_allocable_bytes = 0; 8467 8468 spin_lock(&sinfo->lock); 8469 spin_lock(&cache->lock); 8470 8471 if (cache->ro) { 8472 ret = 0; 8473 goto out; 8474 } 8475 8476 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8477 cache->bytes_super - btrfs_block_group_used(&cache->item); 8478 8479 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8480 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 8481 min_allocable_bytes <= sinfo->total_bytes) { 8482 sinfo->bytes_readonly += num_bytes; 8483 cache->ro = 1; 8484 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 8485 ret = 0; 8486 } 8487 out: 8488 spin_unlock(&cache->lock); 8489 spin_unlock(&sinfo->lock); 8490 return ret; 8491 } 8492 8493 int btrfs_set_block_group_ro(struct btrfs_root *root, 8494 struct btrfs_block_group_cache *cache) 8495 8496 { 8497 struct btrfs_trans_handle *trans; 8498 u64 alloc_flags; 8499 int ret; 8500 8501 BUG_ON(cache->ro); 8502 8503 trans = btrfs_join_transaction(root); 8504 if (IS_ERR(trans)) 8505 return PTR_ERR(trans); 8506 8507 ret = set_block_group_ro(cache, 0); 8508 if (!ret) 8509 goto out; 8510 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8511 ret = do_chunk_alloc(trans, root, alloc_flags, 8512 CHUNK_ALLOC_FORCE); 8513 if (ret < 0) 8514 goto out; 8515 ret = set_block_group_ro(cache, 0); 8516 out: 8517 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 8518 alloc_flags = update_block_group_flags(root, cache->flags); 8519 check_system_chunk(trans, root, alloc_flags); 8520 } 8521 8522 btrfs_end_transaction(trans, root); 8523 return ret; 8524 } 8525 8526 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 8527 struct btrfs_root *root, u64 type) 8528 { 8529 u64 alloc_flags = get_alloc_profile(root, type); 8530 return do_chunk_alloc(trans, root, alloc_flags, 8531 CHUNK_ALLOC_FORCE); 8532 } 8533 8534 /* 8535 * helper to account the unused space of all the readonly block group in the 8536 * space_info. takes mirrors into account. 8537 */ 8538 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 8539 { 8540 struct btrfs_block_group_cache *block_group; 8541 u64 free_bytes = 0; 8542 int factor; 8543 8544 /* It's df, we don't care if it's racey */ 8545 if (list_empty(&sinfo->ro_bgs)) 8546 return 0; 8547 8548 spin_lock(&sinfo->lock); 8549 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 8550 spin_lock(&block_group->lock); 8551 8552 if (!block_group->ro) { 8553 spin_unlock(&block_group->lock); 8554 continue; 8555 } 8556 8557 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 8558 BTRFS_BLOCK_GROUP_RAID10 | 8559 BTRFS_BLOCK_GROUP_DUP)) 8560 factor = 2; 8561 else 8562 factor = 1; 8563 8564 free_bytes += (block_group->key.offset - 8565 btrfs_block_group_used(&block_group->item)) * 8566 factor; 8567 8568 spin_unlock(&block_group->lock); 8569 } 8570 spin_unlock(&sinfo->lock); 8571 8572 return free_bytes; 8573 } 8574 8575 void btrfs_set_block_group_rw(struct btrfs_root *root, 8576 struct btrfs_block_group_cache *cache) 8577 { 8578 struct btrfs_space_info *sinfo = cache->space_info; 8579 u64 num_bytes; 8580 8581 BUG_ON(!cache->ro); 8582 8583 spin_lock(&sinfo->lock); 8584 spin_lock(&cache->lock); 8585 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8586 cache->bytes_super - btrfs_block_group_used(&cache->item); 8587 sinfo->bytes_readonly -= num_bytes; 8588 cache->ro = 0; 8589 list_del_init(&cache->ro_list); 8590 spin_unlock(&cache->lock); 8591 spin_unlock(&sinfo->lock); 8592 } 8593 8594 /* 8595 * checks to see if its even possible to relocate this block group. 8596 * 8597 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 8598 * ok to go ahead and try. 8599 */ 8600 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 8601 { 8602 struct btrfs_block_group_cache *block_group; 8603 struct btrfs_space_info *space_info; 8604 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 8605 struct btrfs_device *device; 8606 struct btrfs_trans_handle *trans; 8607 u64 min_free; 8608 u64 dev_min = 1; 8609 u64 dev_nr = 0; 8610 u64 target; 8611 int index; 8612 int full = 0; 8613 int ret = 0; 8614 8615 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 8616 8617 /* odd, couldn't find the block group, leave it alone */ 8618 if (!block_group) 8619 return -1; 8620 8621 min_free = btrfs_block_group_used(&block_group->item); 8622 8623 /* no bytes used, we're good */ 8624 if (!min_free) 8625 goto out; 8626 8627 space_info = block_group->space_info; 8628 spin_lock(&space_info->lock); 8629 8630 full = space_info->full; 8631 8632 /* 8633 * if this is the last block group we have in this space, we can't 8634 * relocate it unless we're able to allocate a new chunk below. 8635 * 8636 * Otherwise, we need to make sure we have room in the space to handle 8637 * all of the extents from this block group. If we can, we're good 8638 */ 8639 if ((space_info->total_bytes != block_group->key.offset) && 8640 (space_info->bytes_used + space_info->bytes_reserved + 8641 space_info->bytes_pinned + space_info->bytes_readonly + 8642 min_free < space_info->total_bytes)) { 8643 spin_unlock(&space_info->lock); 8644 goto out; 8645 } 8646 spin_unlock(&space_info->lock); 8647 8648 /* 8649 * ok we don't have enough space, but maybe we have free space on our 8650 * devices to allocate new chunks for relocation, so loop through our 8651 * alloc devices and guess if we have enough space. if this block 8652 * group is going to be restriped, run checks against the target 8653 * profile instead of the current one. 8654 */ 8655 ret = -1; 8656 8657 /* 8658 * index: 8659 * 0: raid10 8660 * 1: raid1 8661 * 2: dup 8662 * 3: raid0 8663 * 4: single 8664 */ 8665 target = get_restripe_target(root->fs_info, block_group->flags); 8666 if (target) { 8667 index = __get_raid_index(extended_to_chunk(target)); 8668 } else { 8669 /* 8670 * this is just a balance, so if we were marked as full 8671 * we know there is no space for a new chunk 8672 */ 8673 if (full) 8674 goto out; 8675 8676 index = get_block_group_index(block_group); 8677 } 8678 8679 if (index == BTRFS_RAID_RAID10) { 8680 dev_min = 4; 8681 /* Divide by 2 */ 8682 min_free >>= 1; 8683 } else if (index == BTRFS_RAID_RAID1) { 8684 dev_min = 2; 8685 } else if (index == BTRFS_RAID_DUP) { 8686 /* Multiply by 2 */ 8687 min_free <<= 1; 8688 } else if (index == BTRFS_RAID_RAID0) { 8689 dev_min = fs_devices->rw_devices; 8690 do_div(min_free, dev_min); 8691 } 8692 8693 /* We need to do this so that we can look at pending chunks */ 8694 trans = btrfs_join_transaction(root); 8695 if (IS_ERR(trans)) { 8696 ret = PTR_ERR(trans); 8697 goto out; 8698 } 8699 8700 mutex_lock(&root->fs_info->chunk_mutex); 8701 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8702 u64 dev_offset; 8703 8704 /* 8705 * check to make sure we can actually find a chunk with enough 8706 * space to fit our block group in. 8707 */ 8708 if (device->total_bytes > device->bytes_used + min_free && 8709 !device->is_tgtdev_for_dev_replace) { 8710 ret = find_free_dev_extent(trans, device, min_free, 8711 &dev_offset, NULL); 8712 if (!ret) 8713 dev_nr++; 8714 8715 if (dev_nr >= dev_min) 8716 break; 8717 8718 ret = -1; 8719 } 8720 } 8721 mutex_unlock(&root->fs_info->chunk_mutex); 8722 btrfs_end_transaction(trans, root); 8723 out: 8724 btrfs_put_block_group(block_group); 8725 return ret; 8726 } 8727 8728 static int find_first_block_group(struct btrfs_root *root, 8729 struct btrfs_path *path, struct btrfs_key *key) 8730 { 8731 int ret = 0; 8732 struct btrfs_key found_key; 8733 struct extent_buffer *leaf; 8734 int slot; 8735 8736 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 8737 if (ret < 0) 8738 goto out; 8739 8740 while (1) { 8741 slot = path->slots[0]; 8742 leaf = path->nodes[0]; 8743 if (slot >= btrfs_header_nritems(leaf)) { 8744 ret = btrfs_next_leaf(root, path); 8745 if (ret == 0) 8746 continue; 8747 if (ret < 0) 8748 goto out; 8749 break; 8750 } 8751 btrfs_item_key_to_cpu(leaf, &found_key, slot); 8752 8753 if (found_key.objectid >= key->objectid && 8754 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 8755 ret = 0; 8756 goto out; 8757 } 8758 path->slots[0]++; 8759 } 8760 out: 8761 return ret; 8762 } 8763 8764 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 8765 { 8766 struct btrfs_block_group_cache *block_group; 8767 u64 last = 0; 8768 8769 while (1) { 8770 struct inode *inode; 8771 8772 block_group = btrfs_lookup_first_block_group(info, last); 8773 while (block_group) { 8774 spin_lock(&block_group->lock); 8775 if (block_group->iref) 8776 break; 8777 spin_unlock(&block_group->lock); 8778 block_group = next_block_group(info->tree_root, 8779 block_group); 8780 } 8781 if (!block_group) { 8782 if (last == 0) 8783 break; 8784 last = 0; 8785 continue; 8786 } 8787 8788 inode = block_group->inode; 8789 block_group->iref = 0; 8790 block_group->inode = NULL; 8791 spin_unlock(&block_group->lock); 8792 iput(inode); 8793 last = block_group->key.objectid + block_group->key.offset; 8794 btrfs_put_block_group(block_group); 8795 } 8796 } 8797 8798 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8799 { 8800 struct btrfs_block_group_cache *block_group; 8801 struct btrfs_space_info *space_info; 8802 struct btrfs_caching_control *caching_ctl; 8803 struct rb_node *n; 8804 8805 down_write(&info->commit_root_sem); 8806 while (!list_empty(&info->caching_block_groups)) { 8807 caching_ctl = list_entry(info->caching_block_groups.next, 8808 struct btrfs_caching_control, list); 8809 list_del(&caching_ctl->list); 8810 put_caching_control(caching_ctl); 8811 } 8812 up_write(&info->commit_root_sem); 8813 8814 spin_lock(&info->unused_bgs_lock); 8815 while (!list_empty(&info->unused_bgs)) { 8816 block_group = list_first_entry(&info->unused_bgs, 8817 struct btrfs_block_group_cache, 8818 bg_list); 8819 list_del_init(&block_group->bg_list); 8820 btrfs_put_block_group(block_group); 8821 } 8822 spin_unlock(&info->unused_bgs_lock); 8823 8824 spin_lock(&info->block_group_cache_lock); 8825 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8826 block_group = rb_entry(n, struct btrfs_block_group_cache, 8827 cache_node); 8828 rb_erase(&block_group->cache_node, 8829 &info->block_group_cache_tree); 8830 RB_CLEAR_NODE(&block_group->cache_node); 8831 spin_unlock(&info->block_group_cache_lock); 8832 8833 down_write(&block_group->space_info->groups_sem); 8834 list_del(&block_group->list); 8835 up_write(&block_group->space_info->groups_sem); 8836 8837 if (block_group->cached == BTRFS_CACHE_STARTED) 8838 wait_block_group_cache_done(block_group); 8839 8840 /* 8841 * We haven't cached this block group, which means we could 8842 * possibly have excluded extents on this block group. 8843 */ 8844 if (block_group->cached == BTRFS_CACHE_NO || 8845 block_group->cached == BTRFS_CACHE_ERROR) 8846 free_excluded_extents(info->extent_root, block_group); 8847 8848 btrfs_remove_free_space_cache(block_group); 8849 btrfs_put_block_group(block_group); 8850 8851 spin_lock(&info->block_group_cache_lock); 8852 } 8853 spin_unlock(&info->block_group_cache_lock); 8854 8855 /* now that all the block groups are freed, go through and 8856 * free all the space_info structs. This is only called during 8857 * the final stages of unmount, and so we know nobody is 8858 * using them. We call synchronize_rcu() once before we start, 8859 * just to be on the safe side. 8860 */ 8861 synchronize_rcu(); 8862 8863 release_global_block_rsv(info); 8864 8865 while (!list_empty(&info->space_info)) { 8866 int i; 8867 8868 space_info = list_entry(info->space_info.next, 8869 struct btrfs_space_info, 8870 list); 8871 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8872 if (WARN_ON(space_info->bytes_pinned > 0 || 8873 space_info->bytes_reserved > 0 || 8874 space_info->bytes_may_use > 0)) { 8875 dump_space_info(space_info, 0, 0); 8876 } 8877 } 8878 list_del(&space_info->list); 8879 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 8880 struct kobject *kobj; 8881 kobj = space_info->block_group_kobjs[i]; 8882 space_info->block_group_kobjs[i] = NULL; 8883 if (kobj) { 8884 kobject_del(kobj); 8885 kobject_put(kobj); 8886 } 8887 } 8888 kobject_del(&space_info->kobj); 8889 kobject_put(&space_info->kobj); 8890 } 8891 return 0; 8892 } 8893 8894 static void __link_block_group(struct btrfs_space_info *space_info, 8895 struct btrfs_block_group_cache *cache) 8896 { 8897 int index = get_block_group_index(cache); 8898 bool first = false; 8899 8900 down_write(&space_info->groups_sem); 8901 if (list_empty(&space_info->block_groups[index])) 8902 first = true; 8903 list_add_tail(&cache->list, &space_info->block_groups[index]); 8904 up_write(&space_info->groups_sem); 8905 8906 if (first) { 8907 struct raid_kobject *rkobj; 8908 int ret; 8909 8910 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 8911 if (!rkobj) 8912 goto out_err; 8913 rkobj->raid_type = index; 8914 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 8915 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 8916 "%s", get_raid_name(index)); 8917 if (ret) { 8918 kobject_put(&rkobj->kobj); 8919 goto out_err; 8920 } 8921 space_info->block_group_kobjs[index] = &rkobj->kobj; 8922 } 8923 8924 return; 8925 out_err: 8926 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 8927 } 8928 8929 static struct btrfs_block_group_cache * 8930 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) 8931 { 8932 struct btrfs_block_group_cache *cache; 8933 8934 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8935 if (!cache) 8936 return NULL; 8937 8938 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8939 GFP_NOFS); 8940 if (!cache->free_space_ctl) { 8941 kfree(cache); 8942 return NULL; 8943 } 8944 8945 cache->key.objectid = start; 8946 cache->key.offset = size; 8947 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8948 8949 cache->sectorsize = root->sectorsize; 8950 cache->fs_info = root->fs_info; 8951 cache->full_stripe_len = btrfs_full_stripe_len(root, 8952 &root->fs_info->mapping_tree, 8953 start); 8954 atomic_set(&cache->count, 1); 8955 spin_lock_init(&cache->lock); 8956 init_rwsem(&cache->data_rwsem); 8957 INIT_LIST_HEAD(&cache->list); 8958 INIT_LIST_HEAD(&cache->cluster_list); 8959 INIT_LIST_HEAD(&cache->bg_list); 8960 INIT_LIST_HEAD(&cache->ro_list); 8961 INIT_LIST_HEAD(&cache->dirty_list); 8962 btrfs_init_free_space_ctl(cache); 8963 atomic_set(&cache->trimming, 0); 8964 8965 return cache; 8966 } 8967 8968 int btrfs_read_block_groups(struct btrfs_root *root) 8969 { 8970 struct btrfs_path *path; 8971 int ret; 8972 struct btrfs_block_group_cache *cache; 8973 struct btrfs_fs_info *info = root->fs_info; 8974 struct btrfs_space_info *space_info; 8975 struct btrfs_key key; 8976 struct btrfs_key found_key; 8977 struct extent_buffer *leaf; 8978 int need_clear = 0; 8979 u64 cache_gen; 8980 8981 root = info->extent_root; 8982 key.objectid = 0; 8983 key.offset = 0; 8984 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8985 path = btrfs_alloc_path(); 8986 if (!path) 8987 return -ENOMEM; 8988 path->reada = 1; 8989 8990 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 8991 if (btrfs_test_opt(root, SPACE_CACHE) && 8992 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 8993 need_clear = 1; 8994 if (btrfs_test_opt(root, CLEAR_CACHE)) 8995 need_clear = 1; 8996 8997 while (1) { 8998 ret = find_first_block_group(root, path, &key); 8999 if (ret > 0) 9000 break; 9001 if (ret != 0) 9002 goto error; 9003 9004 leaf = path->nodes[0]; 9005 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9006 9007 cache = btrfs_create_block_group_cache(root, found_key.objectid, 9008 found_key.offset); 9009 if (!cache) { 9010 ret = -ENOMEM; 9011 goto error; 9012 } 9013 9014 if (need_clear) { 9015 /* 9016 * When we mount with old space cache, we need to 9017 * set BTRFS_DC_CLEAR and set dirty flag. 9018 * 9019 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9020 * truncate the old free space cache inode and 9021 * setup a new one. 9022 * b) Setting 'dirty flag' makes sure that we flush 9023 * the new space cache info onto disk. 9024 */ 9025 if (btrfs_test_opt(root, SPACE_CACHE)) 9026 cache->disk_cache_state = BTRFS_DC_CLEAR; 9027 } 9028 9029 read_extent_buffer(leaf, &cache->item, 9030 btrfs_item_ptr_offset(leaf, path->slots[0]), 9031 sizeof(cache->item)); 9032 cache->flags = btrfs_block_group_flags(&cache->item); 9033 9034 key.objectid = found_key.objectid + found_key.offset; 9035 btrfs_release_path(path); 9036 9037 /* 9038 * We need to exclude the super stripes now so that the space 9039 * info has super bytes accounted for, otherwise we'll think 9040 * we have more space than we actually do. 9041 */ 9042 ret = exclude_super_stripes(root, cache); 9043 if (ret) { 9044 /* 9045 * We may have excluded something, so call this just in 9046 * case. 9047 */ 9048 free_excluded_extents(root, cache); 9049 btrfs_put_block_group(cache); 9050 goto error; 9051 } 9052 9053 /* 9054 * check for two cases, either we are full, and therefore 9055 * don't need to bother with the caching work since we won't 9056 * find any space, or we are empty, and we can just add all 9057 * the space in and be done with it. This saves us _alot_ of 9058 * time, particularly in the full case. 9059 */ 9060 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 9061 cache->last_byte_to_unpin = (u64)-1; 9062 cache->cached = BTRFS_CACHE_FINISHED; 9063 free_excluded_extents(root, cache); 9064 } else if (btrfs_block_group_used(&cache->item) == 0) { 9065 cache->last_byte_to_unpin = (u64)-1; 9066 cache->cached = BTRFS_CACHE_FINISHED; 9067 add_new_free_space(cache, root->fs_info, 9068 found_key.objectid, 9069 found_key.objectid + 9070 found_key.offset); 9071 free_excluded_extents(root, cache); 9072 } 9073 9074 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9075 if (ret) { 9076 btrfs_remove_free_space_cache(cache); 9077 btrfs_put_block_group(cache); 9078 goto error; 9079 } 9080 9081 ret = update_space_info(info, cache->flags, found_key.offset, 9082 btrfs_block_group_used(&cache->item), 9083 &space_info); 9084 if (ret) { 9085 btrfs_remove_free_space_cache(cache); 9086 spin_lock(&info->block_group_cache_lock); 9087 rb_erase(&cache->cache_node, 9088 &info->block_group_cache_tree); 9089 RB_CLEAR_NODE(&cache->cache_node); 9090 spin_unlock(&info->block_group_cache_lock); 9091 btrfs_put_block_group(cache); 9092 goto error; 9093 } 9094 9095 cache->space_info = space_info; 9096 spin_lock(&cache->space_info->lock); 9097 cache->space_info->bytes_readonly += cache->bytes_super; 9098 spin_unlock(&cache->space_info->lock); 9099 9100 __link_block_group(space_info, cache); 9101 9102 set_avail_alloc_bits(root->fs_info, cache->flags); 9103 if (btrfs_chunk_readonly(root, cache->key.objectid)) { 9104 set_block_group_ro(cache, 1); 9105 } else if (btrfs_block_group_used(&cache->item) == 0) { 9106 spin_lock(&info->unused_bgs_lock); 9107 /* Should always be true but just in case. */ 9108 if (list_empty(&cache->bg_list)) { 9109 btrfs_get_block_group(cache); 9110 list_add_tail(&cache->bg_list, 9111 &info->unused_bgs); 9112 } 9113 spin_unlock(&info->unused_bgs_lock); 9114 } 9115 } 9116 9117 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 9118 if (!(get_alloc_profile(root, space_info->flags) & 9119 (BTRFS_BLOCK_GROUP_RAID10 | 9120 BTRFS_BLOCK_GROUP_RAID1 | 9121 BTRFS_BLOCK_GROUP_RAID5 | 9122 BTRFS_BLOCK_GROUP_RAID6 | 9123 BTRFS_BLOCK_GROUP_DUP))) 9124 continue; 9125 /* 9126 * avoid allocating from un-mirrored block group if there are 9127 * mirrored block groups. 9128 */ 9129 list_for_each_entry(cache, 9130 &space_info->block_groups[BTRFS_RAID_RAID0], 9131 list) 9132 set_block_group_ro(cache, 1); 9133 list_for_each_entry(cache, 9134 &space_info->block_groups[BTRFS_RAID_SINGLE], 9135 list) 9136 set_block_group_ro(cache, 1); 9137 } 9138 9139 init_global_block_rsv(info); 9140 ret = 0; 9141 error: 9142 btrfs_free_path(path); 9143 return ret; 9144 } 9145 9146 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 9147 struct btrfs_root *root) 9148 { 9149 struct btrfs_block_group_cache *block_group, *tmp; 9150 struct btrfs_root *extent_root = root->fs_info->extent_root; 9151 struct btrfs_block_group_item item; 9152 struct btrfs_key key; 9153 int ret = 0; 9154 9155 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 9156 if (ret) 9157 goto next; 9158 9159 spin_lock(&block_group->lock); 9160 memcpy(&item, &block_group->item, sizeof(item)); 9161 memcpy(&key, &block_group->key, sizeof(key)); 9162 spin_unlock(&block_group->lock); 9163 9164 ret = btrfs_insert_item(trans, extent_root, &key, &item, 9165 sizeof(item)); 9166 if (ret) 9167 btrfs_abort_transaction(trans, extent_root, ret); 9168 ret = btrfs_finish_chunk_alloc(trans, extent_root, 9169 key.objectid, key.offset); 9170 if (ret) 9171 btrfs_abort_transaction(trans, extent_root, ret); 9172 next: 9173 list_del_init(&block_group->bg_list); 9174 } 9175 } 9176 9177 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 9178 struct btrfs_root *root, u64 bytes_used, 9179 u64 type, u64 chunk_objectid, u64 chunk_offset, 9180 u64 size) 9181 { 9182 int ret; 9183 struct btrfs_root *extent_root; 9184 struct btrfs_block_group_cache *cache; 9185 9186 extent_root = root->fs_info->extent_root; 9187 9188 btrfs_set_log_full_commit(root->fs_info, trans); 9189 9190 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 9191 if (!cache) 9192 return -ENOMEM; 9193 9194 btrfs_set_block_group_used(&cache->item, bytes_used); 9195 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 9196 btrfs_set_block_group_flags(&cache->item, type); 9197 9198 cache->flags = type; 9199 cache->last_byte_to_unpin = (u64)-1; 9200 cache->cached = BTRFS_CACHE_FINISHED; 9201 ret = exclude_super_stripes(root, cache); 9202 if (ret) { 9203 /* 9204 * We may have excluded something, so call this just in 9205 * case. 9206 */ 9207 free_excluded_extents(root, cache); 9208 btrfs_put_block_group(cache); 9209 return ret; 9210 } 9211 9212 add_new_free_space(cache, root->fs_info, chunk_offset, 9213 chunk_offset + size); 9214 9215 free_excluded_extents(root, cache); 9216 9217 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9218 if (ret) { 9219 btrfs_remove_free_space_cache(cache); 9220 btrfs_put_block_group(cache); 9221 return ret; 9222 } 9223 9224 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 9225 &cache->space_info); 9226 if (ret) { 9227 btrfs_remove_free_space_cache(cache); 9228 spin_lock(&root->fs_info->block_group_cache_lock); 9229 rb_erase(&cache->cache_node, 9230 &root->fs_info->block_group_cache_tree); 9231 RB_CLEAR_NODE(&cache->cache_node); 9232 spin_unlock(&root->fs_info->block_group_cache_lock); 9233 btrfs_put_block_group(cache); 9234 return ret; 9235 } 9236 update_global_block_rsv(root->fs_info); 9237 9238 spin_lock(&cache->space_info->lock); 9239 cache->space_info->bytes_readonly += cache->bytes_super; 9240 spin_unlock(&cache->space_info->lock); 9241 9242 __link_block_group(cache->space_info, cache); 9243 9244 list_add_tail(&cache->bg_list, &trans->new_bgs); 9245 9246 set_avail_alloc_bits(extent_root->fs_info, type); 9247 9248 return 0; 9249 } 9250 9251 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 9252 { 9253 u64 extra_flags = chunk_to_extended(flags) & 9254 BTRFS_EXTENDED_PROFILE_MASK; 9255 9256 write_seqlock(&fs_info->profiles_lock); 9257 if (flags & BTRFS_BLOCK_GROUP_DATA) 9258 fs_info->avail_data_alloc_bits &= ~extra_flags; 9259 if (flags & BTRFS_BLOCK_GROUP_METADATA) 9260 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 9261 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 9262 fs_info->avail_system_alloc_bits &= ~extra_flags; 9263 write_sequnlock(&fs_info->profiles_lock); 9264 } 9265 9266 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9267 struct btrfs_root *root, u64 group_start, 9268 struct extent_map *em) 9269 { 9270 struct btrfs_path *path; 9271 struct btrfs_block_group_cache *block_group; 9272 struct btrfs_free_cluster *cluster; 9273 struct btrfs_root *tree_root = root->fs_info->tree_root; 9274 struct btrfs_key key; 9275 struct inode *inode; 9276 struct kobject *kobj = NULL; 9277 int ret; 9278 int index; 9279 int factor; 9280 struct btrfs_caching_control *caching_ctl = NULL; 9281 bool remove_em; 9282 9283 root = root->fs_info->extent_root; 9284 9285 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 9286 BUG_ON(!block_group); 9287 BUG_ON(!block_group->ro); 9288 9289 /* 9290 * Free the reserved super bytes from this block group before 9291 * remove it. 9292 */ 9293 free_excluded_extents(root, block_group); 9294 9295 memcpy(&key, &block_group->key, sizeof(key)); 9296 index = get_block_group_index(block_group); 9297 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 9298 BTRFS_BLOCK_GROUP_RAID1 | 9299 BTRFS_BLOCK_GROUP_RAID10)) 9300 factor = 2; 9301 else 9302 factor = 1; 9303 9304 /* make sure this block group isn't part of an allocation cluster */ 9305 cluster = &root->fs_info->data_alloc_cluster; 9306 spin_lock(&cluster->refill_lock); 9307 btrfs_return_cluster_to_free_space(block_group, cluster); 9308 spin_unlock(&cluster->refill_lock); 9309 9310 /* 9311 * make sure this block group isn't part of a metadata 9312 * allocation cluster 9313 */ 9314 cluster = &root->fs_info->meta_alloc_cluster; 9315 spin_lock(&cluster->refill_lock); 9316 btrfs_return_cluster_to_free_space(block_group, cluster); 9317 spin_unlock(&cluster->refill_lock); 9318 9319 path = btrfs_alloc_path(); 9320 if (!path) { 9321 ret = -ENOMEM; 9322 goto out; 9323 } 9324 9325 inode = lookup_free_space_inode(tree_root, block_group, path); 9326 if (!IS_ERR(inode)) { 9327 ret = btrfs_orphan_add(trans, inode); 9328 if (ret) { 9329 btrfs_add_delayed_iput(inode); 9330 goto out; 9331 } 9332 clear_nlink(inode); 9333 /* One for the block groups ref */ 9334 spin_lock(&block_group->lock); 9335 if (block_group->iref) { 9336 block_group->iref = 0; 9337 block_group->inode = NULL; 9338 spin_unlock(&block_group->lock); 9339 iput(inode); 9340 } else { 9341 spin_unlock(&block_group->lock); 9342 } 9343 /* One for our lookup ref */ 9344 btrfs_add_delayed_iput(inode); 9345 } 9346 9347 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 9348 key.offset = block_group->key.objectid; 9349 key.type = 0; 9350 9351 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 9352 if (ret < 0) 9353 goto out; 9354 if (ret > 0) 9355 btrfs_release_path(path); 9356 if (ret == 0) { 9357 ret = btrfs_del_item(trans, tree_root, path); 9358 if (ret) 9359 goto out; 9360 btrfs_release_path(path); 9361 } 9362 9363 spin_lock(&root->fs_info->block_group_cache_lock); 9364 rb_erase(&block_group->cache_node, 9365 &root->fs_info->block_group_cache_tree); 9366 RB_CLEAR_NODE(&block_group->cache_node); 9367 9368 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9369 root->fs_info->first_logical_byte = (u64)-1; 9370 spin_unlock(&root->fs_info->block_group_cache_lock); 9371 9372 down_write(&block_group->space_info->groups_sem); 9373 /* 9374 * we must use list_del_init so people can check to see if they 9375 * are still on the list after taking the semaphore 9376 */ 9377 list_del_init(&block_group->list); 9378 if (list_empty(&block_group->space_info->block_groups[index])) { 9379 kobj = block_group->space_info->block_group_kobjs[index]; 9380 block_group->space_info->block_group_kobjs[index] = NULL; 9381 clear_avail_alloc_bits(root->fs_info, block_group->flags); 9382 } 9383 up_write(&block_group->space_info->groups_sem); 9384 if (kobj) { 9385 kobject_del(kobj); 9386 kobject_put(kobj); 9387 } 9388 9389 if (block_group->has_caching_ctl) 9390 caching_ctl = get_caching_control(block_group); 9391 if (block_group->cached == BTRFS_CACHE_STARTED) 9392 wait_block_group_cache_done(block_group); 9393 if (block_group->has_caching_ctl) { 9394 down_write(&root->fs_info->commit_root_sem); 9395 if (!caching_ctl) { 9396 struct btrfs_caching_control *ctl; 9397 9398 list_for_each_entry(ctl, 9399 &root->fs_info->caching_block_groups, list) 9400 if (ctl->block_group == block_group) { 9401 caching_ctl = ctl; 9402 atomic_inc(&caching_ctl->count); 9403 break; 9404 } 9405 } 9406 if (caching_ctl) 9407 list_del_init(&caching_ctl->list); 9408 up_write(&root->fs_info->commit_root_sem); 9409 if (caching_ctl) { 9410 /* Once for the caching bgs list and once for us. */ 9411 put_caching_control(caching_ctl); 9412 put_caching_control(caching_ctl); 9413 } 9414 } 9415 9416 spin_lock(&trans->transaction->dirty_bgs_lock); 9417 if (!list_empty(&block_group->dirty_list)) { 9418 list_del_init(&block_group->dirty_list); 9419 btrfs_put_block_group(block_group); 9420 } 9421 spin_unlock(&trans->transaction->dirty_bgs_lock); 9422 9423 btrfs_remove_free_space_cache(block_group); 9424 9425 spin_lock(&block_group->space_info->lock); 9426 list_del_init(&block_group->ro_list); 9427 block_group->space_info->total_bytes -= block_group->key.offset; 9428 block_group->space_info->bytes_readonly -= block_group->key.offset; 9429 block_group->space_info->disk_total -= block_group->key.offset * factor; 9430 spin_unlock(&block_group->space_info->lock); 9431 9432 memcpy(&key, &block_group->key, sizeof(key)); 9433 9434 lock_chunks(root); 9435 if (!list_empty(&em->list)) { 9436 /* We're in the transaction->pending_chunks list. */ 9437 free_extent_map(em); 9438 } 9439 spin_lock(&block_group->lock); 9440 block_group->removed = 1; 9441 /* 9442 * At this point trimming can't start on this block group, because we 9443 * removed the block group from the tree fs_info->block_group_cache_tree 9444 * so no one can't find it anymore and even if someone already got this 9445 * block group before we removed it from the rbtree, they have already 9446 * incremented block_group->trimming - if they didn't, they won't find 9447 * any free space entries because we already removed them all when we 9448 * called btrfs_remove_free_space_cache(). 9449 * 9450 * And we must not remove the extent map from the fs_info->mapping_tree 9451 * to prevent the same logical address range and physical device space 9452 * ranges from being reused for a new block group. This is because our 9453 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 9454 * completely transactionless, so while it is trimming a range the 9455 * currently running transaction might finish and a new one start, 9456 * allowing for new block groups to be created that can reuse the same 9457 * physical device locations unless we take this special care. 9458 */ 9459 remove_em = (atomic_read(&block_group->trimming) == 0); 9460 /* 9461 * Make sure a trimmer task always sees the em in the pinned_chunks list 9462 * if it sees block_group->removed == 1 (needs to lock block_group->lock 9463 * before checking block_group->removed). 9464 */ 9465 if (!remove_em) { 9466 /* 9467 * Our em might be in trans->transaction->pending_chunks which 9468 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 9469 * and so is the fs_info->pinned_chunks list. 9470 * 9471 * So at this point we must be holding the chunk_mutex to avoid 9472 * any races with chunk allocation (more specifically at 9473 * volumes.c:contains_pending_extent()), to ensure it always 9474 * sees the em, either in the pending_chunks list or in the 9475 * pinned_chunks list. 9476 */ 9477 list_move_tail(&em->list, &root->fs_info->pinned_chunks); 9478 } 9479 spin_unlock(&block_group->lock); 9480 9481 if (remove_em) { 9482 struct extent_map_tree *em_tree; 9483 9484 em_tree = &root->fs_info->mapping_tree.map_tree; 9485 write_lock(&em_tree->lock); 9486 /* 9487 * The em might be in the pending_chunks list, so make sure the 9488 * chunk mutex is locked, since remove_extent_mapping() will 9489 * delete us from that list. 9490 */ 9491 remove_extent_mapping(em_tree, em); 9492 write_unlock(&em_tree->lock); 9493 /* once for the tree */ 9494 free_extent_map(em); 9495 } 9496 9497 unlock_chunks(root); 9498 9499 btrfs_put_block_group(block_group); 9500 btrfs_put_block_group(block_group); 9501 9502 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 9503 if (ret > 0) 9504 ret = -EIO; 9505 if (ret < 0) 9506 goto out; 9507 9508 ret = btrfs_del_item(trans, root, path); 9509 out: 9510 btrfs_free_path(path); 9511 return ret; 9512 } 9513 9514 /* 9515 * Process the unused_bgs list and remove any that don't have any allocated 9516 * space inside of them. 9517 */ 9518 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 9519 { 9520 struct btrfs_block_group_cache *block_group; 9521 struct btrfs_space_info *space_info; 9522 struct btrfs_root *root = fs_info->extent_root; 9523 struct btrfs_trans_handle *trans; 9524 int ret = 0; 9525 9526 if (!fs_info->open) 9527 return; 9528 9529 spin_lock(&fs_info->unused_bgs_lock); 9530 while (!list_empty(&fs_info->unused_bgs)) { 9531 u64 start, end; 9532 9533 block_group = list_first_entry(&fs_info->unused_bgs, 9534 struct btrfs_block_group_cache, 9535 bg_list); 9536 space_info = block_group->space_info; 9537 list_del_init(&block_group->bg_list); 9538 if (ret || btrfs_mixed_space_info(space_info)) { 9539 btrfs_put_block_group(block_group); 9540 continue; 9541 } 9542 spin_unlock(&fs_info->unused_bgs_lock); 9543 9544 /* Don't want to race with allocators so take the groups_sem */ 9545 down_write(&space_info->groups_sem); 9546 spin_lock(&block_group->lock); 9547 if (block_group->reserved || 9548 btrfs_block_group_used(&block_group->item) || 9549 block_group->ro) { 9550 /* 9551 * We want to bail if we made new allocations or have 9552 * outstanding allocations in this block group. We do 9553 * the ro check in case balance is currently acting on 9554 * this block group. 9555 */ 9556 spin_unlock(&block_group->lock); 9557 up_write(&space_info->groups_sem); 9558 goto next; 9559 } 9560 spin_unlock(&block_group->lock); 9561 9562 /* We don't want to force the issue, only flip if it's ok. */ 9563 ret = set_block_group_ro(block_group, 0); 9564 up_write(&space_info->groups_sem); 9565 if (ret < 0) { 9566 ret = 0; 9567 goto next; 9568 } 9569 9570 /* 9571 * Want to do this before we do anything else so we can recover 9572 * properly if we fail to join the transaction. 9573 */ 9574 /* 1 for btrfs_orphan_reserve_metadata() */ 9575 trans = btrfs_start_transaction(root, 1); 9576 if (IS_ERR(trans)) { 9577 btrfs_set_block_group_rw(root, block_group); 9578 ret = PTR_ERR(trans); 9579 goto next; 9580 } 9581 9582 /* 9583 * We could have pending pinned extents for this block group, 9584 * just delete them, we don't care about them anymore. 9585 */ 9586 start = block_group->key.objectid; 9587 end = start + block_group->key.offset - 1; 9588 /* 9589 * Hold the unused_bg_unpin_mutex lock to avoid racing with 9590 * btrfs_finish_extent_commit(). If we are at transaction N, 9591 * another task might be running finish_extent_commit() for the 9592 * previous transaction N - 1, and have seen a range belonging 9593 * to the block group in freed_extents[] before we were able to 9594 * clear the whole block group range from freed_extents[]. This 9595 * means that task can lookup for the block group after we 9596 * unpinned it from freed_extents[] and removed it, leading to 9597 * a BUG_ON() at btrfs_unpin_extent_range(). 9598 */ 9599 mutex_lock(&fs_info->unused_bg_unpin_mutex); 9600 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 9601 EXTENT_DIRTY, GFP_NOFS); 9602 if (ret) { 9603 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 9604 btrfs_set_block_group_rw(root, block_group); 9605 goto end_trans; 9606 } 9607 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 9608 EXTENT_DIRTY, GFP_NOFS); 9609 if (ret) { 9610 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 9611 btrfs_set_block_group_rw(root, block_group); 9612 goto end_trans; 9613 } 9614 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 9615 9616 /* Reset pinned so btrfs_put_block_group doesn't complain */ 9617 block_group->pinned = 0; 9618 9619 /* 9620 * Btrfs_remove_chunk will abort the transaction if things go 9621 * horribly wrong. 9622 */ 9623 ret = btrfs_remove_chunk(trans, root, 9624 block_group->key.objectid); 9625 end_trans: 9626 btrfs_end_transaction(trans, root); 9627 next: 9628 btrfs_put_block_group(block_group); 9629 spin_lock(&fs_info->unused_bgs_lock); 9630 } 9631 spin_unlock(&fs_info->unused_bgs_lock); 9632 } 9633 9634 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9635 { 9636 struct btrfs_space_info *space_info; 9637 struct btrfs_super_block *disk_super; 9638 u64 features; 9639 u64 flags; 9640 int mixed = 0; 9641 int ret; 9642 9643 disk_super = fs_info->super_copy; 9644 if (!btrfs_super_root(disk_super)) 9645 return 1; 9646 9647 features = btrfs_super_incompat_flags(disk_super); 9648 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 9649 mixed = 1; 9650 9651 flags = BTRFS_BLOCK_GROUP_SYSTEM; 9652 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9653 if (ret) 9654 goto out; 9655 9656 if (mixed) { 9657 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 9658 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9659 } else { 9660 flags = BTRFS_BLOCK_GROUP_METADATA; 9661 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9662 if (ret) 9663 goto out; 9664 9665 flags = BTRFS_BLOCK_GROUP_DATA; 9666 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9667 } 9668 out: 9669 return ret; 9670 } 9671 9672 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 9673 { 9674 return unpin_extent_range(root, start, end, false); 9675 } 9676 9677 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 9678 { 9679 struct btrfs_fs_info *fs_info = root->fs_info; 9680 struct btrfs_block_group_cache *cache = NULL; 9681 u64 group_trimmed; 9682 u64 start; 9683 u64 end; 9684 u64 trimmed = 0; 9685 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 9686 int ret = 0; 9687 9688 /* 9689 * try to trim all FS space, our block group may start from non-zero. 9690 */ 9691 if (range->len == total_bytes) 9692 cache = btrfs_lookup_first_block_group(fs_info, range->start); 9693 else 9694 cache = btrfs_lookup_block_group(fs_info, range->start); 9695 9696 while (cache) { 9697 if (cache->key.objectid >= (range->start + range->len)) { 9698 btrfs_put_block_group(cache); 9699 break; 9700 } 9701 9702 start = max(range->start, cache->key.objectid); 9703 end = min(range->start + range->len, 9704 cache->key.objectid + cache->key.offset); 9705 9706 if (end - start >= range->minlen) { 9707 if (!block_group_cache_done(cache)) { 9708 ret = cache_block_group(cache, 0); 9709 if (ret) { 9710 btrfs_put_block_group(cache); 9711 break; 9712 } 9713 ret = wait_block_group_cache_done(cache); 9714 if (ret) { 9715 btrfs_put_block_group(cache); 9716 break; 9717 } 9718 } 9719 ret = btrfs_trim_block_group(cache, 9720 &group_trimmed, 9721 start, 9722 end, 9723 range->minlen); 9724 9725 trimmed += group_trimmed; 9726 if (ret) { 9727 btrfs_put_block_group(cache); 9728 break; 9729 } 9730 } 9731 9732 cache = next_block_group(fs_info->tree_root, cache); 9733 } 9734 9735 range->len = trimmed; 9736 return ret; 9737 } 9738 9739 /* 9740 * btrfs_{start,end}_write_no_snapshoting() are similar to 9741 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 9742 * data into the page cache through nocow before the subvolume is snapshoted, 9743 * but flush the data into disk after the snapshot creation, or to prevent 9744 * operations while snapshoting is ongoing and that cause the snapshot to be 9745 * inconsistent (writes followed by expanding truncates for example). 9746 */ 9747 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 9748 { 9749 percpu_counter_dec(&root->subv_writers->counter); 9750 /* 9751 * Make sure counter is updated before we wake up 9752 * waiters. 9753 */ 9754 smp_mb(); 9755 if (waitqueue_active(&root->subv_writers->wait)) 9756 wake_up(&root->subv_writers->wait); 9757 } 9758 9759 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 9760 { 9761 if (atomic_read(&root->will_be_snapshoted)) 9762 return 0; 9763 9764 percpu_counter_inc(&root->subv_writers->counter); 9765 /* 9766 * Make sure counter is updated before we check for snapshot creation. 9767 */ 9768 smp_mb(); 9769 if (atomic_read(&root->will_be_snapshoted)) { 9770 btrfs_end_write_no_snapshoting(root); 9771 return 0; 9772 } 9773 return 1; 9774 } 9775