1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "math.h" 37 #include "sysfs.h" 38 #include "qgroup.h" 39 40 #undef SCRAMBLE_DELAYED_REFS 41 42 /* 43 * control flags for do_chunk_alloc's force field 44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 45 * if we really need one. 46 * 47 * CHUNK_ALLOC_LIMITED means to only try and allocate one 48 * if we have very few chunks already allocated. This is 49 * used as part of the clustering code to help make sure 50 * we have a good pool of storage to cluster in, without 51 * filling the FS with empty chunks 52 * 53 * CHUNK_ALLOC_FORCE means it must try to allocate one 54 * 55 */ 56 enum { 57 CHUNK_ALLOC_NO_FORCE = 0, 58 CHUNK_ALLOC_LIMITED = 1, 59 CHUNK_ALLOC_FORCE = 2, 60 }; 61 62 /* 63 * Control how reservations are dealt with. 64 * 65 * RESERVE_FREE - freeing a reservation. 66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 67 * ENOSPC accounting 68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 69 * bytes_may_use as the ENOSPC accounting is done elsewhere 70 */ 71 enum { 72 RESERVE_FREE = 0, 73 RESERVE_ALLOC = 1, 74 RESERVE_ALLOC_NO_ACCOUNT = 2, 75 }; 76 77 static int update_block_group(struct btrfs_trans_handle *trans, 78 struct btrfs_root *root, u64 bytenr, 79 u64 num_bytes, int alloc); 80 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 81 struct btrfs_root *root, 82 u64 bytenr, u64 num_bytes, u64 parent, 83 u64 root_objectid, u64 owner_objectid, 84 u64 owner_offset, int refs_to_drop, 85 struct btrfs_delayed_extent_op *extra_op, 86 int no_quota); 87 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 88 struct extent_buffer *leaf, 89 struct btrfs_extent_item *ei); 90 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 91 struct btrfs_root *root, 92 u64 parent, u64 root_objectid, 93 u64 flags, u64 owner, u64 offset, 94 struct btrfs_key *ins, int ref_mod); 95 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 96 struct btrfs_root *root, 97 u64 parent, u64 root_objectid, 98 u64 flags, struct btrfs_disk_key *key, 99 int level, struct btrfs_key *ins, 100 int no_quota); 101 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 102 struct btrfs_root *extent_root, u64 flags, 103 int force); 104 static int find_next_key(struct btrfs_path *path, int level, 105 struct btrfs_key *key); 106 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 107 int dump_block_groups); 108 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 109 u64 num_bytes, int reserve, 110 int delalloc); 111 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 112 u64 num_bytes); 113 int btrfs_pin_extent(struct btrfs_root *root, 114 u64 bytenr, u64 num_bytes, int reserved); 115 116 static noinline int 117 block_group_cache_done(struct btrfs_block_group_cache *cache) 118 { 119 smp_mb(); 120 return cache->cached == BTRFS_CACHE_FINISHED || 121 cache->cached == BTRFS_CACHE_ERROR; 122 } 123 124 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 125 { 126 return (cache->flags & bits) == bits; 127 } 128 129 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 130 { 131 atomic_inc(&cache->count); 132 } 133 134 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 135 { 136 if (atomic_dec_and_test(&cache->count)) { 137 WARN_ON(cache->pinned > 0); 138 WARN_ON(cache->reserved > 0); 139 kfree(cache->free_space_ctl); 140 kfree(cache); 141 } 142 } 143 144 /* 145 * this adds the block group to the fs_info rb tree for the block group 146 * cache 147 */ 148 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 149 struct btrfs_block_group_cache *block_group) 150 { 151 struct rb_node **p; 152 struct rb_node *parent = NULL; 153 struct btrfs_block_group_cache *cache; 154 155 spin_lock(&info->block_group_cache_lock); 156 p = &info->block_group_cache_tree.rb_node; 157 158 while (*p) { 159 parent = *p; 160 cache = rb_entry(parent, struct btrfs_block_group_cache, 161 cache_node); 162 if (block_group->key.objectid < cache->key.objectid) { 163 p = &(*p)->rb_left; 164 } else if (block_group->key.objectid > cache->key.objectid) { 165 p = &(*p)->rb_right; 166 } else { 167 spin_unlock(&info->block_group_cache_lock); 168 return -EEXIST; 169 } 170 } 171 172 rb_link_node(&block_group->cache_node, parent, p); 173 rb_insert_color(&block_group->cache_node, 174 &info->block_group_cache_tree); 175 176 if (info->first_logical_byte > block_group->key.objectid) 177 info->first_logical_byte = block_group->key.objectid; 178 179 spin_unlock(&info->block_group_cache_lock); 180 181 return 0; 182 } 183 184 /* 185 * This will return the block group at or after bytenr if contains is 0, else 186 * it will return the block group that contains the bytenr 187 */ 188 static struct btrfs_block_group_cache * 189 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 190 int contains) 191 { 192 struct btrfs_block_group_cache *cache, *ret = NULL; 193 struct rb_node *n; 194 u64 end, start; 195 196 spin_lock(&info->block_group_cache_lock); 197 n = info->block_group_cache_tree.rb_node; 198 199 while (n) { 200 cache = rb_entry(n, struct btrfs_block_group_cache, 201 cache_node); 202 end = cache->key.objectid + cache->key.offset - 1; 203 start = cache->key.objectid; 204 205 if (bytenr < start) { 206 if (!contains && (!ret || start < ret->key.objectid)) 207 ret = cache; 208 n = n->rb_left; 209 } else if (bytenr > start) { 210 if (contains && bytenr <= end) { 211 ret = cache; 212 break; 213 } 214 n = n->rb_right; 215 } else { 216 ret = cache; 217 break; 218 } 219 } 220 if (ret) { 221 btrfs_get_block_group(ret); 222 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 223 info->first_logical_byte = ret->key.objectid; 224 } 225 spin_unlock(&info->block_group_cache_lock); 226 227 return ret; 228 } 229 230 static int add_excluded_extent(struct btrfs_root *root, 231 u64 start, u64 num_bytes) 232 { 233 u64 end = start + num_bytes - 1; 234 set_extent_bits(&root->fs_info->freed_extents[0], 235 start, end, EXTENT_UPTODATE, GFP_NOFS); 236 set_extent_bits(&root->fs_info->freed_extents[1], 237 start, end, EXTENT_UPTODATE, GFP_NOFS); 238 return 0; 239 } 240 241 static void free_excluded_extents(struct btrfs_root *root, 242 struct btrfs_block_group_cache *cache) 243 { 244 u64 start, end; 245 246 start = cache->key.objectid; 247 end = start + cache->key.offset - 1; 248 249 clear_extent_bits(&root->fs_info->freed_extents[0], 250 start, end, EXTENT_UPTODATE, GFP_NOFS); 251 clear_extent_bits(&root->fs_info->freed_extents[1], 252 start, end, EXTENT_UPTODATE, GFP_NOFS); 253 } 254 255 static int exclude_super_stripes(struct btrfs_root *root, 256 struct btrfs_block_group_cache *cache) 257 { 258 u64 bytenr; 259 u64 *logical; 260 int stripe_len; 261 int i, nr, ret; 262 263 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 264 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 265 cache->bytes_super += stripe_len; 266 ret = add_excluded_extent(root, cache->key.objectid, 267 stripe_len); 268 if (ret) 269 return ret; 270 } 271 272 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 273 bytenr = btrfs_sb_offset(i); 274 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 275 cache->key.objectid, bytenr, 276 0, &logical, &nr, &stripe_len); 277 if (ret) 278 return ret; 279 280 while (nr--) { 281 u64 start, len; 282 283 if (logical[nr] > cache->key.objectid + 284 cache->key.offset) 285 continue; 286 287 if (logical[nr] + stripe_len <= cache->key.objectid) 288 continue; 289 290 start = logical[nr]; 291 if (start < cache->key.objectid) { 292 start = cache->key.objectid; 293 len = (logical[nr] + stripe_len) - start; 294 } else { 295 len = min_t(u64, stripe_len, 296 cache->key.objectid + 297 cache->key.offset - start); 298 } 299 300 cache->bytes_super += len; 301 ret = add_excluded_extent(root, start, len); 302 if (ret) { 303 kfree(logical); 304 return ret; 305 } 306 } 307 308 kfree(logical); 309 } 310 return 0; 311 } 312 313 static struct btrfs_caching_control * 314 get_caching_control(struct btrfs_block_group_cache *cache) 315 { 316 struct btrfs_caching_control *ctl; 317 318 spin_lock(&cache->lock); 319 if (!cache->caching_ctl) { 320 spin_unlock(&cache->lock); 321 return NULL; 322 } 323 324 ctl = cache->caching_ctl; 325 atomic_inc(&ctl->count); 326 spin_unlock(&cache->lock); 327 return ctl; 328 } 329 330 static void put_caching_control(struct btrfs_caching_control *ctl) 331 { 332 if (atomic_dec_and_test(&ctl->count)) 333 kfree(ctl); 334 } 335 336 /* 337 * this is only called by cache_block_group, since we could have freed extents 338 * we need to check the pinned_extents for any extents that can't be used yet 339 * since their free space will be released as soon as the transaction commits. 340 */ 341 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 342 struct btrfs_fs_info *info, u64 start, u64 end) 343 { 344 u64 extent_start, extent_end, size, total_added = 0; 345 int ret; 346 347 while (start < end) { 348 ret = find_first_extent_bit(info->pinned_extents, start, 349 &extent_start, &extent_end, 350 EXTENT_DIRTY | EXTENT_UPTODATE, 351 NULL); 352 if (ret) 353 break; 354 355 if (extent_start <= start) { 356 start = extent_end + 1; 357 } else if (extent_start > start && extent_start < end) { 358 size = extent_start - start; 359 total_added += size; 360 ret = btrfs_add_free_space(block_group, start, 361 size); 362 BUG_ON(ret); /* -ENOMEM or logic error */ 363 start = extent_end + 1; 364 } else { 365 break; 366 } 367 } 368 369 if (start < end) { 370 size = end - start; 371 total_added += size; 372 ret = btrfs_add_free_space(block_group, start, size); 373 BUG_ON(ret); /* -ENOMEM or logic error */ 374 } 375 376 return total_added; 377 } 378 379 static noinline void caching_thread(struct btrfs_work *work) 380 { 381 struct btrfs_block_group_cache *block_group; 382 struct btrfs_fs_info *fs_info; 383 struct btrfs_caching_control *caching_ctl; 384 struct btrfs_root *extent_root; 385 struct btrfs_path *path; 386 struct extent_buffer *leaf; 387 struct btrfs_key key; 388 u64 total_found = 0; 389 u64 last = 0; 390 u32 nritems; 391 int ret = -ENOMEM; 392 393 caching_ctl = container_of(work, struct btrfs_caching_control, work); 394 block_group = caching_ctl->block_group; 395 fs_info = block_group->fs_info; 396 extent_root = fs_info->extent_root; 397 398 path = btrfs_alloc_path(); 399 if (!path) 400 goto out; 401 402 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 403 404 /* 405 * We don't want to deadlock with somebody trying to allocate a new 406 * extent for the extent root while also trying to search the extent 407 * root to add free space. So we skip locking and search the commit 408 * root, since its read-only 409 */ 410 path->skip_locking = 1; 411 path->search_commit_root = 1; 412 path->reada = 1; 413 414 key.objectid = last; 415 key.offset = 0; 416 key.type = BTRFS_EXTENT_ITEM_KEY; 417 again: 418 mutex_lock(&caching_ctl->mutex); 419 /* need to make sure the commit_root doesn't disappear */ 420 down_read(&fs_info->commit_root_sem); 421 422 next: 423 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 424 if (ret < 0) 425 goto err; 426 427 leaf = path->nodes[0]; 428 nritems = btrfs_header_nritems(leaf); 429 430 while (1) { 431 if (btrfs_fs_closing(fs_info) > 1) { 432 last = (u64)-1; 433 break; 434 } 435 436 if (path->slots[0] < nritems) { 437 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 438 } else { 439 ret = find_next_key(path, 0, &key); 440 if (ret) 441 break; 442 443 if (need_resched() || 444 rwsem_is_contended(&fs_info->commit_root_sem)) { 445 caching_ctl->progress = last; 446 btrfs_release_path(path); 447 up_read(&fs_info->commit_root_sem); 448 mutex_unlock(&caching_ctl->mutex); 449 cond_resched(); 450 goto again; 451 } 452 453 ret = btrfs_next_leaf(extent_root, path); 454 if (ret < 0) 455 goto err; 456 if (ret) 457 break; 458 leaf = path->nodes[0]; 459 nritems = btrfs_header_nritems(leaf); 460 continue; 461 } 462 463 if (key.objectid < last) { 464 key.objectid = last; 465 key.offset = 0; 466 key.type = BTRFS_EXTENT_ITEM_KEY; 467 468 caching_ctl->progress = last; 469 btrfs_release_path(path); 470 goto next; 471 } 472 473 if (key.objectid < block_group->key.objectid) { 474 path->slots[0]++; 475 continue; 476 } 477 478 if (key.objectid >= block_group->key.objectid + 479 block_group->key.offset) 480 break; 481 482 if (key.type == BTRFS_EXTENT_ITEM_KEY || 483 key.type == BTRFS_METADATA_ITEM_KEY) { 484 total_found += add_new_free_space(block_group, 485 fs_info, last, 486 key.objectid); 487 if (key.type == BTRFS_METADATA_ITEM_KEY) 488 last = key.objectid + 489 fs_info->tree_root->nodesize; 490 else 491 last = key.objectid + key.offset; 492 493 if (total_found > (1024 * 1024 * 2)) { 494 total_found = 0; 495 wake_up(&caching_ctl->wait); 496 } 497 } 498 path->slots[0]++; 499 } 500 ret = 0; 501 502 total_found += add_new_free_space(block_group, fs_info, last, 503 block_group->key.objectid + 504 block_group->key.offset); 505 caching_ctl->progress = (u64)-1; 506 507 spin_lock(&block_group->lock); 508 block_group->caching_ctl = NULL; 509 block_group->cached = BTRFS_CACHE_FINISHED; 510 spin_unlock(&block_group->lock); 511 512 err: 513 btrfs_free_path(path); 514 up_read(&fs_info->commit_root_sem); 515 516 free_excluded_extents(extent_root, block_group); 517 518 mutex_unlock(&caching_ctl->mutex); 519 out: 520 if (ret) { 521 spin_lock(&block_group->lock); 522 block_group->caching_ctl = NULL; 523 block_group->cached = BTRFS_CACHE_ERROR; 524 spin_unlock(&block_group->lock); 525 } 526 wake_up(&caching_ctl->wait); 527 528 put_caching_control(caching_ctl); 529 btrfs_put_block_group(block_group); 530 } 531 532 static int cache_block_group(struct btrfs_block_group_cache *cache, 533 int load_cache_only) 534 { 535 DEFINE_WAIT(wait); 536 struct btrfs_fs_info *fs_info = cache->fs_info; 537 struct btrfs_caching_control *caching_ctl; 538 int ret = 0; 539 540 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 541 if (!caching_ctl) 542 return -ENOMEM; 543 544 INIT_LIST_HEAD(&caching_ctl->list); 545 mutex_init(&caching_ctl->mutex); 546 init_waitqueue_head(&caching_ctl->wait); 547 caching_ctl->block_group = cache; 548 caching_ctl->progress = cache->key.objectid; 549 atomic_set(&caching_ctl->count, 1); 550 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 551 caching_thread, NULL, NULL); 552 553 spin_lock(&cache->lock); 554 /* 555 * This should be a rare occasion, but this could happen I think in the 556 * case where one thread starts to load the space cache info, and then 557 * some other thread starts a transaction commit which tries to do an 558 * allocation while the other thread is still loading the space cache 559 * info. The previous loop should have kept us from choosing this block 560 * group, but if we've moved to the state where we will wait on caching 561 * block groups we need to first check if we're doing a fast load here, 562 * so we can wait for it to finish, otherwise we could end up allocating 563 * from a block group who's cache gets evicted for one reason or 564 * another. 565 */ 566 while (cache->cached == BTRFS_CACHE_FAST) { 567 struct btrfs_caching_control *ctl; 568 569 ctl = cache->caching_ctl; 570 atomic_inc(&ctl->count); 571 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 572 spin_unlock(&cache->lock); 573 574 schedule(); 575 576 finish_wait(&ctl->wait, &wait); 577 put_caching_control(ctl); 578 spin_lock(&cache->lock); 579 } 580 581 if (cache->cached != BTRFS_CACHE_NO) { 582 spin_unlock(&cache->lock); 583 kfree(caching_ctl); 584 return 0; 585 } 586 WARN_ON(cache->caching_ctl); 587 cache->caching_ctl = caching_ctl; 588 cache->cached = BTRFS_CACHE_FAST; 589 spin_unlock(&cache->lock); 590 591 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 592 mutex_lock(&caching_ctl->mutex); 593 ret = load_free_space_cache(fs_info, cache); 594 595 spin_lock(&cache->lock); 596 if (ret == 1) { 597 cache->caching_ctl = NULL; 598 cache->cached = BTRFS_CACHE_FINISHED; 599 cache->last_byte_to_unpin = (u64)-1; 600 caching_ctl->progress = (u64)-1; 601 } else { 602 if (load_cache_only) { 603 cache->caching_ctl = NULL; 604 cache->cached = BTRFS_CACHE_NO; 605 } else { 606 cache->cached = BTRFS_CACHE_STARTED; 607 cache->has_caching_ctl = 1; 608 } 609 } 610 spin_unlock(&cache->lock); 611 mutex_unlock(&caching_ctl->mutex); 612 613 wake_up(&caching_ctl->wait); 614 if (ret == 1) { 615 put_caching_control(caching_ctl); 616 free_excluded_extents(fs_info->extent_root, cache); 617 return 0; 618 } 619 } else { 620 /* 621 * We are not going to do the fast caching, set cached to the 622 * appropriate value and wakeup any waiters. 623 */ 624 spin_lock(&cache->lock); 625 if (load_cache_only) { 626 cache->caching_ctl = NULL; 627 cache->cached = BTRFS_CACHE_NO; 628 } else { 629 cache->cached = BTRFS_CACHE_STARTED; 630 cache->has_caching_ctl = 1; 631 } 632 spin_unlock(&cache->lock); 633 wake_up(&caching_ctl->wait); 634 } 635 636 if (load_cache_only) { 637 put_caching_control(caching_ctl); 638 return 0; 639 } 640 641 down_write(&fs_info->commit_root_sem); 642 atomic_inc(&caching_ctl->count); 643 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 644 up_write(&fs_info->commit_root_sem); 645 646 btrfs_get_block_group(cache); 647 648 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 649 650 return ret; 651 } 652 653 /* 654 * return the block group that starts at or after bytenr 655 */ 656 static struct btrfs_block_group_cache * 657 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 658 { 659 struct btrfs_block_group_cache *cache; 660 661 cache = block_group_cache_tree_search(info, bytenr, 0); 662 663 return cache; 664 } 665 666 /* 667 * return the block group that contains the given bytenr 668 */ 669 struct btrfs_block_group_cache *btrfs_lookup_block_group( 670 struct btrfs_fs_info *info, 671 u64 bytenr) 672 { 673 struct btrfs_block_group_cache *cache; 674 675 cache = block_group_cache_tree_search(info, bytenr, 1); 676 677 return cache; 678 } 679 680 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 681 u64 flags) 682 { 683 struct list_head *head = &info->space_info; 684 struct btrfs_space_info *found; 685 686 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 687 688 rcu_read_lock(); 689 list_for_each_entry_rcu(found, head, list) { 690 if (found->flags & flags) { 691 rcu_read_unlock(); 692 return found; 693 } 694 } 695 rcu_read_unlock(); 696 return NULL; 697 } 698 699 /* 700 * after adding space to the filesystem, we need to clear the full flags 701 * on all the space infos. 702 */ 703 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 704 { 705 struct list_head *head = &info->space_info; 706 struct btrfs_space_info *found; 707 708 rcu_read_lock(); 709 list_for_each_entry_rcu(found, head, list) 710 found->full = 0; 711 rcu_read_unlock(); 712 } 713 714 /* simple helper to search for an existing data extent at a given offset */ 715 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) 716 { 717 int ret; 718 struct btrfs_key key; 719 struct btrfs_path *path; 720 721 path = btrfs_alloc_path(); 722 if (!path) 723 return -ENOMEM; 724 725 key.objectid = start; 726 key.offset = len; 727 key.type = BTRFS_EXTENT_ITEM_KEY; 728 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 729 0, 0); 730 btrfs_free_path(path); 731 return ret; 732 } 733 734 /* 735 * helper function to lookup reference count and flags of a tree block. 736 * 737 * the head node for delayed ref is used to store the sum of all the 738 * reference count modifications queued up in the rbtree. the head 739 * node may also store the extent flags to set. This way you can check 740 * to see what the reference count and extent flags would be if all of 741 * the delayed refs are not processed. 742 */ 743 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 744 struct btrfs_root *root, u64 bytenr, 745 u64 offset, int metadata, u64 *refs, u64 *flags) 746 { 747 struct btrfs_delayed_ref_head *head; 748 struct btrfs_delayed_ref_root *delayed_refs; 749 struct btrfs_path *path; 750 struct btrfs_extent_item *ei; 751 struct extent_buffer *leaf; 752 struct btrfs_key key; 753 u32 item_size; 754 u64 num_refs; 755 u64 extent_flags; 756 int ret; 757 758 /* 759 * If we don't have skinny metadata, don't bother doing anything 760 * different 761 */ 762 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 763 offset = root->nodesize; 764 metadata = 0; 765 } 766 767 path = btrfs_alloc_path(); 768 if (!path) 769 return -ENOMEM; 770 771 if (!trans) { 772 path->skip_locking = 1; 773 path->search_commit_root = 1; 774 } 775 776 search_again: 777 key.objectid = bytenr; 778 key.offset = offset; 779 if (metadata) 780 key.type = BTRFS_METADATA_ITEM_KEY; 781 else 782 key.type = BTRFS_EXTENT_ITEM_KEY; 783 784 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 785 &key, path, 0, 0); 786 if (ret < 0) 787 goto out_free; 788 789 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 790 if (path->slots[0]) { 791 path->slots[0]--; 792 btrfs_item_key_to_cpu(path->nodes[0], &key, 793 path->slots[0]); 794 if (key.objectid == bytenr && 795 key.type == BTRFS_EXTENT_ITEM_KEY && 796 key.offset == root->nodesize) 797 ret = 0; 798 } 799 } 800 801 if (ret == 0) { 802 leaf = path->nodes[0]; 803 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 804 if (item_size >= sizeof(*ei)) { 805 ei = btrfs_item_ptr(leaf, path->slots[0], 806 struct btrfs_extent_item); 807 num_refs = btrfs_extent_refs(leaf, ei); 808 extent_flags = btrfs_extent_flags(leaf, ei); 809 } else { 810 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 811 struct btrfs_extent_item_v0 *ei0; 812 BUG_ON(item_size != sizeof(*ei0)); 813 ei0 = btrfs_item_ptr(leaf, path->slots[0], 814 struct btrfs_extent_item_v0); 815 num_refs = btrfs_extent_refs_v0(leaf, ei0); 816 /* FIXME: this isn't correct for data */ 817 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 818 #else 819 BUG(); 820 #endif 821 } 822 BUG_ON(num_refs == 0); 823 } else { 824 num_refs = 0; 825 extent_flags = 0; 826 ret = 0; 827 } 828 829 if (!trans) 830 goto out; 831 832 delayed_refs = &trans->transaction->delayed_refs; 833 spin_lock(&delayed_refs->lock); 834 head = btrfs_find_delayed_ref_head(trans, bytenr); 835 if (head) { 836 if (!mutex_trylock(&head->mutex)) { 837 atomic_inc(&head->node.refs); 838 spin_unlock(&delayed_refs->lock); 839 840 btrfs_release_path(path); 841 842 /* 843 * Mutex was contended, block until it's released and try 844 * again 845 */ 846 mutex_lock(&head->mutex); 847 mutex_unlock(&head->mutex); 848 btrfs_put_delayed_ref(&head->node); 849 goto search_again; 850 } 851 spin_lock(&head->lock); 852 if (head->extent_op && head->extent_op->update_flags) 853 extent_flags |= head->extent_op->flags_to_set; 854 else 855 BUG_ON(num_refs == 0); 856 857 num_refs += head->node.ref_mod; 858 spin_unlock(&head->lock); 859 mutex_unlock(&head->mutex); 860 } 861 spin_unlock(&delayed_refs->lock); 862 out: 863 WARN_ON(num_refs == 0); 864 if (refs) 865 *refs = num_refs; 866 if (flags) 867 *flags = extent_flags; 868 out_free: 869 btrfs_free_path(path); 870 return ret; 871 } 872 873 /* 874 * Back reference rules. Back refs have three main goals: 875 * 876 * 1) differentiate between all holders of references to an extent so that 877 * when a reference is dropped we can make sure it was a valid reference 878 * before freeing the extent. 879 * 880 * 2) Provide enough information to quickly find the holders of an extent 881 * if we notice a given block is corrupted or bad. 882 * 883 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 884 * maintenance. This is actually the same as #2, but with a slightly 885 * different use case. 886 * 887 * There are two kinds of back refs. The implicit back refs is optimized 888 * for pointers in non-shared tree blocks. For a given pointer in a block, 889 * back refs of this kind provide information about the block's owner tree 890 * and the pointer's key. These information allow us to find the block by 891 * b-tree searching. The full back refs is for pointers in tree blocks not 892 * referenced by their owner trees. The location of tree block is recorded 893 * in the back refs. Actually the full back refs is generic, and can be 894 * used in all cases the implicit back refs is used. The major shortcoming 895 * of the full back refs is its overhead. Every time a tree block gets 896 * COWed, we have to update back refs entry for all pointers in it. 897 * 898 * For a newly allocated tree block, we use implicit back refs for 899 * pointers in it. This means most tree related operations only involve 900 * implicit back refs. For a tree block created in old transaction, the 901 * only way to drop a reference to it is COW it. So we can detect the 902 * event that tree block loses its owner tree's reference and do the 903 * back refs conversion. 904 * 905 * When a tree block is COW'd through a tree, there are four cases: 906 * 907 * The reference count of the block is one and the tree is the block's 908 * owner tree. Nothing to do in this case. 909 * 910 * The reference count of the block is one and the tree is not the 911 * block's owner tree. In this case, full back refs is used for pointers 912 * in the block. Remove these full back refs, add implicit back refs for 913 * every pointers in the new block. 914 * 915 * The reference count of the block is greater than one and the tree is 916 * the block's owner tree. In this case, implicit back refs is used for 917 * pointers in the block. Add full back refs for every pointers in the 918 * block, increase lower level extents' reference counts. The original 919 * implicit back refs are entailed to the new block. 920 * 921 * The reference count of the block is greater than one and the tree is 922 * not the block's owner tree. Add implicit back refs for every pointer in 923 * the new block, increase lower level extents' reference count. 924 * 925 * Back Reference Key composing: 926 * 927 * The key objectid corresponds to the first byte in the extent, 928 * The key type is used to differentiate between types of back refs. 929 * There are different meanings of the key offset for different types 930 * of back refs. 931 * 932 * File extents can be referenced by: 933 * 934 * - multiple snapshots, subvolumes, or different generations in one subvol 935 * - different files inside a single subvolume 936 * - different offsets inside a file (bookend extents in file.c) 937 * 938 * The extent ref structure for the implicit back refs has fields for: 939 * 940 * - Objectid of the subvolume root 941 * - objectid of the file holding the reference 942 * - original offset in the file 943 * - how many bookend extents 944 * 945 * The key offset for the implicit back refs is hash of the first 946 * three fields. 947 * 948 * The extent ref structure for the full back refs has field for: 949 * 950 * - number of pointers in the tree leaf 951 * 952 * The key offset for the implicit back refs is the first byte of 953 * the tree leaf 954 * 955 * When a file extent is allocated, The implicit back refs is used. 956 * the fields are filled in: 957 * 958 * (root_key.objectid, inode objectid, offset in file, 1) 959 * 960 * When a file extent is removed file truncation, we find the 961 * corresponding implicit back refs and check the following fields: 962 * 963 * (btrfs_header_owner(leaf), inode objectid, offset in file) 964 * 965 * Btree extents can be referenced by: 966 * 967 * - Different subvolumes 968 * 969 * Both the implicit back refs and the full back refs for tree blocks 970 * only consist of key. The key offset for the implicit back refs is 971 * objectid of block's owner tree. The key offset for the full back refs 972 * is the first byte of parent block. 973 * 974 * When implicit back refs is used, information about the lowest key and 975 * level of the tree block are required. These information are stored in 976 * tree block info structure. 977 */ 978 979 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 980 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 981 struct btrfs_root *root, 982 struct btrfs_path *path, 983 u64 owner, u32 extra_size) 984 { 985 struct btrfs_extent_item *item; 986 struct btrfs_extent_item_v0 *ei0; 987 struct btrfs_extent_ref_v0 *ref0; 988 struct btrfs_tree_block_info *bi; 989 struct extent_buffer *leaf; 990 struct btrfs_key key; 991 struct btrfs_key found_key; 992 u32 new_size = sizeof(*item); 993 u64 refs; 994 int ret; 995 996 leaf = path->nodes[0]; 997 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 998 999 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1000 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1001 struct btrfs_extent_item_v0); 1002 refs = btrfs_extent_refs_v0(leaf, ei0); 1003 1004 if (owner == (u64)-1) { 1005 while (1) { 1006 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1007 ret = btrfs_next_leaf(root, path); 1008 if (ret < 0) 1009 return ret; 1010 BUG_ON(ret > 0); /* Corruption */ 1011 leaf = path->nodes[0]; 1012 } 1013 btrfs_item_key_to_cpu(leaf, &found_key, 1014 path->slots[0]); 1015 BUG_ON(key.objectid != found_key.objectid); 1016 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1017 path->slots[0]++; 1018 continue; 1019 } 1020 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1021 struct btrfs_extent_ref_v0); 1022 owner = btrfs_ref_objectid_v0(leaf, ref0); 1023 break; 1024 } 1025 } 1026 btrfs_release_path(path); 1027 1028 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1029 new_size += sizeof(*bi); 1030 1031 new_size -= sizeof(*ei0); 1032 ret = btrfs_search_slot(trans, root, &key, path, 1033 new_size + extra_size, 1); 1034 if (ret < 0) 1035 return ret; 1036 BUG_ON(ret); /* Corruption */ 1037 1038 btrfs_extend_item(root, path, new_size); 1039 1040 leaf = path->nodes[0]; 1041 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1042 btrfs_set_extent_refs(leaf, item, refs); 1043 /* FIXME: get real generation */ 1044 btrfs_set_extent_generation(leaf, item, 0); 1045 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1046 btrfs_set_extent_flags(leaf, item, 1047 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1048 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1049 bi = (struct btrfs_tree_block_info *)(item + 1); 1050 /* FIXME: get first key of the block */ 1051 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1052 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1053 } else { 1054 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1055 } 1056 btrfs_mark_buffer_dirty(leaf); 1057 return 0; 1058 } 1059 #endif 1060 1061 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1062 { 1063 u32 high_crc = ~(u32)0; 1064 u32 low_crc = ~(u32)0; 1065 __le64 lenum; 1066 1067 lenum = cpu_to_le64(root_objectid); 1068 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1069 lenum = cpu_to_le64(owner); 1070 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1071 lenum = cpu_to_le64(offset); 1072 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1073 1074 return ((u64)high_crc << 31) ^ (u64)low_crc; 1075 } 1076 1077 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1078 struct btrfs_extent_data_ref *ref) 1079 { 1080 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1081 btrfs_extent_data_ref_objectid(leaf, ref), 1082 btrfs_extent_data_ref_offset(leaf, ref)); 1083 } 1084 1085 static int match_extent_data_ref(struct extent_buffer *leaf, 1086 struct btrfs_extent_data_ref *ref, 1087 u64 root_objectid, u64 owner, u64 offset) 1088 { 1089 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1090 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1091 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1092 return 0; 1093 return 1; 1094 } 1095 1096 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1097 struct btrfs_root *root, 1098 struct btrfs_path *path, 1099 u64 bytenr, u64 parent, 1100 u64 root_objectid, 1101 u64 owner, u64 offset) 1102 { 1103 struct btrfs_key key; 1104 struct btrfs_extent_data_ref *ref; 1105 struct extent_buffer *leaf; 1106 u32 nritems; 1107 int ret; 1108 int recow; 1109 int err = -ENOENT; 1110 1111 key.objectid = bytenr; 1112 if (parent) { 1113 key.type = BTRFS_SHARED_DATA_REF_KEY; 1114 key.offset = parent; 1115 } else { 1116 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1117 key.offset = hash_extent_data_ref(root_objectid, 1118 owner, offset); 1119 } 1120 again: 1121 recow = 0; 1122 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1123 if (ret < 0) { 1124 err = ret; 1125 goto fail; 1126 } 1127 1128 if (parent) { 1129 if (!ret) 1130 return 0; 1131 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1132 key.type = BTRFS_EXTENT_REF_V0_KEY; 1133 btrfs_release_path(path); 1134 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1135 if (ret < 0) { 1136 err = ret; 1137 goto fail; 1138 } 1139 if (!ret) 1140 return 0; 1141 #endif 1142 goto fail; 1143 } 1144 1145 leaf = path->nodes[0]; 1146 nritems = btrfs_header_nritems(leaf); 1147 while (1) { 1148 if (path->slots[0] >= nritems) { 1149 ret = btrfs_next_leaf(root, path); 1150 if (ret < 0) 1151 err = ret; 1152 if (ret) 1153 goto fail; 1154 1155 leaf = path->nodes[0]; 1156 nritems = btrfs_header_nritems(leaf); 1157 recow = 1; 1158 } 1159 1160 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1161 if (key.objectid != bytenr || 1162 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1163 goto fail; 1164 1165 ref = btrfs_item_ptr(leaf, path->slots[0], 1166 struct btrfs_extent_data_ref); 1167 1168 if (match_extent_data_ref(leaf, ref, root_objectid, 1169 owner, offset)) { 1170 if (recow) { 1171 btrfs_release_path(path); 1172 goto again; 1173 } 1174 err = 0; 1175 break; 1176 } 1177 path->slots[0]++; 1178 } 1179 fail: 1180 return err; 1181 } 1182 1183 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1184 struct btrfs_root *root, 1185 struct btrfs_path *path, 1186 u64 bytenr, u64 parent, 1187 u64 root_objectid, u64 owner, 1188 u64 offset, int refs_to_add) 1189 { 1190 struct btrfs_key key; 1191 struct extent_buffer *leaf; 1192 u32 size; 1193 u32 num_refs; 1194 int ret; 1195 1196 key.objectid = bytenr; 1197 if (parent) { 1198 key.type = BTRFS_SHARED_DATA_REF_KEY; 1199 key.offset = parent; 1200 size = sizeof(struct btrfs_shared_data_ref); 1201 } else { 1202 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1203 key.offset = hash_extent_data_ref(root_objectid, 1204 owner, offset); 1205 size = sizeof(struct btrfs_extent_data_ref); 1206 } 1207 1208 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1209 if (ret && ret != -EEXIST) 1210 goto fail; 1211 1212 leaf = path->nodes[0]; 1213 if (parent) { 1214 struct btrfs_shared_data_ref *ref; 1215 ref = btrfs_item_ptr(leaf, path->slots[0], 1216 struct btrfs_shared_data_ref); 1217 if (ret == 0) { 1218 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1219 } else { 1220 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1221 num_refs += refs_to_add; 1222 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1223 } 1224 } else { 1225 struct btrfs_extent_data_ref *ref; 1226 while (ret == -EEXIST) { 1227 ref = btrfs_item_ptr(leaf, path->slots[0], 1228 struct btrfs_extent_data_ref); 1229 if (match_extent_data_ref(leaf, ref, root_objectid, 1230 owner, offset)) 1231 break; 1232 btrfs_release_path(path); 1233 key.offset++; 1234 ret = btrfs_insert_empty_item(trans, root, path, &key, 1235 size); 1236 if (ret && ret != -EEXIST) 1237 goto fail; 1238 1239 leaf = path->nodes[0]; 1240 } 1241 ref = btrfs_item_ptr(leaf, path->slots[0], 1242 struct btrfs_extent_data_ref); 1243 if (ret == 0) { 1244 btrfs_set_extent_data_ref_root(leaf, ref, 1245 root_objectid); 1246 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1247 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1248 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1249 } else { 1250 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1251 num_refs += refs_to_add; 1252 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1253 } 1254 } 1255 btrfs_mark_buffer_dirty(leaf); 1256 ret = 0; 1257 fail: 1258 btrfs_release_path(path); 1259 return ret; 1260 } 1261 1262 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1263 struct btrfs_root *root, 1264 struct btrfs_path *path, 1265 int refs_to_drop, int *last_ref) 1266 { 1267 struct btrfs_key key; 1268 struct btrfs_extent_data_ref *ref1 = NULL; 1269 struct btrfs_shared_data_ref *ref2 = NULL; 1270 struct extent_buffer *leaf; 1271 u32 num_refs = 0; 1272 int ret = 0; 1273 1274 leaf = path->nodes[0]; 1275 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1276 1277 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1278 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1279 struct btrfs_extent_data_ref); 1280 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1281 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1282 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1283 struct btrfs_shared_data_ref); 1284 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1285 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1286 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1287 struct btrfs_extent_ref_v0 *ref0; 1288 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1289 struct btrfs_extent_ref_v0); 1290 num_refs = btrfs_ref_count_v0(leaf, ref0); 1291 #endif 1292 } else { 1293 BUG(); 1294 } 1295 1296 BUG_ON(num_refs < refs_to_drop); 1297 num_refs -= refs_to_drop; 1298 1299 if (num_refs == 0) { 1300 ret = btrfs_del_item(trans, root, path); 1301 *last_ref = 1; 1302 } else { 1303 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1304 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1305 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1306 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1307 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1308 else { 1309 struct btrfs_extent_ref_v0 *ref0; 1310 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1311 struct btrfs_extent_ref_v0); 1312 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1313 } 1314 #endif 1315 btrfs_mark_buffer_dirty(leaf); 1316 } 1317 return ret; 1318 } 1319 1320 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1321 struct btrfs_path *path, 1322 struct btrfs_extent_inline_ref *iref) 1323 { 1324 struct btrfs_key key; 1325 struct extent_buffer *leaf; 1326 struct btrfs_extent_data_ref *ref1; 1327 struct btrfs_shared_data_ref *ref2; 1328 u32 num_refs = 0; 1329 1330 leaf = path->nodes[0]; 1331 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1332 if (iref) { 1333 if (btrfs_extent_inline_ref_type(leaf, iref) == 1334 BTRFS_EXTENT_DATA_REF_KEY) { 1335 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1336 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1337 } else { 1338 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1339 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1340 } 1341 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1342 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1343 struct btrfs_extent_data_ref); 1344 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1345 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1346 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1347 struct btrfs_shared_data_ref); 1348 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1349 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1350 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1351 struct btrfs_extent_ref_v0 *ref0; 1352 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1353 struct btrfs_extent_ref_v0); 1354 num_refs = btrfs_ref_count_v0(leaf, ref0); 1355 #endif 1356 } else { 1357 WARN_ON(1); 1358 } 1359 return num_refs; 1360 } 1361 1362 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1363 struct btrfs_root *root, 1364 struct btrfs_path *path, 1365 u64 bytenr, u64 parent, 1366 u64 root_objectid) 1367 { 1368 struct btrfs_key key; 1369 int ret; 1370 1371 key.objectid = bytenr; 1372 if (parent) { 1373 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1374 key.offset = parent; 1375 } else { 1376 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1377 key.offset = root_objectid; 1378 } 1379 1380 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1381 if (ret > 0) 1382 ret = -ENOENT; 1383 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1384 if (ret == -ENOENT && parent) { 1385 btrfs_release_path(path); 1386 key.type = BTRFS_EXTENT_REF_V0_KEY; 1387 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1388 if (ret > 0) 1389 ret = -ENOENT; 1390 } 1391 #endif 1392 return ret; 1393 } 1394 1395 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1396 struct btrfs_root *root, 1397 struct btrfs_path *path, 1398 u64 bytenr, u64 parent, 1399 u64 root_objectid) 1400 { 1401 struct btrfs_key key; 1402 int ret; 1403 1404 key.objectid = bytenr; 1405 if (parent) { 1406 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1407 key.offset = parent; 1408 } else { 1409 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1410 key.offset = root_objectid; 1411 } 1412 1413 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1414 btrfs_release_path(path); 1415 return ret; 1416 } 1417 1418 static inline int extent_ref_type(u64 parent, u64 owner) 1419 { 1420 int type; 1421 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1422 if (parent > 0) 1423 type = BTRFS_SHARED_BLOCK_REF_KEY; 1424 else 1425 type = BTRFS_TREE_BLOCK_REF_KEY; 1426 } else { 1427 if (parent > 0) 1428 type = BTRFS_SHARED_DATA_REF_KEY; 1429 else 1430 type = BTRFS_EXTENT_DATA_REF_KEY; 1431 } 1432 return type; 1433 } 1434 1435 static int find_next_key(struct btrfs_path *path, int level, 1436 struct btrfs_key *key) 1437 1438 { 1439 for (; level < BTRFS_MAX_LEVEL; level++) { 1440 if (!path->nodes[level]) 1441 break; 1442 if (path->slots[level] + 1 >= 1443 btrfs_header_nritems(path->nodes[level])) 1444 continue; 1445 if (level == 0) 1446 btrfs_item_key_to_cpu(path->nodes[level], key, 1447 path->slots[level] + 1); 1448 else 1449 btrfs_node_key_to_cpu(path->nodes[level], key, 1450 path->slots[level] + 1); 1451 return 0; 1452 } 1453 return 1; 1454 } 1455 1456 /* 1457 * look for inline back ref. if back ref is found, *ref_ret is set 1458 * to the address of inline back ref, and 0 is returned. 1459 * 1460 * if back ref isn't found, *ref_ret is set to the address where it 1461 * should be inserted, and -ENOENT is returned. 1462 * 1463 * if insert is true and there are too many inline back refs, the path 1464 * points to the extent item, and -EAGAIN is returned. 1465 * 1466 * NOTE: inline back refs are ordered in the same way that back ref 1467 * items in the tree are ordered. 1468 */ 1469 static noinline_for_stack 1470 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1471 struct btrfs_root *root, 1472 struct btrfs_path *path, 1473 struct btrfs_extent_inline_ref **ref_ret, 1474 u64 bytenr, u64 num_bytes, 1475 u64 parent, u64 root_objectid, 1476 u64 owner, u64 offset, int insert) 1477 { 1478 struct btrfs_key key; 1479 struct extent_buffer *leaf; 1480 struct btrfs_extent_item *ei; 1481 struct btrfs_extent_inline_ref *iref; 1482 u64 flags; 1483 u64 item_size; 1484 unsigned long ptr; 1485 unsigned long end; 1486 int extra_size; 1487 int type; 1488 int want; 1489 int ret; 1490 int err = 0; 1491 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1492 SKINNY_METADATA); 1493 1494 key.objectid = bytenr; 1495 key.type = BTRFS_EXTENT_ITEM_KEY; 1496 key.offset = num_bytes; 1497 1498 want = extent_ref_type(parent, owner); 1499 if (insert) { 1500 extra_size = btrfs_extent_inline_ref_size(want); 1501 path->keep_locks = 1; 1502 } else 1503 extra_size = -1; 1504 1505 /* 1506 * Owner is our parent level, so we can just add one to get the level 1507 * for the block we are interested in. 1508 */ 1509 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1510 key.type = BTRFS_METADATA_ITEM_KEY; 1511 key.offset = owner; 1512 } 1513 1514 again: 1515 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1516 if (ret < 0) { 1517 err = ret; 1518 goto out; 1519 } 1520 1521 /* 1522 * We may be a newly converted file system which still has the old fat 1523 * extent entries for metadata, so try and see if we have one of those. 1524 */ 1525 if (ret > 0 && skinny_metadata) { 1526 skinny_metadata = false; 1527 if (path->slots[0]) { 1528 path->slots[0]--; 1529 btrfs_item_key_to_cpu(path->nodes[0], &key, 1530 path->slots[0]); 1531 if (key.objectid == bytenr && 1532 key.type == BTRFS_EXTENT_ITEM_KEY && 1533 key.offset == num_bytes) 1534 ret = 0; 1535 } 1536 if (ret) { 1537 key.objectid = bytenr; 1538 key.type = BTRFS_EXTENT_ITEM_KEY; 1539 key.offset = num_bytes; 1540 btrfs_release_path(path); 1541 goto again; 1542 } 1543 } 1544 1545 if (ret && !insert) { 1546 err = -ENOENT; 1547 goto out; 1548 } else if (WARN_ON(ret)) { 1549 err = -EIO; 1550 goto out; 1551 } 1552 1553 leaf = path->nodes[0]; 1554 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1555 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1556 if (item_size < sizeof(*ei)) { 1557 if (!insert) { 1558 err = -ENOENT; 1559 goto out; 1560 } 1561 ret = convert_extent_item_v0(trans, root, path, owner, 1562 extra_size); 1563 if (ret < 0) { 1564 err = ret; 1565 goto out; 1566 } 1567 leaf = path->nodes[0]; 1568 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1569 } 1570 #endif 1571 BUG_ON(item_size < sizeof(*ei)); 1572 1573 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1574 flags = btrfs_extent_flags(leaf, ei); 1575 1576 ptr = (unsigned long)(ei + 1); 1577 end = (unsigned long)ei + item_size; 1578 1579 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1580 ptr += sizeof(struct btrfs_tree_block_info); 1581 BUG_ON(ptr > end); 1582 } 1583 1584 err = -ENOENT; 1585 while (1) { 1586 if (ptr >= end) { 1587 WARN_ON(ptr > end); 1588 break; 1589 } 1590 iref = (struct btrfs_extent_inline_ref *)ptr; 1591 type = btrfs_extent_inline_ref_type(leaf, iref); 1592 if (want < type) 1593 break; 1594 if (want > type) { 1595 ptr += btrfs_extent_inline_ref_size(type); 1596 continue; 1597 } 1598 1599 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1600 struct btrfs_extent_data_ref *dref; 1601 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1602 if (match_extent_data_ref(leaf, dref, root_objectid, 1603 owner, offset)) { 1604 err = 0; 1605 break; 1606 } 1607 if (hash_extent_data_ref_item(leaf, dref) < 1608 hash_extent_data_ref(root_objectid, owner, offset)) 1609 break; 1610 } else { 1611 u64 ref_offset; 1612 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1613 if (parent > 0) { 1614 if (parent == ref_offset) { 1615 err = 0; 1616 break; 1617 } 1618 if (ref_offset < parent) 1619 break; 1620 } else { 1621 if (root_objectid == ref_offset) { 1622 err = 0; 1623 break; 1624 } 1625 if (ref_offset < root_objectid) 1626 break; 1627 } 1628 } 1629 ptr += btrfs_extent_inline_ref_size(type); 1630 } 1631 if (err == -ENOENT && insert) { 1632 if (item_size + extra_size >= 1633 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1634 err = -EAGAIN; 1635 goto out; 1636 } 1637 /* 1638 * To add new inline back ref, we have to make sure 1639 * there is no corresponding back ref item. 1640 * For simplicity, we just do not add new inline back 1641 * ref if there is any kind of item for this block 1642 */ 1643 if (find_next_key(path, 0, &key) == 0 && 1644 key.objectid == bytenr && 1645 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1646 err = -EAGAIN; 1647 goto out; 1648 } 1649 } 1650 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1651 out: 1652 if (insert) { 1653 path->keep_locks = 0; 1654 btrfs_unlock_up_safe(path, 1); 1655 } 1656 return err; 1657 } 1658 1659 /* 1660 * helper to add new inline back ref 1661 */ 1662 static noinline_for_stack 1663 void setup_inline_extent_backref(struct btrfs_root *root, 1664 struct btrfs_path *path, 1665 struct btrfs_extent_inline_ref *iref, 1666 u64 parent, u64 root_objectid, 1667 u64 owner, u64 offset, int refs_to_add, 1668 struct btrfs_delayed_extent_op *extent_op) 1669 { 1670 struct extent_buffer *leaf; 1671 struct btrfs_extent_item *ei; 1672 unsigned long ptr; 1673 unsigned long end; 1674 unsigned long item_offset; 1675 u64 refs; 1676 int size; 1677 int type; 1678 1679 leaf = path->nodes[0]; 1680 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1681 item_offset = (unsigned long)iref - (unsigned long)ei; 1682 1683 type = extent_ref_type(parent, owner); 1684 size = btrfs_extent_inline_ref_size(type); 1685 1686 btrfs_extend_item(root, path, size); 1687 1688 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1689 refs = btrfs_extent_refs(leaf, ei); 1690 refs += refs_to_add; 1691 btrfs_set_extent_refs(leaf, ei, refs); 1692 if (extent_op) 1693 __run_delayed_extent_op(extent_op, leaf, ei); 1694 1695 ptr = (unsigned long)ei + item_offset; 1696 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1697 if (ptr < end - size) 1698 memmove_extent_buffer(leaf, ptr + size, ptr, 1699 end - size - ptr); 1700 1701 iref = (struct btrfs_extent_inline_ref *)ptr; 1702 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1703 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1704 struct btrfs_extent_data_ref *dref; 1705 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1706 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1707 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1708 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1709 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1710 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1711 struct btrfs_shared_data_ref *sref; 1712 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1713 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1714 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1715 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1716 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1717 } else { 1718 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1719 } 1720 btrfs_mark_buffer_dirty(leaf); 1721 } 1722 1723 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1724 struct btrfs_root *root, 1725 struct btrfs_path *path, 1726 struct btrfs_extent_inline_ref **ref_ret, 1727 u64 bytenr, u64 num_bytes, u64 parent, 1728 u64 root_objectid, u64 owner, u64 offset) 1729 { 1730 int ret; 1731 1732 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1733 bytenr, num_bytes, parent, 1734 root_objectid, owner, offset, 0); 1735 if (ret != -ENOENT) 1736 return ret; 1737 1738 btrfs_release_path(path); 1739 *ref_ret = NULL; 1740 1741 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1742 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1743 root_objectid); 1744 } else { 1745 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1746 root_objectid, owner, offset); 1747 } 1748 return ret; 1749 } 1750 1751 /* 1752 * helper to update/remove inline back ref 1753 */ 1754 static noinline_for_stack 1755 void update_inline_extent_backref(struct btrfs_root *root, 1756 struct btrfs_path *path, 1757 struct btrfs_extent_inline_ref *iref, 1758 int refs_to_mod, 1759 struct btrfs_delayed_extent_op *extent_op, 1760 int *last_ref) 1761 { 1762 struct extent_buffer *leaf; 1763 struct btrfs_extent_item *ei; 1764 struct btrfs_extent_data_ref *dref = NULL; 1765 struct btrfs_shared_data_ref *sref = NULL; 1766 unsigned long ptr; 1767 unsigned long end; 1768 u32 item_size; 1769 int size; 1770 int type; 1771 u64 refs; 1772 1773 leaf = path->nodes[0]; 1774 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1775 refs = btrfs_extent_refs(leaf, ei); 1776 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1777 refs += refs_to_mod; 1778 btrfs_set_extent_refs(leaf, ei, refs); 1779 if (extent_op) 1780 __run_delayed_extent_op(extent_op, leaf, ei); 1781 1782 type = btrfs_extent_inline_ref_type(leaf, iref); 1783 1784 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1785 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1786 refs = btrfs_extent_data_ref_count(leaf, dref); 1787 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1788 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1789 refs = btrfs_shared_data_ref_count(leaf, sref); 1790 } else { 1791 refs = 1; 1792 BUG_ON(refs_to_mod != -1); 1793 } 1794 1795 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1796 refs += refs_to_mod; 1797 1798 if (refs > 0) { 1799 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1800 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1801 else 1802 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1803 } else { 1804 *last_ref = 1; 1805 size = btrfs_extent_inline_ref_size(type); 1806 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1807 ptr = (unsigned long)iref; 1808 end = (unsigned long)ei + item_size; 1809 if (ptr + size < end) 1810 memmove_extent_buffer(leaf, ptr, ptr + size, 1811 end - ptr - size); 1812 item_size -= size; 1813 btrfs_truncate_item(root, path, item_size, 1); 1814 } 1815 btrfs_mark_buffer_dirty(leaf); 1816 } 1817 1818 static noinline_for_stack 1819 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1820 struct btrfs_root *root, 1821 struct btrfs_path *path, 1822 u64 bytenr, u64 num_bytes, u64 parent, 1823 u64 root_objectid, u64 owner, 1824 u64 offset, int refs_to_add, 1825 struct btrfs_delayed_extent_op *extent_op) 1826 { 1827 struct btrfs_extent_inline_ref *iref; 1828 int ret; 1829 1830 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1831 bytenr, num_bytes, parent, 1832 root_objectid, owner, offset, 1); 1833 if (ret == 0) { 1834 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1835 update_inline_extent_backref(root, path, iref, 1836 refs_to_add, extent_op, NULL); 1837 } else if (ret == -ENOENT) { 1838 setup_inline_extent_backref(root, path, iref, parent, 1839 root_objectid, owner, offset, 1840 refs_to_add, extent_op); 1841 ret = 0; 1842 } 1843 return ret; 1844 } 1845 1846 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1847 struct btrfs_root *root, 1848 struct btrfs_path *path, 1849 u64 bytenr, u64 parent, u64 root_objectid, 1850 u64 owner, u64 offset, int refs_to_add) 1851 { 1852 int ret; 1853 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1854 BUG_ON(refs_to_add != 1); 1855 ret = insert_tree_block_ref(trans, root, path, bytenr, 1856 parent, root_objectid); 1857 } else { 1858 ret = insert_extent_data_ref(trans, root, path, bytenr, 1859 parent, root_objectid, 1860 owner, offset, refs_to_add); 1861 } 1862 return ret; 1863 } 1864 1865 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1866 struct btrfs_root *root, 1867 struct btrfs_path *path, 1868 struct btrfs_extent_inline_ref *iref, 1869 int refs_to_drop, int is_data, int *last_ref) 1870 { 1871 int ret = 0; 1872 1873 BUG_ON(!is_data && refs_to_drop != 1); 1874 if (iref) { 1875 update_inline_extent_backref(root, path, iref, 1876 -refs_to_drop, NULL, last_ref); 1877 } else if (is_data) { 1878 ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1879 last_ref); 1880 } else { 1881 *last_ref = 1; 1882 ret = btrfs_del_item(trans, root, path); 1883 } 1884 return ret; 1885 } 1886 1887 static int btrfs_issue_discard(struct block_device *bdev, 1888 u64 start, u64 len) 1889 { 1890 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1891 } 1892 1893 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1894 u64 num_bytes, u64 *actual_bytes) 1895 { 1896 int ret; 1897 u64 discarded_bytes = 0; 1898 struct btrfs_bio *bbio = NULL; 1899 1900 1901 /* Tell the block device(s) that the sectors can be discarded */ 1902 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1903 bytenr, &num_bytes, &bbio, 0); 1904 /* Error condition is -ENOMEM */ 1905 if (!ret) { 1906 struct btrfs_bio_stripe *stripe = bbio->stripes; 1907 int i; 1908 1909 1910 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1911 if (!stripe->dev->can_discard) 1912 continue; 1913 1914 ret = btrfs_issue_discard(stripe->dev->bdev, 1915 stripe->physical, 1916 stripe->length); 1917 if (!ret) 1918 discarded_bytes += stripe->length; 1919 else if (ret != -EOPNOTSUPP) 1920 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1921 1922 /* 1923 * Just in case we get back EOPNOTSUPP for some reason, 1924 * just ignore the return value so we don't screw up 1925 * people calling discard_extent. 1926 */ 1927 ret = 0; 1928 } 1929 btrfs_put_bbio(bbio); 1930 } 1931 1932 if (actual_bytes) 1933 *actual_bytes = discarded_bytes; 1934 1935 1936 if (ret == -EOPNOTSUPP) 1937 ret = 0; 1938 return ret; 1939 } 1940 1941 /* Can return -ENOMEM */ 1942 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1943 struct btrfs_root *root, 1944 u64 bytenr, u64 num_bytes, u64 parent, 1945 u64 root_objectid, u64 owner, u64 offset, 1946 int no_quota) 1947 { 1948 int ret; 1949 struct btrfs_fs_info *fs_info = root->fs_info; 1950 1951 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1952 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1953 1954 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1955 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1956 num_bytes, 1957 parent, root_objectid, (int)owner, 1958 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1959 } else { 1960 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1961 num_bytes, 1962 parent, root_objectid, owner, offset, 1963 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1964 } 1965 return ret; 1966 } 1967 1968 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1969 struct btrfs_root *root, 1970 u64 bytenr, u64 num_bytes, 1971 u64 parent, u64 root_objectid, 1972 u64 owner, u64 offset, int refs_to_add, 1973 int no_quota, 1974 struct btrfs_delayed_extent_op *extent_op) 1975 { 1976 struct btrfs_fs_info *fs_info = root->fs_info; 1977 struct btrfs_path *path; 1978 struct extent_buffer *leaf; 1979 struct btrfs_extent_item *item; 1980 struct btrfs_key key; 1981 u64 refs; 1982 int ret; 1983 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; 1984 1985 path = btrfs_alloc_path(); 1986 if (!path) 1987 return -ENOMEM; 1988 1989 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) 1990 no_quota = 1; 1991 1992 path->reada = 1; 1993 path->leave_spinning = 1; 1994 /* this will setup the path even if it fails to insert the back ref */ 1995 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, 1996 bytenr, num_bytes, parent, 1997 root_objectid, owner, offset, 1998 refs_to_add, extent_op); 1999 if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) 2000 goto out; 2001 /* 2002 * Ok we were able to insert an inline extent and it appears to be a new 2003 * reference, deal with the qgroup accounting. 2004 */ 2005 if (!ret && !no_quota) { 2006 ASSERT(root->fs_info->quota_enabled); 2007 leaf = path->nodes[0]; 2008 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2009 item = btrfs_item_ptr(leaf, path->slots[0], 2010 struct btrfs_extent_item); 2011 if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) 2012 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2013 btrfs_release_path(path); 2014 2015 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2016 bytenr, num_bytes, type, 0); 2017 goto out; 2018 } 2019 2020 /* 2021 * Ok we had -EAGAIN which means we didn't have space to insert and 2022 * inline extent ref, so just update the reference count and add a 2023 * normal backref. 2024 */ 2025 leaf = path->nodes[0]; 2026 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2027 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2028 refs = btrfs_extent_refs(leaf, item); 2029 if (refs) 2030 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2031 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2032 if (extent_op) 2033 __run_delayed_extent_op(extent_op, leaf, item); 2034 2035 btrfs_mark_buffer_dirty(leaf); 2036 btrfs_release_path(path); 2037 2038 if (!no_quota) { 2039 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2040 bytenr, num_bytes, type, 0); 2041 if (ret) 2042 goto out; 2043 } 2044 2045 path->reada = 1; 2046 path->leave_spinning = 1; 2047 /* now insert the actual backref */ 2048 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2049 path, bytenr, parent, root_objectid, 2050 owner, offset, refs_to_add); 2051 if (ret) 2052 btrfs_abort_transaction(trans, root, ret); 2053 out: 2054 btrfs_free_path(path); 2055 return ret; 2056 } 2057 2058 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2059 struct btrfs_root *root, 2060 struct btrfs_delayed_ref_node *node, 2061 struct btrfs_delayed_extent_op *extent_op, 2062 int insert_reserved) 2063 { 2064 int ret = 0; 2065 struct btrfs_delayed_data_ref *ref; 2066 struct btrfs_key ins; 2067 u64 parent = 0; 2068 u64 ref_root = 0; 2069 u64 flags = 0; 2070 2071 ins.objectid = node->bytenr; 2072 ins.offset = node->num_bytes; 2073 ins.type = BTRFS_EXTENT_ITEM_KEY; 2074 2075 ref = btrfs_delayed_node_to_data_ref(node); 2076 trace_run_delayed_data_ref(node, ref, node->action); 2077 2078 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2079 parent = ref->parent; 2080 ref_root = ref->root; 2081 2082 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2083 if (extent_op) 2084 flags |= extent_op->flags_to_set; 2085 ret = alloc_reserved_file_extent(trans, root, 2086 parent, ref_root, flags, 2087 ref->objectid, ref->offset, 2088 &ins, node->ref_mod); 2089 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2090 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2091 node->num_bytes, parent, 2092 ref_root, ref->objectid, 2093 ref->offset, node->ref_mod, 2094 node->no_quota, extent_op); 2095 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2096 ret = __btrfs_free_extent(trans, root, node->bytenr, 2097 node->num_bytes, parent, 2098 ref_root, ref->objectid, 2099 ref->offset, node->ref_mod, 2100 extent_op, node->no_quota); 2101 } else { 2102 BUG(); 2103 } 2104 return ret; 2105 } 2106 2107 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2108 struct extent_buffer *leaf, 2109 struct btrfs_extent_item *ei) 2110 { 2111 u64 flags = btrfs_extent_flags(leaf, ei); 2112 if (extent_op->update_flags) { 2113 flags |= extent_op->flags_to_set; 2114 btrfs_set_extent_flags(leaf, ei, flags); 2115 } 2116 2117 if (extent_op->update_key) { 2118 struct btrfs_tree_block_info *bi; 2119 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2120 bi = (struct btrfs_tree_block_info *)(ei + 1); 2121 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2122 } 2123 } 2124 2125 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2126 struct btrfs_root *root, 2127 struct btrfs_delayed_ref_node *node, 2128 struct btrfs_delayed_extent_op *extent_op) 2129 { 2130 struct btrfs_key key; 2131 struct btrfs_path *path; 2132 struct btrfs_extent_item *ei; 2133 struct extent_buffer *leaf; 2134 u32 item_size; 2135 int ret; 2136 int err = 0; 2137 int metadata = !extent_op->is_data; 2138 2139 if (trans->aborted) 2140 return 0; 2141 2142 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2143 metadata = 0; 2144 2145 path = btrfs_alloc_path(); 2146 if (!path) 2147 return -ENOMEM; 2148 2149 key.objectid = node->bytenr; 2150 2151 if (metadata) { 2152 key.type = BTRFS_METADATA_ITEM_KEY; 2153 key.offset = extent_op->level; 2154 } else { 2155 key.type = BTRFS_EXTENT_ITEM_KEY; 2156 key.offset = node->num_bytes; 2157 } 2158 2159 again: 2160 path->reada = 1; 2161 path->leave_spinning = 1; 2162 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2163 path, 0, 1); 2164 if (ret < 0) { 2165 err = ret; 2166 goto out; 2167 } 2168 if (ret > 0) { 2169 if (metadata) { 2170 if (path->slots[0] > 0) { 2171 path->slots[0]--; 2172 btrfs_item_key_to_cpu(path->nodes[0], &key, 2173 path->slots[0]); 2174 if (key.objectid == node->bytenr && 2175 key.type == BTRFS_EXTENT_ITEM_KEY && 2176 key.offset == node->num_bytes) 2177 ret = 0; 2178 } 2179 if (ret > 0) { 2180 btrfs_release_path(path); 2181 metadata = 0; 2182 2183 key.objectid = node->bytenr; 2184 key.offset = node->num_bytes; 2185 key.type = BTRFS_EXTENT_ITEM_KEY; 2186 goto again; 2187 } 2188 } else { 2189 err = -EIO; 2190 goto out; 2191 } 2192 } 2193 2194 leaf = path->nodes[0]; 2195 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2196 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2197 if (item_size < sizeof(*ei)) { 2198 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2199 path, (u64)-1, 0); 2200 if (ret < 0) { 2201 err = ret; 2202 goto out; 2203 } 2204 leaf = path->nodes[0]; 2205 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2206 } 2207 #endif 2208 BUG_ON(item_size < sizeof(*ei)); 2209 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2210 __run_delayed_extent_op(extent_op, leaf, ei); 2211 2212 btrfs_mark_buffer_dirty(leaf); 2213 out: 2214 btrfs_free_path(path); 2215 return err; 2216 } 2217 2218 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2219 struct btrfs_root *root, 2220 struct btrfs_delayed_ref_node *node, 2221 struct btrfs_delayed_extent_op *extent_op, 2222 int insert_reserved) 2223 { 2224 int ret = 0; 2225 struct btrfs_delayed_tree_ref *ref; 2226 struct btrfs_key ins; 2227 u64 parent = 0; 2228 u64 ref_root = 0; 2229 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2230 SKINNY_METADATA); 2231 2232 ref = btrfs_delayed_node_to_tree_ref(node); 2233 trace_run_delayed_tree_ref(node, ref, node->action); 2234 2235 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2236 parent = ref->parent; 2237 ref_root = ref->root; 2238 2239 ins.objectid = node->bytenr; 2240 if (skinny_metadata) { 2241 ins.offset = ref->level; 2242 ins.type = BTRFS_METADATA_ITEM_KEY; 2243 } else { 2244 ins.offset = node->num_bytes; 2245 ins.type = BTRFS_EXTENT_ITEM_KEY; 2246 } 2247 2248 BUG_ON(node->ref_mod != 1); 2249 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2250 BUG_ON(!extent_op || !extent_op->update_flags); 2251 ret = alloc_reserved_tree_block(trans, root, 2252 parent, ref_root, 2253 extent_op->flags_to_set, 2254 &extent_op->key, 2255 ref->level, &ins, 2256 node->no_quota); 2257 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2258 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2259 node->num_bytes, parent, ref_root, 2260 ref->level, 0, 1, node->no_quota, 2261 extent_op); 2262 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2263 ret = __btrfs_free_extent(trans, root, node->bytenr, 2264 node->num_bytes, parent, ref_root, 2265 ref->level, 0, 1, extent_op, 2266 node->no_quota); 2267 } else { 2268 BUG(); 2269 } 2270 return ret; 2271 } 2272 2273 /* helper function to actually process a single delayed ref entry */ 2274 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2275 struct btrfs_root *root, 2276 struct btrfs_delayed_ref_node *node, 2277 struct btrfs_delayed_extent_op *extent_op, 2278 int insert_reserved) 2279 { 2280 int ret = 0; 2281 2282 if (trans->aborted) { 2283 if (insert_reserved) 2284 btrfs_pin_extent(root, node->bytenr, 2285 node->num_bytes, 1); 2286 return 0; 2287 } 2288 2289 if (btrfs_delayed_ref_is_head(node)) { 2290 struct btrfs_delayed_ref_head *head; 2291 /* 2292 * we've hit the end of the chain and we were supposed 2293 * to insert this extent into the tree. But, it got 2294 * deleted before we ever needed to insert it, so all 2295 * we have to do is clean up the accounting 2296 */ 2297 BUG_ON(extent_op); 2298 head = btrfs_delayed_node_to_head(node); 2299 trace_run_delayed_ref_head(node, head, node->action); 2300 2301 if (insert_reserved) { 2302 btrfs_pin_extent(root, node->bytenr, 2303 node->num_bytes, 1); 2304 if (head->is_data) { 2305 ret = btrfs_del_csums(trans, root, 2306 node->bytenr, 2307 node->num_bytes); 2308 } 2309 } 2310 return ret; 2311 } 2312 2313 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2314 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2315 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2316 insert_reserved); 2317 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2318 node->type == BTRFS_SHARED_DATA_REF_KEY) 2319 ret = run_delayed_data_ref(trans, root, node, extent_op, 2320 insert_reserved); 2321 else 2322 BUG(); 2323 return ret; 2324 } 2325 2326 static noinline struct btrfs_delayed_ref_node * 2327 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2328 { 2329 struct rb_node *node; 2330 struct btrfs_delayed_ref_node *ref, *last = NULL;; 2331 2332 /* 2333 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2334 * this prevents ref count from going down to zero when 2335 * there still are pending delayed ref. 2336 */ 2337 node = rb_first(&head->ref_root); 2338 while (node) { 2339 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2340 rb_node); 2341 if (ref->action == BTRFS_ADD_DELAYED_REF) 2342 return ref; 2343 else if (last == NULL) 2344 last = ref; 2345 node = rb_next(node); 2346 } 2347 return last; 2348 } 2349 2350 /* 2351 * Returns 0 on success or if called with an already aborted transaction. 2352 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2353 */ 2354 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2355 struct btrfs_root *root, 2356 unsigned long nr) 2357 { 2358 struct btrfs_delayed_ref_root *delayed_refs; 2359 struct btrfs_delayed_ref_node *ref; 2360 struct btrfs_delayed_ref_head *locked_ref = NULL; 2361 struct btrfs_delayed_extent_op *extent_op; 2362 struct btrfs_fs_info *fs_info = root->fs_info; 2363 ktime_t start = ktime_get(); 2364 int ret; 2365 unsigned long count = 0; 2366 unsigned long actual_count = 0; 2367 int must_insert_reserved = 0; 2368 2369 delayed_refs = &trans->transaction->delayed_refs; 2370 while (1) { 2371 if (!locked_ref) { 2372 if (count >= nr) 2373 break; 2374 2375 spin_lock(&delayed_refs->lock); 2376 locked_ref = btrfs_select_ref_head(trans); 2377 if (!locked_ref) { 2378 spin_unlock(&delayed_refs->lock); 2379 break; 2380 } 2381 2382 /* grab the lock that says we are going to process 2383 * all the refs for this head */ 2384 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2385 spin_unlock(&delayed_refs->lock); 2386 /* 2387 * we may have dropped the spin lock to get the head 2388 * mutex lock, and that might have given someone else 2389 * time to free the head. If that's true, it has been 2390 * removed from our list and we can move on. 2391 */ 2392 if (ret == -EAGAIN) { 2393 locked_ref = NULL; 2394 count++; 2395 continue; 2396 } 2397 } 2398 2399 /* 2400 * We need to try and merge add/drops of the same ref since we 2401 * can run into issues with relocate dropping the implicit ref 2402 * and then it being added back again before the drop can 2403 * finish. If we merged anything we need to re-loop so we can 2404 * get a good ref. 2405 */ 2406 spin_lock(&locked_ref->lock); 2407 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2408 locked_ref); 2409 2410 /* 2411 * locked_ref is the head node, so we have to go one 2412 * node back for any delayed ref updates 2413 */ 2414 ref = select_delayed_ref(locked_ref); 2415 2416 if (ref && ref->seq && 2417 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2418 spin_unlock(&locked_ref->lock); 2419 btrfs_delayed_ref_unlock(locked_ref); 2420 spin_lock(&delayed_refs->lock); 2421 locked_ref->processing = 0; 2422 delayed_refs->num_heads_ready++; 2423 spin_unlock(&delayed_refs->lock); 2424 locked_ref = NULL; 2425 cond_resched(); 2426 count++; 2427 continue; 2428 } 2429 2430 /* 2431 * record the must insert reserved flag before we 2432 * drop the spin lock. 2433 */ 2434 must_insert_reserved = locked_ref->must_insert_reserved; 2435 locked_ref->must_insert_reserved = 0; 2436 2437 extent_op = locked_ref->extent_op; 2438 locked_ref->extent_op = NULL; 2439 2440 if (!ref) { 2441 2442 2443 /* All delayed refs have been processed, Go ahead 2444 * and send the head node to run_one_delayed_ref, 2445 * so that any accounting fixes can happen 2446 */ 2447 ref = &locked_ref->node; 2448 2449 if (extent_op && must_insert_reserved) { 2450 btrfs_free_delayed_extent_op(extent_op); 2451 extent_op = NULL; 2452 } 2453 2454 if (extent_op) { 2455 spin_unlock(&locked_ref->lock); 2456 ret = run_delayed_extent_op(trans, root, 2457 ref, extent_op); 2458 btrfs_free_delayed_extent_op(extent_op); 2459 2460 if (ret) { 2461 /* 2462 * Need to reset must_insert_reserved if 2463 * there was an error so the abort stuff 2464 * can cleanup the reserved space 2465 * properly. 2466 */ 2467 if (must_insert_reserved) 2468 locked_ref->must_insert_reserved = 1; 2469 locked_ref->processing = 0; 2470 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2471 btrfs_delayed_ref_unlock(locked_ref); 2472 return ret; 2473 } 2474 continue; 2475 } 2476 2477 /* 2478 * Need to drop our head ref lock and re-aqcuire the 2479 * delayed ref lock and then re-check to make sure 2480 * nobody got added. 2481 */ 2482 spin_unlock(&locked_ref->lock); 2483 spin_lock(&delayed_refs->lock); 2484 spin_lock(&locked_ref->lock); 2485 if (rb_first(&locked_ref->ref_root) || 2486 locked_ref->extent_op) { 2487 spin_unlock(&locked_ref->lock); 2488 spin_unlock(&delayed_refs->lock); 2489 continue; 2490 } 2491 ref->in_tree = 0; 2492 delayed_refs->num_heads--; 2493 rb_erase(&locked_ref->href_node, 2494 &delayed_refs->href_root); 2495 spin_unlock(&delayed_refs->lock); 2496 } else { 2497 actual_count++; 2498 ref->in_tree = 0; 2499 rb_erase(&ref->rb_node, &locked_ref->ref_root); 2500 } 2501 atomic_dec(&delayed_refs->num_entries); 2502 2503 if (!btrfs_delayed_ref_is_head(ref)) { 2504 /* 2505 * when we play the delayed ref, also correct the 2506 * ref_mod on head 2507 */ 2508 switch (ref->action) { 2509 case BTRFS_ADD_DELAYED_REF: 2510 case BTRFS_ADD_DELAYED_EXTENT: 2511 locked_ref->node.ref_mod -= ref->ref_mod; 2512 break; 2513 case BTRFS_DROP_DELAYED_REF: 2514 locked_ref->node.ref_mod += ref->ref_mod; 2515 break; 2516 default: 2517 WARN_ON(1); 2518 } 2519 } 2520 spin_unlock(&locked_ref->lock); 2521 2522 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2523 must_insert_reserved); 2524 2525 btrfs_free_delayed_extent_op(extent_op); 2526 if (ret) { 2527 locked_ref->processing = 0; 2528 btrfs_delayed_ref_unlock(locked_ref); 2529 btrfs_put_delayed_ref(ref); 2530 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2531 return ret; 2532 } 2533 2534 /* 2535 * If this node is a head, that means all the refs in this head 2536 * have been dealt with, and we will pick the next head to deal 2537 * with, so we must unlock the head and drop it from the cluster 2538 * list before we release it. 2539 */ 2540 if (btrfs_delayed_ref_is_head(ref)) { 2541 btrfs_delayed_ref_unlock(locked_ref); 2542 locked_ref = NULL; 2543 } 2544 btrfs_put_delayed_ref(ref); 2545 count++; 2546 cond_resched(); 2547 } 2548 2549 /* 2550 * We don't want to include ref heads since we can have empty ref heads 2551 * and those will drastically skew our runtime down since we just do 2552 * accounting, no actual extent tree updates. 2553 */ 2554 if (actual_count > 0) { 2555 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2556 u64 avg; 2557 2558 /* 2559 * We weigh the current average higher than our current runtime 2560 * to avoid large swings in the average. 2561 */ 2562 spin_lock(&delayed_refs->lock); 2563 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2564 avg = div64_u64(avg, 4); 2565 fs_info->avg_delayed_ref_runtime = avg; 2566 spin_unlock(&delayed_refs->lock); 2567 } 2568 return 0; 2569 } 2570 2571 #ifdef SCRAMBLE_DELAYED_REFS 2572 /* 2573 * Normally delayed refs get processed in ascending bytenr order. This 2574 * correlates in most cases to the order added. To expose dependencies on this 2575 * order, we start to process the tree in the middle instead of the beginning 2576 */ 2577 static u64 find_middle(struct rb_root *root) 2578 { 2579 struct rb_node *n = root->rb_node; 2580 struct btrfs_delayed_ref_node *entry; 2581 int alt = 1; 2582 u64 middle; 2583 u64 first = 0, last = 0; 2584 2585 n = rb_first(root); 2586 if (n) { 2587 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2588 first = entry->bytenr; 2589 } 2590 n = rb_last(root); 2591 if (n) { 2592 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2593 last = entry->bytenr; 2594 } 2595 n = root->rb_node; 2596 2597 while (n) { 2598 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2599 WARN_ON(!entry->in_tree); 2600 2601 middle = entry->bytenr; 2602 2603 if (alt) 2604 n = n->rb_left; 2605 else 2606 n = n->rb_right; 2607 2608 alt = 1 - alt; 2609 } 2610 return middle; 2611 } 2612 #endif 2613 2614 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2615 { 2616 u64 num_bytes; 2617 2618 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2619 sizeof(struct btrfs_extent_inline_ref)); 2620 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2621 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2622 2623 /* 2624 * We don't ever fill up leaves all the way so multiply by 2 just to be 2625 * closer to what we're really going to want to ouse. 2626 */ 2627 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2628 } 2629 2630 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2631 struct btrfs_root *root) 2632 { 2633 struct btrfs_block_rsv *global_rsv; 2634 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2635 u64 num_bytes; 2636 int ret = 0; 2637 2638 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2639 num_heads = heads_to_leaves(root, num_heads); 2640 if (num_heads > 1) 2641 num_bytes += (num_heads - 1) * root->nodesize; 2642 num_bytes <<= 1; 2643 global_rsv = &root->fs_info->global_block_rsv; 2644 2645 /* 2646 * If we can't allocate any more chunks lets make sure we have _lots_ of 2647 * wiggle room since running delayed refs can create more delayed refs. 2648 */ 2649 if (global_rsv->space_info->full) 2650 num_bytes <<= 1; 2651 2652 spin_lock(&global_rsv->lock); 2653 if (global_rsv->reserved <= num_bytes) 2654 ret = 1; 2655 spin_unlock(&global_rsv->lock); 2656 return ret; 2657 } 2658 2659 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2660 struct btrfs_root *root) 2661 { 2662 struct btrfs_fs_info *fs_info = root->fs_info; 2663 u64 num_entries = 2664 atomic_read(&trans->transaction->delayed_refs.num_entries); 2665 u64 avg_runtime; 2666 u64 val; 2667 2668 smp_mb(); 2669 avg_runtime = fs_info->avg_delayed_ref_runtime; 2670 val = num_entries * avg_runtime; 2671 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2672 return 1; 2673 if (val >= NSEC_PER_SEC / 2) 2674 return 2; 2675 2676 return btrfs_check_space_for_delayed_refs(trans, root); 2677 } 2678 2679 struct async_delayed_refs { 2680 struct btrfs_root *root; 2681 int count; 2682 int error; 2683 int sync; 2684 struct completion wait; 2685 struct btrfs_work work; 2686 }; 2687 2688 static void delayed_ref_async_start(struct btrfs_work *work) 2689 { 2690 struct async_delayed_refs *async; 2691 struct btrfs_trans_handle *trans; 2692 int ret; 2693 2694 async = container_of(work, struct async_delayed_refs, work); 2695 2696 trans = btrfs_join_transaction(async->root); 2697 if (IS_ERR(trans)) { 2698 async->error = PTR_ERR(trans); 2699 goto done; 2700 } 2701 2702 /* 2703 * trans->sync means that when we call end_transaciton, we won't 2704 * wait on delayed refs 2705 */ 2706 trans->sync = true; 2707 ret = btrfs_run_delayed_refs(trans, async->root, async->count); 2708 if (ret) 2709 async->error = ret; 2710 2711 ret = btrfs_end_transaction(trans, async->root); 2712 if (ret && !async->error) 2713 async->error = ret; 2714 done: 2715 if (async->sync) 2716 complete(&async->wait); 2717 else 2718 kfree(async); 2719 } 2720 2721 int btrfs_async_run_delayed_refs(struct btrfs_root *root, 2722 unsigned long count, int wait) 2723 { 2724 struct async_delayed_refs *async; 2725 int ret; 2726 2727 async = kmalloc(sizeof(*async), GFP_NOFS); 2728 if (!async) 2729 return -ENOMEM; 2730 2731 async->root = root->fs_info->tree_root; 2732 async->count = count; 2733 async->error = 0; 2734 if (wait) 2735 async->sync = 1; 2736 else 2737 async->sync = 0; 2738 init_completion(&async->wait); 2739 2740 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2741 delayed_ref_async_start, NULL, NULL); 2742 2743 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2744 2745 if (wait) { 2746 wait_for_completion(&async->wait); 2747 ret = async->error; 2748 kfree(async); 2749 return ret; 2750 } 2751 return 0; 2752 } 2753 2754 /* 2755 * this starts processing the delayed reference count updates and 2756 * extent insertions we have queued up so far. count can be 2757 * 0, which means to process everything in the tree at the start 2758 * of the run (but not newly added entries), or it can be some target 2759 * number you'd like to process. 2760 * 2761 * Returns 0 on success or if called with an aborted transaction 2762 * Returns <0 on error and aborts the transaction 2763 */ 2764 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2765 struct btrfs_root *root, unsigned long count) 2766 { 2767 struct rb_node *node; 2768 struct btrfs_delayed_ref_root *delayed_refs; 2769 struct btrfs_delayed_ref_head *head; 2770 int ret; 2771 int run_all = count == (unsigned long)-1; 2772 2773 /* We'll clean this up in btrfs_cleanup_transaction */ 2774 if (trans->aborted) 2775 return 0; 2776 2777 if (root == root->fs_info->extent_root) 2778 root = root->fs_info->tree_root; 2779 2780 delayed_refs = &trans->transaction->delayed_refs; 2781 if (count == 0) 2782 count = atomic_read(&delayed_refs->num_entries) * 2; 2783 2784 again: 2785 #ifdef SCRAMBLE_DELAYED_REFS 2786 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2787 #endif 2788 ret = __btrfs_run_delayed_refs(trans, root, count); 2789 if (ret < 0) { 2790 btrfs_abort_transaction(trans, root, ret); 2791 return ret; 2792 } 2793 2794 if (run_all) { 2795 if (!list_empty(&trans->new_bgs)) 2796 btrfs_create_pending_block_groups(trans, root); 2797 2798 spin_lock(&delayed_refs->lock); 2799 node = rb_first(&delayed_refs->href_root); 2800 if (!node) { 2801 spin_unlock(&delayed_refs->lock); 2802 goto out; 2803 } 2804 count = (unsigned long)-1; 2805 2806 while (node) { 2807 head = rb_entry(node, struct btrfs_delayed_ref_head, 2808 href_node); 2809 if (btrfs_delayed_ref_is_head(&head->node)) { 2810 struct btrfs_delayed_ref_node *ref; 2811 2812 ref = &head->node; 2813 atomic_inc(&ref->refs); 2814 2815 spin_unlock(&delayed_refs->lock); 2816 /* 2817 * Mutex was contended, block until it's 2818 * released and try again 2819 */ 2820 mutex_lock(&head->mutex); 2821 mutex_unlock(&head->mutex); 2822 2823 btrfs_put_delayed_ref(ref); 2824 cond_resched(); 2825 goto again; 2826 } else { 2827 WARN_ON(1); 2828 } 2829 node = rb_next(node); 2830 } 2831 spin_unlock(&delayed_refs->lock); 2832 cond_resched(); 2833 goto again; 2834 } 2835 out: 2836 ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); 2837 if (ret) 2838 return ret; 2839 assert_qgroups_uptodate(trans); 2840 return 0; 2841 } 2842 2843 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2844 struct btrfs_root *root, 2845 u64 bytenr, u64 num_bytes, u64 flags, 2846 int level, int is_data) 2847 { 2848 struct btrfs_delayed_extent_op *extent_op; 2849 int ret; 2850 2851 extent_op = btrfs_alloc_delayed_extent_op(); 2852 if (!extent_op) 2853 return -ENOMEM; 2854 2855 extent_op->flags_to_set = flags; 2856 extent_op->update_flags = 1; 2857 extent_op->update_key = 0; 2858 extent_op->is_data = is_data ? 1 : 0; 2859 extent_op->level = level; 2860 2861 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2862 num_bytes, extent_op); 2863 if (ret) 2864 btrfs_free_delayed_extent_op(extent_op); 2865 return ret; 2866 } 2867 2868 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2869 struct btrfs_root *root, 2870 struct btrfs_path *path, 2871 u64 objectid, u64 offset, u64 bytenr) 2872 { 2873 struct btrfs_delayed_ref_head *head; 2874 struct btrfs_delayed_ref_node *ref; 2875 struct btrfs_delayed_data_ref *data_ref; 2876 struct btrfs_delayed_ref_root *delayed_refs; 2877 struct rb_node *node; 2878 int ret = 0; 2879 2880 delayed_refs = &trans->transaction->delayed_refs; 2881 spin_lock(&delayed_refs->lock); 2882 head = btrfs_find_delayed_ref_head(trans, bytenr); 2883 if (!head) { 2884 spin_unlock(&delayed_refs->lock); 2885 return 0; 2886 } 2887 2888 if (!mutex_trylock(&head->mutex)) { 2889 atomic_inc(&head->node.refs); 2890 spin_unlock(&delayed_refs->lock); 2891 2892 btrfs_release_path(path); 2893 2894 /* 2895 * Mutex was contended, block until it's released and let 2896 * caller try again 2897 */ 2898 mutex_lock(&head->mutex); 2899 mutex_unlock(&head->mutex); 2900 btrfs_put_delayed_ref(&head->node); 2901 return -EAGAIN; 2902 } 2903 spin_unlock(&delayed_refs->lock); 2904 2905 spin_lock(&head->lock); 2906 node = rb_first(&head->ref_root); 2907 while (node) { 2908 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2909 node = rb_next(node); 2910 2911 /* If it's a shared ref we know a cross reference exists */ 2912 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2913 ret = 1; 2914 break; 2915 } 2916 2917 data_ref = btrfs_delayed_node_to_data_ref(ref); 2918 2919 /* 2920 * If our ref doesn't match the one we're currently looking at 2921 * then we have a cross reference. 2922 */ 2923 if (data_ref->root != root->root_key.objectid || 2924 data_ref->objectid != objectid || 2925 data_ref->offset != offset) { 2926 ret = 1; 2927 break; 2928 } 2929 } 2930 spin_unlock(&head->lock); 2931 mutex_unlock(&head->mutex); 2932 return ret; 2933 } 2934 2935 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2936 struct btrfs_root *root, 2937 struct btrfs_path *path, 2938 u64 objectid, u64 offset, u64 bytenr) 2939 { 2940 struct btrfs_root *extent_root = root->fs_info->extent_root; 2941 struct extent_buffer *leaf; 2942 struct btrfs_extent_data_ref *ref; 2943 struct btrfs_extent_inline_ref *iref; 2944 struct btrfs_extent_item *ei; 2945 struct btrfs_key key; 2946 u32 item_size; 2947 int ret; 2948 2949 key.objectid = bytenr; 2950 key.offset = (u64)-1; 2951 key.type = BTRFS_EXTENT_ITEM_KEY; 2952 2953 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2954 if (ret < 0) 2955 goto out; 2956 BUG_ON(ret == 0); /* Corruption */ 2957 2958 ret = -ENOENT; 2959 if (path->slots[0] == 0) 2960 goto out; 2961 2962 path->slots[0]--; 2963 leaf = path->nodes[0]; 2964 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2965 2966 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2967 goto out; 2968 2969 ret = 1; 2970 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2971 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2972 if (item_size < sizeof(*ei)) { 2973 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2974 goto out; 2975 } 2976 #endif 2977 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2978 2979 if (item_size != sizeof(*ei) + 2980 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2981 goto out; 2982 2983 if (btrfs_extent_generation(leaf, ei) <= 2984 btrfs_root_last_snapshot(&root->root_item)) 2985 goto out; 2986 2987 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2988 if (btrfs_extent_inline_ref_type(leaf, iref) != 2989 BTRFS_EXTENT_DATA_REF_KEY) 2990 goto out; 2991 2992 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2993 if (btrfs_extent_refs(leaf, ei) != 2994 btrfs_extent_data_ref_count(leaf, ref) || 2995 btrfs_extent_data_ref_root(leaf, ref) != 2996 root->root_key.objectid || 2997 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2998 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2999 goto out; 3000 3001 ret = 0; 3002 out: 3003 return ret; 3004 } 3005 3006 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3007 struct btrfs_root *root, 3008 u64 objectid, u64 offset, u64 bytenr) 3009 { 3010 struct btrfs_path *path; 3011 int ret; 3012 int ret2; 3013 3014 path = btrfs_alloc_path(); 3015 if (!path) 3016 return -ENOENT; 3017 3018 do { 3019 ret = check_committed_ref(trans, root, path, objectid, 3020 offset, bytenr); 3021 if (ret && ret != -ENOENT) 3022 goto out; 3023 3024 ret2 = check_delayed_ref(trans, root, path, objectid, 3025 offset, bytenr); 3026 } while (ret2 == -EAGAIN); 3027 3028 if (ret2 && ret2 != -ENOENT) { 3029 ret = ret2; 3030 goto out; 3031 } 3032 3033 if (ret != -ENOENT || ret2 != -ENOENT) 3034 ret = 0; 3035 out: 3036 btrfs_free_path(path); 3037 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3038 WARN_ON(ret > 0); 3039 return ret; 3040 } 3041 3042 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3043 struct btrfs_root *root, 3044 struct extent_buffer *buf, 3045 int full_backref, int inc) 3046 { 3047 u64 bytenr; 3048 u64 num_bytes; 3049 u64 parent; 3050 u64 ref_root; 3051 u32 nritems; 3052 struct btrfs_key key; 3053 struct btrfs_file_extent_item *fi; 3054 int i; 3055 int level; 3056 int ret = 0; 3057 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3058 u64, u64, u64, u64, u64, u64, int); 3059 3060 3061 if (btrfs_test_is_dummy_root(root)) 3062 return 0; 3063 3064 ref_root = btrfs_header_owner(buf); 3065 nritems = btrfs_header_nritems(buf); 3066 level = btrfs_header_level(buf); 3067 3068 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3069 return 0; 3070 3071 if (inc) 3072 process_func = btrfs_inc_extent_ref; 3073 else 3074 process_func = btrfs_free_extent; 3075 3076 if (full_backref) 3077 parent = buf->start; 3078 else 3079 parent = 0; 3080 3081 for (i = 0; i < nritems; i++) { 3082 if (level == 0) { 3083 btrfs_item_key_to_cpu(buf, &key, i); 3084 if (key.type != BTRFS_EXTENT_DATA_KEY) 3085 continue; 3086 fi = btrfs_item_ptr(buf, i, 3087 struct btrfs_file_extent_item); 3088 if (btrfs_file_extent_type(buf, fi) == 3089 BTRFS_FILE_EXTENT_INLINE) 3090 continue; 3091 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3092 if (bytenr == 0) 3093 continue; 3094 3095 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3096 key.offset -= btrfs_file_extent_offset(buf, fi); 3097 ret = process_func(trans, root, bytenr, num_bytes, 3098 parent, ref_root, key.objectid, 3099 key.offset, 1); 3100 if (ret) 3101 goto fail; 3102 } else { 3103 bytenr = btrfs_node_blockptr(buf, i); 3104 num_bytes = root->nodesize; 3105 ret = process_func(trans, root, bytenr, num_bytes, 3106 parent, ref_root, level - 1, 0, 3107 1); 3108 if (ret) 3109 goto fail; 3110 } 3111 } 3112 return 0; 3113 fail: 3114 return ret; 3115 } 3116 3117 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3118 struct extent_buffer *buf, int full_backref) 3119 { 3120 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3121 } 3122 3123 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3124 struct extent_buffer *buf, int full_backref) 3125 { 3126 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3127 } 3128 3129 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3130 struct btrfs_root *root, 3131 struct btrfs_path *path, 3132 struct btrfs_block_group_cache *cache) 3133 { 3134 int ret; 3135 struct btrfs_root *extent_root = root->fs_info->extent_root; 3136 unsigned long bi; 3137 struct extent_buffer *leaf; 3138 3139 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3140 if (ret) { 3141 if (ret > 0) 3142 ret = -ENOENT; 3143 goto fail; 3144 } 3145 3146 leaf = path->nodes[0]; 3147 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3148 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3149 btrfs_mark_buffer_dirty(leaf); 3150 btrfs_release_path(path); 3151 fail: 3152 if (ret) 3153 btrfs_abort_transaction(trans, root, ret); 3154 return ret; 3155 3156 } 3157 3158 static struct btrfs_block_group_cache * 3159 next_block_group(struct btrfs_root *root, 3160 struct btrfs_block_group_cache *cache) 3161 { 3162 struct rb_node *node; 3163 3164 spin_lock(&root->fs_info->block_group_cache_lock); 3165 3166 /* If our block group was removed, we need a full search. */ 3167 if (RB_EMPTY_NODE(&cache->cache_node)) { 3168 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3169 3170 spin_unlock(&root->fs_info->block_group_cache_lock); 3171 btrfs_put_block_group(cache); 3172 cache = btrfs_lookup_first_block_group(root->fs_info, 3173 next_bytenr); 3174 return cache; 3175 } 3176 node = rb_next(&cache->cache_node); 3177 btrfs_put_block_group(cache); 3178 if (node) { 3179 cache = rb_entry(node, struct btrfs_block_group_cache, 3180 cache_node); 3181 btrfs_get_block_group(cache); 3182 } else 3183 cache = NULL; 3184 spin_unlock(&root->fs_info->block_group_cache_lock); 3185 return cache; 3186 } 3187 3188 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3189 struct btrfs_trans_handle *trans, 3190 struct btrfs_path *path) 3191 { 3192 struct btrfs_root *root = block_group->fs_info->tree_root; 3193 struct inode *inode = NULL; 3194 u64 alloc_hint = 0; 3195 int dcs = BTRFS_DC_ERROR; 3196 int num_pages = 0; 3197 int retries = 0; 3198 int ret = 0; 3199 3200 /* 3201 * If this block group is smaller than 100 megs don't bother caching the 3202 * block group. 3203 */ 3204 if (block_group->key.offset < (100 * 1024 * 1024)) { 3205 spin_lock(&block_group->lock); 3206 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3207 spin_unlock(&block_group->lock); 3208 return 0; 3209 } 3210 3211 if (trans->aborted) 3212 return 0; 3213 again: 3214 inode = lookup_free_space_inode(root, block_group, path); 3215 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3216 ret = PTR_ERR(inode); 3217 btrfs_release_path(path); 3218 goto out; 3219 } 3220 3221 if (IS_ERR(inode)) { 3222 BUG_ON(retries); 3223 retries++; 3224 3225 if (block_group->ro) 3226 goto out_free; 3227 3228 ret = create_free_space_inode(root, trans, block_group, path); 3229 if (ret) 3230 goto out_free; 3231 goto again; 3232 } 3233 3234 /* We've already setup this transaction, go ahead and exit */ 3235 if (block_group->cache_generation == trans->transid && 3236 i_size_read(inode)) { 3237 dcs = BTRFS_DC_SETUP; 3238 goto out_put; 3239 } 3240 3241 /* 3242 * We want to set the generation to 0, that way if anything goes wrong 3243 * from here on out we know not to trust this cache when we load up next 3244 * time. 3245 */ 3246 BTRFS_I(inode)->generation = 0; 3247 ret = btrfs_update_inode(trans, root, inode); 3248 if (ret) { 3249 /* 3250 * So theoretically we could recover from this, simply set the 3251 * super cache generation to 0 so we know to invalidate the 3252 * cache, but then we'd have to keep track of the block groups 3253 * that fail this way so we know we _have_ to reset this cache 3254 * before the next commit or risk reading stale cache. So to 3255 * limit our exposure to horrible edge cases lets just abort the 3256 * transaction, this only happens in really bad situations 3257 * anyway. 3258 */ 3259 btrfs_abort_transaction(trans, root, ret); 3260 goto out_put; 3261 } 3262 WARN_ON(ret); 3263 3264 if (i_size_read(inode) > 0) { 3265 ret = btrfs_check_trunc_cache_free_space(root, 3266 &root->fs_info->global_block_rsv); 3267 if (ret) 3268 goto out_put; 3269 3270 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3271 if (ret) 3272 goto out_put; 3273 } 3274 3275 spin_lock(&block_group->lock); 3276 if (block_group->cached != BTRFS_CACHE_FINISHED || 3277 !btrfs_test_opt(root, SPACE_CACHE) || 3278 block_group->delalloc_bytes) { 3279 /* 3280 * don't bother trying to write stuff out _if_ 3281 * a) we're not cached, 3282 * b) we're with nospace_cache mount option. 3283 */ 3284 dcs = BTRFS_DC_WRITTEN; 3285 spin_unlock(&block_group->lock); 3286 goto out_put; 3287 } 3288 spin_unlock(&block_group->lock); 3289 3290 /* 3291 * Try to preallocate enough space based on how big the block group is. 3292 * Keep in mind this has to include any pinned space which could end up 3293 * taking up quite a bit since it's not folded into the other space 3294 * cache. 3295 */ 3296 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3297 if (!num_pages) 3298 num_pages = 1; 3299 3300 num_pages *= 16; 3301 num_pages *= PAGE_CACHE_SIZE; 3302 3303 ret = btrfs_check_data_free_space(inode, num_pages); 3304 if (ret) 3305 goto out_put; 3306 3307 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3308 num_pages, num_pages, 3309 &alloc_hint); 3310 if (!ret) 3311 dcs = BTRFS_DC_SETUP; 3312 btrfs_free_reserved_data_space(inode, num_pages); 3313 3314 out_put: 3315 iput(inode); 3316 out_free: 3317 btrfs_release_path(path); 3318 out: 3319 spin_lock(&block_group->lock); 3320 if (!ret && dcs == BTRFS_DC_SETUP) 3321 block_group->cache_generation = trans->transid; 3322 block_group->disk_cache_state = dcs; 3323 spin_unlock(&block_group->lock); 3324 3325 return ret; 3326 } 3327 3328 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3329 struct btrfs_root *root) 3330 { 3331 struct btrfs_block_group_cache *cache, *tmp; 3332 struct btrfs_transaction *cur_trans = trans->transaction; 3333 struct btrfs_path *path; 3334 3335 if (list_empty(&cur_trans->dirty_bgs) || 3336 !btrfs_test_opt(root, SPACE_CACHE)) 3337 return 0; 3338 3339 path = btrfs_alloc_path(); 3340 if (!path) 3341 return -ENOMEM; 3342 3343 /* Could add new block groups, use _safe just in case */ 3344 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3345 dirty_list) { 3346 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3347 cache_save_setup(cache, trans, path); 3348 } 3349 3350 btrfs_free_path(path); 3351 return 0; 3352 } 3353 3354 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3355 struct btrfs_root *root) 3356 { 3357 struct btrfs_block_group_cache *cache; 3358 struct btrfs_transaction *cur_trans = trans->transaction; 3359 int ret = 0; 3360 struct btrfs_path *path; 3361 3362 if (list_empty(&cur_trans->dirty_bgs)) 3363 return 0; 3364 3365 path = btrfs_alloc_path(); 3366 if (!path) 3367 return -ENOMEM; 3368 3369 /* 3370 * We don't need the lock here since we are protected by the transaction 3371 * commit. We want to do the cache_save_setup first and then run the 3372 * delayed refs to make sure we have the best chance at doing this all 3373 * in one shot. 3374 */ 3375 while (!list_empty(&cur_trans->dirty_bgs)) { 3376 cache = list_first_entry(&cur_trans->dirty_bgs, 3377 struct btrfs_block_group_cache, 3378 dirty_list); 3379 list_del_init(&cache->dirty_list); 3380 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3381 cache_save_setup(cache, trans, path); 3382 if (!ret) 3383 ret = btrfs_run_delayed_refs(trans, root, 3384 (unsigned long) -1); 3385 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) 3386 btrfs_write_out_cache(root, trans, cache, path); 3387 if (!ret) 3388 ret = write_one_cache_group(trans, root, path, cache); 3389 btrfs_put_block_group(cache); 3390 } 3391 3392 btrfs_free_path(path); 3393 return ret; 3394 } 3395 3396 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3397 { 3398 struct btrfs_block_group_cache *block_group; 3399 int readonly = 0; 3400 3401 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3402 if (!block_group || block_group->ro) 3403 readonly = 1; 3404 if (block_group) 3405 btrfs_put_block_group(block_group); 3406 return readonly; 3407 } 3408 3409 static const char *alloc_name(u64 flags) 3410 { 3411 switch (flags) { 3412 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3413 return "mixed"; 3414 case BTRFS_BLOCK_GROUP_METADATA: 3415 return "metadata"; 3416 case BTRFS_BLOCK_GROUP_DATA: 3417 return "data"; 3418 case BTRFS_BLOCK_GROUP_SYSTEM: 3419 return "system"; 3420 default: 3421 WARN_ON(1); 3422 return "invalid-combination"; 3423 }; 3424 } 3425 3426 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3427 u64 total_bytes, u64 bytes_used, 3428 struct btrfs_space_info **space_info) 3429 { 3430 struct btrfs_space_info *found; 3431 int i; 3432 int factor; 3433 int ret; 3434 3435 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3436 BTRFS_BLOCK_GROUP_RAID10)) 3437 factor = 2; 3438 else 3439 factor = 1; 3440 3441 found = __find_space_info(info, flags); 3442 if (found) { 3443 spin_lock(&found->lock); 3444 found->total_bytes += total_bytes; 3445 found->disk_total += total_bytes * factor; 3446 found->bytes_used += bytes_used; 3447 found->disk_used += bytes_used * factor; 3448 found->full = 0; 3449 spin_unlock(&found->lock); 3450 *space_info = found; 3451 return 0; 3452 } 3453 found = kzalloc(sizeof(*found), GFP_NOFS); 3454 if (!found) 3455 return -ENOMEM; 3456 3457 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3458 if (ret) { 3459 kfree(found); 3460 return ret; 3461 } 3462 3463 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3464 INIT_LIST_HEAD(&found->block_groups[i]); 3465 init_rwsem(&found->groups_sem); 3466 spin_lock_init(&found->lock); 3467 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3468 found->total_bytes = total_bytes; 3469 found->disk_total = total_bytes * factor; 3470 found->bytes_used = bytes_used; 3471 found->disk_used = bytes_used * factor; 3472 found->bytes_pinned = 0; 3473 found->bytes_reserved = 0; 3474 found->bytes_readonly = 0; 3475 found->bytes_may_use = 0; 3476 found->full = 0; 3477 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3478 found->chunk_alloc = 0; 3479 found->flush = 0; 3480 init_waitqueue_head(&found->wait); 3481 INIT_LIST_HEAD(&found->ro_bgs); 3482 3483 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3484 info->space_info_kobj, "%s", 3485 alloc_name(found->flags)); 3486 if (ret) { 3487 kfree(found); 3488 return ret; 3489 } 3490 3491 *space_info = found; 3492 list_add_rcu(&found->list, &info->space_info); 3493 if (flags & BTRFS_BLOCK_GROUP_DATA) 3494 info->data_sinfo = found; 3495 3496 return ret; 3497 } 3498 3499 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3500 { 3501 u64 extra_flags = chunk_to_extended(flags) & 3502 BTRFS_EXTENDED_PROFILE_MASK; 3503 3504 write_seqlock(&fs_info->profiles_lock); 3505 if (flags & BTRFS_BLOCK_GROUP_DATA) 3506 fs_info->avail_data_alloc_bits |= extra_flags; 3507 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3508 fs_info->avail_metadata_alloc_bits |= extra_flags; 3509 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3510 fs_info->avail_system_alloc_bits |= extra_flags; 3511 write_sequnlock(&fs_info->profiles_lock); 3512 } 3513 3514 /* 3515 * returns target flags in extended format or 0 if restripe for this 3516 * chunk_type is not in progress 3517 * 3518 * should be called with either volume_mutex or balance_lock held 3519 */ 3520 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3521 { 3522 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3523 u64 target = 0; 3524 3525 if (!bctl) 3526 return 0; 3527 3528 if (flags & BTRFS_BLOCK_GROUP_DATA && 3529 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3530 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3531 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3532 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3533 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3534 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3535 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3536 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3537 } 3538 3539 return target; 3540 } 3541 3542 /* 3543 * @flags: available profiles in extended format (see ctree.h) 3544 * 3545 * Returns reduced profile in chunk format. If profile changing is in 3546 * progress (either running or paused) picks the target profile (if it's 3547 * already available), otherwise falls back to plain reducing. 3548 */ 3549 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3550 { 3551 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3552 u64 target; 3553 u64 tmp; 3554 3555 /* 3556 * see if restripe for this chunk_type is in progress, if so 3557 * try to reduce to the target profile 3558 */ 3559 spin_lock(&root->fs_info->balance_lock); 3560 target = get_restripe_target(root->fs_info, flags); 3561 if (target) { 3562 /* pick target profile only if it's already available */ 3563 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3564 spin_unlock(&root->fs_info->balance_lock); 3565 return extended_to_chunk(target); 3566 } 3567 } 3568 spin_unlock(&root->fs_info->balance_lock); 3569 3570 /* First, mask out the RAID levels which aren't possible */ 3571 if (num_devices == 1) 3572 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3573 BTRFS_BLOCK_GROUP_RAID5); 3574 if (num_devices < 3) 3575 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3576 if (num_devices < 4) 3577 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3578 3579 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3580 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3581 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3582 flags &= ~tmp; 3583 3584 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3585 tmp = BTRFS_BLOCK_GROUP_RAID6; 3586 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3587 tmp = BTRFS_BLOCK_GROUP_RAID5; 3588 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3589 tmp = BTRFS_BLOCK_GROUP_RAID10; 3590 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3591 tmp = BTRFS_BLOCK_GROUP_RAID1; 3592 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3593 tmp = BTRFS_BLOCK_GROUP_RAID0; 3594 3595 return extended_to_chunk(flags | tmp); 3596 } 3597 3598 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 3599 { 3600 unsigned seq; 3601 u64 flags; 3602 3603 do { 3604 flags = orig_flags; 3605 seq = read_seqbegin(&root->fs_info->profiles_lock); 3606 3607 if (flags & BTRFS_BLOCK_GROUP_DATA) 3608 flags |= root->fs_info->avail_data_alloc_bits; 3609 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3610 flags |= root->fs_info->avail_system_alloc_bits; 3611 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3612 flags |= root->fs_info->avail_metadata_alloc_bits; 3613 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3614 3615 return btrfs_reduce_alloc_profile(root, flags); 3616 } 3617 3618 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3619 { 3620 u64 flags; 3621 u64 ret; 3622 3623 if (data) 3624 flags = BTRFS_BLOCK_GROUP_DATA; 3625 else if (root == root->fs_info->chunk_root) 3626 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3627 else 3628 flags = BTRFS_BLOCK_GROUP_METADATA; 3629 3630 ret = get_alloc_profile(root, flags); 3631 return ret; 3632 } 3633 3634 /* 3635 * This will check the space that the inode allocates from to make sure we have 3636 * enough space for bytes. 3637 */ 3638 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3639 { 3640 struct btrfs_space_info *data_sinfo; 3641 struct btrfs_root *root = BTRFS_I(inode)->root; 3642 struct btrfs_fs_info *fs_info = root->fs_info; 3643 u64 used; 3644 int ret = 0, committed = 0, alloc_chunk = 1; 3645 3646 /* make sure bytes are sectorsize aligned */ 3647 bytes = ALIGN(bytes, root->sectorsize); 3648 3649 if (btrfs_is_free_space_inode(inode)) { 3650 committed = 1; 3651 ASSERT(current->journal_info); 3652 } 3653 3654 data_sinfo = fs_info->data_sinfo; 3655 if (!data_sinfo) 3656 goto alloc; 3657 3658 again: 3659 /* make sure we have enough space to handle the data first */ 3660 spin_lock(&data_sinfo->lock); 3661 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3662 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3663 data_sinfo->bytes_may_use; 3664 3665 if (used + bytes > data_sinfo->total_bytes) { 3666 struct btrfs_trans_handle *trans; 3667 3668 /* 3669 * if we don't have enough free bytes in this space then we need 3670 * to alloc a new chunk. 3671 */ 3672 if (!data_sinfo->full && alloc_chunk) { 3673 u64 alloc_target; 3674 3675 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3676 spin_unlock(&data_sinfo->lock); 3677 alloc: 3678 alloc_target = btrfs_get_alloc_profile(root, 1); 3679 /* 3680 * It is ugly that we don't call nolock join 3681 * transaction for the free space inode case here. 3682 * But it is safe because we only do the data space 3683 * reservation for the free space cache in the 3684 * transaction context, the common join transaction 3685 * just increase the counter of the current transaction 3686 * handler, doesn't try to acquire the trans_lock of 3687 * the fs. 3688 */ 3689 trans = btrfs_join_transaction(root); 3690 if (IS_ERR(trans)) 3691 return PTR_ERR(trans); 3692 3693 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3694 alloc_target, 3695 CHUNK_ALLOC_NO_FORCE); 3696 btrfs_end_transaction(trans, root); 3697 if (ret < 0) { 3698 if (ret != -ENOSPC) 3699 return ret; 3700 else 3701 goto commit_trans; 3702 } 3703 3704 if (!data_sinfo) 3705 data_sinfo = fs_info->data_sinfo; 3706 3707 goto again; 3708 } 3709 3710 /* 3711 * If we don't have enough pinned space to deal with this 3712 * allocation don't bother committing the transaction. 3713 */ 3714 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, 3715 bytes) < 0) 3716 committed = 1; 3717 spin_unlock(&data_sinfo->lock); 3718 3719 /* commit the current transaction and try again */ 3720 commit_trans: 3721 if (!committed && 3722 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3723 committed = 1; 3724 3725 trans = btrfs_join_transaction(root); 3726 if (IS_ERR(trans)) 3727 return PTR_ERR(trans); 3728 ret = btrfs_commit_transaction(trans, root); 3729 if (ret) 3730 return ret; 3731 goto again; 3732 } 3733 3734 trace_btrfs_space_reservation(root->fs_info, 3735 "space_info:enospc", 3736 data_sinfo->flags, bytes, 1); 3737 return -ENOSPC; 3738 } 3739 data_sinfo->bytes_may_use += bytes; 3740 trace_btrfs_space_reservation(root->fs_info, "space_info", 3741 data_sinfo->flags, bytes, 1); 3742 spin_unlock(&data_sinfo->lock); 3743 3744 return 0; 3745 } 3746 3747 /* 3748 * Called if we need to clear a data reservation for this inode. 3749 */ 3750 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3751 { 3752 struct btrfs_root *root = BTRFS_I(inode)->root; 3753 struct btrfs_space_info *data_sinfo; 3754 3755 /* make sure bytes are sectorsize aligned */ 3756 bytes = ALIGN(bytes, root->sectorsize); 3757 3758 data_sinfo = root->fs_info->data_sinfo; 3759 spin_lock(&data_sinfo->lock); 3760 WARN_ON(data_sinfo->bytes_may_use < bytes); 3761 data_sinfo->bytes_may_use -= bytes; 3762 trace_btrfs_space_reservation(root->fs_info, "space_info", 3763 data_sinfo->flags, bytes, 0); 3764 spin_unlock(&data_sinfo->lock); 3765 } 3766 3767 static void force_metadata_allocation(struct btrfs_fs_info *info) 3768 { 3769 struct list_head *head = &info->space_info; 3770 struct btrfs_space_info *found; 3771 3772 rcu_read_lock(); 3773 list_for_each_entry_rcu(found, head, list) { 3774 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3775 found->force_alloc = CHUNK_ALLOC_FORCE; 3776 } 3777 rcu_read_unlock(); 3778 } 3779 3780 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3781 { 3782 return (global->size << 1); 3783 } 3784 3785 static int should_alloc_chunk(struct btrfs_root *root, 3786 struct btrfs_space_info *sinfo, int force) 3787 { 3788 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3789 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3790 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3791 u64 thresh; 3792 3793 if (force == CHUNK_ALLOC_FORCE) 3794 return 1; 3795 3796 /* 3797 * We need to take into account the global rsv because for all intents 3798 * and purposes it's used space. Don't worry about locking the 3799 * global_rsv, it doesn't change except when the transaction commits. 3800 */ 3801 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3802 num_allocated += calc_global_rsv_need_space(global_rsv); 3803 3804 /* 3805 * in limited mode, we want to have some free space up to 3806 * about 1% of the FS size. 3807 */ 3808 if (force == CHUNK_ALLOC_LIMITED) { 3809 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3810 thresh = max_t(u64, 64 * 1024 * 1024, 3811 div_factor_fine(thresh, 1)); 3812 3813 if (num_bytes - num_allocated < thresh) 3814 return 1; 3815 } 3816 3817 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3818 return 0; 3819 return 1; 3820 } 3821 3822 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3823 { 3824 u64 num_dev; 3825 3826 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3827 BTRFS_BLOCK_GROUP_RAID0 | 3828 BTRFS_BLOCK_GROUP_RAID5 | 3829 BTRFS_BLOCK_GROUP_RAID6)) 3830 num_dev = root->fs_info->fs_devices->rw_devices; 3831 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3832 num_dev = 2; 3833 else 3834 num_dev = 1; /* DUP or single */ 3835 3836 /* metadata for updaing devices and chunk tree */ 3837 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3838 } 3839 3840 static void check_system_chunk(struct btrfs_trans_handle *trans, 3841 struct btrfs_root *root, u64 type) 3842 { 3843 struct btrfs_space_info *info; 3844 u64 left; 3845 u64 thresh; 3846 3847 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3848 spin_lock(&info->lock); 3849 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3850 info->bytes_reserved - info->bytes_readonly; 3851 spin_unlock(&info->lock); 3852 3853 thresh = get_system_chunk_thresh(root, type); 3854 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3855 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3856 left, thresh, type); 3857 dump_space_info(info, 0, 0); 3858 } 3859 3860 if (left < thresh) { 3861 u64 flags; 3862 3863 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3864 btrfs_alloc_chunk(trans, root, flags); 3865 } 3866 } 3867 3868 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3869 struct btrfs_root *extent_root, u64 flags, int force) 3870 { 3871 struct btrfs_space_info *space_info; 3872 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3873 int wait_for_alloc = 0; 3874 int ret = 0; 3875 3876 /* Don't re-enter if we're already allocating a chunk */ 3877 if (trans->allocating_chunk) 3878 return -ENOSPC; 3879 3880 space_info = __find_space_info(extent_root->fs_info, flags); 3881 if (!space_info) { 3882 ret = update_space_info(extent_root->fs_info, flags, 3883 0, 0, &space_info); 3884 BUG_ON(ret); /* -ENOMEM */ 3885 } 3886 BUG_ON(!space_info); /* Logic error */ 3887 3888 again: 3889 spin_lock(&space_info->lock); 3890 if (force < space_info->force_alloc) 3891 force = space_info->force_alloc; 3892 if (space_info->full) { 3893 if (should_alloc_chunk(extent_root, space_info, force)) 3894 ret = -ENOSPC; 3895 else 3896 ret = 0; 3897 spin_unlock(&space_info->lock); 3898 return ret; 3899 } 3900 3901 if (!should_alloc_chunk(extent_root, space_info, force)) { 3902 spin_unlock(&space_info->lock); 3903 return 0; 3904 } else if (space_info->chunk_alloc) { 3905 wait_for_alloc = 1; 3906 } else { 3907 space_info->chunk_alloc = 1; 3908 } 3909 3910 spin_unlock(&space_info->lock); 3911 3912 mutex_lock(&fs_info->chunk_mutex); 3913 3914 /* 3915 * The chunk_mutex is held throughout the entirety of a chunk 3916 * allocation, so once we've acquired the chunk_mutex we know that the 3917 * other guy is done and we need to recheck and see if we should 3918 * allocate. 3919 */ 3920 if (wait_for_alloc) { 3921 mutex_unlock(&fs_info->chunk_mutex); 3922 wait_for_alloc = 0; 3923 goto again; 3924 } 3925 3926 trans->allocating_chunk = true; 3927 3928 /* 3929 * If we have mixed data/metadata chunks we want to make sure we keep 3930 * allocating mixed chunks instead of individual chunks. 3931 */ 3932 if (btrfs_mixed_space_info(space_info)) 3933 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3934 3935 /* 3936 * if we're doing a data chunk, go ahead and make sure that 3937 * we keep a reasonable number of metadata chunks allocated in the 3938 * FS as well. 3939 */ 3940 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3941 fs_info->data_chunk_allocations++; 3942 if (!(fs_info->data_chunk_allocations % 3943 fs_info->metadata_ratio)) 3944 force_metadata_allocation(fs_info); 3945 } 3946 3947 /* 3948 * Check if we have enough space in SYSTEM chunk because we may need 3949 * to update devices. 3950 */ 3951 check_system_chunk(trans, extent_root, flags); 3952 3953 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3954 trans->allocating_chunk = false; 3955 3956 spin_lock(&space_info->lock); 3957 if (ret < 0 && ret != -ENOSPC) 3958 goto out; 3959 if (ret) 3960 space_info->full = 1; 3961 else 3962 ret = 1; 3963 3964 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3965 out: 3966 space_info->chunk_alloc = 0; 3967 spin_unlock(&space_info->lock); 3968 mutex_unlock(&fs_info->chunk_mutex); 3969 return ret; 3970 } 3971 3972 static int can_overcommit(struct btrfs_root *root, 3973 struct btrfs_space_info *space_info, u64 bytes, 3974 enum btrfs_reserve_flush_enum flush) 3975 { 3976 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3977 u64 profile = btrfs_get_alloc_profile(root, 0); 3978 u64 space_size; 3979 u64 avail; 3980 u64 used; 3981 3982 used = space_info->bytes_used + space_info->bytes_reserved + 3983 space_info->bytes_pinned + space_info->bytes_readonly; 3984 3985 /* 3986 * We only want to allow over committing if we have lots of actual space 3987 * free, but if we don't have enough space to handle the global reserve 3988 * space then we could end up having a real enospc problem when trying 3989 * to allocate a chunk or some other such important allocation. 3990 */ 3991 spin_lock(&global_rsv->lock); 3992 space_size = calc_global_rsv_need_space(global_rsv); 3993 spin_unlock(&global_rsv->lock); 3994 if (used + space_size >= space_info->total_bytes) 3995 return 0; 3996 3997 used += space_info->bytes_may_use; 3998 3999 spin_lock(&root->fs_info->free_chunk_lock); 4000 avail = root->fs_info->free_chunk_space; 4001 spin_unlock(&root->fs_info->free_chunk_lock); 4002 4003 /* 4004 * If we have dup, raid1 or raid10 then only half of the free 4005 * space is actually useable. For raid56, the space info used 4006 * doesn't include the parity drive, so we don't have to 4007 * change the math 4008 */ 4009 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4010 BTRFS_BLOCK_GROUP_RAID1 | 4011 BTRFS_BLOCK_GROUP_RAID10)) 4012 avail >>= 1; 4013 4014 /* 4015 * If we aren't flushing all things, let us overcommit up to 4016 * 1/2th of the space. If we can flush, don't let us overcommit 4017 * too much, let it overcommit up to 1/8 of the space. 4018 */ 4019 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4020 avail >>= 3; 4021 else 4022 avail >>= 1; 4023 4024 if (used + bytes < space_info->total_bytes + avail) 4025 return 1; 4026 return 0; 4027 } 4028 4029 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4030 unsigned long nr_pages, int nr_items) 4031 { 4032 struct super_block *sb = root->fs_info->sb; 4033 4034 if (down_read_trylock(&sb->s_umount)) { 4035 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4036 up_read(&sb->s_umount); 4037 } else { 4038 /* 4039 * We needn't worry the filesystem going from r/w to r/o though 4040 * we don't acquire ->s_umount mutex, because the filesystem 4041 * should guarantee the delalloc inodes list be empty after 4042 * the filesystem is readonly(all dirty pages are written to 4043 * the disk). 4044 */ 4045 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4046 if (!current->journal_info) 4047 btrfs_wait_ordered_roots(root->fs_info, nr_items); 4048 } 4049 } 4050 4051 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4052 { 4053 u64 bytes; 4054 int nr; 4055 4056 bytes = btrfs_calc_trans_metadata_size(root, 1); 4057 nr = (int)div64_u64(to_reclaim, bytes); 4058 if (!nr) 4059 nr = 1; 4060 return nr; 4061 } 4062 4063 #define EXTENT_SIZE_PER_ITEM (256 * 1024) 4064 4065 /* 4066 * shrink metadata reservation for delalloc 4067 */ 4068 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4069 bool wait_ordered) 4070 { 4071 struct btrfs_block_rsv *block_rsv; 4072 struct btrfs_space_info *space_info; 4073 struct btrfs_trans_handle *trans; 4074 u64 delalloc_bytes; 4075 u64 max_reclaim; 4076 long time_left; 4077 unsigned long nr_pages; 4078 int loops; 4079 int items; 4080 enum btrfs_reserve_flush_enum flush; 4081 4082 /* Calc the number of the pages we need flush for space reservation */ 4083 items = calc_reclaim_items_nr(root, to_reclaim); 4084 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4085 4086 trans = (struct btrfs_trans_handle *)current->journal_info; 4087 block_rsv = &root->fs_info->delalloc_block_rsv; 4088 space_info = block_rsv->space_info; 4089 4090 delalloc_bytes = percpu_counter_sum_positive( 4091 &root->fs_info->delalloc_bytes); 4092 if (delalloc_bytes == 0) { 4093 if (trans) 4094 return; 4095 if (wait_ordered) 4096 btrfs_wait_ordered_roots(root->fs_info, items); 4097 return; 4098 } 4099 4100 loops = 0; 4101 while (delalloc_bytes && loops < 3) { 4102 max_reclaim = min(delalloc_bytes, to_reclaim); 4103 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4104 btrfs_writeback_inodes_sb_nr(root, nr_pages, items); 4105 /* 4106 * We need to wait for the async pages to actually start before 4107 * we do anything. 4108 */ 4109 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4110 if (!max_reclaim) 4111 goto skip_async; 4112 4113 if (max_reclaim <= nr_pages) 4114 max_reclaim = 0; 4115 else 4116 max_reclaim -= nr_pages; 4117 4118 wait_event(root->fs_info->async_submit_wait, 4119 atomic_read(&root->fs_info->async_delalloc_pages) <= 4120 (int)max_reclaim); 4121 skip_async: 4122 if (!trans) 4123 flush = BTRFS_RESERVE_FLUSH_ALL; 4124 else 4125 flush = BTRFS_RESERVE_NO_FLUSH; 4126 spin_lock(&space_info->lock); 4127 if (can_overcommit(root, space_info, orig, flush)) { 4128 spin_unlock(&space_info->lock); 4129 break; 4130 } 4131 spin_unlock(&space_info->lock); 4132 4133 loops++; 4134 if (wait_ordered && !trans) { 4135 btrfs_wait_ordered_roots(root->fs_info, items); 4136 } else { 4137 time_left = schedule_timeout_killable(1); 4138 if (time_left) 4139 break; 4140 } 4141 delalloc_bytes = percpu_counter_sum_positive( 4142 &root->fs_info->delalloc_bytes); 4143 } 4144 } 4145 4146 /** 4147 * maybe_commit_transaction - possibly commit the transaction if its ok to 4148 * @root - the root we're allocating for 4149 * @bytes - the number of bytes we want to reserve 4150 * @force - force the commit 4151 * 4152 * This will check to make sure that committing the transaction will actually 4153 * get us somewhere and then commit the transaction if it does. Otherwise it 4154 * will return -ENOSPC. 4155 */ 4156 static int may_commit_transaction(struct btrfs_root *root, 4157 struct btrfs_space_info *space_info, 4158 u64 bytes, int force) 4159 { 4160 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4161 struct btrfs_trans_handle *trans; 4162 4163 trans = (struct btrfs_trans_handle *)current->journal_info; 4164 if (trans) 4165 return -EAGAIN; 4166 4167 if (force) 4168 goto commit; 4169 4170 /* See if there is enough pinned space to make this reservation */ 4171 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4172 bytes) >= 0) 4173 goto commit; 4174 4175 /* 4176 * See if there is some space in the delayed insertion reservation for 4177 * this reservation. 4178 */ 4179 if (space_info != delayed_rsv->space_info) 4180 return -ENOSPC; 4181 4182 spin_lock(&delayed_rsv->lock); 4183 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4184 bytes - delayed_rsv->size) >= 0) { 4185 spin_unlock(&delayed_rsv->lock); 4186 return -ENOSPC; 4187 } 4188 spin_unlock(&delayed_rsv->lock); 4189 4190 commit: 4191 trans = btrfs_join_transaction(root); 4192 if (IS_ERR(trans)) 4193 return -ENOSPC; 4194 4195 return btrfs_commit_transaction(trans, root); 4196 } 4197 4198 enum flush_state { 4199 FLUSH_DELAYED_ITEMS_NR = 1, 4200 FLUSH_DELAYED_ITEMS = 2, 4201 FLUSH_DELALLOC = 3, 4202 FLUSH_DELALLOC_WAIT = 4, 4203 ALLOC_CHUNK = 5, 4204 COMMIT_TRANS = 6, 4205 }; 4206 4207 static int flush_space(struct btrfs_root *root, 4208 struct btrfs_space_info *space_info, u64 num_bytes, 4209 u64 orig_bytes, int state) 4210 { 4211 struct btrfs_trans_handle *trans; 4212 int nr; 4213 int ret = 0; 4214 4215 switch (state) { 4216 case FLUSH_DELAYED_ITEMS_NR: 4217 case FLUSH_DELAYED_ITEMS: 4218 if (state == FLUSH_DELAYED_ITEMS_NR) 4219 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4220 else 4221 nr = -1; 4222 4223 trans = btrfs_join_transaction(root); 4224 if (IS_ERR(trans)) { 4225 ret = PTR_ERR(trans); 4226 break; 4227 } 4228 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4229 btrfs_end_transaction(trans, root); 4230 break; 4231 case FLUSH_DELALLOC: 4232 case FLUSH_DELALLOC_WAIT: 4233 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4234 state == FLUSH_DELALLOC_WAIT); 4235 break; 4236 case ALLOC_CHUNK: 4237 trans = btrfs_join_transaction(root); 4238 if (IS_ERR(trans)) { 4239 ret = PTR_ERR(trans); 4240 break; 4241 } 4242 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4243 btrfs_get_alloc_profile(root, 0), 4244 CHUNK_ALLOC_NO_FORCE); 4245 btrfs_end_transaction(trans, root); 4246 if (ret == -ENOSPC) 4247 ret = 0; 4248 break; 4249 case COMMIT_TRANS: 4250 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4251 break; 4252 default: 4253 ret = -ENOSPC; 4254 break; 4255 } 4256 4257 return ret; 4258 } 4259 4260 static inline u64 4261 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4262 struct btrfs_space_info *space_info) 4263 { 4264 u64 used; 4265 u64 expected; 4266 u64 to_reclaim; 4267 4268 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, 4269 16 * 1024 * 1024); 4270 spin_lock(&space_info->lock); 4271 if (can_overcommit(root, space_info, to_reclaim, 4272 BTRFS_RESERVE_FLUSH_ALL)) { 4273 to_reclaim = 0; 4274 goto out; 4275 } 4276 4277 used = space_info->bytes_used + space_info->bytes_reserved + 4278 space_info->bytes_pinned + space_info->bytes_readonly + 4279 space_info->bytes_may_use; 4280 if (can_overcommit(root, space_info, 1024 * 1024, 4281 BTRFS_RESERVE_FLUSH_ALL)) 4282 expected = div_factor_fine(space_info->total_bytes, 95); 4283 else 4284 expected = div_factor_fine(space_info->total_bytes, 90); 4285 4286 if (used > expected) 4287 to_reclaim = used - expected; 4288 else 4289 to_reclaim = 0; 4290 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4291 space_info->bytes_reserved); 4292 out: 4293 spin_unlock(&space_info->lock); 4294 4295 return to_reclaim; 4296 } 4297 4298 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4299 struct btrfs_fs_info *fs_info, u64 used) 4300 { 4301 return (used >= div_factor_fine(space_info->total_bytes, 98) && 4302 !btrfs_fs_closing(fs_info) && 4303 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4304 } 4305 4306 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4307 struct btrfs_fs_info *fs_info, 4308 int flush_state) 4309 { 4310 u64 used; 4311 4312 spin_lock(&space_info->lock); 4313 /* 4314 * We run out of space and have not got any free space via flush_space, 4315 * so don't bother doing async reclaim. 4316 */ 4317 if (flush_state > COMMIT_TRANS && space_info->full) { 4318 spin_unlock(&space_info->lock); 4319 return 0; 4320 } 4321 4322 used = space_info->bytes_used + space_info->bytes_reserved + 4323 space_info->bytes_pinned + space_info->bytes_readonly + 4324 space_info->bytes_may_use; 4325 if (need_do_async_reclaim(space_info, fs_info, used)) { 4326 spin_unlock(&space_info->lock); 4327 return 1; 4328 } 4329 spin_unlock(&space_info->lock); 4330 4331 return 0; 4332 } 4333 4334 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4335 { 4336 struct btrfs_fs_info *fs_info; 4337 struct btrfs_space_info *space_info; 4338 u64 to_reclaim; 4339 int flush_state; 4340 4341 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4342 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4343 4344 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4345 space_info); 4346 if (!to_reclaim) 4347 return; 4348 4349 flush_state = FLUSH_DELAYED_ITEMS_NR; 4350 do { 4351 flush_space(fs_info->fs_root, space_info, to_reclaim, 4352 to_reclaim, flush_state); 4353 flush_state++; 4354 if (!btrfs_need_do_async_reclaim(space_info, fs_info, 4355 flush_state)) 4356 return; 4357 } while (flush_state <= COMMIT_TRANS); 4358 4359 if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) 4360 queue_work(system_unbound_wq, work); 4361 } 4362 4363 void btrfs_init_async_reclaim_work(struct work_struct *work) 4364 { 4365 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 4366 } 4367 4368 /** 4369 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4370 * @root - the root we're allocating for 4371 * @block_rsv - the block_rsv we're allocating for 4372 * @orig_bytes - the number of bytes we want 4373 * @flush - whether or not we can flush to make our reservation 4374 * 4375 * This will reserve orgi_bytes number of bytes from the space info associated 4376 * with the block_rsv. If there is not enough space it will make an attempt to 4377 * flush out space to make room. It will do this by flushing delalloc if 4378 * possible or committing the transaction. If flush is 0 then no attempts to 4379 * regain reservations will be made and this will fail if there is not enough 4380 * space already. 4381 */ 4382 static int reserve_metadata_bytes(struct btrfs_root *root, 4383 struct btrfs_block_rsv *block_rsv, 4384 u64 orig_bytes, 4385 enum btrfs_reserve_flush_enum flush) 4386 { 4387 struct btrfs_space_info *space_info = block_rsv->space_info; 4388 u64 used; 4389 u64 num_bytes = orig_bytes; 4390 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4391 int ret = 0; 4392 bool flushing = false; 4393 4394 again: 4395 ret = 0; 4396 spin_lock(&space_info->lock); 4397 /* 4398 * We only want to wait if somebody other than us is flushing and we 4399 * are actually allowed to flush all things. 4400 */ 4401 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4402 space_info->flush) { 4403 spin_unlock(&space_info->lock); 4404 /* 4405 * If we have a trans handle we can't wait because the flusher 4406 * may have to commit the transaction, which would mean we would 4407 * deadlock since we are waiting for the flusher to finish, but 4408 * hold the current transaction open. 4409 */ 4410 if (current->journal_info) 4411 return -EAGAIN; 4412 ret = wait_event_killable(space_info->wait, !space_info->flush); 4413 /* Must have been killed, return */ 4414 if (ret) 4415 return -EINTR; 4416 4417 spin_lock(&space_info->lock); 4418 } 4419 4420 ret = -ENOSPC; 4421 used = space_info->bytes_used + space_info->bytes_reserved + 4422 space_info->bytes_pinned + space_info->bytes_readonly + 4423 space_info->bytes_may_use; 4424 4425 /* 4426 * The idea here is that we've not already over-reserved the block group 4427 * then we can go ahead and save our reservation first and then start 4428 * flushing if we need to. Otherwise if we've already overcommitted 4429 * lets start flushing stuff first and then come back and try to make 4430 * our reservation. 4431 */ 4432 if (used <= space_info->total_bytes) { 4433 if (used + orig_bytes <= space_info->total_bytes) { 4434 space_info->bytes_may_use += orig_bytes; 4435 trace_btrfs_space_reservation(root->fs_info, 4436 "space_info", space_info->flags, orig_bytes, 1); 4437 ret = 0; 4438 } else { 4439 /* 4440 * Ok set num_bytes to orig_bytes since we aren't 4441 * overocmmitted, this way we only try and reclaim what 4442 * we need. 4443 */ 4444 num_bytes = orig_bytes; 4445 } 4446 } else { 4447 /* 4448 * Ok we're over committed, set num_bytes to the overcommitted 4449 * amount plus the amount of bytes that we need for this 4450 * reservation. 4451 */ 4452 num_bytes = used - space_info->total_bytes + 4453 (orig_bytes * 2); 4454 } 4455 4456 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4457 space_info->bytes_may_use += orig_bytes; 4458 trace_btrfs_space_reservation(root->fs_info, "space_info", 4459 space_info->flags, orig_bytes, 4460 1); 4461 ret = 0; 4462 } 4463 4464 /* 4465 * Couldn't make our reservation, save our place so while we're trying 4466 * to reclaim space we can actually use it instead of somebody else 4467 * stealing it from us. 4468 * 4469 * We make the other tasks wait for the flush only when we can flush 4470 * all things. 4471 */ 4472 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4473 flushing = true; 4474 space_info->flush = 1; 4475 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4476 used += orig_bytes; 4477 /* 4478 * We will do the space reservation dance during log replay, 4479 * which means we won't have fs_info->fs_root set, so don't do 4480 * the async reclaim as we will panic. 4481 */ 4482 if (!root->fs_info->log_root_recovering && 4483 need_do_async_reclaim(space_info, root->fs_info, used) && 4484 !work_busy(&root->fs_info->async_reclaim_work)) 4485 queue_work(system_unbound_wq, 4486 &root->fs_info->async_reclaim_work); 4487 } 4488 spin_unlock(&space_info->lock); 4489 4490 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4491 goto out; 4492 4493 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4494 flush_state); 4495 flush_state++; 4496 4497 /* 4498 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4499 * would happen. So skip delalloc flush. 4500 */ 4501 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4502 (flush_state == FLUSH_DELALLOC || 4503 flush_state == FLUSH_DELALLOC_WAIT)) 4504 flush_state = ALLOC_CHUNK; 4505 4506 if (!ret) 4507 goto again; 4508 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4509 flush_state < COMMIT_TRANS) 4510 goto again; 4511 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4512 flush_state <= COMMIT_TRANS) 4513 goto again; 4514 4515 out: 4516 if (ret == -ENOSPC && 4517 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4518 struct btrfs_block_rsv *global_rsv = 4519 &root->fs_info->global_block_rsv; 4520 4521 if (block_rsv != global_rsv && 4522 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4523 ret = 0; 4524 } 4525 if (ret == -ENOSPC) 4526 trace_btrfs_space_reservation(root->fs_info, 4527 "space_info:enospc", 4528 space_info->flags, orig_bytes, 1); 4529 if (flushing) { 4530 spin_lock(&space_info->lock); 4531 space_info->flush = 0; 4532 wake_up_all(&space_info->wait); 4533 spin_unlock(&space_info->lock); 4534 } 4535 return ret; 4536 } 4537 4538 static struct btrfs_block_rsv *get_block_rsv( 4539 const struct btrfs_trans_handle *trans, 4540 const struct btrfs_root *root) 4541 { 4542 struct btrfs_block_rsv *block_rsv = NULL; 4543 4544 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4545 block_rsv = trans->block_rsv; 4546 4547 if (root == root->fs_info->csum_root && trans->adding_csums) 4548 block_rsv = trans->block_rsv; 4549 4550 if (root == root->fs_info->uuid_root) 4551 block_rsv = trans->block_rsv; 4552 4553 if (!block_rsv) 4554 block_rsv = root->block_rsv; 4555 4556 if (!block_rsv) 4557 block_rsv = &root->fs_info->empty_block_rsv; 4558 4559 return block_rsv; 4560 } 4561 4562 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4563 u64 num_bytes) 4564 { 4565 int ret = -ENOSPC; 4566 spin_lock(&block_rsv->lock); 4567 if (block_rsv->reserved >= num_bytes) { 4568 block_rsv->reserved -= num_bytes; 4569 if (block_rsv->reserved < block_rsv->size) 4570 block_rsv->full = 0; 4571 ret = 0; 4572 } 4573 spin_unlock(&block_rsv->lock); 4574 return ret; 4575 } 4576 4577 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4578 u64 num_bytes, int update_size) 4579 { 4580 spin_lock(&block_rsv->lock); 4581 block_rsv->reserved += num_bytes; 4582 if (update_size) 4583 block_rsv->size += num_bytes; 4584 else if (block_rsv->reserved >= block_rsv->size) 4585 block_rsv->full = 1; 4586 spin_unlock(&block_rsv->lock); 4587 } 4588 4589 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 4590 struct btrfs_block_rsv *dest, u64 num_bytes, 4591 int min_factor) 4592 { 4593 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4594 u64 min_bytes; 4595 4596 if (global_rsv->space_info != dest->space_info) 4597 return -ENOSPC; 4598 4599 spin_lock(&global_rsv->lock); 4600 min_bytes = div_factor(global_rsv->size, min_factor); 4601 if (global_rsv->reserved < min_bytes + num_bytes) { 4602 spin_unlock(&global_rsv->lock); 4603 return -ENOSPC; 4604 } 4605 global_rsv->reserved -= num_bytes; 4606 if (global_rsv->reserved < global_rsv->size) 4607 global_rsv->full = 0; 4608 spin_unlock(&global_rsv->lock); 4609 4610 block_rsv_add_bytes(dest, num_bytes, 1); 4611 return 0; 4612 } 4613 4614 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4615 struct btrfs_block_rsv *block_rsv, 4616 struct btrfs_block_rsv *dest, u64 num_bytes) 4617 { 4618 struct btrfs_space_info *space_info = block_rsv->space_info; 4619 4620 spin_lock(&block_rsv->lock); 4621 if (num_bytes == (u64)-1) 4622 num_bytes = block_rsv->size; 4623 block_rsv->size -= num_bytes; 4624 if (block_rsv->reserved >= block_rsv->size) { 4625 num_bytes = block_rsv->reserved - block_rsv->size; 4626 block_rsv->reserved = block_rsv->size; 4627 block_rsv->full = 1; 4628 } else { 4629 num_bytes = 0; 4630 } 4631 spin_unlock(&block_rsv->lock); 4632 4633 if (num_bytes > 0) { 4634 if (dest) { 4635 spin_lock(&dest->lock); 4636 if (!dest->full) { 4637 u64 bytes_to_add; 4638 4639 bytes_to_add = dest->size - dest->reserved; 4640 bytes_to_add = min(num_bytes, bytes_to_add); 4641 dest->reserved += bytes_to_add; 4642 if (dest->reserved >= dest->size) 4643 dest->full = 1; 4644 num_bytes -= bytes_to_add; 4645 } 4646 spin_unlock(&dest->lock); 4647 } 4648 if (num_bytes) { 4649 spin_lock(&space_info->lock); 4650 space_info->bytes_may_use -= num_bytes; 4651 trace_btrfs_space_reservation(fs_info, "space_info", 4652 space_info->flags, num_bytes, 0); 4653 spin_unlock(&space_info->lock); 4654 } 4655 } 4656 } 4657 4658 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4659 struct btrfs_block_rsv *dst, u64 num_bytes) 4660 { 4661 int ret; 4662 4663 ret = block_rsv_use_bytes(src, num_bytes); 4664 if (ret) 4665 return ret; 4666 4667 block_rsv_add_bytes(dst, num_bytes, 1); 4668 return 0; 4669 } 4670 4671 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4672 { 4673 memset(rsv, 0, sizeof(*rsv)); 4674 spin_lock_init(&rsv->lock); 4675 rsv->type = type; 4676 } 4677 4678 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4679 unsigned short type) 4680 { 4681 struct btrfs_block_rsv *block_rsv; 4682 struct btrfs_fs_info *fs_info = root->fs_info; 4683 4684 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4685 if (!block_rsv) 4686 return NULL; 4687 4688 btrfs_init_block_rsv(block_rsv, type); 4689 block_rsv->space_info = __find_space_info(fs_info, 4690 BTRFS_BLOCK_GROUP_METADATA); 4691 return block_rsv; 4692 } 4693 4694 void btrfs_free_block_rsv(struct btrfs_root *root, 4695 struct btrfs_block_rsv *rsv) 4696 { 4697 if (!rsv) 4698 return; 4699 btrfs_block_rsv_release(root, rsv, (u64)-1); 4700 kfree(rsv); 4701 } 4702 4703 int btrfs_block_rsv_add(struct btrfs_root *root, 4704 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4705 enum btrfs_reserve_flush_enum flush) 4706 { 4707 int ret; 4708 4709 if (num_bytes == 0) 4710 return 0; 4711 4712 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4713 if (!ret) { 4714 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4715 return 0; 4716 } 4717 4718 return ret; 4719 } 4720 4721 int btrfs_block_rsv_check(struct btrfs_root *root, 4722 struct btrfs_block_rsv *block_rsv, int min_factor) 4723 { 4724 u64 num_bytes = 0; 4725 int ret = -ENOSPC; 4726 4727 if (!block_rsv) 4728 return 0; 4729 4730 spin_lock(&block_rsv->lock); 4731 num_bytes = div_factor(block_rsv->size, min_factor); 4732 if (block_rsv->reserved >= num_bytes) 4733 ret = 0; 4734 spin_unlock(&block_rsv->lock); 4735 4736 return ret; 4737 } 4738 4739 int btrfs_block_rsv_refill(struct btrfs_root *root, 4740 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4741 enum btrfs_reserve_flush_enum flush) 4742 { 4743 u64 num_bytes = 0; 4744 int ret = -ENOSPC; 4745 4746 if (!block_rsv) 4747 return 0; 4748 4749 spin_lock(&block_rsv->lock); 4750 num_bytes = min_reserved; 4751 if (block_rsv->reserved >= num_bytes) 4752 ret = 0; 4753 else 4754 num_bytes -= block_rsv->reserved; 4755 spin_unlock(&block_rsv->lock); 4756 4757 if (!ret) 4758 return 0; 4759 4760 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4761 if (!ret) { 4762 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4763 return 0; 4764 } 4765 4766 return ret; 4767 } 4768 4769 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4770 struct btrfs_block_rsv *dst_rsv, 4771 u64 num_bytes) 4772 { 4773 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4774 } 4775 4776 void btrfs_block_rsv_release(struct btrfs_root *root, 4777 struct btrfs_block_rsv *block_rsv, 4778 u64 num_bytes) 4779 { 4780 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4781 if (global_rsv == block_rsv || 4782 block_rsv->space_info != global_rsv->space_info) 4783 global_rsv = NULL; 4784 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4785 num_bytes); 4786 } 4787 4788 /* 4789 * helper to calculate size of global block reservation. 4790 * the desired value is sum of space used by extent tree, 4791 * checksum tree and root tree 4792 */ 4793 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4794 { 4795 struct btrfs_space_info *sinfo; 4796 u64 num_bytes; 4797 u64 meta_used; 4798 u64 data_used; 4799 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4800 4801 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4802 spin_lock(&sinfo->lock); 4803 data_used = sinfo->bytes_used; 4804 spin_unlock(&sinfo->lock); 4805 4806 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4807 spin_lock(&sinfo->lock); 4808 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4809 data_used = 0; 4810 meta_used = sinfo->bytes_used; 4811 spin_unlock(&sinfo->lock); 4812 4813 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4814 csum_size * 2; 4815 num_bytes += div64_u64(data_used + meta_used, 50); 4816 4817 if (num_bytes * 3 > meta_used) 4818 num_bytes = div64_u64(meta_used, 3); 4819 4820 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); 4821 } 4822 4823 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4824 { 4825 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4826 struct btrfs_space_info *sinfo = block_rsv->space_info; 4827 u64 num_bytes; 4828 4829 num_bytes = calc_global_metadata_size(fs_info); 4830 4831 spin_lock(&sinfo->lock); 4832 spin_lock(&block_rsv->lock); 4833 4834 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4835 4836 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4837 sinfo->bytes_reserved + sinfo->bytes_readonly + 4838 sinfo->bytes_may_use; 4839 4840 if (sinfo->total_bytes > num_bytes) { 4841 num_bytes = sinfo->total_bytes - num_bytes; 4842 block_rsv->reserved += num_bytes; 4843 sinfo->bytes_may_use += num_bytes; 4844 trace_btrfs_space_reservation(fs_info, "space_info", 4845 sinfo->flags, num_bytes, 1); 4846 } 4847 4848 if (block_rsv->reserved >= block_rsv->size) { 4849 num_bytes = block_rsv->reserved - block_rsv->size; 4850 sinfo->bytes_may_use -= num_bytes; 4851 trace_btrfs_space_reservation(fs_info, "space_info", 4852 sinfo->flags, num_bytes, 0); 4853 block_rsv->reserved = block_rsv->size; 4854 block_rsv->full = 1; 4855 } 4856 4857 spin_unlock(&block_rsv->lock); 4858 spin_unlock(&sinfo->lock); 4859 } 4860 4861 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4862 { 4863 struct btrfs_space_info *space_info; 4864 4865 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4866 fs_info->chunk_block_rsv.space_info = space_info; 4867 4868 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4869 fs_info->global_block_rsv.space_info = space_info; 4870 fs_info->delalloc_block_rsv.space_info = space_info; 4871 fs_info->trans_block_rsv.space_info = space_info; 4872 fs_info->empty_block_rsv.space_info = space_info; 4873 fs_info->delayed_block_rsv.space_info = space_info; 4874 4875 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4876 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4877 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4878 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4879 if (fs_info->quota_root) 4880 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 4881 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4882 4883 update_global_block_rsv(fs_info); 4884 } 4885 4886 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4887 { 4888 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4889 (u64)-1); 4890 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4891 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4892 WARN_ON(fs_info->trans_block_rsv.size > 0); 4893 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4894 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4895 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4896 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4897 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4898 } 4899 4900 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4901 struct btrfs_root *root) 4902 { 4903 if (!trans->block_rsv) 4904 return; 4905 4906 if (!trans->bytes_reserved) 4907 return; 4908 4909 trace_btrfs_space_reservation(root->fs_info, "transaction", 4910 trans->transid, trans->bytes_reserved, 0); 4911 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4912 trans->bytes_reserved = 0; 4913 } 4914 4915 /* Can only return 0 or -ENOSPC */ 4916 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4917 struct inode *inode) 4918 { 4919 struct btrfs_root *root = BTRFS_I(inode)->root; 4920 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4921 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4922 4923 /* 4924 * We need to hold space in order to delete our orphan item once we've 4925 * added it, so this takes the reservation so we can release it later 4926 * when we are truly done with the orphan item. 4927 */ 4928 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4929 trace_btrfs_space_reservation(root->fs_info, "orphan", 4930 btrfs_ino(inode), num_bytes, 1); 4931 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4932 } 4933 4934 void btrfs_orphan_release_metadata(struct inode *inode) 4935 { 4936 struct btrfs_root *root = BTRFS_I(inode)->root; 4937 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4938 trace_btrfs_space_reservation(root->fs_info, "orphan", 4939 btrfs_ino(inode), num_bytes, 0); 4940 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4941 } 4942 4943 /* 4944 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4945 * root: the root of the parent directory 4946 * rsv: block reservation 4947 * items: the number of items that we need do reservation 4948 * qgroup_reserved: used to return the reserved size in qgroup 4949 * 4950 * This function is used to reserve the space for snapshot/subvolume 4951 * creation and deletion. Those operations are different with the 4952 * common file/directory operations, they change two fs/file trees 4953 * and root tree, the number of items that the qgroup reserves is 4954 * different with the free space reservation. So we can not use 4955 * the space reseravtion mechanism in start_transaction(). 4956 */ 4957 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4958 struct btrfs_block_rsv *rsv, 4959 int items, 4960 u64 *qgroup_reserved, 4961 bool use_global_rsv) 4962 { 4963 u64 num_bytes; 4964 int ret; 4965 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4966 4967 if (root->fs_info->quota_enabled) { 4968 /* One for parent inode, two for dir entries */ 4969 num_bytes = 3 * root->nodesize; 4970 ret = btrfs_qgroup_reserve(root, num_bytes); 4971 if (ret) 4972 return ret; 4973 } else { 4974 num_bytes = 0; 4975 } 4976 4977 *qgroup_reserved = num_bytes; 4978 4979 num_bytes = btrfs_calc_trans_metadata_size(root, items); 4980 rsv->space_info = __find_space_info(root->fs_info, 4981 BTRFS_BLOCK_GROUP_METADATA); 4982 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 4983 BTRFS_RESERVE_FLUSH_ALL); 4984 4985 if (ret == -ENOSPC && use_global_rsv) 4986 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 4987 4988 if (ret) { 4989 if (*qgroup_reserved) 4990 btrfs_qgroup_free(root, *qgroup_reserved); 4991 } 4992 4993 return ret; 4994 } 4995 4996 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 4997 struct btrfs_block_rsv *rsv, 4998 u64 qgroup_reserved) 4999 { 5000 btrfs_block_rsv_release(root, rsv, (u64)-1); 5001 if (qgroup_reserved) 5002 btrfs_qgroup_free(root, qgroup_reserved); 5003 } 5004 5005 /** 5006 * drop_outstanding_extent - drop an outstanding extent 5007 * @inode: the inode we're dropping the extent for 5008 * @num_bytes: the number of bytes we're relaseing. 5009 * 5010 * This is called when we are freeing up an outstanding extent, either called 5011 * after an error or after an extent is written. This will return the number of 5012 * reserved extents that need to be freed. This must be called with 5013 * BTRFS_I(inode)->lock held. 5014 */ 5015 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) 5016 { 5017 unsigned drop_inode_space = 0; 5018 unsigned dropped_extents = 0; 5019 unsigned num_extents = 0; 5020 5021 num_extents = (unsigned)div64_u64(num_bytes + 5022 BTRFS_MAX_EXTENT_SIZE - 1, 5023 BTRFS_MAX_EXTENT_SIZE); 5024 ASSERT(num_extents); 5025 ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); 5026 BTRFS_I(inode)->outstanding_extents -= num_extents; 5027 5028 if (BTRFS_I(inode)->outstanding_extents == 0 && 5029 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5030 &BTRFS_I(inode)->runtime_flags)) 5031 drop_inode_space = 1; 5032 5033 /* 5034 * If we have more or the same amount of outsanding extents than we have 5035 * reserved then we need to leave the reserved extents count alone. 5036 */ 5037 if (BTRFS_I(inode)->outstanding_extents >= 5038 BTRFS_I(inode)->reserved_extents) 5039 return drop_inode_space; 5040 5041 dropped_extents = BTRFS_I(inode)->reserved_extents - 5042 BTRFS_I(inode)->outstanding_extents; 5043 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5044 return dropped_extents + drop_inode_space; 5045 } 5046 5047 /** 5048 * calc_csum_metadata_size - return the amount of metada space that must be 5049 * reserved/free'd for the given bytes. 5050 * @inode: the inode we're manipulating 5051 * @num_bytes: the number of bytes in question 5052 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5053 * 5054 * This adjusts the number of csum_bytes in the inode and then returns the 5055 * correct amount of metadata that must either be reserved or freed. We 5056 * calculate how many checksums we can fit into one leaf and then divide the 5057 * number of bytes that will need to be checksumed by this value to figure out 5058 * how many checksums will be required. If we are adding bytes then the number 5059 * may go up and we will return the number of additional bytes that must be 5060 * reserved. If it is going down we will return the number of bytes that must 5061 * be freed. 5062 * 5063 * This must be called with BTRFS_I(inode)->lock held. 5064 */ 5065 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5066 int reserve) 5067 { 5068 struct btrfs_root *root = BTRFS_I(inode)->root; 5069 u64 csum_size; 5070 int num_csums_per_leaf; 5071 int num_csums; 5072 int old_csums; 5073 5074 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5075 BTRFS_I(inode)->csum_bytes == 0) 5076 return 0; 5077 5078 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5079 if (reserve) 5080 BTRFS_I(inode)->csum_bytes += num_bytes; 5081 else 5082 BTRFS_I(inode)->csum_bytes -= num_bytes; 5083 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 5084 num_csums_per_leaf = (int)div64_u64(csum_size, 5085 sizeof(struct btrfs_csum_item) + 5086 sizeof(struct btrfs_disk_key)); 5087 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5088 num_csums = num_csums + num_csums_per_leaf - 1; 5089 num_csums = num_csums / num_csums_per_leaf; 5090 5091 old_csums = old_csums + num_csums_per_leaf - 1; 5092 old_csums = old_csums / num_csums_per_leaf; 5093 5094 /* No change, no need to reserve more */ 5095 if (old_csums == num_csums) 5096 return 0; 5097 5098 if (reserve) 5099 return btrfs_calc_trans_metadata_size(root, 5100 num_csums - old_csums); 5101 5102 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 5103 } 5104 5105 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5106 { 5107 struct btrfs_root *root = BTRFS_I(inode)->root; 5108 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 5109 u64 to_reserve = 0; 5110 u64 csum_bytes; 5111 unsigned nr_extents = 0; 5112 int extra_reserve = 0; 5113 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5114 int ret = 0; 5115 bool delalloc_lock = true; 5116 u64 to_free = 0; 5117 unsigned dropped; 5118 5119 /* If we are a free space inode we need to not flush since we will be in 5120 * the middle of a transaction commit. We also don't need the delalloc 5121 * mutex since we won't race with anybody. We need this mostly to make 5122 * lockdep shut its filthy mouth. 5123 */ 5124 if (btrfs_is_free_space_inode(inode)) { 5125 flush = BTRFS_RESERVE_NO_FLUSH; 5126 delalloc_lock = false; 5127 } 5128 5129 if (flush != BTRFS_RESERVE_NO_FLUSH && 5130 btrfs_transaction_in_commit(root->fs_info)) 5131 schedule_timeout(1); 5132 5133 if (delalloc_lock) 5134 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5135 5136 num_bytes = ALIGN(num_bytes, root->sectorsize); 5137 5138 spin_lock(&BTRFS_I(inode)->lock); 5139 nr_extents = (unsigned)div64_u64(num_bytes + 5140 BTRFS_MAX_EXTENT_SIZE - 1, 5141 BTRFS_MAX_EXTENT_SIZE); 5142 BTRFS_I(inode)->outstanding_extents += nr_extents; 5143 nr_extents = 0; 5144 5145 if (BTRFS_I(inode)->outstanding_extents > 5146 BTRFS_I(inode)->reserved_extents) 5147 nr_extents = BTRFS_I(inode)->outstanding_extents - 5148 BTRFS_I(inode)->reserved_extents; 5149 5150 /* 5151 * Add an item to reserve for updating the inode when we complete the 5152 * delalloc io. 5153 */ 5154 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5155 &BTRFS_I(inode)->runtime_flags)) { 5156 nr_extents++; 5157 extra_reserve = 1; 5158 } 5159 5160 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 5161 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5162 csum_bytes = BTRFS_I(inode)->csum_bytes; 5163 spin_unlock(&BTRFS_I(inode)->lock); 5164 5165 if (root->fs_info->quota_enabled) { 5166 ret = btrfs_qgroup_reserve(root, num_bytes + 5167 nr_extents * root->nodesize); 5168 if (ret) 5169 goto out_fail; 5170 } 5171 5172 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5173 if (unlikely(ret)) { 5174 if (root->fs_info->quota_enabled) 5175 btrfs_qgroup_free(root, num_bytes + 5176 nr_extents * root->nodesize); 5177 goto out_fail; 5178 } 5179 5180 spin_lock(&BTRFS_I(inode)->lock); 5181 if (extra_reserve) { 5182 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5183 &BTRFS_I(inode)->runtime_flags); 5184 nr_extents--; 5185 } 5186 BTRFS_I(inode)->reserved_extents += nr_extents; 5187 spin_unlock(&BTRFS_I(inode)->lock); 5188 5189 if (delalloc_lock) 5190 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5191 5192 if (to_reserve) 5193 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5194 btrfs_ino(inode), to_reserve, 1); 5195 block_rsv_add_bytes(block_rsv, to_reserve, 1); 5196 5197 return 0; 5198 5199 out_fail: 5200 spin_lock(&BTRFS_I(inode)->lock); 5201 dropped = drop_outstanding_extent(inode, num_bytes); 5202 /* 5203 * If the inodes csum_bytes is the same as the original 5204 * csum_bytes then we know we haven't raced with any free()ers 5205 * so we can just reduce our inodes csum bytes and carry on. 5206 */ 5207 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 5208 calc_csum_metadata_size(inode, num_bytes, 0); 5209 } else { 5210 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 5211 u64 bytes; 5212 5213 /* 5214 * This is tricky, but first we need to figure out how much we 5215 * free'd from any free-ers that occured during this 5216 * reservation, so we reset ->csum_bytes to the csum_bytes 5217 * before we dropped our lock, and then call the free for the 5218 * number of bytes that were freed while we were trying our 5219 * reservation. 5220 */ 5221 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 5222 BTRFS_I(inode)->csum_bytes = csum_bytes; 5223 to_free = calc_csum_metadata_size(inode, bytes, 0); 5224 5225 5226 /* 5227 * Now we need to see how much we would have freed had we not 5228 * been making this reservation and our ->csum_bytes were not 5229 * artificially inflated. 5230 */ 5231 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 5232 bytes = csum_bytes - orig_csum_bytes; 5233 bytes = calc_csum_metadata_size(inode, bytes, 0); 5234 5235 /* 5236 * Now reset ->csum_bytes to what it should be. If bytes is 5237 * more than to_free then we would have free'd more space had we 5238 * not had an artificially high ->csum_bytes, so we need to free 5239 * the remainder. If bytes is the same or less then we don't 5240 * need to do anything, the other free-ers did the correct 5241 * thing. 5242 */ 5243 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 5244 if (bytes > to_free) 5245 to_free = bytes - to_free; 5246 else 5247 to_free = 0; 5248 } 5249 spin_unlock(&BTRFS_I(inode)->lock); 5250 if (dropped) 5251 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5252 5253 if (to_free) { 5254 btrfs_block_rsv_release(root, block_rsv, to_free); 5255 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5256 btrfs_ino(inode), to_free, 0); 5257 } 5258 if (delalloc_lock) 5259 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5260 return ret; 5261 } 5262 5263 /** 5264 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5265 * @inode: the inode to release the reservation for 5266 * @num_bytes: the number of bytes we're releasing 5267 * 5268 * This will release the metadata reservation for an inode. This can be called 5269 * once we complete IO for a given set of bytes to release their metadata 5270 * reservations. 5271 */ 5272 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 5273 { 5274 struct btrfs_root *root = BTRFS_I(inode)->root; 5275 u64 to_free = 0; 5276 unsigned dropped; 5277 5278 num_bytes = ALIGN(num_bytes, root->sectorsize); 5279 spin_lock(&BTRFS_I(inode)->lock); 5280 dropped = drop_outstanding_extent(inode, num_bytes); 5281 5282 if (num_bytes) 5283 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5284 spin_unlock(&BTRFS_I(inode)->lock); 5285 if (dropped > 0) 5286 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5287 5288 if (btrfs_test_is_dummy_root(root)) 5289 return; 5290 5291 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5292 btrfs_ino(inode), to_free, 0); 5293 if (root->fs_info->quota_enabled) { 5294 btrfs_qgroup_free(root, num_bytes + 5295 dropped * root->nodesize); 5296 } 5297 5298 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5299 to_free); 5300 } 5301 5302 /** 5303 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 5304 * @inode: inode we're writing to 5305 * @num_bytes: the number of bytes we want to allocate 5306 * 5307 * This will do the following things 5308 * 5309 * o reserve space in the data space info for num_bytes 5310 * o reserve space in the metadata space info based on number of outstanding 5311 * extents and how much csums will be needed 5312 * o add to the inodes ->delalloc_bytes 5313 * o add it to the fs_info's delalloc inodes list. 5314 * 5315 * This will return 0 for success and -ENOSPC if there is no space left. 5316 */ 5317 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 5318 { 5319 int ret; 5320 5321 ret = btrfs_check_data_free_space(inode, num_bytes); 5322 if (ret) 5323 return ret; 5324 5325 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 5326 if (ret) { 5327 btrfs_free_reserved_data_space(inode, num_bytes); 5328 return ret; 5329 } 5330 5331 return 0; 5332 } 5333 5334 /** 5335 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5336 * @inode: inode we're releasing space for 5337 * @num_bytes: the number of bytes we want to free up 5338 * 5339 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5340 * called in the case that we don't need the metadata AND data reservations 5341 * anymore. So if there is an error or we insert an inline extent. 5342 * 5343 * This function will release the metadata space that was not used and will 5344 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5345 * list if there are no delalloc bytes left. 5346 */ 5347 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5348 { 5349 btrfs_delalloc_release_metadata(inode, num_bytes); 5350 btrfs_free_reserved_data_space(inode, num_bytes); 5351 } 5352 5353 static int update_block_group(struct btrfs_trans_handle *trans, 5354 struct btrfs_root *root, u64 bytenr, 5355 u64 num_bytes, int alloc) 5356 { 5357 struct btrfs_block_group_cache *cache = NULL; 5358 struct btrfs_fs_info *info = root->fs_info; 5359 u64 total = num_bytes; 5360 u64 old_val; 5361 u64 byte_in_group; 5362 int factor; 5363 5364 /* block accounting for super block */ 5365 spin_lock(&info->delalloc_root_lock); 5366 old_val = btrfs_super_bytes_used(info->super_copy); 5367 if (alloc) 5368 old_val += num_bytes; 5369 else 5370 old_val -= num_bytes; 5371 btrfs_set_super_bytes_used(info->super_copy, old_val); 5372 spin_unlock(&info->delalloc_root_lock); 5373 5374 while (total) { 5375 cache = btrfs_lookup_block_group(info, bytenr); 5376 if (!cache) 5377 return -ENOENT; 5378 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5379 BTRFS_BLOCK_GROUP_RAID1 | 5380 BTRFS_BLOCK_GROUP_RAID10)) 5381 factor = 2; 5382 else 5383 factor = 1; 5384 /* 5385 * If this block group has free space cache written out, we 5386 * need to make sure to load it if we are removing space. This 5387 * is because we need the unpinning stage to actually add the 5388 * space back to the block group, otherwise we will leak space. 5389 */ 5390 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5391 cache_block_group(cache, 1); 5392 5393 spin_lock(&trans->transaction->dirty_bgs_lock); 5394 if (list_empty(&cache->dirty_list)) { 5395 list_add_tail(&cache->dirty_list, 5396 &trans->transaction->dirty_bgs); 5397 btrfs_get_block_group(cache); 5398 } 5399 spin_unlock(&trans->transaction->dirty_bgs_lock); 5400 5401 byte_in_group = bytenr - cache->key.objectid; 5402 WARN_ON(byte_in_group > cache->key.offset); 5403 5404 spin_lock(&cache->space_info->lock); 5405 spin_lock(&cache->lock); 5406 5407 if (btrfs_test_opt(root, SPACE_CACHE) && 5408 cache->disk_cache_state < BTRFS_DC_CLEAR) 5409 cache->disk_cache_state = BTRFS_DC_CLEAR; 5410 5411 old_val = btrfs_block_group_used(&cache->item); 5412 num_bytes = min(total, cache->key.offset - byte_in_group); 5413 if (alloc) { 5414 old_val += num_bytes; 5415 btrfs_set_block_group_used(&cache->item, old_val); 5416 cache->reserved -= num_bytes; 5417 cache->space_info->bytes_reserved -= num_bytes; 5418 cache->space_info->bytes_used += num_bytes; 5419 cache->space_info->disk_used += num_bytes * factor; 5420 spin_unlock(&cache->lock); 5421 spin_unlock(&cache->space_info->lock); 5422 } else { 5423 old_val -= num_bytes; 5424 btrfs_set_block_group_used(&cache->item, old_val); 5425 cache->pinned += num_bytes; 5426 cache->space_info->bytes_pinned += num_bytes; 5427 cache->space_info->bytes_used -= num_bytes; 5428 cache->space_info->disk_used -= num_bytes * factor; 5429 spin_unlock(&cache->lock); 5430 spin_unlock(&cache->space_info->lock); 5431 5432 set_extent_dirty(info->pinned_extents, 5433 bytenr, bytenr + num_bytes - 1, 5434 GFP_NOFS | __GFP_NOFAIL); 5435 /* 5436 * No longer have used bytes in this block group, queue 5437 * it for deletion. 5438 */ 5439 if (old_val == 0) { 5440 spin_lock(&info->unused_bgs_lock); 5441 if (list_empty(&cache->bg_list)) { 5442 btrfs_get_block_group(cache); 5443 list_add_tail(&cache->bg_list, 5444 &info->unused_bgs); 5445 } 5446 spin_unlock(&info->unused_bgs_lock); 5447 } 5448 } 5449 btrfs_put_block_group(cache); 5450 total -= num_bytes; 5451 bytenr += num_bytes; 5452 } 5453 return 0; 5454 } 5455 5456 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5457 { 5458 struct btrfs_block_group_cache *cache; 5459 u64 bytenr; 5460 5461 spin_lock(&root->fs_info->block_group_cache_lock); 5462 bytenr = root->fs_info->first_logical_byte; 5463 spin_unlock(&root->fs_info->block_group_cache_lock); 5464 5465 if (bytenr < (u64)-1) 5466 return bytenr; 5467 5468 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5469 if (!cache) 5470 return 0; 5471 5472 bytenr = cache->key.objectid; 5473 btrfs_put_block_group(cache); 5474 5475 return bytenr; 5476 } 5477 5478 static int pin_down_extent(struct btrfs_root *root, 5479 struct btrfs_block_group_cache *cache, 5480 u64 bytenr, u64 num_bytes, int reserved) 5481 { 5482 spin_lock(&cache->space_info->lock); 5483 spin_lock(&cache->lock); 5484 cache->pinned += num_bytes; 5485 cache->space_info->bytes_pinned += num_bytes; 5486 if (reserved) { 5487 cache->reserved -= num_bytes; 5488 cache->space_info->bytes_reserved -= num_bytes; 5489 } 5490 spin_unlock(&cache->lock); 5491 spin_unlock(&cache->space_info->lock); 5492 5493 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5494 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5495 if (reserved) 5496 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); 5497 return 0; 5498 } 5499 5500 /* 5501 * this function must be called within transaction 5502 */ 5503 int btrfs_pin_extent(struct btrfs_root *root, 5504 u64 bytenr, u64 num_bytes, int reserved) 5505 { 5506 struct btrfs_block_group_cache *cache; 5507 5508 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5509 BUG_ON(!cache); /* Logic error */ 5510 5511 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5512 5513 btrfs_put_block_group(cache); 5514 return 0; 5515 } 5516 5517 /* 5518 * this function must be called within transaction 5519 */ 5520 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5521 u64 bytenr, u64 num_bytes) 5522 { 5523 struct btrfs_block_group_cache *cache; 5524 int ret; 5525 5526 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5527 if (!cache) 5528 return -EINVAL; 5529 5530 /* 5531 * pull in the free space cache (if any) so that our pin 5532 * removes the free space from the cache. We have load_only set 5533 * to one because the slow code to read in the free extents does check 5534 * the pinned extents. 5535 */ 5536 cache_block_group(cache, 1); 5537 5538 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5539 5540 /* remove us from the free space cache (if we're there at all) */ 5541 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5542 btrfs_put_block_group(cache); 5543 return ret; 5544 } 5545 5546 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 5547 { 5548 int ret; 5549 struct btrfs_block_group_cache *block_group; 5550 struct btrfs_caching_control *caching_ctl; 5551 5552 block_group = btrfs_lookup_block_group(root->fs_info, start); 5553 if (!block_group) 5554 return -EINVAL; 5555 5556 cache_block_group(block_group, 0); 5557 caching_ctl = get_caching_control(block_group); 5558 5559 if (!caching_ctl) { 5560 /* Logic error */ 5561 BUG_ON(!block_group_cache_done(block_group)); 5562 ret = btrfs_remove_free_space(block_group, start, num_bytes); 5563 } else { 5564 mutex_lock(&caching_ctl->mutex); 5565 5566 if (start >= caching_ctl->progress) { 5567 ret = add_excluded_extent(root, start, num_bytes); 5568 } else if (start + num_bytes <= caching_ctl->progress) { 5569 ret = btrfs_remove_free_space(block_group, 5570 start, num_bytes); 5571 } else { 5572 num_bytes = caching_ctl->progress - start; 5573 ret = btrfs_remove_free_space(block_group, 5574 start, num_bytes); 5575 if (ret) 5576 goto out_lock; 5577 5578 num_bytes = (start + num_bytes) - 5579 caching_ctl->progress; 5580 start = caching_ctl->progress; 5581 ret = add_excluded_extent(root, start, num_bytes); 5582 } 5583 out_lock: 5584 mutex_unlock(&caching_ctl->mutex); 5585 put_caching_control(caching_ctl); 5586 } 5587 btrfs_put_block_group(block_group); 5588 return ret; 5589 } 5590 5591 int btrfs_exclude_logged_extents(struct btrfs_root *log, 5592 struct extent_buffer *eb) 5593 { 5594 struct btrfs_file_extent_item *item; 5595 struct btrfs_key key; 5596 int found_type; 5597 int i; 5598 5599 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 5600 return 0; 5601 5602 for (i = 0; i < btrfs_header_nritems(eb); i++) { 5603 btrfs_item_key_to_cpu(eb, &key, i); 5604 if (key.type != BTRFS_EXTENT_DATA_KEY) 5605 continue; 5606 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 5607 found_type = btrfs_file_extent_type(eb, item); 5608 if (found_type == BTRFS_FILE_EXTENT_INLINE) 5609 continue; 5610 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 5611 continue; 5612 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 5613 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 5614 __exclude_logged_extent(log, key.objectid, key.offset); 5615 } 5616 5617 return 0; 5618 } 5619 5620 /** 5621 * btrfs_update_reserved_bytes - update the block_group and space info counters 5622 * @cache: The cache we are manipulating 5623 * @num_bytes: The number of bytes in question 5624 * @reserve: One of the reservation enums 5625 * @delalloc: The blocks are allocated for the delalloc write 5626 * 5627 * This is called by the allocator when it reserves space, or by somebody who is 5628 * freeing space that was never actually used on disk. For example if you 5629 * reserve some space for a new leaf in transaction A and before transaction A 5630 * commits you free that leaf, you call this with reserve set to 0 in order to 5631 * clear the reservation. 5632 * 5633 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5634 * ENOSPC accounting. For data we handle the reservation through clearing the 5635 * delalloc bits in the io_tree. We have to do this since we could end up 5636 * allocating less disk space for the amount of data we have reserved in the 5637 * case of compression. 5638 * 5639 * If this is a reservation and the block group has become read only we cannot 5640 * make the reservation and return -EAGAIN, otherwise this function always 5641 * succeeds. 5642 */ 5643 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5644 u64 num_bytes, int reserve, int delalloc) 5645 { 5646 struct btrfs_space_info *space_info = cache->space_info; 5647 int ret = 0; 5648 5649 spin_lock(&space_info->lock); 5650 spin_lock(&cache->lock); 5651 if (reserve != RESERVE_FREE) { 5652 if (cache->ro) { 5653 ret = -EAGAIN; 5654 } else { 5655 cache->reserved += num_bytes; 5656 space_info->bytes_reserved += num_bytes; 5657 if (reserve == RESERVE_ALLOC) { 5658 trace_btrfs_space_reservation(cache->fs_info, 5659 "space_info", space_info->flags, 5660 num_bytes, 0); 5661 space_info->bytes_may_use -= num_bytes; 5662 } 5663 5664 if (delalloc) 5665 cache->delalloc_bytes += num_bytes; 5666 } 5667 } else { 5668 if (cache->ro) 5669 space_info->bytes_readonly += num_bytes; 5670 cache->reserved -= num_bytes; 5671 space_info->bytes_reserved -= num_bytes; 5672 5673 if (delalloc) 5674 cache->delalloc_bytes -= num_bytes; 5675 } 5676 spin_unlock(&cache->lock); 5677 spin_unlock(&space_info->lock); 5678 return ret; 5679 } 5680 5681 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5682 struct btrfs_root *root) 5683 { 5684 struct btrfs_fs_info *fs_info = root->fs_info; 5685 struct btrfs_caching_control *next; 5686 struct btrfs_caching_control *caching_ctl; 5687 struct btrfs_block_group_cache *cache; 5688 5689 down_write(&fs_info->commit_root_sem); 5690 5691 list_for_each_entry_safe(caching_ctl, next, 5692 &fs_info->caching_block_groups, list) { 5693 cache = caching_ctl->block_group; 5694 if (block_group_cache_done(cache)) { 5695 cache->last_byte_to_unpin = (u64)-1; 5696 list_del_init(&caching_ctl->list); 5697 put_caching_control(caching_ctl); 5698 } else { 5699 cache->last_byte_to_unpin = caching_ctl->progress; 5700 } 5701 } 5702 5703 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5704 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5705 else 5706 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5707 5708 up_write(&fs_info->commit_root_sem); 5709 5710 update_global_block_rsv(fs_info); 5711 } 5712 5713 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, 5714 const bool return_free_space) 5715 { 5716 struct btrfs_fs_info *fs_info = root->fs_info; 5717 struct btrfs_block_group_cache *cache = NULL; 5718 struct btrfs_space_info *space_info; 5719 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5720 u64 len; 5721 bool readonly; 5722 5723 while (start <= end) { 5724 readonly = false; 5725 if (!cache || 5726 start >= cache->key.objectid + cache->key.offset) { 5727 if (cache) 5728 btrfs_put_block_group(cache); 5729 cache = btrfs_lookup_block_group(fs_info, start); 5730 BUG_ON(!cache); /* Logic error */ 5731 } 5732 5733 len = cache->key.objectid + cache->key.offset - start; 5734 len = min(len, end + 1 - start); 5735 5736 if (start < cache->last_byte_to_unpin) { 5737 len = min(len, cache->last_byte_to_unpin - start); 5738 if (return_free_space) 5739 btrfs_add_free_space(cache, start, len); 5740 } 5741 5742 start += len; 5743 space_info = cache->space_info; 5744 5745 spin_lock(&space_info->lock); 5746 spin_lock(&cache->lock); 5747 cache->pinned -= len; 5748 space_info->bytes_pinned -= len; 5749 percpu_counter_add(&space_info->total_bytes_pinned, -len); 5750 if (cache->ro) { 5751 space_info->bytes_readonly += len; 5752 readonly = true; 5753 } 5754 spin_unlock(&cache->lock); 5755 if (!readonly && global_rsv->space_info == space_info) { 5756 spin_lock(&global_rsv->lock); 5757 if (!global_rsv->full) { 5758 len = min(len, global_rsv->size - 5759 global_rsv->reserved); 5760 global_rsv->reserved += len; 5761 space_info->bytes_may_use += len; 5762 if (global_rsv->reserved >= global_rsv->size) 5763 global_rsv->full = 1; 5764 } 5765 spin_unlock(&global_rsv->lock); 5766 } 5767 spin_unlock(&space_info->lock); 5768 } 5769 5770 if (cache) 5771 btrfs_put_block_group(cache); 5772 return 0; 5773 } 5774 5775 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5776 struct btrfs_root *root) 5777 { 5778 struct btrfs_fs_info *fs_info = root->fs_info; 5779 struct extent_io_tree *unpin; 5780 u64 start; 5781 u64 end; 5782 int ret; 5783 5784 if (trans->aborted) 5785 return 0; 5786 5787 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5788 unpin = &fs_info->freed_extents[1]; 5789 else 5790 unpin = &fs_info->freed_extents[0]; 5791 5792 while (1) { 5793 mutex_lock(&fs_info->unused_bg_unpin_mutex); 5794 ret = find_first_extent_bit(unpin, 0, &start, &end, 5795 EXTENT_DIRTY, NULL); 5796 if (ret) { 5797 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 5798 break; 5799 } 5800 5801 if (btrfs_test_opt(root, DISCARD)) 5802 ret = btrfs_discard_extent(root, start, 5803 end + 1 - start, NULL); 5804 5805 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5806 unpin_extent_range(root, start, end, true); 5807 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 5808 cond_resched(); 5809 } 5810 5811 return 0; 5812 } 5813 5814 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 5815 u64 owner, u64 root_objectid) 5816 { 5817 struct btrfs_space_info *space_info; 5818 u64 flags; 5819 5820 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5821 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 5822 flags = BTRFS_BLOCK_GROUP_SYSTEM; 5823 else 5824 flags = BTRFS_BLOCK_GROUP_METADATA; 5825 } else { 5826 flags = BTRFS_BLOCK_GROUP_DATA; 5827 } 5828 5829 space_info = __find_space_info(fs_info, flags); 5830 BUG_ON(!space_info); /* Logic bug */ 5831 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 5832 } 5833 5834 5835 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5836 struct btrfs_root *root, 5837 u64 bytenr, u64 num_bytes, u64 parent, 5838 u64 root_objectid, u64 owner_objectid, 5839 u64 owner_offset, int refs_to_drop, 5840 struct btrfs_delayed_extent_op *extent_op, 5841 int no_quota) 5842 { 5843 struct btrfs_key key; 5844 struct btrfs_path *path; 5845 struct btrfs_fs_info *info = root->fs_info; 5846 struct btrfs_root *extent_root = info->extent_root; 5847 struct extent_buffer *leaf; 5848 struct btrfs_extent_item *ei; 5849 struct btrfs_extent_inline_ref *iref; 5850 int ret; 5851 int is_data; 5852 int extent_slot = 0; 5853 int found_extent = 0; 5854 int num_to_del = 1; 5855 u32 item_size; 5856 u64 refs; 5857 int last_ref = 0; 5858 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; 5859 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5860 SKINNY_METADATA); 5861 5862 if (!info->quota_enabled || !is_fstree(root_objectid)) 5863 no_quota = 1; 5864 5865 path = btrfs_alloc_path(); 5866 if (!path) 5867 return -ENOMEM; 5868 5869 path->reada = 1; 5870 path->leave_spinning = 1; 5871 5872 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5873 BUG_ON(!is_data && refs_to_drop != 1); 5874 5875 if (is_data) 5876 skinny_metadata = 0; 5877 5878 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5879 bytenr, num_bytes, parent, 5880 root_objectid, owner_objectid, 5881 owner_offset); 5882 if (ret == 0) { 5883 extent_slot = path->slots[0]; 5884 while (extent_slot >= 0) { 5885 btrfs_item_key_to_cpu(path->nodes[0], &key, 5886 extent_slot); 5887 if (key.objectid != bytenr) 5888 break; 5889 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5890 key.offset == num_bytes) { 5891 found_extent = 1; 5892 break; 5893 } 5894 if (key.type == BTRFS_METADATA_ITEM_KEY && 5895 key.offset == owner_objectid) { 5896 found_extent = 1; 5897 break; 5898 } 5899 if (path->slots[0] - extent_slot > 5) 5900 break; 5901 extent_slot--; 5902 } 5903 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5904 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5905 if (found_extent && item_size < sizeof(*ei)) 5906 found_extent = 0; 5907 #endif 5908 if (!found_extent) { 5909 BUG_ON(iref); 5910 ret = remove_extent_backref(trans, extent_root, path, 5911 NULL, refs_to_drop, 5912 is_data, &last_ref); 5913 if (ret) { 5914 btrfs_abort_transaction(trans, extent_root, ret); 5915 goto out; 5916 } 5917 btrfs_release_path(path); 5918 path->leave_spinning = 1; 5919 5920 key.objectid = bytenr; 5921 key.type = BTRFS_EXTENT_ITEM_KEY; 5922 key.offset = num_bytes; 5923 5924 if (!is_data && skinny_metadata) { 5925 key.type = BTRFS_METADATA_ITEM_KEY; 5926 key.offset = owner_objectid; 5927 } 5928 5929 ret = btrfs_search_slot(trans, extent_root, 5930 &key, path, -1, 1); 5931 if (ret > 0 && skinny_metadata && path->slots[0]) { 5932 /* 5933 * Couldn't find our skinny metadata item, 5934 * see if we have ye olde extent item. 5935 */ 5936 path->slots[0]--; 5937 btrfs_item_key_to_cpu(path->nodes[0], &key, 5938 path->slots[0]); 5939 if (key.objectid == bytenr && 5940 key.type == BTRFS_EXTENT_ITEM_KEY && 5941 key.offset == num_bytes) 5942 ret = 0; 5943 } 5944 5945 if (ret > 0 && skinny_metadata) { 5946 skinny_metadata = false; 5947 key.objectid = bytenr; 5948 key.type = BTRFS_EXTENT_ITEM_KEY; 5949 key.offset = num_bytes; 5950 btrfs_release_path(path); 5951 ret = btrfs_search_slot(trans, extent_root, 5952 &key, path, -1, 1); 5953 } 5954 5955 if (ret) { 5956 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5957 ret, bytenr); 5958 if (ret > 0) 5959 btrfs_print_leaf(extent_root, 5960 path->nodes[0]); 5961 } 5962 if (ret < 0) { 5963 btrfs_abort_transaction(trans, extent_root, ret); 5964 goto out; 5965 } 5966 extent_slot = path->slots[0]; 5967 } 5968 } else if (WARN_ON(ret == -ENOENT)) { 5969 btrfs_print_leaf(extent_root, path->nodes[0]); 5970 btrfs_err(info, 5971 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5972 bytenr, parent, root_objectid, owner_objectid, 5973 owner_offset); 5974 btrfs_abort_transaction(trans, extent_root, ret); 5975 goto out; 5976 } else { 5977 btrfs_abort_transaction(trans, extent_root, ret); 5978 goto out; 5979 } 5980 5981 leaf = path->nodes[0]; 5982 item_size = btrfs_item_size_nr(leaf, extent_slot); 5983 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5984 if (item_size < sizeof(*ei)) { 5985 BUG_ON(found_extent || extent_slot != path->slots[0]); 5986 ret = convert_extent_item_v0(trans, extent_root, path, 5987 owner_objectid, 0); 5988 if (ret < 0) { 5989 btrfs_abort_transaction(trans, extent_root, ret); 5990 goto out; 5991 } 5992 5993 btrfs_release_path(path); 5994 path->leave_spinning = 1; 5995 5996 key.objectid = bytenr; 5997 key.type = BTRFS_EXTENT_ITEM_KEY; 5998 key.offset = num_bytes; 5999 6000 ret = btrfs_search_slot(trans, extent_root, &key, path, 6001 -1, 1); 6002 if (ret) { 6003 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 6004 ret, bytenr); 6005 btrfs_print_leaf(extent_root, path->nodes[0]); 6006 } 6007 if (ret < 0) { 6008 btrfs_abort_transaction(trans, extent_root, ret); 6009 goto out; 6010 } 6011 6012 extent_slot = path->slots[0]; 6013 leaf = path->nodes[0]; 6014 item_size = btrfs_item_size_nr(leaf, extent_slot); 6015 } 6016 #endif 6017 BUG_ON(item_size < sizeof(*ei)); 6018 ei = btrfs_item_ptr(leaf, extent_slot, 6019 struct btrfs_extent_item); 6020 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6021 key.type == BTRFS_EXTENT_ITEM_KEY) { 6022 struct btrfs_tree_block_info *bi; 6023 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6024 bi = (struct btrfs_tree_block_info *)(ei + 1); 6025 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6026 } 6027 6028 refs = btrfs_extent_refs(leaf, ei); 6029 if (refs < refs_to_drop) { 6030 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 6031 "for bytenr %Lu", refs_to_drop, refs, bytenr); 6032 ret = -EINVAL; 6033 btrfs_abort_transaction(trans, extent_root, ret); 6034 goto out; 6035 } 6036 refs -= refs_to_drop; 6037 6038 if (refs > 0) { 6039 type = BTRFS_QGROUP_OPER_SUB_SHARED; 6040 if (extent_op) 6041 __run_delayed_extent_op(extent_op, leaf, ei); 6042 /* 6043 * In the case of inline back ref, reference count will 6044 * be updated by remove_extent_backref 6045 */ 6046 if (iref) { 6047 BUG_ON(!found_extent); 6048 } else { 6049 btrfs_set_extent_refs(leaf, ei, refs); 6050 btrfs_mark_buffer_dirty(leaf); 6051 } 6052 if (found_extent) { 6053 ret = remove_extent_backref(trans, extent_root, path, 6054 iref, refs_to_drop, 6055 is_data, &last_ref); 6056 if (ret) { 6057 btrfs_abort_transaction(trans, extent_root, ret); 6058 goto out; 6059 } 6060 } 6061 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 6062 root_objectid); 6063 } else { 6064 if (found_extent) { 6065 BUG_ON(is_data && refs_to_drop != 6066 extent_data_ref_count(root, path, iref)); 6067 if (iref) { 6068 BUG_ON(path->slots[0] != extent_slot); 6069 } else { 6070 BUG_ON(path->slots[0] != extent_slot + 1); 6071 path->slots[0] = extent_slot; 6072 num_to_del = 2; 6073 } 6074 } 6075 6076 last_ref = 1; 6077 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6078 num_to_del); 6079 if (ret) { 6080 btrfs_abort_transaction(trans, extent_root, ret); 6081 goto out; 6082 } 6083 btrfs_release_path(path); 6084 6085 if (is_data) { 6086 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 6087 if (ret) { 6088 btrfs_abort_transaction(trans, extent_root, ret); 6089 goto out; 6090 } 6091 } 6092 6093 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 6094 if (ret) { 6095 btrfs_abort_transaction(trans, extent_root, ret); 6096 goto out; 6097 } 6098 } 6099 btrfs_release_path(path); 6100 6101 /* Deal with the quota accounting */ 6102 if (!ret && last_ref && !no_quota) { 6103 int mod_seq = 0; 6104 6105 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && 6106 type == BTRFS_QGROUP_OPER_SUB_SHARED) 6107 mod_seq = 1; 6108 6109 ret = btrfs_qgroup_record_ref(trans, info, root_objectid, 6110 bytenr, num_bytes, type, 6111 mod_seq); 6112 } 6113 out: 6114 btrfs_free_path(path); 6115 return ret; 6116 } 6117 6118 /* 6119 * when we free an block, it is possible (and likely) that we free the last 6120 * delayed ref for that extent as well. This searches the delayed ref tree for 6121 * a given extent, and if there are no other delayed refs to be processed, it 6122 * removes it from the tree. 6123 */ 6124 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 6125 struct btrfs_root *root, u64 bytenr) 6126 { 6127 struct btrfs_delayed_ref_head *head; 6128 struct btrfs_delayed_ref_root *delayed_refs; 6129 int ret = 0; 6130 6131 delayed_refs = &trans->transaction->delayed_refs; 6132 spin_lock(&delayed_refs->lock); 6133 head = btrfs_find_delayed_ref_head(trans, bytenr); 6134 if (!head) 6135 goto out_delayed_unlock; 6136 6137 spin_lock(&head->lock); 6138 if (rb_first(&head->ref_root)) 6139 goto out; 6140 6141 if (head->extent_op) { 6142 if (!head->must_insert_reserved) 6143 goto out; 6144 btrfs_free_delayed_extent_op(head->extent_op); 6145 head->extent_op = NULL; 6146 } 6147 6148 /* 6149 * waiting for the lock here would deadlock. If someone else has it 6150 * locked they are already in the process of dropping it anyway 6151 */ 6152 if (!mutex_trylock(&head->mutex)) 6153 goto out; 6154 6155 /* 6156 * at this point we have a head with no other entries. Go 6157 * ahead and process it. 6158 */ 6159 head->node.in_tree = 0; 6160 rb_erase(&head->href_node, &delayed_refs->href_root); 6161 6162 atomic_dec(&delayed_refs->num_entries); 6163 6164 /* 6165 * we don't take a ref on the node because we're removing it from the 6166 * tree, so we just steal the ref the tree was holding. 6167 */ 6168 delayed_refs->num_heads--; 6169 if (head->processing == 0) 6170 delayed_refs->num_heads_ready--; 6171 head->processing = 0; 6172 spin_unlock(&head->lock); 6173 spin_unlock(&delayed_refs->lock); 6174 6175 BUG_ON(head->extent_op); 6176 if (head->must_insert_reserved) 6177 ret = 1; 6178 6179 mutex_unlock(&head->mutex); 6180 btrfs_put_delayed_ref(&head->node); 6181 return ret; 6182 out: 6183 spin_unlock(&head->lock); 6184 6185 out_delayed_unlock: 6186 spin_unlock(&delayed_refs->lock); 6187 return 0; 6188 } 6189 6190 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 6191 struct btrfs_root *root, 6192 struct extent_buffer *buf, 6193 u64 parent, int last_ref) 6194 { 6195 int pin = 1; 6196 int ret; 6197 6198 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6199 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6200 buf->start, buf->len, 6201 parent, root->root_key.objectid, 6202 btrfs_header_level(buf), 6203 BTRFS_DROP_DELAYED_REF, NULL, 0); 6204 BUG_ON(ret); /* -ENOMEM */ 6205 } 6206 6207 if (!last_ref) 6208 return; 6209 6210 if (btrfs_header_generation(buf) == trans->transid) { 6211 struct btrfs_block_group_cache *cache; 6212 6213 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6214 ret = check_ref_cleanup(trans, root, buf->start); 6215 if (!ret) 6216 goto out; 6217 } 6218 6219 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 6220 6221 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6222 pin_down_extent(root, cache, buf->start, buf->len, 1); 6223 btrfs_put_block_group(cache); 6224 goto out; 6225 } 6226 6227 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6228 6229 btrfs_add_free_space(cache, buf->start, buf->len); 6230 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 6231 btrfs_put_block_group(cache); 6232 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6233 pin = 0; 6234 } 6235 out: 6236 if (pin) 6237 add_pinned_bytes(root->fs_info, buf->len, 6238 btrfs_header_level(buf), 6239 root->root_key.objectid); 6240 6241 /* 6242 * Deleting the buffer, clear the corrupt flag since it doesn't matter 6243 * anymore. 6244 */ 6245 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6246 } 6247 6248 /* Can return -ENOMEM */ 6249 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6250 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6251 u64 owner, u64 offset, int no_quota) 6252 { 6253 int ret; 6254 struct btrfs_fs_info *fs_info = root->fs_info; 6255 6256 if (btrfs_test_is_dummy_root(root)) 6257 return 0; 6258 6259 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6260 6261 /* 6262 * tree log blocks never actually go into the extent allocation 6263 * tree, just update pinning info and exit early. 6264 */ 6265 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 6266 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 6267 /* unlocks the pinned mutex */ 6268 btrfs_pin_extent(root, bytenr, num_bytes, 1); 6269 ret = 0; 6270 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6271 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6272 num_bytes, 6273 parent, root_objectid, (int)owner, 6274 BTRFS_DROP_DELAYED_REF, NULL, no_quota); 6275 } else { 6276 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6277 num_bytes, 6278 parent, root_objectid, owner, 6279 offset, BTRFS_DROP_DELAYED_REF, 6280 NULL, no_quota); 6281 } 6282 return ret; 6283 } 6284 6285 /* 6286 * when we wait for progress in the block group caching, its because 6287 * our allocation attempt failed at least once. So, we must sleep 6288 * and let some progress happen before we try again. 6289 * 6290 * This function will sleep at least once waiting for new free space to 6291 * show up, and then it will check the block group free space numbers 6292 * for our min num_bytes. Another option is to have it go ahead 6293 * and look in the rbtree for a free extent of a given size, but this 6294 * is a good start. 6295 * 6296 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 6297 * any of the information in this block group. 6298 */ 6299 static noinline void 6300 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 6301 u64 num_bytes) 6302 { 6303 struct btrfs_caching_control *caching_ctl; 6304 6305 caching_ctl = get_caching_control(cache); 6306 if (!caching_ctl) 6307 return; 6308 6309 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 6310 (cache->free_space_ctl->free_space >= num_bytes)); 6311 6312 put_caching_control(caching_ctl); 6313 } 6314 6315 static noinline int 6316 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 6317 { 6318 struct btrfs_caching_control *caching_ctl; 6319 int ret = 0; 6320 6321 caching_ctl = get_caching_control(cache); 6322 if (!caching_ctl) 6323 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 6324 6325 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 6326 if (cache->cached == BTRFS_CACHE_ERROR) 6327 ret = -EIO; 6328 put_caching_control(caching_ctl); 6329 return ret; 6330 } 6331 6332 int __get_raid_index(u64 flags) 6333 { 6334 if (flags & BTRFS_BLOCK_GROUP_RAID10) 6335 return BTRFS_RAID_RAID10; 6336 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 6337 return BTRFS_RAID_RAID1; 6338 else if (flags & BTRFS_BLOCK_GROUP_DUP) 6339 return BTRFS_RAID_DUP; 6340 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 6341 return BTRFS_RAID_RAID0; 6342 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 6343 return BTRFS_RAID_RAID5; 6344 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 6345 return BTRFS_RAID_RAID6; 6346 6347 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 6348 } 6349 6350 int get_block_group_index(struct btrfs_block_group_cache *cache) 6351 { 6352 return __get_raid_index(cache->flags); 6353 } 6354 6355 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 6356 [BTRFS_RAID_RAID10] = "raid10", 6357 [BTRFS_RAID_RAID1] = "raid1", 6358 [BTRFS_RAID_DUP] = "dup", 6359 [BTRFS_RAID_RAID0] = "raid0", 6360 [BTRFS_RAID_SINGLE] = "single", 6361 [BTRFS_RAID_RAID5] = "raid5", 6362 [BTRFS_RAID_RAID6] = "raid6", 6363 }; 6364 6365 static const char *get_raid_name(enum btrfs_raid_types type) 6366 { 6367 if (type >= BTRFS_NR_RAID_TYPES) 6368 return NULL; 6369 6370 return btrfs_raid_type_names[type]; 6371 } 6372 6373 enum btrfs_loop_type { 6374 LOOP_CACHING_NOWAIT = 0, 6375 LOOP_CACHING_WAIT = 1, 6376 LOOP_ALLOC_CHUNK = 2, 6377 LOOP_NO_EMPTY_SIZE = 3, 6378 }; 6379 6380 static inline void 6381 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 6382 int delalloc) 6383 { 6384 if (delalloc) 6385 down_read(&cache->data_rwsem); 6386 } 6387 6388 static inline void 6389 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 6390 int delalloc) 6391 { 6392 btrfs_get_block_group(cache); 6393 if (delalloc) 6394 down_read(&cache->data_rwsem); 6395 } 6396 6397 static struct btrfs_block_group_cache * 6398 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 6399 struct btrfs_free_cluster *cluster, 6400 int delalloc) 6401 { 6402 struct btrfs_block_group_cache *used_bg; 6403 bool locked = false; 6404 again: 6405 spin_lock(&cluster->refill_lock); 6406 if (locked) { 6407 if (used_bg == cluster->block_group) 6408 return used_bg; 6409 6410 up_read(&used_bg->data_rwsem); 6411 btrfs_put_block_group(used_bg); 6412 } 6413 6414 used_bg = cluster->block_group; 6415 if (!used_bg) 6416 return NULL; 6417 6418 if (used_bg == block_group) 6419 return used_bg; 6420 6421 btrfs_get_block_group(used_bg); 6422 6423 if (!delalloc) 6424 return used_bg; 6425 6426 if (down_read_trylock(&used_bg->data_rwsem)) 6427 return used_bg; 6428 6429 spin_unlock(&cluster->refill_lock); 6430 down_read(&used_bg->data_rwsem); 6431 locked = true; 6432 goto again; 6433 } 6434 6435 static inline void 6436 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 6437 int delalloc) 6438 { 6439 if (delalloc) 6440 up_read(&cache->data_rwsem); 6441 btrfs_put_block_group(cache); 6442 } 6443 6444 /* 6445 * walks the btree of allocated extents and find a hole of a given size. 6446 * The key ins is changed to record the hole: 6447 * ins->objectid == start position 6448 * ins->flags = BTRFS_EXTENT_ITEM_KEY 6449 * ins->offset == the size of the hole. 6450 * Any available blocks before search_start are skipped. 6451 * 6452 * If there is no suitable free space, we will record the max size of 6453 * the free space extent currently. 6454 */ 6455 static noinline int find_free_extent(struct btrfs_root *orig_root, 6456 u64 num_bytes, u64 empty_size, 6457 u64 hint_byte, struct btrfs_key *ins, 6458 u64 flags, int delalloc) 6459 { 6460 int ret = 0; 6461 struct btrfs_root *root = orig_root->fs_info->extent_root; 6462 struct btrfs_free_cluster *last_ptr = NULL; 6463 struct btrfs_block_group_cache *block_group = NULL; 6464 u64 search_start = 0; 6465 u64 max_extent_size = 0; 6466 int empty_cluster = 2 * 1024 * 1024; 6467 struct btrfs_space_info *space_info; 6468 int loop = 0; 6469 int index = __get_raid_index(flags); 6470 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 6471 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 6472 bool failed_cluster_refill = false; 6473 bool failed_alloc = false; 6474 bool use_cluster = true; 6475 bool have_caching_bg = false; 6476 6477 WARN_ON(num_bytes < root->sectorsize); 6478 ins->type = BTRFS_EXTENT_ITEM_KEY; 6479 ins->objectid = 0; 6480 ins->offset = 0; 6481 6482 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 6483 6484 space_info = __find_space_info(root->fs_info, flags); 6485 if (!space_info) { 6486 btrfs_err(root->fs_info, "No space info for %llu", flags); 6487 return -ENOSPC; 6488 } 6489 6490 /* 6491 * If the space info is for both data and metadata it means we have a 6492 * small filesystem and we can't use the clustering stuff. 6493 */ 6494 if (btrfs_mixed_space_info(space_info)) 6495 use_cluster = false; 6496 6497 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 6498 last_ptr = &root->fs_info->meta_alloc_cluster; 6499 if (!btrfs_test_opt(root, SSD)) 6500 empty_cluster = 64 * 1024; 6501 } 6502 6503 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 6504 btrfs_test_opt(root, SSD)) { 6505 last_ptr = &root->fs_info->data_alloc_cluster; 6506 } 6507 6508 if (last_ptr) { 6509 spin_lock(&last_ptr->lock); 6510 if (last_ptr->block_group) 6511 hint_byte = last_ptr->window_start; 6512 spin_unlock(&last_ptr->lock); 6513 } 6514 6515 search_start = max(search_start, first_logical_byte(root, 0)); 6516 search_start = max(search_start, hint_byte); 6517 6518 if (!last_ptr) 6519 empty_cluster = 0; 6520 6521 if (search_start == hint_byte) { 6522 block_group = btrfs_lookup_block_group(root->fs_info, 6523 search_start); 6524 /* 6525 * we don't want to use the block group if it doesn't match our 6526 * allocation bits, or if its not cached. 6527 * 6528 * However if we are re-searching with an ideal block group 6529 * picked out then we don't care that the block group is cached. 6530 */ 6531 if (block_group && block_group_bits(block_group, flags) && 6532 block_group->cached != BTRFS_CACHE_NO) { 6533 down_read(&space_info->groups_sem); 6534 if (list_empty(&block_group->list) || 6535 block_group->ro) { 6536 /* 6537 * someone is removing this block group, 6538 * we can't jump into the have_block_group 6539 * target because our list pointers are not 6540 * valid 6541 */ 6542 btrfs_put_block_group(block_group); 6543 up_read(&space_info->groups_sem); 6544 } else { 6545 index = get_block_group_index(block_group); 6546 btrfs_lock_block_group(block_group, delalloc); 6547 goto have_block_group; 6548 } 6549 } else if (block_group) { 6550 btrfs_put_block_group(block_group); 6551 } 6552 } 6553 search: 6554 have_caching_bg = false; 6555 down_read(&space_info->groups_sem); 6556 list_for_each_entry(block_group, &space_info->block_groups[index], 6557 list) { 6558 u64 offset; 6559 int cached; 6560 6561 btrfs_grab_block_group(block_group, delalloc); 6562 search_start = block_group->key.objectid; 6563 6564 /* 6565 * this can happen if we end up cycling through all the 6566 * raid types, but we want to make sure we only allocate 6567 * for the proper type. 6568 */ 6569 if (!block_group_bits(block_group, flags)) { 6570 u64 extra = BTRFS_BLOCK_GROUP_DUP | 6571 BTRFS_BLOCK_GROUP_RAID1 | 6572 BTRFS_BLOCK_GROUP_RAID5 | 6573 BTRFS_BLOCK_GROUP_RAID6 | 6574 BTRFS_BLOCK_GROUP_RAID10; 6575 6576 /* 6577 * if they asked for extra copies and this block group 6578 * doesn't provide them, bail. This does allow us to 6579 * fill raid0 from raid1. 6580 */ 6581 if ((flags & extra) && !(block_group->flags & extra)) 6582 goto loop; 6583 } 6584 6585 have_block_group: 6586 cached = block_group_cache_done(block_group); 6587 if (unlikely(!cached)) { 6588 ret = cache_block_group(block_group, 0); 6589 BUG_ON(ret < 0); 6590 ret = 0; 6591 } 6592 6593 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 6594 goto loop; 6595 if (unlikely(block_group->ro)) 6596 goto loop; 6597 6598 /* 6599 * Ok we want to try and use the cluster allocator, so 6600 * lets look there 6601 */ 6602 if (last_ptr) { 6603 struct btrfs_block_group_cache *used_block_group; 6604 unsigned long aligned_cluster; 6605 /* 6606 * the refill lock keeps out other 6607 * people trying to start a new cluster 6608 */ 6609 used_block_group = btrfs_lock_cluster(block_group, 6610 last_ptr, 6611 delalloc); 6612 if (!used_block_group) 6613 goto refill_cluster; 6614 6615 if (used_block_group != block_group && 6616 (used_block_group->ro || 6617 !block_group_bits(used_block_group, flags))) 6618 goto release_cluster; 6619 6620 offset = btrfs_alloc_from_cluster(used_block_group, 6621 last_ptr, 6622 num_bytes, 6623 used_block_group->key.objectid, 6624 &max_extent_size); 6625 if (offset) { 6626 /* we have a block, we're done */ 6627 spin_unlock(&last_ptr->refill_lock); 6628 trace_btrfs_reserve_extent_cluster(root, 6629 used_block_group, 6630 search_start, num_bytes); 6631 if (used_block_group != block_group) { 6632 btrfs_release_block_group(block_group, 6633 delalloc); 6634 block_group = used_block_group; 6635 } 6636 goto checks; 6637 } 6638 6639 WARN_ON(last_ptr->block_group != used_block_group); 6640 release_cluster: 6641 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6642 * set up a new clusters, so lets just skip it 6643 * and let the allocator find whatever block 6644 * it can find. If we reach this point, we 6645 * will have tried the cluster allocator 6646 * plenty of times and not have found 6647 * anything, so we are likely way too 6648 * fragmented for the clustering stuff to find 6649 * anything. 6650 * 6651 * However, if the cluster is taken from the 6652 * current block group, release the cluster 6653 * first, so that we stand a better chance of 6654 * succeeding in the unclustered 6655 * allocation. */ 6656 if (loop >= LOOP_NO_EMPTY_SIZE && 6657 used_block_group != block_group) { 6658 spin_unlock(&last_ptr->refill_lock); 6659 btrfs_release_block_group(used_block_group, 6660 delalloc); 6661 goto unclustered_alloc; 6662 } 6663 6664 /* 6665 * this cluster didn't work out, free it and 6666 * start over 6667 */ 6668 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6669 6670 if (used_block_group != block_group) 6671 btrfs_release_block_group(used_block_group, 6672 delalloc); 6673 refill_cluster: 6674 if (loop >= LOOP_NO_EMPTY_SIZE) { 6675 spin_unlock(&last_ptr->refill_lock); 6676 goto unclustered_alloc; 6677 } 6678 6679 aligned_cluster = max_t(unsigned long, 6680 empty_cluster + empty_size, 6681 block_group->full_stripe_len); 6682 6683 /* allocate a cluster in this block group */ 6684 ret = btrfs_find_space_cluster(root, block_group, 6685 last_ptr, search_start, 6686 num_bytes, 6687 aligned_cluster); 6688 if (ret == 0) { 6689 /* 6690 * now pull our allocation out of this 6691 * cluster 6692 */ 6693 offset = btrfs_alloc_from_cluster(block_group, 6694 last_ptr, 6695 num_bytes, 6696 search_start, 6697 &max_extent_size); 6698 if (offset) { 6699 /* we found one, proceed */ 6700 spin_unlock(&last_ptr->refill_lock); 6701 trace_btrfs_reserve_extent_cluster(root, 6702 block_group, search_start, 6703 num_bytes); 6704 goto checks; 6705 } 6706 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6707 && !failed_cluster_refill) { 6708 spin_unlock(&last_ptr->refill_lock); 6709 6710 failed_cluster_refill = true; 6711 wait_block_group_cache_progress(block_group, 6712 num_bytes + empty_cluster + empty_size); 6713 goto have_block_group; 6714 } 6715 6716 /* 6717 * at this point we either didn't find a cluster 6718 * or we weren't able to allocate a block from our 6719 * cluster. Free the cluster we've been trying 6720 * to use, and go to the next block group 6721 */ 6722 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6723 spin_unlock(&last_ptr->refill_lock); 6724 goto loop; 6725 } 6726 6727 unclustered_alloc: 6728 spin_lock(&block_group->free_space_ctl->tree_lock); 6729 if (cached && 6730 block_group->free_space_ctl->free_space < 6731 num_bytes + empty_cluster + empty_size) { 6732 if (block_group->free_space_ctl->free_space > 6733 max_extent_size) 6734 max_extent_size = 6735 block_group->free_space_ctl->free_space; 6736 spin_unlock(&block_group->free_space_ctl->tree_lock); 6737 goto loop; 6738 } 6739 spin_unlock(&block_group->free_space_ctl->tree_lock); 6740 6741 offset = btrfs_find_space_for_alloc(block_group, search_start, 6742 num_bytes, empty_size, 6743 &max_extent_size); 6744 /* 6745 * If we didn't find a chunk, and we haven't failed on this 6746 * block group before, and this block group is in the middle of 6747 * caching and we are ok with waiting, then go ahead and wait 6748 * for progress to be made, and set failed_alloc to true. 6749 * 6750 * If failed_alloc is true then we've already waited on this 6751 * block group once and should move on to the next block group. 6752 */ 6753 if (!offset && !failed_alloc && !cached && 6754 loop > LOOP_CACHING_NOWAIT) { 6755 wait_block_group_cache_progress(block_group, 6756 num_bytes + empty_size); 6757 failed_alloc = true; 6758 goto have_block_group; 6759 } else if (!offset) { 6760 if (!cached) 6761 have_caching_bg = true; 6762 goto loop; 6763 } 6764 checks: 6765 search_start = ALIGN(offset, root->stripesize); 6766 6767 /* move on to the next group */ 6768 if (search_start + num_bytes > 6769 block_group->key.objectid + block_group->key.offset) { 6770 btrfs_add_free_space(block_group, offset, num_bytes); 6771 goto loop; 6772 } 6773 6774 if (offset < search_start) 6775 btrfs_add_free_space(block_group, offset, 6776 search_start - offset); 6777 BUG_ON(offset > search_start); 6778 6779 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 6780 alloc_type, delalloc); 6781 if (ret == -EAGAIN) { 6782 btrfs_add_free_space(block_group, offset, num_bytes); 6783 goto loop; 6784 } 6785 6786 /* we are all good, lets return */ 6787 ins->objectid = search_start; 6788 ins->offset = num_bytes; 6789 6790 trace_btrfs_reserve_extent(orig_root, block_group, 6791 search_start, num_bytes); 6792 btrfs_release_block_group(block_group, delalloc); 6793 break; 6794 loop: 6795 failed_cluster_refill = false; 6796 failed_alloc = false; 6797 BUG_ON(index != get_block_group_index(block_group)); 6798 btrfs_release_block_group(block_group, delalloc); 6799 } 6800 up_read(&space_info->groups_sem); 6801 6802 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6803 goto search; 6804 6805 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6806 goto search; 6807 6808 /* 6809 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6810 * caching kthreads as we move along 6811 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6812 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6813 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6814 * again 6815 */ 6816 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6817 index = 0; 6818 loop++; 6819 if (loop == LOOP_ALLOC_CHUNK) { 6820 struct btrfs_trans_handle *trans; 6821 int exist = 0; 6822 6823 trans = current->journal_info; 6824 if (trans) 6825 exist = 1; 6826 else 6827 trans = btrfs_join_transaction(root); 6828 6829 if (IS_ERR(trans)) { 6830 ret = PTR_ERR(trans); 6831 goto out; 6832 } 6833 6834 ret = do_chunk_alloc(trans, root, flags, 6835 CHUNK_ALLOC_FORCE); 6836 /* 6837 * Do not bail out on ENOSPC since we 6838 * can do more things. 6839 */ 6840 if (ret < 0 && ret != -ENOSPC) 6841 btrfs_abort_transaction(trans, 6842 root, ret); 6843 else 6844 ret = 0; 6845 if (!exist) 6846 btrfs_end_transaction(trans, root); 6847 if (ret) 6848 goto out; 6849 } 6850 6851 if (loop == LOOP_NO_EMPTY_SIZE) { 6852 empty_size = 0; 6853 empty_cluster = 0; 6854 } 6855 6856 goto search; 6857 } else if (!ins->objectid) { 6858 ret = -ENOSPC; 6859 } else if (ins->objectid) { 6860 ret = 0; 6861 } 6862 out: 6863 if (ret == -ENOSPC) 6864 ins->offset = max_extent_size; 6865 return ret; 6866 } 6867 6868 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6869 int dump_block_groups) 6870 { 6871 struct btrfs_block_group_cache *cache; 6872 int index = 0; 6873 6874 spin_lock(&info->lock); 6875 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", 6876 info->flags, 6877 info->total_bytes - info->bytes_used - info->bytes_pinned - 6878 info->bytes_reserved - info->bytes_readonly, 6879 (info->full) ? "" : "not "); 6880 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " 6881 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6882 info->total_bytes, info->bytes_used, info->bytes_pinned, 6883 info->bytes_reserved, info->bytes_may_use, 6884 info->bytes_readonly); 6885 spin_unlock(&info->lock); 6886 6887 if (!dump_block_groups) 6888 return; 6889 6890 down_read(&info->groups_sem); 6891 again: 6892 list_for_each_entry(cache, &info->block_groups[index], list) { 6893 spin_lock(&cache->lock); 6894 printk(KERN_INFO "BTRFS: " 6895 "block group %llu has %llu bytes, " 6896 "%llu used %llu pinned %llu reserved %s\n", 6897 cache->key.objectid, cache->key.offset, 6898 btrfs_block_group_used(&cache->item), cache->pinned, 6899 cache->reserved, cache->ro ? "[readonly]" : ""); 6900 btrfs_dump_free_space(cache, bytes); 6901 spin_unlock(&cache->lock); 6902 } 6903 if (++index < BTRFS_NR_RAID_TYPES) 6904 goto again; 6905 up_read(&info->groups_sem); 6906 } 6907 6908 int btrfs_reserve_extent(struct btrfs_root *root, 6909 u64 num_bytes, u64 min_alloc_size, 6910 u64 empty_size, u64 hint_byte, 6911 struct btrfs_key *ins, int is_data, int delalloc) 6912 { 6913 bool final_tried = false; 6914 u64 flags; 6915 int ret; 6916 6917 flags = btrfs_get_alloc_profile(root, is_data); 6918 again: 6919 WARN_ON(num_bytes < root->sectorsize); 6920 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 6921 flags, delalloc); 6922 6923 if (ret == -ENOSPC) { 6924 if (!final_tried && ins->offset) { 6925 num_bytes = min(num_bytes >> 1, ins->offset); 6926 num_bytes = round_down(num_bytes, root->sectorsize); 6927 num_bytes = max(num_bytes, min_alloc_size); 6928 if (num_bytes == min_alloc_size) 6929 final_tried = true; 6930 goto again; 6931 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6932 struct btrfs_space_info *sinfo; 6933 6934 sinfo = __find_space_info(root->fs_info, flags); 6935 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6936 flags, num_bytes); 6937 if (sinfo) 6938 dump_space_info(sinfo, num_bytes, 1); 6939 } 6940 } 6941 6942 return ret; 6943 } 6944 6945 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6946 u64 start, u64 len, 6947 int pin, int delalloc) 6948 { 6949 struct btrfs_block_group_cache *cache; 6950 int ret = 0; 6951 6952 cache = btrfs_lookup_block_group(root->fs_info, start); 6953 if (!cache) { 6954 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6955 start); 6956 return -ENOSPC; 6957 } 6958 6959 if (btrfs_test_opt(root, DISCARD)) 6960 ret = btrfs_discard_extent(root, start, len, NULL); 6961 6962 if (pin) 6963 pin_down_extent(root, cache, start, len, 1); 6964 else { 6965 btrfs_add_free_space(cache, start, len); 6966 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 6967 } 6968 btrfs_put_block_group(cache); 6969 6970 trace_btrfs_reserved_extent_free(root, start, len); 6971 6972 return ret; 6973 } 6974 6975 int btrfs_free_reserved_extent(struct btrfs_root *root, 6976 u64 start, u64 len, int delalloc) 6977 { 6978 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); 6979 } 6980 6981 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6982 u64 start, u64 len) 6983 { 6984 return __btrfs_free_reserved_extent(root, start, len, 1, 0); 6985 } 6986 6987 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6988 struct btrfs_root *root, 6989 u64 parent, u64 root_objectid, 6990 u64 flags, u64 owner, u64 offset, 6991 struct btrfs_key *ins, int ref_mod) 6992 { 6993 int ret; 6994 struct btrfs_fs_info *fs_info = root->fs_info; 6995 struct btrfs_extent_item *extent_item; 6996 struct btrfs_extent_inline_ref *iref; 6997 struct btrfs_path *path; 6998 struct extent_buffer *leaf; 6999 int type; 7000 u32 size; 7001 7002 if (parent > 0) 7003 type = BTRFS_SHARED_DATA_REF_KEY; 7004 else 7005 type = BTRFS_EXTENT_DATA_REF_KEY; 7006 7007 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 7008 7009 path = btrfs_alloc_path(); 7010 if (!path) 7011 return -ENOMEM; 7012 7013 path->leave_spinning = 1; 7014 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7015 ins, size); 7016 if (ret) { 7017 btrfs_free_path(path); 7018 return ret; 7019 } 7020 7021 leaf = path->nodes[0]; 7022 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7023 struct btrfs_extent_item); 7024 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 7025 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7026 btrfs_set_extent_flags(leaf, extent_item, 7027 flags | BTRFS_EXTENT_FLAG_DATA); 7028 7029 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7030 btrfs_set_extent_inline_ref_type(leaf, iref, type); 7031 if (parent > 0) { 7032 struct btrfs_shared_data_ref *ref; 7033 ref = (struct btrfs_shared_data_ref *)(iref + 1); 7034 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7035 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 7036 } else { 7037 struct btrfs_extent_data_ref *ref; 7038 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 7039 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 7040 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 7041 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 7042 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 7043 } 7044 7045 btrfs_mark_buffer_dirty(path->nodes[0]); 7046 btrfs_free_path(path); 7047 7048 /* Always set parent to 0 here since its exclusive anyway. */ 7049 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7050 ins->objectid, ins->offset, 7051 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7052 if (ret) 7053 return ret; 7054 7055 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 7056 if (ret) { /* -ENOENT, logic error */ 7057 btrfs_err(fs_info, "update block group failed for %llu %llu", 7058 ins->objectid, ins->offset); 7059 BUG(); 7060 } 7061 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 7062 return ret; 7063 } 7064 7065 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 7066 struct btrfs_root *root, 7067 u64 parent, u64 root_objectid, 7068 u64 flags, struct btrfs_disk_key *key, 7069 int level, struct btrfs_key *ins, 7070 int no_quota) 7071 { 7072 int ret; 7073 struct btrfs_fs_info *fs_info = root->fs_info; 7074 struct btrfs_extent_item *extent_item; 7075 struct btrfs_tree_block_info *block_info; 7076 struct btrfs_extent_inline_ref *iref; 7077 struct btrfs_path *path; 7078 struct extent_buffer *leaf; 7079 u32 size = sizeof(*extent_item) + sizeof(*iref); 7080 u64 num_bytes = ins->offset; 7081 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7082 SKINNY_METADATA); 7083 7084 if (!skinny_metadata) 7085 size += sizeof(*block_info); 7086 7087 path = btrfs_alloc_path(); 7088 if (!path) { 7089 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7090 root->nodesize); 7091 return -ENOMEM; 7092 } 7093 7094 path->leave_spinning = 1; 7095 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7096 ins, size); 7097 if (ret) { 7098 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7099 root->nodesize); 7100 btrfs_free_path(path); 7101 return ret; 7102 } 7103 7104 leaf = path->nodes[0]; 7105 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7106 struct btrfs_extent_item); 7107 btrfs_set_extent_refs(leaf, extent_item, 1); 7108 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7109 btrfs_set_extent_flags(leaf, extent_item, 7110 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 7111 7112 if (skinny_metadata) { 7113 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7114 num_bytes = root->nodesize; 7115 } else { 7116 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7117 btrfs_set_tree_block_key(leaf, block_info, key); 7118 btrfs_set_tree_block_level(leaf, block_info, level); 7119 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 7120 } 7121 7122 if (parent > 0) { 7123 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 7124 btrfs_set_extent_inline_ref_type(leaf, iref, 7125 BTRFS_SHARED_BLOCK_REF_KEY); 7126 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7127 } else { 7128 btrfs_set_extent_inline_ref_type(leaf, iref, 7129 BTRFS_TREE_BLOCK_REF_KEY); 7130 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 7131 } 7132 7133 btrfs_mark_buffer_dirty(leaf); 7134 btrfs_free_path(path); 7135 7136 if (!no_quota) { 7137 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7138 ins->objectid, num_bytes, 7139 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7140 if (ret) 7141 return ret; 7142 } 7143 7144 ret = update_block_group(trans, root, ins->objectid, root->nodesize, 7145 1); 7146 if (ret) { /* -ENOENT, logic error */ 7147 btrfs_err(fs_info, "update block group failed for %llu %llu", 7148 ins->objectid, ins->offset); 7149 BUG(); 7150 } 7151 7152 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); 7153 return ret; 7154 } 7155 7156 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7157 struct btrfs_root *root, 7158 u64 root_objectid, u64 owner, 7159 u64 offset, struct btrfs_key *ins) 7160 { 7161 int ret; 7162 7163 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 7164 7165 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 7166 ins->offset, 0, 7167 root_objectid, owner, offset, 7168 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 7169 return ret; 7170 } 7171 7172 /* 7173 * this is used by the tree logging recovery code. It records that 7174 * an extent has been allocated and makes sure to clear the free 7175 * space cache bits as well 7176 */ 7177 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 7178 struct btrfs_root *root, 7179 u64 root_objectid, u64 owner, u64 offset, 7180 struct btrfs_key *ins) 7181 { 7182 int ret; 7183 struct btrfs_block_group_cache *block_group; 7184 7185 /* 7186 * Mixed block groups will exclude before processing the log so we only 7187 * need to do the exlude dance if this fs isn't mixed. 7188 */ 7189 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 7190 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 7191 if (ret) 7192 return ret; 7193 } 7194 7195 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 7196 if (!block_group) 7197 return -EINVAL; 7198 7199 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 7200 RESERVE_ALLOC_NO_ACCOUNT, 0); 7201 BUG_ON(ret); /* logic error */ 7202 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 7203 0, owner, offset, ins, 1); 7204 btrfs_put_block_group(block_group); 7205 return ret; 7206 } 7207 7208 static struct extent_buffer * 7209 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 7210 u64 bytenr, int level) 7211 { 7212 struct extent_buffer *buf; 7213 7214 buf = btrfs_find_create_tree_block(root, bytenr); 7215 if (!buf) 7216 return ERR_PTR(-ENOMEM); 7217 btrfs_set_header_generation(buf, trans->transid); 7218 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 7219 btrfs_tree_lock(buf); 7220 clean_tree_block(trans, root, buf); 7221 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 7222 7223 btrfs_set_lock_blocking(buf); 7224 btrfs_set_buffer_uptodate(buf); 7225 7226 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7227 buf->log_index = root->log_transid % 2; 7228 /* 7229 * we allow two log transactions at a time, use different 7230 * EXENT bit to differentiate dirty pages. 7231 */ 7232 if (buf->log_index == 0) 7233 set_extent_dirty(&root->dirty_log_pages, buf->start, 7234 buf->start + buf->len - 1, GFP_NOFS); 7235 else 7236 set_extent_new(&root->dirty_log_pages, buf->start, 7237 buf->start + buf->len - 1, GFP_NOFS); 7238 } else { 7239 buf->log_index = -1; 7240 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7241 buf->start + buf->len - 1, GFP_NOFS); 7242 } 7243 trans->blocks_used++; 7244 /* this returns a buffer locked for blocking */ 7245 return buf; 7246 } 7247 7248 static struct btrfs_block_rsv * 7249 use_block_rsv(struct btrfs_trans_handle *trans, 7250 struct btrfs_root *root, u32 blocksize) 7251 { 7252 struct btrfs_block_rsv *block_rsv; 7253 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 7254 int ret; 7255 bool global_updated = false; 7256 7257 block_rsv = get_block_rsv(trans, root); 7258 7259 if (unlikely(block_rsv->size == 0)) 7260 goto try_reserve; 7261 again: 7262 ret = block_rsv_use_bytes(block_rsv, blocksize); 7263 if (!ret) 7264 return block_rsv; 7265 7266 if (block_rsv->failfast) 7267 return ERR_PTR(ret); 7268 7269 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 7270 global_updated = true; 7271 update_global_block_rsv(root->fs_info); 7272 goto again; 7273 } 7274 7275 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 7276 static DEFINE_RATELIMIT_STATE(_rs, 7277 DEFAULT_RATELIMIT_INTERVAL * 10, 7278 /*DEFAULT_RATELIMIT_BURST*/ 1); 7279 if (__ratelimit(&_rs)) 7280 WARN(1, KERN_DEBUG 7281 "BTRFS: block rsv returned %d\n", ret); 7282 } 7283 try_reserve: 7284 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 7285 BTRFS_RESERVE_NO_FLUSH); 7286 if (!ret) 7287 return block_rsv; 7288 /* 7289 * If we couldn't reserve metadata bytes try and use some from 7290 * the global reserve if its space type is the same as the global 7291 * reservation. 7292 */ 7293 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 7294 block_rsv->space_info == global_rsv->space_info) { 7295 ret = block_rsv_use_bytes(global_rsv, blocksize); 7296 if (!ret) 7297 return global_rsv; 7298 } 7299 return ERR_PTR(ret); 7300 } 7301 7302 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 7303 struct btrfs_block_rsv *block_rsv, u32 blocksize) 7304 { 7305 block_rsv_add_bytes(block_rsv, blocksize, 0); 7306 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 7307 } 7308 7309 /* 7310 * finds a free extent and does all the dirty work required for allocation 7311 * returns the key for the extent through ins, and a tree buffer for 7312 * the first block of the extent through buf. 7313 * 7314 * returns the tree buffer or NULL. 7315 */ 7316 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 7317 struct btrfs_root *root, 7318 u64 parent, u64 root_objectid, 7319 struct btrfs_disk_key *key, int level, 7320 u64 hint, u64 empty_size) 7321 { 7322 struct btrfs_key ins; 7323 struct btrfs_block_rsv *block_rsv; 7324 struct extent_buffer *buf; 7325 u64 flags = 0; 7326 int ret; 7327 u32 blocksize = root->nodesize; 7328 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7329 SKINNY_METADATA); 7330 7331 if (btrfs_test_is_dummy_root(root)) { 7332 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7333 level); 7334 if (!IS_ERR(buf)) 7335 root->alloc_bytenr += blocksize; 7336 return buf; 7337 } 7338 7339 block_rsv = use_block_rsv(trans, root, blocksize); 7340 if (IS_ERR(block_rsv)) 7341 return ERR_CAST(block_rsv); 7342 7343 ret = btrfs_reserve_extent(root, blocksize, blocksize, 7344 empty_size, hint, &ins, 0, 0); 7345 if (ret) { 7346 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 7347 return ERR_PTR(ret); 7348 } 7349 7350 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 7351 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 7352 7353 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 7354 if (parent == 0) 7355 parent = ins.objectid; 7356 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 7357 } else 7358 BUG_ON(parent > 0); 7359 7360 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 7361 struct btrfs_delayed_extent_op *extent_op; 7362 extent_op = btrfs_alloc_delayed_extent_op(); 7363 BUG_ON(!extent_op); /* -ENOMEM */ 7364 if (key) 7365 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 7366 else 7367 memset(&extent_op->key, 0, sizeof(extent_op->key)); 7368 extent_op->flags_to_set = flags; 7369 if (skinny_metadata) 7370 extent_op->update_key = 0; 7371 else 7372 extent_op->update_key = 1; 7373 extent_op->update_flags = 1; 7374 extent_op->is_data = 0; 7375 extent_op->level = level; 7376 7377 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 7378 ins.objectid, 7379 ins.offset, parent, root_objectid, 7380 level, BTRFS_ADD_DELAYED_EXTENT, 7381 extent_op, 0); 7382 BUG_ON(ret); /* -ENOMEM */ 7383 } 7384 return buf; 7385 } 7386 7387 struct walk_control { 7388 u64 refs[BTRFS_MAX_LEVEL]; 7389 u64 flags[BTRFS_MAX_LEVEL]; 7390 struct btrfs_key update_progress; 7391 int stage; 7392 int level; 7393 int shared_level; 7394 int update_ref; 7395 int keep_locks; 7396 int reada_slot; 7397 int reada_count; 7398 int for_reloc; 7399 }; 7400 7401 #define DROP_REFERENCE 1 7402 #define UPDATE_BACKREF 2 7403 7404 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 7405 struct btrfs_root *root, 7406 struct walk_control *wc, 7407 struct btrfs_path *path) 7408 { 7409 u64 bytenr; 7410 u64 generation; 7411 u64 refs; 7412 u64 flags; 7413 u32 nritems; 7414 u32 blocksize; 7415 struct btrfs_key key; 7416 struct extent_buffer *eb; 7417 int ret; 7418 int slot; 7419 int nread = 0; 7420 7421 if (path->slots[wc->level] < wc->reada_slot) { 7422 wc->reada_count = wc->reada_count * 2 / 3; 7423 wc->reada_count = max(wc->reada_count, 2); 7424 } else { 7425 wc->reada_count = wc->reada_count * 3 / 2; 7426 wc->reada_count = min_t(int, wc->reada_count, 7427 BTRFS_NODEPTRS_PER_BLOCK(root)); 7428 } 7429 7430 eb = path->nodes[wc->level]; 7431 nritems = btrfs_header_nritems(eb); 7432 blocksize = root->nodesize; 7433 7434 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7435 if (nread >= wc->reada_count) 7436 break; 7437 7438 cond_resched(); 7439 bytenr = btrfs_node_blockptr(eb, slot); 7440 generation = btrfs_node_ptr_generation(eb, slot); 7441 7442 if (slot == path->slots[wc->level]) 7443 goto reada; 7444 7445 if (wc->stage == UPDATE_BACKREF && 7446 generation <= root->root_key.offset) 7447 continue; 7448 7449 /* We don't lock the tree block, it's OK to be racy here */ 7450 ret = btrfs_lookup_extent_info(trans, root, bytenr, 7451 wc->level - 1, 1, &refs, 7452 &flags); 7453 /* We don't care about errors in readahead. */ 7454 if (ret < 0) 7455 continue; 7456 BUG_ON(refs == 0); 7457 7458 if (wc->stage == DROP_REFERENCE) { 7459 if (refs == 1) 7460 goto reada; 7461 7462 if (wc->level == 1 && 7463 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7464 continue; 7465 if (!wc->update_ref || 7466 generation <= root->root_key.offset) 7467 continue; 7468 btrfs_node_key_to_cpu(eb, &key, slot); 7469 ret = btrfs_comp_cpu_keys(&key, 7470 &wc->update_progress); 7471 if (ret < 0) 7472 continue; 7473 } else { 7474 if (wc->level == 1 && 7475 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7476 continue; 7477 } 7478 reada: 7479 readahead_tree_block(root, bytenr); 7480 nread++; 7481 } 7482 wc->reada_slot = slot; 7483 } 7484 7485 static int account_leaf_items(struct btrfs_trans_handle *trans, 7486 struct btrfs_root *root, 7487 struct extent_buffer *eb) 7488 { 7489 int nr = btrfs_header_nritems(eb); 7490 int i, extent_type, ret; 7491 struct btrfs_key key; 7492 struct btrfs_file_extent_item *fi; 7493 u64 bytenr, num_bytes; 7494 7495 for (i = 0; i < nr; i++) { 7496 btrfs_item_key_to_cpu(eb, &key, i); 7497 7498 if (key.type != BTRFS_EXTENT_DATA_KEY) 7499 continue; 7500 7501 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 7502 /* filter out non qgroup-accountable extents */ 7503 extent_type = btrfs_file_extent_type(eb, fi); 7504 7505 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 7506 continue; 7507 7508 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 7509 if (!bytenr) 7510 continue; 7511 7512 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 7513 7514 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7515 root->objectid, 7516 bytenr, num_bytes, 7517 BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); 7518 if (ret) 7519 return ret; 7520 } 7521 return 0; 7522 } 7523 7524 /* 7525 * Walk up the tree from the bottom, freeing leaves and any interior 7526 * nodes which have had all slots visited. If a node (leaf or 7527 * interior) is freed, the node above it will have it's slot 7528 * incremented. The root node will never be freed. 7529 * 7530 * At the end of this function, we should have a path which has all 7531 * slots incremented to the next position for a search. If we need to 7532 * read a new node it will be NULL and the node above it will have the 7533 * correct slot selected for a later read. 7534 * 7535 * If we increment the root nodes slot counter past the number of 7536 * elements, 1 is returned to signal completion of the search. 7537 */ 7538 static int adjust_slots_upwards(struct btrfs_root *root, 7539 struct btrfs_path *path, int root_level) 7540 { 7541 int level = 0; 7542 int nr, slot; 7543 struct extent_buffer *eb; 7544 7545 if (root_level == 0) 7546 return 1; 7547 7548 while (level <= root_level) { 7549 eb = path->nodes[level]; 7550 nr = btrfs_header_nritems(eb); 7551 path->slots[level]++; 7552 slot = path->slots[level]; 7553 if (slot >= nr || level == 0) { 7554 /* 7555 * Don't free the root - we will detect this 7556 * condition after our loop and return a 7557 * positive value for caller to stop walking the tree. 7558 */ 7559 if (level != root_level) { 7560 btrfs_tree_unlock_rw(eb, path->locks[level]); 7561 path->locks[level] = 0; 7562 7563 free_extent_buffer(eb); 7564 path->nodes[level] = NULL; 7565 path->slots[level] = 0; 7566 } 7567 } else { 7568 /* 7569 * We have a valid slot to walk back down 7570 * from. Stop here so caller can process these 7571 * new nodes. 7572 */ 7573 break; 7574 } 7575 7576 level++; 7577 } 7578 7579 eb = path->nodes[root_level]; 7580 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 7581 return 1; 7582 7583 return 0; 7584 } 7585 7586 /* 7587 * root_eb is the subtree root and is locked before this function is called. 7588 */ 7589 static int account_shared_subtree(struct btrfs_trans_handle *trans, 7590 struct btrfs_root *root, 7591 struct extent_buffer *root_eb, 7592 u64 root_gen, 7593 int root_level) 7594 { 7595 int ret = 0; 7596 int level; 7597 struct extent_buffer *eb = root_eb; 7598 struct btrfs_path *path = NULL; 7599 7600 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); 7601 BUG_ON(root_eb == NULL); 7602 7603 if (!root->fs_info->quota_enabled) 7604 return 0; 7605 7606 if (!extent_buffer_uptodate(root_eb)) { 7607 ret = btrfs_read_buffer(root_eb, root_gen); 7608 if (ret) 7609 goto out; 7610 } 7611 7612 if (root_level == 0) { 7613 ret = account_leaf_items(trans, root, root_eb); 7614 goto out; 7615 } 7616 7617 path = btrfs_alloc_path(); 7618 if (!path) 7619 return -ENOMEM; 7620 7621 /* 7622 * Walk down the tree. Missing extent blocks are filled in as 7623 * we go. Metadata is accounted every time we read a new 7624 * extent block. 7625 * 7626 * When we reach a leaf, we account for file extent items in it, 7627 * walk back up the tree (adjusting slot pointers as we go) 7628 * and restart the search process. 7629 */ 7630 extent_buffer_get(root_eb); /* For path */ 7631 path->nodes[root_level] = root_eb; 7632 path->slots[root_level] = 0; 7633 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 7634 walk_down: 7635 level = root_level; 7636 while (level >= 0) { 7637 if (path->nodes[level] == NULL) { 7638 int parent_slot; 7639 u64 child_gen; 7640 u64 child_bytenr; 7641 7642 /* We need to get child blockptr/gen from 7643 * parent before we can read it. */ 7644 eb = path->nodes[level + 1]; 7645 parent_slot = path->slots[level + 1]; 7646 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 7647 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 7648 7649 eb = read_tree_block(root, child_bytenr, child_gen); 7650 if (!eb || !extent_buffer_uptodate(eb)) { 7651 ret = -EIO; 7652 goto out; 7653 } 7654 7655 path->nodes[level] = eb; 7656 path->slots[level] = 0; 7657 7658 btrfs_tree_read_lock(eb); 7659 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 7660 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 7661 7662 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7663 root->objectid, 7664 child_bytenr, 7665 root->nodesize, 7666 BTRFS_QGROUP_OPER_SUB_SUBTREE, 7667 0); 7668 if (ret) 7669 goto out; 7670 7671 } 7672 7673 if (level == 0) { 7674 ret = account_leaf_items(trans, root, path->nodes[level]); 7675 if (ret) 7676 goto out; 7677 7678 /* Nonzero return here means we completed our search */ 7679 ret = adjust_slots_upwards(root, path, root_level); 7680 if (ret) 7681 break; 7682 7683 /* Restart search with new slots */ 7684 goto walk_down; 7685 } 7686 7687 level--; 7688 } 7689 7690 ret = 0; 7691 out: 7692 btrfs_free_path(path); 7693 7694 return ret; 7695 } 7696 7697 /* 7698 * helper to process tree block while walking down the tree. 7699 * 7700 * when wc->stage == UPDATE_BACKREF, this function updates 7701 * back refs for pointers in the block. 7702 * 7703 * NOTE: return value 1 means we should stop walking down. 7704 */ 7705 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 7706 struct btrfs_root *root, 7707 struct btrfs_path *path, 7708 struct walk_control *wc, int lookup_info) 7709 { 7710 int level = wc->level; 7711 struct extent_buffer *eb = path->nodes[level]; 7712 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7713 int ret; 7714 7715 if (wc->stage == UPDATE_BACKREF && 7716 btrfs_header_owner(eb) != root->root_key.objectid) 7717 return 1; 7718 7719 /* 7720 * when reference count of tree block is 1, it won't increase 7721 * again. once full backref flag is set, we never clear it. 7722 */ 7723 if (lookup_info && 7724 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 7725 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 7726 BUG_ON(!path->locks[level]); 7727 ret = btrfs_lookup_extent_info(trans, root, 7728 eb->start, level, 1, 7729 &wc->refs[level], 7730 &wc->flags[level]); 7731 BUG_ON(ret == -ENOMEM); 7732 if (ret) 7733 return ret; 7734 BUG_ON(wc->refs[level] == 0); 7735 } 7736 7737 if (wc->stage == DROP_REFERENCE) { 7738 if (wc->refs[level] > 1) 7739 return 1; 7740 7741 if (path->locks[level] && !wc->keep_locks) { 7742 btrfs_tree_unlock_rw(eb, path->locks[level]); 7743 path->locks[level] = 0; 7744 } 7745 return 0; 7746 } 7747 7748 /* wc->stage == UPDATE_BACKREF */ 7749 if (!(wc->flags[level] & flag)) { 7750 BUG_ON(!path->locks[level]); 7751 ret = btrfs_inc_ref(trans, root, eb, 1); 7752 BUG_ON(ret); /* -ENOMEM */ 7753 ret = btrfs_dec_ref(trans, root, eb, 0); 7754 BUG_ON(ret); /* -ENOMEM */ 7755 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7756 eb->len, flag, 7757 btrfs_header_level(eb), 0); 7758 BUG_ON(ret); /* -ENOMEM */ 7759 wc->flags[level] |= flag; 7760 } 7761 7762 /* 7763 * the block is shared by multiple trees, so it's not good to 7764 * keep the tree lock 7765 */ 7766 if (path->locks[level] && level > 0) { 7767 btrfs_tree_unlock_rw(eb, path->locks[level]); 7768 path->locks[level] = 0; 7769 } 7770 return 0; 7771 } 7772 7773 /* 7774 * helper to process tree block pointer. 7775 * 7776 * when wc->stage == DROP_REFERENCE, this function checks 7777 * reference count of the block pointed to. if the block 7778 * is shared and we need update back refs for the subtree 7779 * rooted at the block, this function changes wc->stage to 7780 * UPDATE_BACKREF. if the block is shared and there is no 7781 * need to update back, this function drops the reference 7782 * to the block. 7783 * 7784 * NOTE: return value 1 means we should stop walking down. 7785 */ 7786 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 7787 struct btrfs_root *root, 7788 struct btrfs_path *path, 7789 struct walk_control *wc, int *lookup_info) 7790 { 7791 u64 bytenr; 7792 u64 generation; 7793 u64 parent; 7794 u32 blocksize; 7795 struct btrfs_key key; 7796 struct extent_buffer *next; 7797 int level = wc->level; 7798 int reada = 0; 7799 int ret = 0; 7800 bool need_account = false; 7801 7802 generation = btrfs_node_ptr_generation(path->nodes[level], 7803 path->slots[level]); 7804 /* 7805 * if the lower level block was created before the snapshot 7806 * was created, we know there is no need to update back refs 7807 * for the subtree 7808 */ 7809 if (wc->stage == UPDATE_BACKREF && 7810 generation <= root->root_key.offset) { 7811 *lookup_info = 1; 7812 return 1; 7813 } 7814 7815 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7816 blocksize = root->nodesize; 7817 7818 next = btrfs_find_tree_block(root, bytenr); 7819 if (!next) { 7820 next = btrfs_find_create_tree_block(root, bytenr); 7821 if (!next) 7822 return -ENOMEM; 7823 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 7824 level - 1); 7825 reada = 1; 7826 } 7827 btrfs_tree_lock(next); 7828 btrfs_set_lock_blocking(next); 7829 7830 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7831 &wc->refs[level - 1], 7832 &wc->flags[level - 1]); 7833 if (ret < 0) { 7834 btrfs_tree_unlock(next); 7835 return ret; 7836 } 7837 7838 if (unlikely(wc->refs[level - 1] == 0)) { 7839 btrfs_err(root->fs_info, "Missing references."); 7840 BUG(); 7841 } 7842 *lookup_info = 0; 7843 7844 if (wc->stage == DROP_REFERENCE) { 7845 if (wc->refs[level - 1] > 1) { 7846 need_account = true; 7847 if (level == 1 && 7848 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7849 goto skip; 7850 7851 if (!wc->update_ref || 7852 generation <= root->root_key.offset) 7853 goto skip; 7854 7855 btrfs_node_key_to_cpu(path->nodes[level], &key, 7856 path->slots[level]); 7857 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7858 if (ret < 0) 7859 goto skip; 7860 7861 wc->stage = UPDATE_BACKREF; 7862 wc->shared_level = level - 1; 7863 } 7864 } else { 7865 if (level == 1 && 7866 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7867 goto skip; 7868 } 7869 7870 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7871 btrfs_tree_unlock(next); 7872 free_extent_buffer(next); 7873 next = NULL; 7874 *lookup_info = 1; 7875 } 7876 7877 if (!next) { 7878 if (reada && level == 1) 7879 reada_walk_down(trans, root, wc, path); 7880 next = read_tree_block(root, bytenr, generation); 7881 if (!next || !extent_buffer_uptodate(next)) { 7882 free_extent_buffer(next); 7883 return -EIO; 7884 } 7885 btrfs_tree_lock(next); 7886 btrfs_set_lock_blocking(next); 7887 } 7888 7889 level--; 7890 BUG_ON(level != btrfs_header_level(next)); 7891 path->nodes[level] = next; 7892 path->slots[level] = 0; 7893 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7894 wc->level = level; 7895 if (wc->level == 1) 7896 wc->reada_slot = 0; 7897 return 0; 7898 skip: 7899 wc->refs[level - 1] = 0; 7900 wc->flags[level - 1] = 0; 7901 if (wc->stage == DROP_REFERENCE) { 7902 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7903 parent = path->nodes[level]->start; 7904 } else { 7905 BUG_ON(root->root_key.objectid != 7906 btrfs_header_owner(path->nodes[level])); 7907 parent = 0; 7908 } 7909 7910 if (need_account) { 7911 ret = account_shared_subtree(trans, root, next, 7912 generation, level - 1); 7913 if (ret) { 7914 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 7915 "%d accounting shared subtree. Quota " 7916 "is out of sync, rescan required.\n", 7917 root->fs_info->sb->s_id, ret); 7918 } 7919 } 7920 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7921 root->root_key.objectid, level - 1, 0, 0); 7922 BUG_ON(ret); /* -ENOMEM */ 7923 } 7924 btrfs_tree_unlock(next); 7925 free_extent_buffer(next); 7926 *lookup_info = 1; 7927 return 1; 7928 } 7929 7930 /* 7931 * helper to process tree block while walking up the tree. 7932 * 7933 * when wc->stage == DROP_REFERENCE, this function drops 7934 * reference count on the block. 7935 * 7936 * when wc->stage == UPDATE_BACKREF, this function changes 7937 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7938 * to UPDATE_BACKREF previously while processing the block. 7939 * 7940 * NOTE: return value 1 means we should stop walking up. 7941 */ 7942 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7943 struct btrfs_root *root, 7944 struct btrfs_path *path, 7945 struct walk_control *wc) 7946 { 7947 int ret; 7948 int level = wc->level; 7949 struct extent_buffer *eb = path->nodes[level]; 7950 u64 parent = 0; 7951 7952 if (wc->stage == UPDATE_BACKREF) { 7953 BUG_ON(wc->shared_level < level); 7954 if (level < wc->shared_level) 7955 goto out; 7956 7957 ret = find_next_key(path, level + 1, &wc->update_progress); 7958 if (ret > 0) 7959 wc->update_ref = 0; 7960 7961 wc->stage = DROP_REFERENCE; 7962 wc->shared_level = -1; 7963 path->slots[level] = 0; 7964 7965 /* 7966 * check reference count again if the block isn't locked. 7967 * we should start walking down the tree again if reference 7968 * count is one. 7969 */ 7970 if (!path->locks[level]) { 7971 BUG_ON(level == 0); 7972 btrfs_tree_lock(eb); 7973 btrfs_set_lock_blocking(eb); 7974 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7975 7976 ret = btrfs_lookup_extent_info(trans, root, 7977 eb->start, level, 1, 7978 &wc->refs[level], 7979 &wc->flags[level]); 7980 if (ret < 0) { 7981 btrfs_tree_unlock_rw(eb, path->locks[level]); 7982 path->locks[level] = 0; 7983 return ret; 7984 } 7985 BUG_ON(wc->refs[level] == 0); 7986 if (wc->refs[level] == 1) { 7987 btrfs_tree_unlock_rw(eb, path->locks[level]); 7988 path->locks[level] = 0; 7989 return 1; 7990 } 7991 } 7992 } 7993 7994 /* wc->stage == DROP_REFERENCE */ 7995 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7996 7997 if (wc->refs[level] == 1) { 7998 if (level == 0) { 7999 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8000 ret = btrfs_dec_ref(trans, root, eb, 1); 8001 else 8002 ret = btrfs_dec_ref(trans, root, eb, 0); 8003 BUG_ON(ret); /* -ENOMEM */ 8004 ret = account_leaf_items(trans, root, eb); 8005 if (ret) { 8006 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 8007 "%d accounting leaf items. Quota " 8008 "is out of sync, rescan required.\n", 8009 root->fs_info->sb->s_id, ret); 8010 } 8011 } 8012 /* make block locked assertion in clean_tree_block happy */ 8013 if (!path->locks[level] && 8014 btrfs_header_generation(eb) == trans->transid) { 8015 btrfs_tree_lock(eb); 8016 btrfs_set_lock_blocking(eb); 8017 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8018 } 8019 clean_tree_block(trans, root, eb); 8020 } 8021 8022 if (eb == root->node) { 8023 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8024 parent = eb->start; 8025 else 8026 BUG_ON(root->root_key.objectid != 8027 btrfs_header_owner(eb)); 8028 } else { 8029 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8030 parent = path->nodes[level + 1]->start; 8031 else 8032 BUG_ON(root->root_key.objectid != 8033 btrfs_header_owner(path->nodes[level + 1])); 8034 } 8035 8036 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8037 out: 8038 wc->refs[level] = 0; 8039 wc->flags[level] = 0; 8040 return 0; 8041 } 8042 8043 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8044 struct btrfs_root *root, 8045 struct btrfs_path *path, 8046 struct walk_control *wc) 8047 { 8048 int level = wc->level; 8049 int lookup_info = 1; 8050 int ret; 8051 8052 while (level >= 0) { 8053 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8054 if (ret > 0) 8055 break; 8056 8057 if (level == 0) 8058 break; 8059 8060 if (path->slots[level] >= 8061 btrfs_header_nritems(path->nodes[level])) 8062 break; 8063 8064 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8065 if (ret > 0) { 8066 path->slots[level]++; 8067 continue; 8068 } else if (ret < 0) 8069 return ret; 8070 level = wc->level; 8071 } 8072 return 0; 8073 } 8074 8075 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8076 struct btrfs_root *root, 8077 struct btrfs_path *path, 8078 struct walk_control *wc, int max_level) 8079 { 8080 int level = wc->level; 8081 int ret; 8082 8083 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8084 while (level < max_level && path->nodes[level]) { 8085 wc->level = level; 8086 if (path->slots[level] + 1 < 8087 btrfs_header_nritems(path->nodes[level])) { 8088 path->slots[level]++; 8089 return 0; 8090 } else { 8091 ret = walk_up_proc(trans, root, path, wc); 8092 if (ret > 0) 8093 return 0; 8094 8095 if (path->locks[level]) { 8096 btrfs_tree_unlock_rw(path->nodes[level], 8097 path->locks[level]); 8098 path->locks[level] = 0; 8099 } 8100 free_extent_buffer(path->nodes[level]); 8101 path->nodes[level] = NULL; 8102 level++; 8103 } 8104 } 8105 return 1; 8106 } 8107 8108 /* 8109 * drop a subvolume tree. 8110 * 8111 * this function traverses the tree freeing any blocks that only 8112 * referenced by the tree. 8113 * 8114 * when a shared tree block is found. this function decreases its 8115 * reference count by one. if update_ref is true, this function 8116 * also make sure backrefs for the shared block and all lower level 8117 * blocks are properly updated. 8118 * 8119 * If called with for_reloc == 0, may exit early with -EAGAIN 8120 */ 8121 int btrfs_drop_snapshot(struct btrfs_root *root, 8122 struct btrfs_block_rsv *block_rsv, int update_ref, 8123 int for_reloc) 8124 { 8125 struct btrfs_path *path; 8126 struct btrfs_trans_handle *trans; 8127 struct btrfs_root *tree_root = root->fs_info->tree_root; 8128 struct btrfs_root_item *root_item = &root->root_item; 8129 struct walk_control *wc; 8130 struct btrfs_key key; 8131 int err = 0; 8132 int ret; 8133 int level; 8134 bool root_dropped = false; 8135 8136 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); 8137 8138 path = btrfs_alloc_path(); 8139 if (!path) { 8140 err = -ENOMEM; 8141 goto out; 8142 } 8143 8144 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8145 if (!wc) { 8146 btrfs_free_path(path); 8147 err = -ENOMEM; 8148 goto out; 8149 } 8150 8151 trans = btrfs_start_transaction(tree_root, 0); 8152 if (IS_ERR(trans)) { 8153 err = PTR_ERR(trans); 8154 goto out_free; 8155 } 8156 8157 if (block_rsv) 8158 trans->block_rsv = block_rsv; 8159 8160 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 8161 level = btrfs_header_level(root->node); 8162 path->nodes[level] = btrfs_lock_root_node(root); 8163 btrfs_set_lock_blocking(path->nodes[level]); 8164 path->slots[level] = 0; 8165 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8166 memset(&wc->update_progress, 0, 8167 sizeof(wc->update_progress)); 8168 } else { 8169 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 8170 memcpy(&wc->update_progress, &key, 8171 sizeof(wc->update_progress)); 8172 8173 level = root_item->drop_level; 8174 BUG_ON(level == 0); 8175 path->lowest_level = level; 8176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8177 path->lowest_level = 0; 8178 if (ret < 0) { 8179 err = ret; 8180 goto out_end_trans; 8181 } 8182 WARN_ON(ret > 0); 8183 8184 /* 8185 * unlock our path, this is safe because only this 8186 * function is allowed to delete this snapshot 8187 */ 8188 btrfs_unlock_up_safe(path, 0); 8189 8190 level = btrfs_header_level(root->node); 8191 while (1) { 8192 btrfs_tree_lock(path->nodes[level]); 8193 btrfs_set_lock_blocking(path->nodes[level]); 8194 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8195 8196 ret = btrfs_lookup_extent_info(trans, root, 8197 path->nodes[level]->start, 8198 level, 1, &wc->refs[level], 8199 &wc->flags[level]); 8200 if (ret < 0) { 8201 err = ret; 8202 goto out_end_trans; 8203 } 8204 BUG_ON(wc->refs[level] == 0); 8205 8206 if (level == root_item->drop_level) 8207 break; 8208 8209 btrfs_tree_unlock(path->nodes[level]); 8210 path->locks[level] = 0; 8211 WARN_ON(wc->refs[level] != 1); 8212 level--; 8213 } 8214 } 8215 8216 wc->level = level; 8217 wc->shared_level = -1; 8218 wc->stage = DROP_REFERENCE; 8219 wc->update_ref = update_ref; 8220 wc->keep_locks = 0; 8221 wc->for_reloc = for_reloc; 8222 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8223 8224 while (1) { 8225 8226 ret = walk_down_tree(trans, root, path, wc); 8227 if (ret < 0) { 8228 err = ret; 8229 break; 8230 } 8231 8232 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 8233 if (ret < 0) { 8234 err = ret; 8235 break; 8236 } 8237 8238 if (ret > 0) { 8239 BUG_ON(wc->stage != DROP_REFERENCE); 8240 break; 8241 } 8242 8243 if (wc->stage == DROP_REFERENCE) { 8244 level = wc->level; 8245 btrfs_node_key(path->nodes[level], 8246 &root_item->drop_progress, 8247 path->slots[level]); 8248 root_item->drop_level = level; 8249 } 8250 8251 BUG_ON(wc->level == 0); 8252 if (btrfs_should_end_transaction(trans, tree_root) || 8253 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 8254 ret = btrfs_update_root(trans, tree_root, 8255 &root->root_key, 8256 root_item); 8257 if (ret) { 8258 btrfs_abort_transaction(trans, tree_root, ret); 8259 err = ret; 8260 goto out_end_trans; 8261 } 8262 8263 /* 8264 * Qgroup update accounting is run from 8265 * delayed ref handling. This usually works 8266 * out because delayed refs are normally the 8267 * only way qgroup updates are added. However, 8268 * we may have added updates during our tree 8269 * walk so run qgroups here to make sure we 8270 * don't lose any updates. 8271 */ 8272 ret = btrfs_delayed_qgroup_accounting(trans, 8273 root->fs_info); 8274 if (ret) 8275 printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8276 "running qgroup updates " 8277 "during snapshot delete. " 8278 "Quota is out of sync, " 8279 "rescan required.\n", ret); 8280 8281 btrfs_end_transaction_throttle(trans, tree_root); 8282 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8283 pr_debug("BTRFS: drop snapshot early exit\n"); 8284 err = -EAGAIN; 8285 goto out_free; 8286 } 8287 8288 trans = btrfs_start_transaction(tree_root, 0); 8289 if (IS_ERR(trans)) { 8290 err = PTR_ERR(trans); 8291 goto out_free; 8292 } 8293 if (block_rsv) 8294 trans->block_rsv = block_rsv; 8295 } 8296 } 8297 btrfs_release_path(path); 8298 if (err) 8299 goto out_end_trans; 8300 8301 ret = btrfs_del_root(trans, tree_root, &root->root_key); 8302 if (ret) { 8303 btrfs_abort_transaction(trans, tree_root, ret); 8304 goto out_end_trans; 8305 } 8306 8307 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 8308 ret = btrfs_find_root(tree_root, &root->root_key, path, 8309 NULL, NULL); 8310 if (ret < 0) { 8311 btrfs_abort_transaction(trans, tree_root, ret); 8312 err = ret; 8313 goto out_end_trans; 8314 } else if (ret > 0) { 8315 /* if we fail to delete the orphan item this time 8316 * around, it'll get picked up the next time. 8317 * 8318 * The most common failure here is just -ENOENT. 8319 */ 8320 btrfs_del_orphan_item(trans, tree_root, 8321 root->root_key.objectid); 8322 } 8323 } 8324 8325 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 8326 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 8327 } else { 8328 free_extent_buffer(root->node); 8329 free_extent_buffer(root->commit_root); 8330 btrfs_put_fs_root(root); 8331 } 8332 root_dropped = true; 8333 out_end_trans: 8334 ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); 8335 if (ret) 8336 printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8337 "running qgroup updates " 8338 "during snapshot delete. " 8339 "Quota is out of sync, " 8340 "rescan required.\n", ret); 8341 8342 btrfs_end_transaction_throttle(trans, tree_root); 8343 out_free: 8344 kfree(wc); 8345 btrfs_free_path(path); 8346 out: 8347 /* 8348 * So if we need to stop dropping the snapshot for whatever reason we 8349 * need to make sure to add it back to the dead root list so that we 8350 * keep trying to do the work later. This also cleans up roots if we 8351 * don't have it in the radix (like when we recover after a power fail 8352 * or unmount) so we don't leak memory. 8353 */ 8354 if (!for_reloc && root_dropped == false) 8355 btrfs_add_dead_root(root); 8356 if (err && err != -EAGAIN) 8357 btrfs_std_error(root->fs_info, err); 8358 return err; 8359 } 8360 8361 /* 8362 * drop subtree rooted at tree block 'node'. 8363 * 8364 * NOTE: this function will unlock and release tree block 'node' 8365 * only used by relocation code 8366 */ 8367 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 8368 struct btrfs_root *root, 8369 struct extent_buffer *node, 8370 struct extent_buffer *parent) 8371 { 8372 struct btrfs_path *path; 8373 struct walk_control *wc; 8374 int level; 8375 int parent_level; 8376 int ret = 0; 8377 int wret; 8378 8379 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 8380 8381 path = btrfs_alloc_path(); 8382 if (!path) 8383 return -ENOMEM; 8384 8385 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8386 if (!wc) { 8387 btrfs_free_path(path); 8388 return -ENOMEM; 8389 } 8390 8391 btrfs_assert_tree_locked(parent); 8392 parent_level = btrfs_header_level(parent); 8393 extent_buffer_get(parent); 8394 path->nodes[parent_level] = parent; 8395 path->slots[parent_level] = btrfs_header_nritems(parent); 8396 8397 btrfs_assert_tree_locked(node); 8398 level = btrfs_header_level(node); 8399 path->nodes[level] = node; 8400 path->slots[level] = 0; 8401 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8402 8403 wc->refs[parent_level] = 1; 8404 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8405 wc->level = level; 8406 wc->shared_level = -1; 8407 wc->stage = DROP_REFERENCE; 8408 wc->update_ref = 0; 8409 wc->keep_locks = 1; 8410 wc->for_reloc = 1; 8411 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8412 8413 while (1) { 8414 wret = walk_down_tree(trans, root, path, wc); 8415 if (wret < 0) { 8416 ret = wret; 8417 break; 8418 } 8419 8420 wret = walk_up_tree(trans, root, path, wc, parent_level); 8421 if (wret < 0) 8422 ret = wret; 8423 if (wret != 0) 8424 break; 8425 } 8426 8427 kfree(wc); 8428 btrfs_free_path(path); 8429 return ret; 8430 } 8431 8432 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 8433 { 8434 u64 num_devices; 8435 u64 stripped; 8436 8437 /* 8438 * if restripe for this chunk_type is on pick target profile and 8439 * return, otherwise do the usual balance 8440 */ 8441 stripped = get_restripe_target(root->fs_info, flags); 8442 if (stripped) 8443 return extended_to_chunk(stripped); 8444 8445 num_devices = root->fs_info->fs_devices->rw_devices; 8446 8447 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8448 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8449 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 8450 8451 if (num_devices == 1) { 8452 stripped |= BTRFS_BLOCK_GROUP_DUP; 8453 stripped = flags & ~stripped; 8454 8455 /* turn raid0 into single device chunks */ 8456 if (flags & BTRFS_BLOCK_GROUP_RAID0) 8457 return stripped; 8458 8459 /* turn mirroring into duplication */ 8460 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 8461 BTRFS_BLOCK_GROUP_RAID10)) 8462 return stripped | BTRFS_BLOCK_GROUP_DUP; 8463 } else { 8464 /* they already had raid on here, just return */ 8465 if (flags & stripped) 8466 return flags; 8467 8468 stripped |= BTRFS_BLOCK_GROUP_DUP; 8469 stripped = flags & ~stripped; 8470 8471 /* switch duplicated blocks with raid1 */ 8472 if (flags & BTRFS_BLOCK_GROUP_DUP) 8473 return stripped | BTRFS_BLOCK_GROUP_RAID1; 8474 8475 /* this is drive concat, leave it alone */ 8476 } 8477 8478 return flags; 8479 } 8480 8481 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 8482 { 8483 struct btrfs_space_info *sinfo = cache->space_info; 8484 u64 num_bytes; 8485 u64 min_allocable_bytes; 8486 int ret = -ENOSPC; 8487 8488 8489 /* 8490 * We need some metadata space and system metadata space for 8491 * allocating chunks in some corner cases until we force to set 8492 * it to be readonly. 8493 */ 8494 if ((sinfo->flags & 8495 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 8496 !force) 8497 min_allocable_bytes = 1 * 1024 * 1024; 8498 else 8499 min_allocable_bytes = 0; 8500 8501 spin_lock(&sinfo->lock); 8502 spin_lock(&cache->lock); 8503 8504 if (cache->ro) { 8505 ret = 0; 8506 goto out; 8507 } 8508 8509 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8510 cache->bytes_super - btrfs_block_group_used(&cache->item); 8511 8512 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8513 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 8514 min_allocable_bytes <= sinfo->total_bytes) { 8515 sinfo->bytes_readonly += num_bytes; 8516 cache->ro = 1; 8517 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 8518 ret = 0; 8519 } 8520 out: 8521 spin_unlock(&cache->lock); 8522 spin_unlock(&sinfo->lock); 8523 return ret; 8524 } 8525 8526 int btrfs_set_block_group_ro(struct btrfs_root *root, 8527 struct btrfs_block_group_cache *cache) 8528 8529 { 8530 struct btrfs_trans_handle *trans; 8531 u64 alloc_flags; 8532 int ret; 8533 8534 BUG_ON(cache->ro); 8535 8536 trans = btrfs_join_transaction(root); 8537 if (IS_ERR(trans)) 8538 return PTR_ERR(trans); 8539 8540 ret = set_block_group_ro(cache, 0); 8541 if (!ret) 8542 goto out; 8543 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8544 ret = do_chunk_alloc(trans, root, alloc_flags, 8545 CHUNK_ALLOC_FORCE); 8546 if (ret < 0) 8547 goto out; 8548 ret = set_block_group_ro(cache, 0); 8549 out: 8550 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 8551 alloc_flags = update_block_group_flags(root, cache->flags); 8552 check_system_chunk(trans, root, alloc_flags); 8553 } 8554 8555 btrfs_end_transaction(trans, root); 8556 return ret; 8557 } 8558 8559 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 8560 struct btrfs_root *root, u64 type) 8561 { 8562 u64 alloc_flags = get_alloc_profile(root, type); 8563 return do_chunk_alloc(trans, root, alloc_flags, 8564 CHUNK_ALLOC_FORCE); 8565 } 8566 8567 /* 8568 * helper to account the unused space of all the readonly block group in the 8569 * space_info. takes mirrors into account. 8570 */ 8571 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 8572 { 8573 struct btrfs_block_group_cache *block_group; 8574 u64 free_bytes = 0; 8575 int factor; 8576 8577 /* It's df, we don't care if it's racey */ 8578 if (list_empty(&sinfo->ro_bgs)) 8579 return 0; 8580 8581 spin_lock(&sinfo->lock); 8582 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 8583 spin_lock(&block_group->lock); 8584 8585 if (!block_group->ro) { 8586 spin_unlock(&block_group->lock); 8587 continue; 8588 } 8589 8590 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 8591 BTRFS_BLOCK_GROUP_RAID10 | 8592 BTRFS_BLOCK_GROUP_DUP)) 8593 factor = 2; 8594 else 8595 factor = 1; 8596 8597 free_bytes += (block_group->key.offset - 8598 btrfs_block_group_used(&block_group->item)) * 8599 factor; 8600 8601 spin_unlock(&block_group->lock); 8602 } 8603 spin_unlock(&sinfo->lock); 8604 8605 return free_bytes; 8606 } 8607 8608 void btrfs_set_block_group_rw(struct btrfs_root *root, 8609 struct btrfs_block_group_cache *cache) 8610 { 8611 struct btrfs_space_info *sinfo = cache->space_info; 8612 u64 num_bytes; 8613 8614 BUG_ON(!cache->ro); 8615 8616 spin_lock(&sinfo->lock); 8617 spin_lock(&cache->lock); 8618 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8619 cache->bytes_super - btrfs_block_group_used(&cache->item); 8620 sinfo->bytes_readonly -= num_bytes; 8621 cache->ro = 0; 8622 list_del_init(&cache->ro_list); 8623 spin_unlock(&cache->lock); 8624 spin_unlock(&sinfo->lock); 8625 } 8626 8627 /* 8628 * checks to see if its even possible to relocate this block group. 8629 * 8630 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 8631 * ok to go ahead and try. 8632 */ 8633 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 8634 { 8635 struct btrfs_block_group_cache *block_group; 8636 struct btrfs_space_info *space_info; 8637 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 8638 struct btrfs_device *device; 8639 struct btrfs_trans_handle *trans; 8640 u64 min_free; 8641 u64 dev_min = 1; 8642 u64 dev_nr = 0; 8643 u64 target; 8644 int index; 8645 int full = 0; 8646 int ret = 0; 8647 8648 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 8649 8650 /* odd, couldn't find the block group, leave it alone */ 8651 if (!block_group) 8652 return -1; 8653 8654 min_free = btrfs_block_group_used(&block_group->item); 8655 8656 /* no bytes used, we're good */ 8657 if (!min_free) 8658 goto out; 8659 8660 space_info = block_group->space_info; 8661 spin_lock(&space_info->lock); 8662 8663 full = space_info->full; 8664 8665 /* 8666 * if this is the last block group we have in this space, we can't 8667 * relocate it unless we're able to allocate a new chunk below. 8668 * 8669 * Otherwise, we need to make sure we have room in the space to handle 8670 * all of the extents from this block group. If we can, we're good 8671 */ 8672 if ((space_info->total_bytes != block_group->key.offset) && 8673 (space_info->bytes_used + space_info->bytes_reserved + 8674 space_info->bytes_pinned + space_info->bytes_readonly + 8675 min_free < space_info->total_bytes)) { 8676 spin_unlock(&space_info->lock); 8677 goto out; 8678 } 8679 spin_unlock(&space_info->lock); 8680 8681 /* 8682 * ok we don't have enough space, but maybe we have free space on our 8683 * devices to allocate new chunks for relocation, so loop through our 8684 * alloc devices and guess if we have enough space. if this block 8685 * group is going to be restriped, run checks against the target 8686 * profile instead of the current one. 8687 */ 8688 ret = -1; 8689 8690 /* 8691 * index: 8692 * 0: raid10 8693 * 1: raid1 8694 * 2: dup 8695 * 3: raid0 8696 * 4: single 8697 */ 8698 target = get_restripe_target(root->fs_info, block_group->flags); 8699 if (target) { 8700 index = __get_raid_index(extended_to_chunk(target)); 8701 } else { 8702 /* 8703 * this is just a balance, so if we were marked as full 8704 * we know there is no space for a new chunk 8705 */ 8706 if (full) 8707 goto out; 8708 8709 index = get_block_group_index(block_group); 8710 } 8711 8712 if (index == BTRFS_RAID_RAID10) { 8713 dev_min = 4; 8714 /* Divide by 2 */ 8715 min_free >>= 1; 8716 } else if (index == BTRFS_RAID_RAID1) { 8717 dev_min = 2; 8718 } else if (index == BTRFS_RAID_DUP) { 8719 /* Multiply by 2 */ 8720 min_free <<= 1; 8721 } else if (index == BTRFS_RAID_RAID0) { 8722 dev_min = fs_devices->rw_devices; 8723 do_div(min_free, dev_min); 8724 } 8725 8726 /* We need to do this so that we can look at pending chunks */ 8727 trans = btrfs_join_transaction(root); 8728 if (IS_ERR(trans)) { 8729 ret = PTR_ERR(trans); 8730 goto out; 8731 } 8732 8733 mutex_lock(&root->fs_info->chunk_mutex); 8734 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8735 u64 dev_offset; 8736 8737 /* 8738 * check to make sure we can actually find a chunk with enough 8739 * space to fit our block group in. 8740 */ 8741 if (device->total_bytes > device->bytes_used + min_free && 8742 !device->is_tgtdev_for_dev_replace) { 8743 ret = find_free_dev_extent(trans, device, min_free, 8744 &dev_offset, NULL); 8745 if (!ret) 8746 dev_nr++; 8747 8748 if (dev_nr >= dev_min) 8749 break; 8750 8751 ret = -1; 8752 } 8753 } 8754 mutex_unlock(&root->fs_info->chunk_mutex); 8755 btrfs_end_transaction(trans, root); 8756 out: 8757 btrfs_put_block_group(block_group); 8758 return ret; 8759 } 8760 8761 static int find_first_block_group(struct btrfs_root *root, 8762 struct btrfs_path *path, struct btrfs_key *key) 8763 { 8764 int ret = 0; 8765 struct btrfs_key found_key; 8766 struct extent_buffer *leaf; 8767 int slot; 8768 8769 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 8770 if (ret < 0) 8771 goto out; 8772 8773 while (1) { 8774 slot = path->slots[0]; 8775 leaf = path->nodes[0]; 8776 if (slot >= btrfs_header_nritems(leaf)) { 8777 ret = btrfs_next_leaf(root, path); 8778 if (ret == 0) 8779 continue; 8780 if (ret < 0) 8781 goto out; 8782 break; 8783 } 8784 btrfs_item_key_to_cpu(leaf, &found_key, slot); 8785 8786 if (found_key.objectid >= key->objectid && 8787 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 8788 ret = 0; 8789 goto out; 8790 } 8791 path->slots[0]++; 8792 } 8793 out: 8794 return ret; 8795 } 8796 8797 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 8798 { 8799 struct btrfs_block_group_cache *block_group; 8800 u64 last = 0; 8801 8802 while (1) { 8803 struct inode *inode; 8804 8805 block_group = btrfs_lookup_first_block_group(info, last); 8806 while (block_group) { 8807 spin_lock(&block_group->lock); 8808 if (block_group->iref) 8809 break; 8810 spin_unlock(&block_group->lock); 8811 block_group = next_block_group(info->tree_root, 8812 block_group); 8813 } 8814 if (!block_group) { 8815 if (last == 0) 8816 break; 8817 last = 0; 8818 continue; 8819 } 8820 8821 inode = block_group->inode; 8822 block_group->iref = 0; 8823 block_group->inode = NULL; 8824 spin_unlock(&block_group->lock); 8825 iput(inode); 8826 last = block_group->key.objectid + block_group->key.offset; 8827 btrfs_put_block_group(block_group); 8828 } 8829 } 8830 8831 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8832 { 8833 struct btrfs_block_group_cache *block_group; 8834 struct btrfs_space_info *space_info; 8835 struct btrfs_caching_control *caching_ctl; 8836 struct rb_node *n; 8837 8838 down_write(&info->commit_root_sem); 8839 while (!list_empty(&info->caching_block_groups)) { 8840 caching_ctl = list_entry(info->caching_block_groups.next, 8841 struct btrfs_caching_control, list); 8842 list_del(&caching_ctl->list); 8843 put_caching_control(caching_ctl); 8844 } 8845 up_write(&info->commit_root_sem); 8846 8847 spin_lock(&info->unused_bgs_lock); 8848 while (!list_empty(&info->unused_bgs)) { 8849 block_group = list_first_entry(&info->unused_bgs, 8850 struct btrfs_block_group_cache, 8851 bg_list); 8852 list_del_init(&block_group->bg_list); 8853 btrfs_put_block_group(block_group); 8854 } 8855 spin_unlock(&info->unused_bgs_lock); 8856 8857 spin_lock(&info->block_group_cache_lock); 8858 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8859 block_group = rb_entry(n, struct btrfs_block_group_cache, 8860 cache_node); 8861 rb_erase(&block_group->cache_node, 8862 &info->block_group_cache_tree); 8863 RB_CLEAR_NODE(&block_group->cache_node); 8864 spin_unlock(&info->block_group_cache_lock); 8865 8866 down_write(&block_group->space_info->groups_sem); 8867 list_del(&block_group->list); 8868 up_write(&block_group->space_info->groups_sem); 8869 8870 if (block_group->cached == BTRFS_CACHE_STARTED) 8871 wait_block_group_cache_done(block_group); 8872 8873 /* 8874 * We haven't cached this block group, which means we could 8875 * possibly have excluded extents on this block group. 8876 */ 8877 if (block_group->cached == BTRFS_CACHE_NO || 8878 block_group->cached == BTRFS_CACHE_ERROR) 8879 free_excluded_extents(info->extent_root, block_group); 8880 8881 btrfs_remove_free_space_cache(block_group); 8882 btrfs_put_block_group(block_group); 8883 8884 spin_lock(&info->block_group_cache_lock); 8885 } 8886 spin_unlock(&info->block_group_cache_lock); 8887 8888 /* now that all the block groups are freed, go through and 8889 * free all the space_info structs. This is only called during 8890 * the final stages of unmount, and so we know nobody is 8891 * using them. We call synchronize_rcu() once before we start, 8892 * just to be on the safe side. 8893 */ 8894 synchronize_rcu(); 8895 8896 release_global_block_rsv(info); 8897 8898 while (!list_empty(&info->space_info)) { 8899 int i; 8900 8901 space_info = list_entry(info->space_info.next, 8902 struct btrfs_space_info, 8903 list); 8904 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8905 if (WARN_ON(space_info->bytes_pinned > 0 || 8906 space_info->bytes_reserved > 0 || 8907 space_info->bytes_may_use > 0)) { 8908 dump_space_info(space_info, 0, 0); 8909 } 8910 } 8911 list_del(&space_info->list); 8912 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 8913 struct kobject *kobj; 8914 kobj = space_info->block_group_kobjs[i]; 8915 space_info->block_group_kobjs[i] = NULL; 8916 if (kobj) { 8917 kobject_del(kobj); 8918 kobject_put(kobj); 8919 } 8920 } 8921 kobject_del(&space_info->kobj); 8922 kobject_put(&space_info->kobj); 8923 } 8924 return 0; 8925 } 8926 8927 static void __link_block_group(struct btrfs_space_info *space_info, 8928 struct btrfs_block_group_cache *cache) 8929 { 8930 int index = get_block_group_index(cache); 8931 bool first = false; 8932 8933 down_write(&space_info->groups_sem); 8934 if (list_empty(&space_info->block_groups[index])) 8935 first = true; 8936 list_add_tail(&cache->list, &space_info->block_groups[index]); 8937 up_write(&space_info->groups_sem); 8938 8939 if (first) { 8940 struct raid_kobject *rkobj; 8941 int ret; 8942 8943 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 8944 if (!rkobj) 8945 goto out_err; 8946 rkobj->raid_type = index; 8947 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 8948 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 8949 "%s", get_raid_name(index)); 8950 if (ret) { 8951 kobject_put(&rkobj->kobj); 8952 goto out_err; 8953 } 8954 space_info->block_group_kobjs[index] = &rkobj->kobj; 8955 } 8956 8957 return; 8958 out_err: 8959 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 8960 } 8961 8962 static struct btrfs_block_group_cache * 8963 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) 8964 { 8965 struct btrfs_block_group_cache *cache; 8966 8967 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8968 if (!cache) 8969 return NULL; 8970 8971 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8972 GFP_NOFS); 8973 if (!cache->free_space_ctl) { 8974 kfree(cache); 8975 return NULL; 8976 } 8977 8978 cache->key.objectid = start; 8979 cache->key.offset = size; 8980 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8981 8982 cache->sectorsize = root->sectorsize; 8983 cache->fs_info = root->fs_info; 8984 cache->full_stripe_len = btrfs_full_stripe_len(root, 8985 &root->fs_info->mapping_tree, 8986 start); 8987 atomic_set(&cache->count, 1); 8988 spin_lock_init(&cache->lock); 8989 init_rwsem(&cache->data_rwsem); 8990 INIT_LIST_HEAD(&cache->list); 8991 INIT_LIST_HEAD(&cache->cluster_list); 8992 INIT_LIST_HEAD(&cache->bg_list); 8993 INIT_LIST_HEAD(&cache->ro_list); 8994 INIT_LIST_HEAD(&cache->dirty_list); 8995 btrfs_init_free_space_ctl(cache); 8996 atomic_set(&cache->trimming, 0); 8997 8998 return cache; 8999 } 9000 9001 int btrfs_read_block_groups(struct btrfs_root *root) 9002 { 9003 struct btrfs_path *path; 9004 int ret; 9005 struct btrfs_block_group_cache *cache; 9006 struct btrfs_fs_info *info = root->fs_info; 9007 struct btrfs_space_info *space_info; 9008 struct btrfs_key key; 9009 struct btrfs_key found_key; 9010 struct extent_buffer *leaf; 9011 int need_clear = 0; 9012 u64 cache_gen; 9013 9014 root = info->extent_root; 9015 key.objectid = 0; 9016 key.offset = 0; 9017 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9018 path = btrfs_alloc_path(); 9019 if (!path) 9020 return -ENOMEM; 9021 path->reada = 1; 9022 9023 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 9024 if (btrfs_test_opt(root, SPACE_CACHE) && 9025 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 9026 need_clear = 1; 9027 if (btrfs_test_opt(root, CLEAR_CACHE)) 9028 need_clear = 1; 9029 9030 while (1) { 9031 ret = find_first_block_group(root, path, &key); 9032 if (ret > 0) 9033 break; 9034 if (ret != 0) 9035 goto error; 9036 9037 leaf = path->nodes[0]; 9038 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9039 9040 cache = btrfs_create_block_group_cache(root, found_key.objectid, 9041 found_key.offset); 9042 if (!cache) { 9043 ret = -ENOMEM; 9044 goto error; 9045 } 9046 9047 if (need_clear) { 9048 /* 9049 * When we mount with old space cache, we need to 9050 * set BTRFS_DC_CLEAR and set dirty flag. 9051 * 9052 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9053 * truncate the old free space cache inode and 9054 * setup a new one. 9055 * b) Setting 'dirty flag' makes sure that we flush 9056 * the new space cache info onto disk. 9057 */ 9058 if (btrfs_test_opt(root, SPACE_CACHE)) 9059 cache->disk_cache_state = BTRFS_DC_CLEAR; 9060 } 9061 9062 read_extent_buffer(leaf, &cache->item, 9063 btrfs_item_ptr_offset(leaf, path->slots[0]), 9064 sizeof(cache->item)); 9065 cache->flags = btrfs_block_group_flags(&cache->item); 9066 9067 key.objectid = found_key.objectid + found_key.offset; 9068 btrfs_release_path(path); 9069 9070 /* 9071 * We need to exclude the super stripes now so that the space 9072 * info has super bytes accounted for, otherwise we'll think 9073 * we have more space than we actually do. 9074 */ 9075 ret = exclude_super_stripes(root, cache); 9076 if (ret) { 9077 /* 9078 * We may have excluded something, so call this just in 9079 * case. 9080 */ 9081 free_excluded_extents(root, cache); 9082 btrfs_put_block_group(cache); 9083 goto error; 9084 } 9085 9086 /* 9087 * check for two cases, either we are full, and therefore 9088 * don't need to bother with the caching work since we won't 9089 * find any space, or we are empty, and we can just add all 9090 * the space in and be done with it. This saves us _alot_ of 9091 * time, particularly in the full case. 9092 */ 9093 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 9094 cache->last_byte_to_unpin = (u64)-1; 9095 cache->cached = BTRFS_CACHE_FINISHED; 9096 free_excluded_extents(root, cache); 9097 } else if (btrfs_block_group_used(&cache->item) == 0) { 9098 cache->last_byte_to_unpin = (u64)-1; 9099 cache->cached = BTRFS_CACHE_FINISHED; 9100 add_new_free_space(cache, root->fs_info, 9101 found_key.objectid, 9102 found_key.objectid + 9103 found_key.offset); 9104 free_excluded_extents(root, cache); 9105 } 9106 9107 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9108 if (ret) { 9109 btrfs_remove_free_space_cache(cache); 9110 btrfs_put_block_group(cache); 9111 goto error; 9112 } 9113 9114 ret = update_space_info(info, cache->flags, found_key.offset, 9115 btrfs_block_group_used(&cache->item), 9116 &space_info); 9117 if (ret) { 9118 btrfs_remove_free_space_cache(cache); 9119 spin_lock(&info->block_group_cache_lock); 9120 rb_erase(&cache->cache_node, 9121 &info->block_group_cache_tree); 9122 RB_CLEAR_NODE(&cache->cache_node); 9123 spin_unlock(&info->block_group_cache_lock); 9124 btrfs_put_block_group(cache); 9125 goto error; 9126 } 9127 9128 cache->space_info = space_info; 9129 spin_lock(&cache->space_info->lock); 9130 cache->space_info->bytes_readonly += cache->bytes_super; 9131 spin_unlock(&cache->space_info->lock); 9132 9133 __link_block_group(space_info, cache); 9134 9135 set_avail_alloc_bits(root->fs_info, cache->flags); 9136 if (btrfs_chunk_readonly(root, cache->key.objectid)) { 9137 set_block_group_ro(cache, 1); 9138 } else if (btrfs_block_group_used(&cache->item) == 0) { 9139 spin_lock(&info->unused_bgs_lock); 9140 /* Should always be true but just in case. */ 9141 if (list_empty(&cache->bg_list)) { 9142 btrfs_get_block_group(cache); 9143 list_add_tail(&cache->bg_list, 9144 &info->unused_bgs); 9145 } 9146 spin_unlock(&info->unused_bgs_lock); 9147 } 9148 } 9149 9150 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 9151 if (!(get_alloc_profile(root, space_info->flags) & 9152 (BTRFS_BLOCK_GROUP_RAID10 | 9153 BTRFS_BLOCK_GROUP_RAID1 | 9154 BTRFS_BLOCK_GROUP_RAID5 | 9155 BTRFS_BLOCK_GROUP_RAID6 | 9156 BTRFS_BLOCK_GROUP_DUP))) 9157 continue; 9158 /* 9159 * avoid allocating from un-mirrored block group if there are 9160 * mirrored block groups. 9161 */ 9162 list_for_each_entry(cache, 9163 &space_info->block_groups[BTRFS_RAID_RAID0], 9164 list) 9165 set_block_group_ro(cache, 1); 9166 list_for_each_entry(cache, 9167 &space_info->block_groups[BTRFS_RAID_SINGLE], 9168 list) 9169 set_block_group_ro(cache, 1); 9170 } 9171 9172 init_global_block_rsv(info); 9173 ret = 0; 9174 error: 9175 btrfs_free_path(path); 9176 return ret; 9177 } 9178 9179 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 9180 struct btrfs_root *root) 9181 { 9182 struct btrfs_block_group_cache *block_group, *tmp; 9183 struct btrfs_root *extent_root = root->fs_info->extent_root; 9184 struct btrfs_block_group_item item; 9185 struct btrfs_key key; 9186 int ret = 0; 9187 9188 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 9189 if (ret) 9190 goto next; 9191 9192 spin_lock(&block_group->lock); 9193 memcpy(&item, &block_group->item, sizeof(item)); 9194 memcpy(&key, &block_group->key, sizeof(key)); 9195 spin_unlock(&block_group->lock); 9196 9197 ret = btrfs_insert_item(trans, extent_root, &key, &item, 9198 sizeof(item)); 9199 if (ret) 9200 btrfs_abort_transaction(trans, extent_root, ret); 9201 ret = btrfs_finish_chunk_alloc(trans, extent_root, 9202 key.objectid, key.offset); 9203 if (ret) 9204 btrfs_abort_transaction(trans, extent_root, ret); 9205 next: 9206 list_del_init(&block_group->bg_list); 9207 } 9208 } 9209 9210 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 9211 struct btrfs_root *root, u64 bytes_used, 9212 u64 type, u64 chunk_objectid, u64 chunk_offset, 9213 u64 size) 9214 { 9215 int ret; 9216 struct btrfs_root *extent_root; 9217 struct btrfs_block_group_cache *cache; 9218 9219 extent_root = root->fs_info->extent_root; 9220 9221 btrfs_set_log_full_commit(root->fs_info, trans); 9222 9223 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 9224 if (!cache) 9225 return -ENOMEM; 9226 9227 btrfs_set_block_group_used(&cache->item, bytes_used); 9228 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 9229 btrfs_set_block_group_flags(&cache->item, type); 9230 9231 cache->flags = type; 9232 cache->last_byte_to_unpin = (u64)-1; 9233 cache->cached = BTRFS_CACHE_FINISHED; 9234 ret = exclude_super_stripes(root, cache); 9235 if (ret) { 9236 /* 9237 * We may have excluded something, so call this just in 9238 * case. 9239 */ 9240 free_excluded_extents(root, cache); 9241 btrfs_put_block_group(cache); 9242 return ret; 9243 } 9244 9245 add_new_free_space(cache, root->fs_info, chunk_offset, 9246 chunk_offset + size); 9247 9248 free_excluded_extents(root, cache); 9249 9250 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9251 if (ret) { 9252 btrfs_remove_free_space_cache(cache); 9253 btrfs_put_block_group(cache); 9254 return ret; 9255 } 9256 9257 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 9258 &cache->space_info); 9259 if (ret) { 9260 btrfs_remove_free_space_cache(cache); 9261 spin_lock(&root->fs_info->block_group_cache_lock); 9262 rb_erase(&cache->cache_node, 9263 &root->fs_info->block_group_cache_tree); 9264 RB_CLEAR_NODE(&cache->cache_node); 9265 spin_unlock(&root->fs_info->block_group_cache_lock); 9266 btrfs_put_block_group(cache); 9267 return ret; 9268 } 9269 update_global_block_rsv(root->fs_info); 9270 9271 spin_lock(&cache->space_info->lock); 9272 cache->space_info->bytes_readonly += cache->bytes_super; 9273 spin_unlock(&cache->space_info->lock); 9274 9275 __link_block_group(cache->space_info, cache); 9276 9277 list_add_tail(&cache->bg_list, &trans->new_bgs); 9278 9279 set_avail_alloc_bits(extent_root->fs_info, type); 9280 9281 return 0; 9282 } 9283 9284 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 9285 { 9286 u64 extra_flags = chunk_to_extended(flags) & 9287 BTRFS_EXTENDED_PROFILE_MASK; 9288 9289 write_seqlock(&fs_info->profiles_lock); 9290 if (flags & BTRFS_BLOCK_GROUP_DATA) 9291 fs_info->avail_data_alloc_bits &= ~extra_flags; 9292 if (flags & BTRFS_BLOCK_GROUP_METADATA) 9293 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 9294 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 9295 fs_info->avail_system_alloc_bits &= ~extra_flags; 9296 write_sequnlock(&fs_info->profiles_lock); 9297 } 9298 9299 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9300 struct btrfs_root *root, u64 group_start, 9301 struct extent_map *em) 9302 { 9303 struct btrfs_path *path; 9304 struct btrfs_block_group_cache *block_group; 9305 struct btrfs_free_cluster *cluster; 9306 struct btrfs_root *tree_root = root->fs_info->tree_root; 9307 struct btrfs_key key; 9308 struct inode *inode; 9309 struct kobject *kobj = NULL; 9310 int ret; 9311 int index; 9312 int factor; 9313 struct btrfs_caching_control *caching_ctl = NULL; 9314 bool remove_em; 9315 9316 root = root->fs_info->extent_root; 9317 9318 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 9319 BUG_ON(!block_group); 9320 BUG_ON(!block_group->ro); 9321 9322 /* 9323 * Free the reserved super bytes from this block group before 9324 * remove it. 9325 */ 9326 free_excluded_extents(root, block_group); 9327 9328 memcpy(&key, &block_group->key, sizeof(key)); 9329 index = get_block_group_index(block_group); 9330 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 9331 BTRFS_BLOCK_GROUP_RAID1 | 9332 BTRFS_BLOCK_GROUP_RAID10)) 9333 factor = 2; 9334 else 9335 factor = 1; 9336 9337 /* make sure this block group isn't part of an allocation cluster */ 9338 cluster = &root->fs_info->data_alloc_cluster; 9339 spin_lock(&cluster->refill_lock); 9340 btrfs_return_cluster_to_free_space(block_group, cluster); 9341 spin_unlock(&cluster->refill_lock); 9342 9343 /* 9344 * make sure this block group isn't part of a metadata 9345 * allocation cluster 9346 */ 9347 cluster = &root->fs_info->meta_alloc_cluster; 9348 spin_lock(&cluster->refill_lock); 9349 btrfs_return_cluster_to_free_space(block_group, cluster); 9350 spin_unlock(&cluster->refill_lock); 9351 9352 path = btrfs_alloc_path(); 9353 if (!path) { 9354 ret = -ENOMEM; 9355 goto out; 9356 } 9357 9358 inode = lookup_free_space_inode(tree_root, block_group, path); 9359 if (!IS_ERR(inode)) { 9360 ret = btrfs_orphan_add(trans, inode); 9361 if (ret) { 9362 btrfs_add_delayed_iput(inode); 9363 goto out; 9364 } 9365 clear_nlink(inode); 9366 /* One for the block groups ref */ 9367 spin_lock(&block_group->lock); 9368 if (block_group->iref) { 9369 block_group->iref = 0; 9370 block_group->inode = NULL; 9371 spin_unlock(&block_group->lock); 9372 iput(inode); 9373 } else { 9374 spin_unlock(&block_group->lock); 9375 } 9376 /* One for our lookup ref */ 9377 btrfs_add_delayed_iput(inode); 9378 } 9379 9380 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 9381 key.offset = block_group->key.objectid; 9382 key.type = 0; 9383 9384 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 9385 if (ret < 0) 9386 goto out; 9387 if (ret > 0) 9388 btrfs_release_path(path); 9389 if (ret == 0) { 9390 ret = btrfs_del_item(trans, tree_root, path); 9391 if (ret) 9392 goto out; 9393 btrfs_release_path(path); 9394 } 9395 9396 spin_lock(&root->fs_info->block_group_cache_lock); 9397 rb_erase(&block_group->cache_node, 9398 &root->fs_info->block_group_cache_tree); 9399 RB_CLEAR_NODE(&block_group->cache_node); 9400 9401 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9402 root->fs_info->first_logical_byte = (u64)-1; 9403 spin_unlock(&root->fs_info->block_group_cache_lock); 9404 9405 down_write(&block_group->space_info->groups_sem); 9406 /* 9407 * we must use list_del_init so people can check to see if they 9408 * are still on the list after taking the semaphore 9409 */ 9410 list_del_init(&block_group->list); 9411 if (list_empty(&block_group->space_info->block_groups[index])) { 9412 kobj = block_group->space_info->block_group_kobjs[index]; 9413 block_group->space_info->block_group_kobjs[index] = NULL; 9414 clear_avail_alloc_bits(root->fs_info, block_group->flags); 9415 } 9416 up_write(&block_group->space_info->groups_sem); 9417 if (kobj) { 9418 kobject_del(kobj); 9419 kobject_put(kobj); 9420 } 9421 9422 if (block_group->has_caching_ctl) 9423 caching_ctl = get_caching_control(block_group); 9424 if (block_group->cached == BTRFS_CACHE_STARTED) 9425 wait_block_group_cache_done(block_group); 9426 if (block_group->has_caching_ctl) { 9427 down_write(&root->fs_info->commit_root_sem); 9428 if (!caching_ctl) { 9429 struct btrfs_caching_control *ctl; 9430 9431 list_for_each_entry(ctl, 9432 &root->fs_info->caching_block_groups, list) 9433 if (ctl->block_group == block_group) { 9434 caching_ctl = ctl; 9435 atomic_inc(&caching_ctl->count); 9436 break; 9437 } 9438 } 9439 if (caching_ctl) 9440 list_del_init(&caching_ctl->list); 9441 up_write(&root->fs_info->commit_root_sem); 9442 if (caching_ctl) { 9443 /* Once for the caching bgs list and once for us. */ 9444 put_caching_control(caching_ctl); 9445 put_caching_control(caching_ctl); 9446 } 9447 } 9448 9449 spin_lock(&trans->transaction->dirty_bgs_lock); 9450 if (!list_empty(&block_group->dirty_list)) { 9451 list_del_init(&block_group->dirty_list); 9452 btrfs_put_block_group(block_group); 9453 } 9454 spin_unlock(&trans->transaction->dirty_bgs_lock); 9455 9456 btrfs_remove_free_space_cache(block_group); 9457 9458 spin_lock(&block_group->space_info->lock); 9459 list_del_init(&block_group->ro_list); 9460 block_group->space_info->total_bytes -= block_group->key.offset; 9461 block_group->space_info->bytes_readonly -= block_group->key.offset; 9462 block_group->space_info->disk_total -= block_group->key.offset * factor; 9463 spin_unlock(&block_group->space_info->lock); 9464 9465 memcpy(&key, &block_group->key, sizeof(key)); 9466 9467 lock_chunks(root); 9468 if (!list_empty(&em->list)) { 9469 /* We're in the transaction->pending_chunks list. */ 9470 free_extent_map(em); 9471 } 9472 spin_lock(&block_group->lock); 9473 block_group->removed = 1; 9474 /* 9475 * At this point trimming can't start on this block group, because we 9476 * removed the block group from the tree fs_info->block_group_cache_tree 9477 * so no one can't find it anymore and even if someone already got this 9478 * block group before we removed it from the rbtree, they have already 9479 * incremented block_group->trimming - if they didn't, they won't find 9480 * any free space entries because we already removed them all when we 9481 * called btrfs_remove_free_space_cache(). 9482 * 9483 * And we must not remove the extent map from the fs_info->mapping_tree 9484 * to prevent the same logical address range and physical device space 9485 * ranges from being reused for a new block group. This is because our 9486 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 9487 * completely transactionless, so while it is trimming a range the 9488 * currently running transaction might finish and a new one start, 9489 * allowing for new block groups to be created that can reuse the same 9490 * physical device locations unless we take this special care. 9491 */ 9492 remove_em = (atomic_read(&block_group->trimming) == 0); 9493 /* 9494 * Make sure a trimmer task always sees the em in the pinned_chunks list 9495 * if it sees block_group->removed == 1 (needs to lock block_group->lock 9496 * before checking block_group->removed). 9497 */ 9498 if (!remove_em) { 9499 /* 9500 * Our em might be in trans->transaction->pending_chunks which 9501 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 9502 * and so is the fs_info->pinned_chunks list. 9503 * 9504 * So at this point we must be holding the chunk_mutex to avoid 9505 * any races with chunk allocation (more specifically at 9506 * volumes.c:contains_pending_extent()), to ensure it always 9507 * sees the em, either in the pending_chunks list or in the 9508 * pinned_chunks list. 9509 */ 9510 list_move_tail(&em->list, &root->fs_info->pinned_chunks); 9511 } 9512 spin_unlock(&block_group->lock); 9513 9514 if (remove_em) { 9515 struct extent_map_tree *em_tree; 9516 9517 em_tree = &root->fs_info->mapping_tree.map_tree; 9518 write_lock(&em_tree->lock); 9519 /* 9520 * The em might be in the pending_chunks list, so make sure the 9521 * chunk mutex is locked, since remove_extent_mapping() will 9522 * delete us from that list. 9523 */ 9524 remove_extent_mapping(em_tree, em); 9525 write_unlock(&em_tree->lock); 9526 /* once for the tree */ 9527 free_extent_map(em); 9528 } 9529 9530 unlock_chunks(root); 9531 9532 btrfs_put_block_group(block_group); 9533 btrfs_put_block_group(block_group); 9534 9535 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 9536 if (ret > 0) 9537 ret = -EIO; 9538 if (ret < 0) 9539 goto out; 9540 9541 ret = btrfs_del_item(trans, root, path); 9542 out: 9543 btrfs_free_path(path); 9544 return ret; 9545 } 9546 9547 /* 9548 * Process the unused_bgs list and remove any that don't have any allocated 9549 * space inside of them. 9550 */ 9551 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 9552 { 9553 struct btrfs_block_group_cache *block_group; 9554 struct btrfs_space_info *space_info; 9555 struct btrfs_root *root = fs_info->extent_root; 9556 struct btrfs_trans_handle *trans; 9557 int ret = 0; 9558 9559 if (!fs_info->open) 9560 return; 9561 9562 spin_lock(&fs_info->unused_bgs_lock); 9563 while (!list_empty(&fs_info->unused_bgs)) { 9564 u64 start, end; 9565 9566 block_group = list_first_entry(&fs_info->unused_bgs, 9567 struct btrfs_block_group_cache, 9568 bg_list); 9569 space_info = block_group->space_info; 9570 list_del_init(&block_group->bg_list); 9571 if (ret || btrfs_mixed_space_info(space_info)) { 9572 btrfs_put_block_group(block_group); 9573 continue; 9574 } 9575 spin_unlock(&fs_info->unused_bgs_lock); 9576 9577 /* Don't want to race with allocators so take the groups_sem */ 9578 down_write(&space_info->groups_sem); 9579 spin_lock(&block_group->lock); 9580 if (block_group->reserved || 9581 btrfs_block_group_used(&block_group->item) || 9582 block_group->ro) { 9583 /* 9584 * We want to bail if we made new allocations or have 9585 * outstanding allocations in this block group. We do 9586 * the ro check in case balance is currently acting on 9587 * this block group. 9588 */ 9589 spin_unlock(&block_group->lock); 9590 up_write(&space_info->groups_sem); 9591 goto next; 9592 } 9593 spin_unlock(&block_group->lock); 9594 9595 /* We don't want to force the issue, only flip if it's ok. */ 9596 ret = set_block_group_ro(block_group, 0); 9597 up_write(&space_info->groups_sem); 9598 if (ret < 0) { 9599 ret = 0; 9600 goto next; 9601 } 9602 9603 /* 9604 * Want to do this before we do anything else so we can recover 9605 * properly if we fail to join the transaction. 9606 */ 9607 /* 1 for btrfs_orphan_reserve_metadata() */ 9608 trans = btrfs_start_transaction(root, 1); 9609 if (IS_ERR(trans)) { 9610 btrfs_set_block_group_rw(root, block_group); 9611 ret = PTR_ERR(trans); 9612 goto next; 9613 } 9614 9615 /* 9616 * We could have pending pinned extents for this block group, 9617 * just delete them, we don't care about them anymore. 9618 */ 9619 start = block_group->key.objectid; 9620 end = start + block_group->key.offset - 1; 9621 /* 9622 * Hold the unused_bg_unpin_mutex lock to avoid racing with 9623 * btrfs_finish_extent_commit(). If we are at transaction N, 9624 * another task might be running finish_extent_commit() for the 9625 * previous transaction N - 1, and have seen a range belonging 9626 * to the block group in freed_extents[] before we were able to 9627 * clear the whole block group range from freed_extents[]. This 9628 * means that task can lookup for the block group after we 9629 * unpinned it from freed_extents[] and removed it, leading to 9630 * a BUG_ON() at btrfs_unpin_extent_range(). 9631 */ 9632 mutex_lock(&fs_info->unused_bg_unpin_mutex); 9633 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 9634 EXTENT_DIRTY, GFP_NOFS); 9635 if (ret) { 9636 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 9637 btrfs_set_block_group_rw(root, block_group); 9638 goto end_trans; 9639 } 9640 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 9641 EXTENT_DIRTY, GFP_NOFS); 9642 if (ret) { 9643 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 9644 btrfs_set_block_group_rw(root, block_group); 9645 goto end_trans; 9646 } 9647 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 9648 9649 /* Reset pinned so btrfs_put_block_group doesn't complain */ 9650 block_group->pinned = 0; 9651 9652 /* 9653 * Btrfs_remove_chunk will abort the transaction if things go 9654 * horribly wrong. 9655 */ 9656 ret = btrfs_remove_chunk(trans, root, 9657 block_group->key.objectid); 9658 end_trans: 9659 btrfs_end_transaction(trans, root); 9660 next: 9661 btrfs_put_block_group(block_group); 9662 spin_lock(&fs_info->unused_bgs_lock); 9663 } 9664 spin_unlock(&fs_info->unused_bgs_lock); 9665 } 9666 9667 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9668 { 9669 struct btrfs_space_info *space_info; 9670 struct btrfs_super_block *disk_super; 9671 u64 features; 9672 u64 flags; 9673 int mixed = 0; 9674 int ret; 9675 9676 disk_super = fs_info->super_copy; 9677 if (!btrfs_super_root(disk_super)) 9678 return 1; 9679 9680 features = btrfs_super_incompat_flags(disk_super); 9681 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 9682 mixed = 1; 9683 9684 flags = BTRFS_BLOCK_GROUP_SYSTEM; 9685 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9686 if (ret) 9687 goto out; 9688 9689 if (mixed) { 9690 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 9691 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9692 } else { 9693 flags = BTRFS_BLOCK_GROUP_METADATA; 9694 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9695 if (ret) 9696 goto out; 9697 9698 flags = BTRFS_BLOCK_GROUP_DATA; 9699 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9700 } 9701 out: 9702 return ret; 9703 } 9704 9705 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 9706 { 9707 return unpin_extent_range(root, start, end, false); 9708 } 9709 9710 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 9711 { 9712 struct btrfs_fs_info *fs_info = root->fs_info; 9713 struct btrfs_block_group_cache *cache = NULL; 9714 u64 group_trimmed; 9715 u64 start; 9716 u64 end; 9717 u64 trimmed = 0; 9718 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 9719 int ret = 0; 9720 9721 /* 9722 * try to trim all FS space, our block group may start from non-zero. 9723 */ 9724 if (range->len == total_bytes) 9725 cache = btrfs_lookup_first_block_group(fs_info, range->start); 9726 else 9727 cache = btrfs_lookup_block_group(fs_info, range->start); 9728 9729 while (cache) { 9730 if (cache->key.objectid >= (range->start + range->len)) { 9731 btrfs_put_block_group(cache); 9732 break; 9733 } 9734 9735 start = max(range->start, cache->key.objectid); 9736 end = min(range->start + range->len, 9737 cache->key.objectid + cache->key.offset); 9738 9739 if (end - start >= range->minlen) { 9740 if (!block_group_cache_done(cache)) { 9741 ret = cache_block_group(cache, 0); 9742 if (ret) { 9743 btrfs_put_block_group(cache); 9744 break; 9745 } 9746 ret = wait_block_group_cache_done(cache); 9747 if (ret) { 9748 btrfs_put_block_group(cache); 9749 break; 9750 } 9751 } 9752 ret = btrfs_trim_block_group(cache, 9753 &group_trimmed, 9754 start, 9755 end, 9756 range->minlen); 9757 9758 trimmed += group_trimmed; 9759 if (ret) { 9760 btrfs_put_block_group(cache); 9761 break; 9762 } 9763 } 9764 9765 cache = next_block_group(fs_info->tree_root, cache); 9766 } 9767 9768 range->len = trimmed; 9769 return ret; 9770 } 9771 9772 /* 9773 * btrfs_{start,end}_write_no_snapshoting() are similar to 9774 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 9775 * data into the page cache through nocow before the subvolume is snapshoted, 9776 * but flush the data into disk after the snapshot creation, or to prevent 9777 * operations while snapshoting is ongoing and that cause the snapshot to be 9778 * inconsistent (writes followed by expanding truncates for example). 9779 */ 9780 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 9781 { 9782 percpu_counter_dec(&root->subv_writers->counter); 9783 /* 9784 * Make sure counter is updated before we wake up 9785 * waiters. 9786 */ 9787 smp_mb(); 9788 if (waitqueue_active(&root->subv_writers->wait)) 9789 wake_up(&root->subv_writers->wait); 9790 } 9791 9792 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 9793 { 9794 if (atomic_read(&root->will_be_snapshoted)) 9795 return 0; 9796 9797 percpu_counter_inc(&root->subv_writers->counter); 9798 /* 9799 * Make sure counter is updated before we check for snapshot creation. 9800 */ 9801 smp_mb(); 9802 if (atomic_read(&root->will_be_snapshoted)) { 9803 btrfs_end_write_no_snapshoting(root); 9804 return 0; 9805 } 9806 return 1; 9807 } 9808