1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "math.h" 37 #include "sysfs.h" 38 #include "qgroup.h" 39 40 #undef SCRAMBLE_DELAYED_REFS 41 42 /* 43 * control flags for do_chunk_alloc's force field 44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 45 * if we really need one. 46 * 47 * CHUNK_ALLOC_LIMITED means to only try and allocate one 48 * if we have very few chunks already allocated. This is 49 * used as part of the clustering code to help make sure 50 * we have a good pool of storage to cluster in, without 51 * filling the FS with empty chunks 52 * 53 * CHUNK_ALLOC_FORCE means it must try to allocate one 54 * 55 */ 56 enum { 57 CHUNK_ALLOC_NO_FORCE = 0, 58 CHUNK_ALLOC_LIMITED = 1, 59 CHUNK_ALLOC_FORCE = 2, 60 }; 61 62 /* 63 * Control how reservations are dealt with. 64 * 65 * RESERVE_FREE - freeing a reservation. 66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 67 * ENOSPC accounting 68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 69 * bytes_may_use as the ENOSPC accounting is done elsewhere 70 */ 71 enum { 72 RESERVE_FREE = 0, 73 RESERVE_ALLOC = 1, 74 RESERVE_ALLOC_NO_ACCOUNT = 2, 75 }; 76 77 static int update_block_group(struct btrfs_root *root, 78 u64 bytenr, u64 num_bytes, int alloc); 79 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 80 struct btrfs_root *root, 81 u64 bytenr, u64 num_bytes, u64 parent, 82 u64 root_objectid, u64 owner_objectid, 83 u64 owner_offset, int refs_to_drop, 84 struct btrfs_delayed_extent_op *extra_op, 85 int no_quota); 86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 87 struct extent_buffer *leaf, 88 struct btrfs_extent_item *ei); 89 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 90 struct btrfs_root *root, 91 u64 parent, u64 root_objectid, 92 u64 flags, u64 owner, u64 offset, 93 struct btrfs_key *ins, int ref_mod); 94 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, 96 u64 parent, u64 root_objectid, 97 u64 flags, struct btrfs_disk_key *key, 98 int level, struct btrfs_key *ins, 99 int no_quota); 100 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 101 struct btrfs_root *extent_root, u64 flags, 102 int force); 103 static int find_next_key(struct btrfs_path *path, int level, 104 struct btrfs_key *key); 105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 106 int dump_block_groups); 107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 108 u64 num_bytes, int reserve); 109 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 110 u64 num_bytes); 111 int btrfs_pin_extent(struct btrfs_root *root, 112 u64 bytenr, u64 num_bytes, int reserved); 113 114 static noinline int 115 block_group_cache_done(struct btrfs_block_group_cache *cache) 116 { 117 smp_mb(); 118 return cache->cached == BTRFS_CACHE_FINISHED || 119 cache->cached == BTRFS_CACHE_ERROR; 120 } 121 122 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 123 { 124 return (cache->flags & bits) == bits; 125 } 126 127 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 128 { 129 atomic_inc(&cache->count); 130 } 131 132 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 133 { 134 if (atomic_dec_and_test(&cache->count)) { 135 WARN_ON(cache->pinned > 0); 136 WARN_ON(cache->reserved > 0); 137 kfree(cache->free_space_ctl); 138 kfree(cache); 139 } 140 } 141 142 /* 143 * this adds the block group to the fs_info rb tree for the block group 144 * cache 145 */ 146 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 147 struct btrfs_block_group_cache *block_group) 148 { 149 struct rb_node **p; 150 struct rb_node *parent = NULL; 151 struct btrfs_block_group_cache *cache; 152 153 spin_lock(&info->block_group_cache_lock); 154 p = &info->block_group_cache_tree.rb_node; 155 156 while (*p) { 157 parent = *p; 158 cache = rb_entry(parent, struct btrfs_block_group_cache, 159 cache_node); 160 if (block_group->key.objectid < cache->key.objectid) { 161 p = &(*p)->rb_left; 162 } else if (block_group->key.objectid > cache->key.objectid) { 163 p = &(*p)->rb_right; 164 } else { 165 spin_unlock(&info->block_group_cache_lock); 166 return -EEXIST; 167 } 168 } 169 170 rb_link_node(&block_group->cache_node, parent, p); 171 rb_insert_color(&block_group->cache_node, 172 &info->block_group_cache_tree); 173 174 if (info->first_logical_byte > block_group->key.objectid) 175 info->first_logical_byte = block_group->key.objectid; 176 177 spin_unlock(&info->block_group_cache_lock); 178 179 return 0; 180 } 181 182 /* 183 * This will return the block group at or after bytenr if contains is 0, else 184 * it will return the block group that contains the bytenr 185 */ 186 static struct btrfs_block_group_cache * 187 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 188 int contains) 189 { 190 struct btrfs_block_group_cache *cache, *ret = NULL; 191 struct rb_node *n; 192 u64 end, start; 193 194 spin_lock(&info->block_group_cache_lock); 195 n = info->block_group_cache_tree.rb_node; 196 197 while (n) { 198 cache = rb_entry(n, struct btrfs_block_group_cache, 199 cache_node); 200 end = cache->key.objectid + cache->key.offset - 1; 201 start = cache->key.objectid; 202 203 if (bytenr < start) { 204 if (!contains && (!ret || start < ret->key.objectid)) 205 ret = cache; 206 n = n->rb_left; 207 } else if (bytenr > start) { 208 if (contains && bytenr <= end) { 209 ret = cache; 210 break; 211 } 212 n = n->rb_right; 213 } else { 214 ret = cache; 215 break; 216 } 217 } 218 if (ret) { 219 btrfs_get_block_group(ret); 220 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 221 info->first_logical_byte = ret->key.objectid; 222 } 223 spin_unlock(&info->block_group_cache_lock); 224 225 return ret; 226 } 227 228 static int add_excluded_extent(struct btrfs_root *root, 229 u64 start, u64 num_bytes) 230 { 231 u64 end = start + num_bytes - 1; 232 set_extent_bits(&root->fs_info->freed_extents[0], 233 start, end, EXTENT_UPTODATE, GFP_NOFS); 234 set_extent_bits(&root->fs_info->freed_extents[1], 235 start, end, EXTENT_UPTODATE, GFP_NOFS); 236 return 0; 237 } 238 239 static void free_excluded_extents(struct btrfs_root *root, 240 struct btrfs_block_group_cache *cache) 241 { 242 u64 start, end; 243 244 start = cache->key.objectid; 245 end = start + cache->key.offset - 1; 246 247 clear_extent_bits(&root->fs_info->freed_extents[0], 248 start, end, EXTENT_UPTODATE, GFP_NOFS); 249 clear_extent_bits(&root->fs_info->freed_extents[1], 250 start, end, EXTENT_UPTODATE, GFP_NOFS); 251 } 252 253 static int exclude_super_stripes(struct btrfs_root *root, 254 struct btrfs_block_group_cache *cache) 255 { 256 u64 bytenr; 257 u64 *logical; 258 int stripe_len; 259 int i, nr, ret; 260 261 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 262 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 263 cache->bytes_super += stripe_len; 264 ret = add_excluded_extent(root, cache->key.objectid, 265 stripe_len); 266 if (ret) 267 return ret; 268 } 269 270 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 271 bytenr = btrfs_sb_offset(i); 272 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 273 cache->key.objectid, bytenr, 274 0, &logical, &nr, &stripe_len); 275 if (ret) 276 return ret; 277 278 while (nr--) { 279 u64 start, len; 280 281 if (logical[nr] > cache->key.objectid + 282 cache->key.offset) 283 continue; 284 285 if (logical[nr] + stripe_len <= cache->key.objectid) 286 continue; 287 288 start = logical[nr]; 289 if (start < cache->key.objectid) { 290 start = cache->key.objectid; 291 len = (logical[nr] + stripe_len) - start; 292 } else { 293 len = min_t(u64, stripe_len, 294 cache->key.objectid + 295 cache->key.offset - start); 296 } 297 298 cache->bytes_super += len; 299 ret = add_excluded_extent(root, start, len); 300 if (ret) { 301 kfree(logical); 302 return ret; 303 } 304 } 305 306 kfree(logical); 307 } 308 return 0; 309 } 310 311 static struct btrfs_caching_control * 312 get_caching_control(struct btrfs_block_group_cache *cache) 313 { 314 struct btrfs_caching_control *ctl; 315 316 spin_lock(&cache->lock); 317 if (cache->cached != BTRFS_CACHE_STARTED) { 318 spin_unlock(&cache->lock); 319 return NULL; 320 } 321 322 /* We're loading it the fast way, so we don't have a caching_ctl. */ 323 if (!cache->caching_ctl) { 324 spin_unlock(&cache->lock); 325 return NULL; 326 } 327 328 ctl = cache->caching_ctl; 329 atomic_inc(&ctl->count); 330 spin_unlock(&cache->lock); 331 return ctl; 332 } 333 334 static void put_caching_control(struct btrfs_caching_control *ctl) 335 { 336 if (atomic_dec_and_test(&ctl->count)) 337 kfree(ctl); 338 } 339 340 /* 341 * this is only called by cache_block_group, since we could have freed extents 342 * we need to check the pinned_extents for any extents that can't be used yet 343 * since their free space will be released as soon as the transaction commits. 344 */ 345 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 346 struct btrfs_fs_info *info, u64 start, u64 end) 347 { 348 u64 extent_start, extent_end, size, total_added = 0; 349 int ret; 350 351 while (start < end) { 352 ret = find_first_extent_bit(info->pinned_extents, start, 353 &extent_start, &extent_end, 354 EXTENT_DIRTY | EXTENT_UPTODATE, 355 NULL); 356 if (ret) 357 break; 358 359 if (extent_start <= start) { 360 start = extent_end + 1; 361 } else if (extent_start > start && extent_start < end) { 362 size = extent_start - start; 363 total_added += size; 364 ret = btrfs_add_free_space(block_group, start, 365 size); 366 BUG_ON(ret); /* -ENOMEM or logic error */ 367 start = extent_end + 1; 368 } else { 369 break; 370 } 371 } 372 373 if (start < end) { 374 size = end - start; 375 total_added += size; 376 ret = btrfs_add_free_space(block_group, start, size); 377 BUG_ON(ret); /* -ENOMEM or logic error */ 378 } 379 380 return total_added; 381 } 382 383 static noinline void caching_thread(struct btrfs_work *work) 384 { 385 struct btrfs_block_group_cache *block_group; 386 struct btrfs_fs_info *fs_info; 387 struct btrfs_caching_control *caching_ctl; 388 struct btrfs_root *extent_root; 389 struct btrfs_path *path; 390 struct extent_buffer *leaf; 391 struct btrfs_key key; 392 u64 total_found = 0; 393 u64 last = 0; 394 u32 nritems; 395 int ret = -ENOMEM; 396 397 caching_ctl = container_of(work, struct btrfs_caching_control, work); 398 block_group = caching_ctl->block_group; 399 fs_info = block_group->fs_info; 400 extent_root = fs_info->extent_root; 401 402 path = btrfs_alloc_path(); 403 if (!path) 404 goto out; 405 406 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 407 408 /* 409 * We don't want to deadlock with somebody trying to allocate a new 410 * extent for the extent root while also trying to search the extent 411 * root to add free space. So we skip locking and search the commit 412 * root, since its read-only 413 */ 414 path->skip_locking = 1; 415 path->search_commit_root = 1; 416 path->reada = 1; 417 418 key.objectid = last; 419 key.offset = 0; 420 key.type = BTRFS_EXTENT_ITEM_KEY; 421 again: 422 mutex_lock(&caching_ctl->mutex); 423 /* need to make sure the commit_root doesn't disappear */ 424 down_read(&fs_info->commit_root_sem); 425 426 next: 427 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 428 if (ret < 0) 429 goto err; 430 431 leaf = path->nodes[0]; 432 nritems = btrfs_header_nritems(leaf); 433 434 while (1) { 435 if (btrfs_fs_closing(fs_info) > 1) { 436 last = (u64)-1; 437 break; 438 } 439 440 if (path->slots[0] < nritems) { 441 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 442 } else { 443 ret = find_next_key(path, 0, &key); 444 if (ret) 445 break; 446 447 if (need_resched() || 448 rwsem_is_contended(&fs_info->commit_root_sem)) { 449 caching_ctl->progress = last; 450 btrfs_release_path(path); 451 up_read(&fs_info->commit_root_sem); 452 mutex_unlock(&caching_ctl->mutex); 453 cond_resched(); 454 goto again; 455 } 456 457 ret = btrfs_next_leaf(extent_root, path); 458 if (ret < 0) 459 goto err; 460 if (ret) 461 break; 462 leaf = path->nodes[0]; 463 nritems = btrfs_header_nritems(leaf); 464 continue; 465 } 466 467 if (key.objectid < last) { 468 key.objectid = last; 469 key.offset = 0; 470 key.type = BTRFS_EXTENT_ITEM_KEY; 471 472 caching_ctl->progress = last; 473 btrfs_release_path(path); 474 goto next; 475 } 476 477 if (key.objectid < block_group->key.objectid) { 478 path->slots[0]++; 479 continue; 480 } 481 482 if (key.objectid >= block_group->key.objectid + 483 block_group->key.offset) 484 break; 485 486 if (key.type == BTRFS_EXTENT_ITEM_KEY || 487 key.type == BTRFS_METADATA_ITEM_KEY) { 488 total_found += add_new_free_space(block_group, 489 fs_info, last, 490 key.objectid); 491 if (key.type == BTRFS_METADATA_ITEM_KEY) 492 last = key.objectid + 493 fs_info->tree_root->leafsize; 494 else 495 last = key.objectid + key.offset; 496 497 if (total_found > (1024 * 1024 * 2)) { 498 total_found = 0; 499 wake_up(&caching_ctl->wait); 500 } 501 } 502 path->slots[0]++; 503 } 504 ret = 0; 505 506 total_found += add_new_free_space(block_group, fs_info, last, 507 block_group->key.objectid + 508 block_group->key.offset); 509 caching_ctl->progress = (u64)-1; 510 511 spin_lock(&block_group->lock); 512 block_group->caching_ctl = NULL; 513 block_group->cached = BTRFS_CACHE_FINISHED; 514 spin_unlock(&block_group->lock); 515 516 err: 517 btrfs_free_path(path); 518 up_read(&fs_info->commit_root_sem); 519 520 free_excluded_extents(extent_root, block_group); 521 522 mutex_unlock(&caching_ctl->mutex); 523 out: 524 if (ret) { 525 spin_lock(&block_group->lock); 526 block_group->caching_ctl = NULL; 527 block_group->cached = BTRFS_CACHE_ERROR; 528 spin_unlock(&block_group->lock); 529 } 530 wake_up(&caching_ctl->wait); 531 532 put_caching_control(caching_ctl); 533 btrfs_put_block_group(block_group); 534 } 535 536 static int cache_block_group(struct btrfs_block_group_cache *cache, 537 int load_cache_only) 538 { 539 DEFINE_WAIT(wait); 540 struct btrfs_fs_info *fs_info = cache->fs_info; 541 struct btrfs_caching_control *caching_ctl; 542 int ret = 0; 543 544 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 545 if (!caching_ctl) 546 return -ENOMEM; 547 548 INIT_LIST_HEAD(&caching_ctl->list); 549 mutex_init(&caching_ctl->mutex); 550 init_waitqueue_head(&caching_ctl->wait); 551 caching_ctl->block_group = cache; 552 caching_ctl->progress = cache->key.objectid; 553 atomic_set(&caching_ctl->count, 1); 554 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 555 556 spin_lock(&cache->lock); 557 /* 558 * This should be a rare occasion, but this could happen I think in the 559 * case where one thread starts to load the space cache info, and then 560 * some other thread starts a transaction commit which tries to do an 561 * allocation while the other thread is still loading the space cache 562 * info. The previous loop should have kept us from choosing this block 563 * group, but if we've moved to the state where we will wait on caching 564 * block groups we need to first check if we're doing a fast load here, 565 * so we can wait for it to finish, otherwise we could end up allocating 566 * from a block group who's cache gets evicted for one reason or 567 * another. 568 */ 569 while (cache->cached == BTRFS_CACHE_FAST) { 570 struct btrfs_caching_control *ctl; 571 572 ctl = cache->caching_ctl; 573 atomic_inc(&ctl->count); 574 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 575 spin_unlock(&cache->lock); 576 577 schedule(); 578 579 finish_wait(&ctl->wait, &wait); 580 put_caching_control(ctl); 581 spin_lock(&cache->lock); 582 } 583 584 if (cache->cached != BTRFS_CACHE_NO) { 585 spin_unlock(&cache->lock); 586 kfree(caching_ctl); 587 return 0; 588 } 589 WARN_ON(cache->caching_ctl); 590 cache->caching_ctl = caching_ctl; 591 cache->cached = BTRFS_CACHE_FAST; 592 spin_unlock(&cache->lock); 593 594 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 595 ret = load_free_space_cache(fs_info, cache); 596 597 spin_lock(&cache->lock); 598 if (ret == 1) { 599 cache->caching_ctl = NULL; 600 cache->cached = BTRFS_CACHE_FINISHED; 601 cache->last_byte_to_unpin = (u64)-1; 602 } else { 603 if (load_cache_only) { 604 cache->caching_ctl = NULL; 605 cache->cached = BTRFS_CACHE_NO; 606 } else { 607 cache->cached = BTRFS_CACHE_STARTED; 608 } 609 } 610 spin_unlock(&cache->lock); 611 wake_up(&caching_ctl->wait); 612 if (ret == 1) { 613 put_caching_control(caching_ctl); 614 free_excluded_extents(fs_info->extent_root, cache); 615 return 0; 616 } 617 } else { 618 /* 619 * We are not going to do the fast caching, set cached to the 620 * appropriate value and wakeup any waiters. 621 */ 622 spin_lock(&cache->lock); 623 if (load_cache_only) { 624 cache->caching_ctl = NULL; 625 cache->cached = BTRFS_CACHE_NO; 626 } else { 627 cache->cached = BTRFS_CACHE_STARTED; 628 } 629 spin_unlock(&cache->lock); 630 wake_up(&caching_ctl->wait); 631 } 632 633 if (load_cache_only) { 634 put_caching_control(caching_ctl); 635 return 0; 636 } 637 638 down_write(&fs_info->commit_root_sem); 639 atomic_inc(&caching_ctl->count); 640 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 641 up_write(&fs_info->commit_root_sem); 642 643 btrfs_get_block_group(cache); 644 645 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 646 647 return ret; 648 } 649 650 /* 651 * return the block group that starts at or after bytenr 652 */ 653 static struct btrfs_block_group_cache * 654 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 655 { 656 struct btrfs_block_group_cache *cache; 657 658 cache = block_group_cache_tree_search(info, bytenr, 0); 659 660 return cache; 661 } 662 663 /* 664 * return the block group that contains the given bytenr 665 */ 666 struct btrfs_block_group_cache *btrfs_lookup_block_group( 667 struct btrfs_fs_info *info, 668 u64 bytenr) 669 { 670 struct btrfs_block_group_cache *cache; 671 672 cache = block_group_cache_tree_search(info, bytenr, 1); 673 674 return cache; 675 } 676 677 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 678 u64 flags) 679 { 680 struct list_head *head = &info->space_info; 681 struct btrfs_space_info *found; 682 683 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 684 685 rcu_read_lock(); 686 list_for_each_entry_rcu(found, head, list) { 687 if (found->flags & flags) { 688 rcu_read_unlock(); 689 return found; 690 } 691 } 692 rcu_read_unlock(); 693 return NULL; 694 } 695 696 /* 697 * after adding space to the filesystem, we need to clear the full flags 698 * on all the space infos. 699 */ 700 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 701 { 702 struct list_head *head = &info->space_info; 703 struct btrfs_space_info *found; 704 705 rcu_read_lock(); 706 list_for_each_entry_rcu(found, head, list) 707 found->full = 0; 708 rcu_read_unlock(); 709 } 710 711 /* simple helper to search for an existing extent at a given offset */ 712 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 713 { 714 int ret; 715 struct btrfs_key key; 716 struct btrfs_path *path; 717 718 path = btrfs_alloc_path(); 719 if (!path) 720 return -ENOMEM; 721 722 key.objectid = start; 723 key.offset = len; 724 key.type = BTRFS_EXTENT_ITEM_KEY; 725 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 726 0, 0); 727 if (ret > 0) { 728 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 729 if (key.objectid == start && 730 key.type == BTRFS_METADATA_ITEM_KEY) 731 ret = 0; 732 } 733 btrfs_free_path(path); 734 return ret; 735 } 736 737 /* 738 * helper function to lookup reference count and flags of a tree block. 739 * 740 * the head node for delayed ref is used to store the sum of all the 741 * reference count modifications queued up in the rbtree. the head 742 * node may also store the extent flags to set. This way you can check 743 * to see what the reference count and extent flags would be if all of 744 * the delayed refs are not processed. 745 */ 746 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 747 struct btrfs_root *root, u64 bytenr, 748 u64 offset, int metadata, u64 *refs, u64 *flags) 749 { 750 struct btrfs_delayed_ref_head *head; 751 struct btrfs_delayed_ref_root *delayed_refs; 752 struct btrfs_path *path; 753 struct btrfs_extent_item *ei; 754 struct extent_buffer *leaf; 755 struct btrfs_key key; 756 u32 item_size; 757 u64 num_refs; 758 u64 extent_flags; 759 int ret; 760 761 /* 762 * If we don't have skinny metadata, don't bother doing anything 763 * different 764 */ 765 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 766 offset = root->leafsize; 767 metadata = 0; 768 } 769 770 path = btrfs_alloc_path(); 771 if (!path) 772 return -ENOMEM; 773 774 if (!trans) { 775 path->skip_locking = 1; 776 path->search_commit_root = 1; 777 } 778 779 search_again: 780 key.objectid = bytenr; 781 key.offset = offset; 782 if (metadata) 783 key.type = BTRFS_METADATA_ITEM_KEY; 784 else 785 key.type = BTRFS_EXTENT_ITEM_KEY; 786 787 again: 788 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 789 &key, path, 0, 0); 790 if (ret < 0) 791 goto out_free; 792 793 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 794 if (path->slots[0]) { 795 path->slots[0]--; 796 btrfs_item_key_to_cpu(path->nodes[0], &key, 797 path->slots[0]); 798 if (key.objectid == bytenr && 799 key.type == BTRFS_EXTENT_ITEM_KEY && 800 key.offset == root->leafsize) 801 ret = 0; 802 } 803 if (ret) { 804 key.objectid = bytenr; 805 key.type = BTRFS_EXTENT_ITEM_KEY; 806 key.offset = root->leafsize; 807 btrfs_release_path(path); 808 goto again; 809 } 810 } 811 812 if (ret == 0) { 813 leaf = path->nodes[0]; 814 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 815 if (item_size >= sizeof(*ei)) { 816 ei = btrfs_item_ptr(leaf, path->slots[0], 817 struct btrfs_extent_item); 818 num_refs = btrfs_extent_refs(leaf, ei); 819 extent_flags = btrfs_extent_flags(leaf, ei); 820 } else { 821 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 822 struct btrfs_extent_item_v0 *ei0; 823 BUG_ON(item_size != sizeof(*ei0)); 824 ei0 = btrfs_item_ptr(leaf, path->slots[0], 825 struct btrfs_extent_item_v0); 826 num_refs = btrfs_extent_refs_v0(leaf, ei0); 827 /* FIXME: this isn't correct for data */ 828 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 829 #else 830 BUG(); 831 #endif 832 } 833 BUG_ON(num_refs == 0); 834 } else { 835 num_refs = 0; 836 extent_flags = 0; 837 ret = 0; 838 } 839 840 if (!trans) 841 goto out; 842 843 delayed_refs = &trans->transaction->delayed_refs; 844 spin_lock(&delayed_refs->lock); 845 head = btrfs_find_delayed_ref_head(trans, bytenr); 846 if (head) { 847 if (!mutex_trylock(&head->mutex)) { 848 atomic_inc(&head->node.refs); 849 spin_unlock(&delayed_refs->lock); 850 851 btrfs_release_path(path); 852 853 /* 854 * Mutex was contended, block until it's released and try 855 * again 856 */ 857 mutex_lock(&head->mutex); 858 mutex_unlock(&head->mutex); 859 btrfs_put_delayed_ref(&head->node); 860 goto search_again; 861 } 862 spin_lock(&head->lock); 863 if (head->extent_op && head->extent_op->update_flags) 864 extent_flags |= head->extent_op->flags_to_set; 865 else 866 BUG_ON(num_refs == 0); 867 868 num_refs += head->node.ref_mod; 869 spin_unlock(&head->lock); 870 mutex_unlock(&head->mutex); 871 } 872 spin_unlock(&delayed_refs->lock); 873 out: 874 WARN_ON(num_refs == 0); 875 if (refs) 876 *refs = num_refs; 877 if (flags) 878 *flags = extent_flags; 879 out_free: 880 btrfs_free_path(path); 881 return ret; 882 } 883 884 /* 885 * Back reference rules. Back refs have three main goals: 886 * 887 * 1) differentiate between all holders of references to an extent so that 888 * when a reference is dropped we can make sure it was a valid reference 889 * before freeing the extent. 890 * 891 * 2) Provide enough information to quickly find the holders of an extent 892 * if we notice a given block is corrupted or bad. 893 * 894 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 895 * maintenance. This is actually the same as #2, but with a slightly 896 * different use case. 897 * 898 * There are two kinds of back refs. The implicit back refs is optimized 899 * for pointers in non-shared tree blocks. For a given pointer in a block, 900 * back refs of this kind provide information about the block's owner tree 901 * and the pointer's key. These information allow us to find the block by 902 * b-tree searching. The full back refs is for pointers in tree blocks not 903 * referenced by their owner trees. The location of tree block is recorded 904 * in the back refs. Actually the full back refs is generic, and can be 905 * used in all cases the implicit back refs is used. The major shortcoming 906 * of the full back refs is its overhead. Every time a tree block gets 907 * COWed, we have to update back refs entry for all pointers in it. 908 * 909 * For a newly allocated tree block, we use implicit back refs for 910 * pointers in it. This means most tree related operations only involve 911 * implicit back refs. For a tree block created in old transaction, the 912 * only way to drop a reference to it is COW it. So we can detect the 913 * event that tree block loses its owner tree's reference and do the 914 * back refs conversion. 915 * 916 * When a tree block is COW'd through a tree, there are four cases: 917 * 918 * The reference count of the block is one and the tree is the block's 919 * owner tree. Nothing to do in this case. 920 * 921 * The reference count of the block is one and the tree is not the 922 * block's owner tree. In this case, full back refs is used for pointers 923 * in the block. Remove these full back refs, add implicit back refs for 924 * every pointers in the new block. 925 * 926 * The reference count of the block is greater than one and the tree is 927 * the block's owner tree. In this case, implicit back refs is used for 928 * pointers in the block. Add full back refs for every pointers in the 929 * block, increase lower level extents' reference counts. The original 930 * implicit back refs are entailed to the new block. 931 * 932 * The reference count of the block is greater than one and the tree is 933 * not the block's owner tree. Add implicit back refs for every pointer in 934 * the new block, increase lower level extents' reference count. 935 * 936 * Back Reference Key composing: 937 * 938 * The key objectid corresponds to the first byte in the extent, 939 * The key type is used to differentiate between types of back refs. 940 * There are different meanings of the key offset for different types 941 * of back refs. 942 * 943 * File extents can be referenced by: 944 * 945 * - multiple snapshots, subvolumes, or different generations in one subvol 946 * - different files inside a single subvolume 947 * - different offsets inside a file (bookend extents in file.c) 948 * 949 * The extent ref structure for the implicit back refs has fields for: 950 * 951 * - Objectid of the subvolume root 952 * - objectid of the file holding the reference 953 * - original offset in the file 954 * - how many bookend extents 955 * 956 * The key offset for the implicit back refs is hash of the first 957 * three fields. 958 * 959 * The extent ref structure for the full back refs has field for: 960 * 961 * - number of pointers in the tree leaf 962 * 963 * The key offset for the implicit back refs is the first byte of 964 * the tree leaf 965 * 966 * When a file extent is allocated, The implicit back refs is used. 967 * the fields are filled in: 968 * 969 * (root_key.objectid, inode objectid, offset in file, 1) 970 * 971 * When a file extent is removed file truncation, we find the 972 * corresponding implicit back refs and check the following fields: 973 * 974 * (btrfs_header_owner(leaf), inode objectid, offset in file) 975 * 976 * Btree extents can be referenced by: 977 * 978 * - Different subvolumes 979 * 980 * Both the implicit back refs and the full back refs for tree blocks 981 * only consist of key. The key offset for the implicit back refs is 982 * objectid of block's owner tree. The key offset for the full back refs 983 * is the first byte of parent block. 984 * 985 * When implicit back refs is used, information about the lowest key and 986 * level of the tree block are required. These information are stored in 987 * tree block info structure. 988 */ 989 990 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 991 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 992 struct btrfs_root *root, 993 struct btrfs_path *path, 994 u64 owner, u32 extra_size) 995 { 996 struct btrfs_extent_item *item; 997 struct btrfs_extent_item_v0 *ei0; 998 struct btrfs_extent_ref_v0 *ref0; 999 struct btrfs_tree_block_info *bi; 1000 struct extent_buffer *leaf; 1001 struct btrfs_key key; 1002 struct btrfs_key found_key; 1003 u32 new_size = sizeof(*item); 1004 u64 refs; 1005 int ret; 1006 1007 leaf = path->nodes[0]; 1008 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1009 1010 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1011 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1012 struct btrfs_extent_item_v0); 1013 refs = btrfs_extent_refs_v0(leaf, ei0); 1014 1015 if (owner == (u64)-1) { 1016 while (1) { 1017 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1018 ret = btrfs_next_leaf(root, path); 1019 if (ret < 0) 1020 return ret; 1021 BUG_ON(ret > 0); /* Corruption */ 1022 leaf = path->nodes[0]; 1023 } 1024 btrfs_item_key_to_cpu(leaf, &found_key, 1025 path->slots[0]); 1026 BUG_ON(key.objectid != found_key.objectid); 1027 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1028 path->slots[0]++; 1029 continue; 1030 } 1031 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1032 struct btrfs_extent_ref_v0); 1033 owner = btrfs_ref_objectid_v0(leaf, ref0); 1034 break; 1035 } 1036 } 1037 btrfs_release_path(path); 1038 1039 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1040 new_size += sizeof(*bi); 1041 1042 new_size -= sizeof(*ei0); 1043 ret = btrfs_search_slot(trans, root, &key, path, 1044 new_size + extra_size, 1); 1045 if (ret < 0) 1046 return ret; 1047 BUG_ON(ret); /* Corruption */ 1048 1049 btrfs_extend_item(root, path, new_size); 1050 1051 leaf = path->nodes[0]; 1052 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1053 btrfs_set_extent_refs(leaf, item, refs); 1054 /* FIXME: get real generation */ 1055 btrfs_set_extent_generation(leaf, item, 0); 1056 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1057 btrfs_set_extent_flags(leaf, item, 1058 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1059 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1060 bi = (struct btrfs_tree_block_info *)(item + 1); 1061 /* FIXME: get first key of the block */ 1062 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1063 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1064 } else { 1065 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1066 } 1067 btrfs_mark_buffer_dirty(leaf); 1068 return 0; 1069 } 1070 #endif 1071 1072 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1073 { 1074 u32 high_crc = ~(u32)0; 1075 u32 low_crc = ~(u32)0; 1076 __le64 lenum; 1077 1078 lenum = cpu_to_le64(root_objectid); 1079 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1080 lenum = cpu_to_le64(owner); 1081 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1082 lenum = cpu_to_le64(offset); 1083 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1084 1085 return ((u64)high_crc << 31) ^ (u64)low_crc; 1086 } 1087 1088 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1089 struct btrfs_extent_data_ref *ref) 1090 { 1091 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1092 btrfs_extent_data_ref_objectid(leaf, ref), 1093 btrfs_extent_data_ref_offset(leaf, ref)); 1094 } 1095 1096 static int match_extent_data_ref(struct extent_buffer *leaf, 1097 struct btrfs_extent_data_ref *ref, 1098 u64 root_objectid, u64 owner, u64 offset) 1099 { 1100 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1101 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1102 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1103 return 0; 1104 return 1; 1105 } 1106 1107 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1108 struct btrfs_root *root, 1109 struct btrfs_path *path, 1110 u64 bytenr, u64 parent, 1111 u64 root_objectid, 1112 u64 owner, u64 offset) 1113 { 1114 struct btrfs_key key; 1115 struct btrfs_extent_data_ref *ref; 1116 struct extent_buffer *leaf; 1117 u32 nritems; 1118 int ret; 1119 int recow; 1120 int err = -ENOENT; 1121 1122 key.objectid = bytenr; 1123 if (parent) { 1124 key.type = BTRFS_SHARED_DATA_REF_KEY; 1125 key.offset = parent; 1126 } else { 1127 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1128 key.offset = hash_extent_data_ref(root_objectid, 1129 owner, offset); 1130 } 1131 again: 1132 recow = 0; 1133 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1134 if (ret < 0) { 1135 err = ret; 1136 goto fail; 1137 } 1138 1139 if (parent) { 1140 if (!ret) 1141 return 0; 1142 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1143 key.type = BTRFS_EXTENT_REF_V0_KEY; 1144 btrfs_release_path(path); 1145 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1146 if (ret < 0) { 1147 err = ret; 1148 goto fail; 1149 } 1150 if (!ret) 1151 return 0; 1152 #endif 1153 goto fail; 1154 } 1155 1156 leaf = path->nodes[0]; 1157 nritems = btrfs_header_nritems(leaf); 1158 while (1) { 1159 if (path->slots[0] >= nritems) { 1160 ret = btrfs_next_leaf(root, path); 1161 if (ret < 0) 1162 err = ret; 1163 if (ret) 1164 goto fail; 1165 1166 leaf = path->nodes[0]; 1167 nritems = btrfs_header_nritems(leaf); 1168 recow = 1; 1169 } 1170 1171 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1172 if (key.objectid != bytenr || 1173 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1174 goto fail; 1175 1176 ref = btrfs_item_ptr(leaf, path->slots[0], 1177 struct btrfs_extent_data_ref); 1178 1179 if (match_extent_data_ref(leaf, ref, root_objectid, 1180 owner, offset)) { 1181 if (recow) { 1182 btrfs_release_path(path); 1183 goto again; 1184 } 1185 err = 0; 1186 break; 1187 } 1188 path->slots[0]++; 1189 } 1190 fail: 1191 return err; 1192 } 1193 1194 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1195 struct btrfs_root *root, 1196 struct btrfs_path *path, 1197 u64 bytenr, u64 parent, 1198 u64 root_objectid, u64 owner, 1199 u64 offset, int refs_to_add) 1200 { 1201 struct btrfs_key key; 1202 struct extent_buffer *leaf; 1203 u32 size; 1204 u32 num_refs; 1205 int ret; 1206 1207 key.objectid = bytenr; 1208 if (parent) { 1209 key.type = BTRFS_SHARED_DATA_REF_KEY; 1210 key.offset = parent; 1211 size = sizeof(struct btrfs_shared_data_ref); 1212 } else { 1213 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1214 key.offset = hash_extent_data_ref(root_objectid, 1215 owner, offset); 1216 size = sizeof(struct btrfs_extent_data_ref); 1217 } 1218 1219 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1220 if (ret && ret != -EEXIST) 1221 goto fail; 1222 1223 leaf = path->nodes[0]; 1224 if (parent) { 1225 struct btrfs_shared_data_ref *ref; 1226 ref = btrfs_item_ptr(leaf, path->slots[0], 1227 struct btrfs_shared_data_ref); 1228 if (ret == 0) { 1229 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1230 } else { 1231 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1232 num_refs += refs_to_add; 1233 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1234 } 1235 } else { 1236 struct btrfs_extent_data_ref *ref; 1237 while (ret == -EEXIST) { 1238 ref = btrfs_item_ptr(leaf, path->slots[0], 1239 struct btrfs_extent_data_ref); 1240 if (match_extent_data_ref(leaf, ref, root_objectid, 1241 owner, offset)) 1242 break; 1243 btrfs_release_path(path); 1244 key.offset++; 1245 ret = btrfs_insert_empty_item(trans, root, path, &key, 1246 size); 1247 if (ret && ret != -EEXIST) 1248 goto fail; 1249 1250 leaf = path->nodes[0]; 1251 } 1252 ref = btrfs_item_ptr(leaf, path->slots[0], 1253 struct btrfs_extent_data_ref); 1254 if (ret == 0) { 1255 btrfs_set_extent_data_ref_root(leaf, ref, 1256 root_objectid); 1257 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1258 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1259 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1260 } else { 1261 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1262 num_refs += refs_to_add; 1263 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1264 } 1265 } 1266 btrfs_mark_buffer_dirty(leaf); 1267 ret = 0; 1268 fail: 1269 btrfs_release_path(path); 1270 return ret; 1271 } 1272 1273 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1274 struct btrfs_root *root, 1275 struct btrfs_path *path, 1276 int refs_to_drop, int *last_ref) 1277 { 1278 struct btrfs_key key; 1279 struct btrfs_extent_data_ref *ref1 = NULL; 1280 struct btrfs_shared_data_ref *ref2 = NULL; 1281 struct extent_buffer *leaf; 1282 u32 num_refs = 0; 1283 int ret = 0; 1284 1285 leaf = path->nodes[0]; 1286 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1287 1288 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1289 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1290 struct btrfs_extent_data_ref); 1291 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1292 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1293 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1294 struct btrfs_shared_data_ref); 1295 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1296 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1297 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1298 struct btrfs_extent_ref_v0 *ref0; 1299 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1300 struct btrfs_extent_ref_v0); 1301 num_refs = btrfs_ref_count_v0(leaf, ref0); 1302 #endif 1303 } else { 1304 BUG(); 1305 } 1306 1307 BUG_ON(num_refs < refs_to_drop); 1308 num_refs -= refs_to_drop; 1309 1310 if (num_refs == 0) { 1311 ret = btrfs_del_item(trans, root, path); 1312 *last_ref = 1; 1313 } else { 1314 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1315 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1316 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1317 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1318 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1319 else { 1320 struct btrfs_extent_ref_v0 *ref0; 1321 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1322 struct btrfs_extent_ref_v0); 1323 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1324 } 1325 #endif 1326 btrfs_mark_buffer_dirty(leaf); 1327 } 1328 return ret; 1329 } 1330 1331 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1332 struct btrfs_path *path, 1333 struct btrfs_extent_inline_ref *iref) 1334 { 1335 struct btrfs_key key; 1336 struct extent_buffer *leaf; 1337 struct btrfs_extent_data_ref *ref1; 1338 struct btrfs_shared_data_ref *ref2; 1339 u32 num_refs = 0; 1340 1341 leaf = path->nodes[0]; 1342 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1343 if (iref) { 1344 if (btrfs_extent_inline_ref_type(leaf, iref) == 1345 BTRFS_EXTENT_DATA_REF_KEY) { 1346 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1347 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1348 } else { 1349 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1350 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1351 } 1352 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1353 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1354 struct btrfs_extent_data_ref); 1355 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1356 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1357 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1358 struct btrfs_shared_data_ref); 1359 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1360 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1361 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1362 struct btrfs_extent_ref_v0 *ref0; 1363 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1364 struct btrfs_extent_ref_v0); 1365 num_refs = btrfs_ref_count_v0(leaf, ref0); 1366 #endif 1367 } else { 1368 WARN_ON(1); 1369 } 1370 return num_refs; 1371 } 1372 1373 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1374 struct btrfs_root *root, 1375 struct btrfs_path *path, 1376 u64 bytenr, u64 parent, 1377 u64 root_objectid) 1378 { 1379 struct btrfs_key key; 1380 int ret; 1381 1382 key.objectid = bytenr; 1383 if (parent) { 1384 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1385 key.offset = parent; 1386 } else { 1387 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1388 key.offset = root_objectid; 1389 } 1390 1391 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1392 if (ret > 0) 1393 ret = -ENOENT; 1394 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1395 if (ret == -ENOENT && parent) { 1396 btrfs_release_path(path); 1397 key.type = BTRFS_EXTENT_REF_V0_KEY; 1398 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1399 if (ret > 0) 1400 ret = -ENOENT; 1401 } 1402 #endif 1403 return ret; 1404 } 1405 1406 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1407 struct btrfs_root *root, 1408 struct btrfs_path *path, 1409 u64 bytenr, u64 parent, 1410 u64 root_objectid) 1411 { 1412 struct btrfs_key key; 1413 int ret; 1414 1415 key.objectid = bytenr; 1416 if (parent) { 1417 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1418 key.offset = parent; 1419 } else { 1420 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1421 key.offset = root_objectid; 1422 } 1423 1424 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1425 btrfs_release_path(path); 1426 return ret; 1427 } 1428 1429 static inline int extent_ref_type(u64 parent, u64 owner) 1430 { 1431 int type; 1432 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1433 if (parent > 0) 1434 type = BTRFS_SHARED_BLOCK_REF_KEY; 1435 else 1436 type = BTRFS_TREE_BLOCK_REF_KEY; 1437 } else { 1438 if (parent > 0) 1439 type = BTRFS_SHARED_DATA_REF_KEY; 1440 else 1441 type = BTRFS_EXTENT_DATA_REF_KEY; 1442 } 1443 return type; 1444 } 1445 1446 static int find_next_key(struct btrfs_path *path, int level, 1447 struct btrfs_key *key) 1448 1449 { 1450 for (; level < BTRFS_MAX_LEVEL; level++) { 1451 if (!path->nodes[level]) 1452 break; 1453 if (path->slots[level] + 1 >= 1454 btrfs_header_nritems(path->nodes[level])) 1455 continue; 1456 if (level == 0) 1457 btrfs_item_key_to_cpu(path->nodes[level], key, 1458 path->slots[level] + 1); 1459 else 1460 btrfs_node_key_to_cpu(path->nodes[level], key, 1461 path->slots[level] + 1); 1462 return 0; 1463 } 1464 return 1; 1465 } 1466 1467 /* 1468 * look for inline back ref. if back ref is found, *ref_ret is set 1469 * to the address of inline back ref, and 0 is returned. 1470 * 1471 * if back ref isn't found, *ref_ret is set to the address where it 1472 * should be inserted, and -ENOENT is returned. 1473 * 1474 * if insert is true and there are too many inline back refs, the path 1475 * points to the extent item, and -EAGAIN is returned. 1476 * 1477 * NOTE: inline back refs are ordered in the same way that back ref 1478 * items in the tree are ordered. 1479 */ 1480 static noinline_for_stack 1481 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1482 struct btrfs_root *root, 1483 struct btrfs_path *path, 1484 struct btrfs_extent_inline_ref **ref_ret, 1485 u64 bytenr, u64 num_bytes, 1486 u64 parent, u64 root_objectid, 1487 u64 owner, u64 offset, int insert) 1488 { 1489 struct btrfs_key key; 1490 struct extent_buffer *leaf; 1491 struct btrfs_extent_item *ei; 1492 struct btrfs_extent_inline_ref *iref; 1493 u64 flags; 1494 u64 item_size; 1495 unsigned long ptr; 1496 unsigned long end; 1497 int extra_size; 1498 int type; 1499 int want; 1500 int ret; 1501 int err = 0; 1502 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1503 SKINNY_METADATA); 1504 1505 key.objectid = bytenr; 1506 key.type = BTRFS_EXTENT_ITEM_KEY; 1507 key.offset = num_bytes; 1508 1509 want = extent_ref_type(parent, owner); 1510 if (insert) { 1511 extra_size = btrfs_extent_inline_ref_size(want); 1512 path->keep_locks = 1; 1513 } else 1514 extra_size = -1; 1515 1516 /* 1517 * Owner is our parent level, so we can just add one to get the level 1518 * for the block we are interested in. 1519 */ 1520 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1521 key.type = BTRFS_METADATA_ITEM_KEY; 1522 key.offset = owner; 1523 } 1524 1525 again: 1526 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1527 if (ret < 0) { 1528 err = ret; 1529 goto out; 1530 } 1531 1532 /* 1533 * We may be a newly converted file system which still has the old fat 1534 * extent entries for metadata, so try and see if we have one of those. 1535 */ 1536 if (ret > 0 && skinny_metadata) { 1537 skinny_metadata = false; 1538 if (path->slots[0]) { 1539 path->slots[0]--; 1540 btrfs_item_key_to_cpu(path->nodes[0], &key, 1541 path->slots[0]); 1542 if (key.objectid == bytenr && 1543 key.type == BTRFS_EXTENT_ITEM_KEY && 1544 key.offset == num_bytes) 1545 ret = 0; 1546 } 1547 if (ret) { 1548 key.objectid = bytenr; 1549 key.type = BTRFS_EXTENT_ITEM_KEY; 1550 key.offset = num_bytes; 1551 btrfs_release_path(path); 1552 goto again; 1553 } 1554 } 1555 1556 if (ret && !insert) { 1557 err = -ENOENT; 1558 goto out; 1559 } else if (WARN_ON(ret)) { 1560 err = -EIO; 1561 goto out; 1562 } 1563 1564 leaf = path->nodes[0]; 1565 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1566 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1567 if (item_size < sizeof(*ei)) { 1568 if (!insert) { 1569 err = -ENOENT; 1570 goto out; 1571 } 1572 ret = convert_extent_item_v0(trans, root, path, owner, 1573 extra_size); 1574 if (ret < 0) { 1575 err = ret; 1576 goto out; 1577 } 1578 leaf = path->nodes[0]; 1579 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1580 } 1581 #endif 1582 BUG_ON(item_size < sizeof(*ei)); 1583 1584 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1585 flags = btrfs_extent_flags(leaf, ei); 1586 1587 ptr = (unsigned long)(ei + 1); 1588 end = (unsigned long)ei + item_size; 1589 1590 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1591 ptr += sizeof(struct btrfs_tree_block_info); 1592 BUG_ON(ptr > end); 1593 } 1594 1595 err = -ENOENT; 1596 while (1) { 1597 if (ptr >= end) { 1598 WARN_ON(ptr > end); 1599 break; 1600 } 1601 iref = (struct btrfs_extent_inline_ref *)ptr; 1602 type = btrfs_extent_inline_ref_type(leaf, iref); 1603 if (want < type) 1604 break; 1605 if (want > type) { 1606 ptr += btrfs_extent_inline_ref_size(type); 1607 continue; 1608 } 1609 1610 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1611 struct btrfs_extent_data_ref *dref; 1612 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1613 if (match_extent_data_ref(leaf, dref, root_objectid, 1614 owner, offset)) { 1615 err = 0; 1616 break; 1617 } 1618 if (hash_extent_data_ref_item(leaf, dref) < 1619 hash_extent_data_ref(root_objectid, owner, offset)) 1620 break; 1621 } else { 1622 u64 ref_offset; 1623 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1624 if (parent > 0) { 1625 if (parent == ref_offset) { 1626 err = 0; 1627 break; 1628 } 1629 if (ref_offset < parent) 1630 break; 1631 } else { 1632 if (root_objectid == ref_offset) { 1633 err = 0; 1634 break; 1635 } 1636 if (ref_offset < root_objectid) 1637 break; 1638 } 1639 } 1640 ptr += btrfs_extent_inline_ref_size(type); 1641 } 1642 if (err == -ENOENT && insert) { 1643 if (item_size + extra_size >= 1644 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1645 err = -EAGAIN; 1646 goto out; 1647 } 1648 /* 1649 * To add new inline back ref, we have to make sure 1650 * there is no corresponding back ref item. 1651 * For simplicity, we just do not add new inline back 1652 * ref if there is any kind of item for this block 1653 */ 1654 if (find_next_key(path, 0, &key) == 0 && 1655 key.objectid == bytenr && 1656 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1657 err = -EAGAIN; 1658 goto out; 1659 } 1660 } 1661 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1662 out: 1663 if (insert) { 1664 path->keep_locks = 0; 1665 btrfs_unlock_up_safe(path, 1); 1666 } 1667 return err; 1668 } 1669 1670 /* 1671 * helper to add new inline back ref 1672 */ 1673 static noinline_for_stack 1674 void setup_inline_extent_backref(struct btrfs_root *root, 1675 struct btrfs_path *path, 1676 struct btrfs_extent_inline_ref *iref, 1677 u64 parent, u64 root_objectid, 1678 u64 owner, u64 offset, int refs_to_add, 1679 struct btrfs_delayed_extent_op *extent_op) 1680 { 1681 struct extent_buffer *leaf; 1682 struct btrfs_extent_item *ei; 1683 unsigned long ptr; 1684 unsigned long end; 1685 unsigned long item_offset; 1686 u64 refs; 1687 int size; 1688 int type; 1689 1690 leaf = path->nodes[0]; 1691 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1692 item_offset = (unsigned long)iref - (unsigned long)ei; 1693 1694 type = extent_ref_type(parent, owner); 1695 size = btrfs_extent_inline_ref_size(type); 1696 1697 btrfs_extend_item(root, path, size); 1698 1699 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1700 refs = btrfs_extent_refs(leaf, ei); 1701 refs += refs_to_add; 1702 btrfs_set_extent_refs(leaf, ei, refs); 1703 if (extent_op) 1704 __run_delayed_extent_op(extent_op, leaf, ei); 1705 1706 ptr = (unsigned long)ei + item_offset; 1707 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1708 if (ptr < end - size) 1709 memmove_extent_buffer(leaf, ptr + size, ptr, 1710 end - size - ptr); 1711 1712 iref = (struct btrfs_extent_inline_ref *)ptr; 1713 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1714 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1715 struct btrfs_extent_data_ref *dref; 1716 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1717 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1718 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1719 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1720 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1721 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1722 struct btrfs_shared_data_ref *sref; 1723 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1724 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1725 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1726 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1727 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1728 } else { 1729 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1730 } 1731 btrfs_mark_buffer_dirty(leaf); 1732 } 1733 1734 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1735 struct btrfs_root *root, 1736 struct btrfs_path *path, 1737 struct btrfs_extent_inline_ref **ref_ret, 1738 u64 bytenr, u64 num_bytes, u64 parent, 1739 u64 root_objectid, u64 owner, u64 offset) 1740 { 1741 int ret; 1742 1743 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1744 bytenr, num_bytes, parent, 1745 root_objectid, owner, offset, 0); 1746 if (ret != -ENOENT) 1747 return ret; 1748 1749 btrfs_release_path(path); 1750 *ref_ret = NULL; 1751 1752 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1753 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1754 root_objectid); 1755 } else { 1756 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1757 root_objectid, owner, offset); 1758 } 1759 return ret; 1760 } 1761 1762 /* 1763 * helper to update/remove inline back ref 1764 */ 1765 static noinline_for_stack 1766 void update_inline_extent_backref(struct btrfs_root *root, 1767 struct btrfs_path *path, 1768 struct btrfs_extent_inline_ref *iref, 1769 int refs_to_mod, 1770 struct btrfs_delayed_extent_op *extent_op, 1771 int *last_ref) 1772 { 1773 struct extent_buffer *leaf; 1774 struct btrfs_extent_item *ei; 1775 struct btrfs_extent_data_ref *dref = NULL; 1776 struct btrfs_shared_data_ref *sref = NULL; 1777 unsigned long ptr; 1778 unsigned long end; 1779 u32 item_size; 1780 int size; 1781 int type; 1782 u64 refs; 1783 1784 leaf = path->nodes[0]; 1785 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1786 refs = btrfs_extent_refs(leaf, ei); 1787 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1788 refs += refs_to_mod; 1789 btrfs_set_extent_refs(leaf, ei, refs); 1790 if (extent_op) 1791 __run_delayed_extent_op(extent_op, leaf, ei); 1792 1793 type = btrfs_extent_inline_ref_type(leaf, iref); 1794 1795 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1796 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1797 refs = btrfs_extent_data_ref_count(leaf, dref); 1798 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1799 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1800 refs = btrfs_shared_data_ref_count(leaf, sref); 1801 } else { 1802 refs = 1; 1803 BUG_ON(refs_to_mod != -1); 1804 } 1805 1806 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1807 refs += refs_to_mod; 1808 1809 if (refs > 0) { 1810 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1811 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1812 else 1813 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1814 } else { 1815 *last_ref = 1; 1816 size = btrfs_extent_inline_ref_size(type); 1817 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1818 ptr = (unsigned long)iref; 1819 end = (unsigned long)ei + item_size; 1820 if (ptr + size < end) 1821 memmove_extent_buffer(leaf, ptr, ptr + size, 1822 end - ptr - size); 1823 item_size -= size; 1824 btrfs_truncate_item(root, path, item_size, 1); 1825 } 1826 btrfs_mark_buffer_dirty(leaf); 1827 } 1828 1829 static noinline_for_stack 1830 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1831 struct btrfs_root *root, 1832 struct btrfs_path *path, 1833 u64 bytenr, u64 num_bytes, u64 parent, 1834 u64 root_objectid, u64 owner, 1835 u64 offset, int refs_to_add, 1836 struct btrfs_delayed_extent_op *extent_op) 1837 { 1838 struct btrfs_extent_inline_ref *iref; 1839 int ret; 1840 1841 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1842 bytenr, num_bytes, parent, 1843 root_objectid, owner, offset, 1); 1844 if (ret == 0) { 1845 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1846 update_inline_extent_backref(root, path, iref, 1847 refs_to_add, extent_op, NULL); 1848 } else if (ret == -ENOENT) { 1849 setup_inline_extent_backref(root, path, iref, parent, 1850 root_objectid, owner, offset, 1851 refs_to_add, extent_op); 1852 ret = 0; 1853 } 1854 return ret; 1855 } 1856 1857 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1858 struct btrfs_root *root, 1859 struct btrfs_path *path, 1860 u64 bytenr, u64 parent, u64 root_objectid, 1861 u64 owner, u64 offset, int refs_to_add) 1862 { 1863 int ret; 1864 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1865 BUG_ON(refs_to_add != 1); 1866 ret = insert_tree_block_ref(trans, root, path, bytenr, 1867 parent, root_objectid); 1868 } else { 1869 ret = insert_extent_data_ref(trans, root, path, bytenr, 1870 parent, root_objectid, 1871 owner, offset, refs_to_add); 1872 } 1873 return ret; 1874 } 1875 1876 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1877 struct btrfs_root *root, 1878 struct btrfs_path *path, 1879 struct btrfs_extent_inline_ref *iref, 1880 int refs_to_drop, int is_data, int *last_ref) 1881 { 1882 int ret = 0; 1883 1884 BUG_ON(!is_data && refs_to_drop != 1); 1885 if (iref) { 1886 update_inline_extent_backref(root, path, iref, 1887 -refs_to_drop, NULL, last_ref); 1888 } else if (is_data) { 1889 ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1890 last_ref); 1891 } else { 1892 *last_ref = 1; 1893 ret = btrfs_del_item(trans, root, path); 1894 } 1895 return ret; 1896 } 1897 1898 static int btrfs_issue_discard(struct block_device *bdev, 1899 u64 start, u64 len) 1900 { 1901 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1902 } 1903 1904 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1905 u64 num_bytes, u64 *actual_bytes) 1906 { 1907 int ret; 1908 u64 discarded_bytes = 0; 1909 struct btrfs_bio *bbio = NULL; 1910 1911 1912 /* Tell the block device(s) that the sectors can be discarded */ 1913 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1914 bytenr, &num_bytes, &bbio, 0); 1915 /* Error condition is -ENOMEM */ 1916 if (!ret) { 1917 struct btrfs_bio_stripe *stripe = bbio->stripes; 1918 int i; 1919 1920 1921 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1922 if (!stripe->dev->can_discard) 1923 continue; 1924 1925 ret = btrfs_issue_discard(stripe->dev->bdev, 1926 stripe->physical, 1927 stripe->length); 1928 if (!ret) 1929 discarded_bytes += stripe->length; 1930 else if (ret != -EOPNOTSUPP) 1931 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1932 1933 /* 1934 * Just in case we get back EOPNOTSUPP for some reason, 1935 * just ignore the return value so we don't screw up 1936 * people calling discard_extent. 1937 */ 1938 ret = 0; 1939 } 1940 kfree(bbio); 1941 } 1942 1943 if (actual_bytes) 1944 *actual_bytes = discarded_bytes; 1945 1946 1947 if (ret == -EOPNOTSUPP) 1948 ret = 0; 1949 return ret; 1950 } 1951 1952 /* Can return -ENOMEM */ 1953 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1954 struct btrfs_root *root, 1955 u64 bytenr, u64 num_bytes, u64 parent, 1956 u64 root_objectid, u64 owner, u64 offset, 1957 int no_quota) 1958 { 1959 int ret; 1960 struct btrfs_fs_info *fs_info = root->fs_info; 1961 1962 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1963 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1964 1965 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1966 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1967 num_bytes, 1968 parent, root_objectid, (int)owner, 1969 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1970 } else { 1971 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1972 num_bytes, 1973 parent, root_objectid, owner, offset, 1974 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1975 } 1976 return ret; 1977 } 1978 1979 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1980 struct btrfs_root *root, 1981 u64 bytenr, u64 num_bytes, 1982 u64 parent, u64 root_objectid, 1983 u64 owner, u64 offset, int refs_to_add, 1984 int no_quota, 1985 struct btrfs_delayed_extent_op *extent_op) 1986 { 1987 struct btrfs_fs_info *fs_info = root->fs_info; 1988 struct btrfs_path *path; 1989 struct extent_buffer *leaf; 1990 struct btrfs_extent_item *item; 1991 struct btrfs_key key; 1992 u64 refs; 1993 int ret; 1994 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; 1995 1996 path = btrfs_alloc_path(); 1997 if (!path) 1998 return -ENOMEM; 1999 2000 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) 2001 no_quota = 1; 2002 2003 path->reada = 1; 2004 path->leave_spinning = 1; 2005 /* this will setup the path even if it fails to insert the back ref */ 2006 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, 2007 bytenr, num_bytes, parent, 2008 root_objectid, owner, offset, 2009 refs_to_add, extent_op); 2010 if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) 2011 goto out; 2012 /* 2013 * Ok we were able to insert an inline extent and it appears to be a new 2014 * reference, deal with the qgroup accounting. 2015 */ 2016 if (!ret && !no_quota) { 2017 ASSERT(root->fs_info->quota_enabled); 2018 leaf = path->nodes[0]; 2019 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2020 item = btrfs_item_ptr(leaf, path->slots[0], 2021 struct btrfs_extent_item); 2022 if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) 2023 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2024 btrfs_release_path(path); 2025 2026 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2027 bytenr, num_bytes, type, 0); 2028 goto out; 2029 } 2030 2031 /* 2032 * Ok we had -EAGAIN which means we didn't have space to insert and 2033 * inline extent ref, so just update the reference count and add a 2034 * normal backref. 2035 */ 2036 leaf = path->nodes[0]; 2037 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2038 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2039 refs = btrfs_extent_refs(leaf, item); 2040 if (refs) 2041 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2042 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2043 if (extent_op) 2044 __run_delayed_extent_op(extent_op, leaf, item); 2045 2046 btrfs_mark_buffer_dirty(leaf); 2047 btrfs_release_path(path); 2048 2049 if (!no_quota) { 2050 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2051 bytenr, num_bytes, type, 0); 2052 if (ret) 2053 goto out; 2054 } 2055 2056 path->reada = 1; 2057 path->leave_spinning = 1; 2058 /* now insert the actual backref */ 2059 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2060 path, bytenr, parent, root_objectid, 2061 owner, offset, refs_to_add); 2062 if (ret) 2063 btrfs_abort_transaction(trans, root, ret); 2064 out: 2065 btrfs_free_path(path); 2066 return ret; 2067 } 2068 2069 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2070 struct btrfs_root *root, 2071 struct btrfs_delayed_ref_node *node, 2072 struct btrfs_delayed_extent_op *extent_op, 2073 int insert_reserved) 2074 { 2075 int ret = 0; 2076 struct btrfs_delayed_data_ref *ref; 2077 struct btrfs_key ins; 2078 u64 parent = 0; 2079 u64 ref_root = 0; 2080 u64 flags = 0; 2081 2082 ins.objectid = node->bytenr; 2083 ins.offset = node->num_bytes; 2084 ins.type = BTRFS_EXTENT_ITEM_KEY; 2085 2086 ref = btrfs_delayed_node_to_data_ref(node); 2087 trace_run_delayed_data_ref(node, ref, node->action); 2088 2089 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2090 parent = ref->parent; 2091 ref_root = ref->root; 2092 2093 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2094 if (extent_op) 2095 flags |= extent_op->flags_to_set; 2096 ret = alloc_reserved_file_extent(trans, root, 2097 parent, ref_root, flags, 2098 ref->objectid, ref->offset, 2099 &ins, node->ref_mod); 2100 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2101 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2102 node->num_bytes, parent, 2103 ref_root, ref->objectid, 2104 ref->offset, node->ref_mod, 2105 node->no_quota, extent_op); 2106 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2107 ret = __btrfs_free_extent(trans, root, node->bytenr, 2108 node->num_bytes, parent, 2109 ref_root, ref->objectid, 2110 ref->offset, node->ref_mod, 2111 extent_op, node->no_quota); 2112 } else { 2113 BUG(); 2114 } 2115 return ret; 2116 } 2117 2118 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2119 struct extent_buffer *leaf, 2120 struct btrfs_extent_item *ei) 2121 { 2122 u64 flags = btrfs_extent_flags(leaf, ei); 2123 if (extent_op->update_flags) { 2124 flags |= extent_op->flags_to_set; 2125 btrfs_set_extent_flags(leaf, ei, flags); 2126 } 2127 2128 if (extent_op->update_key) { 2129 struct btrfs_tree_block_info *bi; 2130 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2131 bi = (struct btrfs_tree_block_info *)(ei + 1); 2132 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2133 } 2134 } 2135 2136 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2137 struct btrfs_root *root, 2138 struct btrfs_delayed_ref_node *node, 2139 struct btrfs_delayed_extent_op *extent_op) 2140 { 2141 struct btrfs_key key; 2142 struct btrfs_path *path; 2143 struct btrfs_extent_item *ei; 2144 struct extent_buffer *leaf; 2145 u32 item_size; 2146 int ret; 2147 int err = 0; 2148 int metadata = !extent_op->is_data; 2149 2150 if (trans->aborted) 2151 return 0; 2152 2153 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2154 metadata = 0; 2155 2156 path = btrfs_alloc_path(); 2157 if (!path) 2158 return -ENOMEM; 2159 2160 key.objectid = node->bytenr; 2161 2162 if (metadata) { 2163 key.type = BTRFS_METADATA_ITEM_KEY; 2164 key.offset = extent_op->level; 2165 } else { 2166 key.type = BTRFS_EXTENT_ITEM_KEY; 2167 key.offset = node->num_bytes; 2168 } 2169 2170 again: 2171 path->reada = 1; 2172 path->leave_spinning = 1; 2173 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2174 path, 0, 1); 2175 if (ret < 0) { 2176 err = ret; 2177 goto out; 2178 } 2179 if (ret > 0) { 2180 if (metadata) { 2181 if (path->slots[0] > 0) { 2182 path->slots[0]--; 2183 btrfs_item_key_to_cpu(path->nodes[0], &key, 2184 path->slots[0]); 2185 if (key.objectid == node->bytenr && 2186 key.type == BTRFS_EXTENT_ITEM_KEY && 2187 key.offset == node->num_bytes) 2188 ret = 0; 2189 } 2190 if (ret > 0) { 2191 btrfs_release_path(path); 2192 metadata = 0; 2193 2194 key.objectid = node->bytenr; 2195 key.offset = node->num_bytes; 2196 key.type = BTRFS_EXTENT_ITEM_KEY; 2197 goto again; 2198 } 2199 } else { 2200 err = -EIO; 2201 goto out; 2202 } 2203 } 2204 2205 leaf = path->nodes[0]; 2206 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2207 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2208 if (item_size < sizeof(*ei)) { 2209 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2210 path, (u64)-1, 0); 2211 if (ret < 0) { 2212 err = ret; 2213 goto out; 2214 } 2215 leaf = path->nodes[0]; 2216 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2217 } 2218 #endif 2219 BUG_ON(item_size < sizeof(*ei)); 2220 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2221 __run_delayed_extent_op(extent_op, leaf, ei); 2222 2223 btrfs_mark_buffer_dirty(leaf); 2224 out: 2225 btrfs_free_path(path); 2226 return err; 2227 } 2228 2229 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2230 struct btrfs_root *root, 2231 struct btrfs_delayed_ref_node *node, 2232 struct btrfs_delayed_extent_op *extent_op, 2233 int insert_reserved) 2234 { 2235 int ret = 0; 2236 struct btrfs_delayed_tree_ref *ref; 2237 struct btrfs_key ins; 2238 u64 parent = 0; 2239 u64 ref_root = 0; 2240 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2241 SKINNY_METADATA); 2242 2243 ref = btrfs_delayed_node_to_tree_ref(node); 2244 trace_run_delayed_tree_ref(node, ref, node->action); 2245 2246 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2247 parent = ref->parent; 2248 ref_root = ref->root; 2249 2250 ins.objectid = node->bytenr; 2251 if (skinny_metadata) { 2252 ins.offset = ref->level; 2253 ins.type = BTRFS_METADATA_ITEM_KEY; 2254 } else { 2255 ins.offset = node->num_bytes; 2256 ins.type = BTRFS_EXTENT_ITEM_KEY; 2257 } 2258 2259 BUG_ON(node->ref_mod != 1); 2260 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2261 BUG_ON(!extent_op || !extent_op->update_flags); 2262 ret = alloc_reserved_tree_block(trans, root, 2263 parent, ref_root, 2264 extent_op->flags_to_set, 2265 &extent_op->key, 2266 ref->level, &ins, 2267 node->no_quota); 2268 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2269 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2270 node->num_bytes, parent, ref_root, 2271 ref->level, 0, 1, node->no_quota, 2272 extent_op); 2273 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2274 ret = __btrfs_free_extent(trans, root, node->bytenr, 2275 node->num_bytes, parent, ref_root, 2276 ref->level, 0, 1, extent_op, 2277 node->no_quota); 2278 } else { 2279 BUG(); 2280 } 2281 return ret; 2282 } 2283 2284 /* helper function to actually process a single delayed ref entry */ 2285 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2286 struct btrfs_root *root, 2287 struct btrfs_delayed_ref_node *node, 2288 struct btrfs_delayed_extent_op *extent_op, 2289 int insert_reserved) 2290 { 2291 int ret = 0; 2292 2293 if (trans->aborted) { 2294 if (insert_reserved) 2295 btrfs_pin_extent(root, node->bytenr, 2296 node->num_bytes, 1); 2297 return 0; 2298 } 2299 2300 if (btrfs_delayed_ref_is_head(node)) { 2301 struct btrfs_delayed_ref_head *head; 2302 /* 2303 * we've hit the end of the chain and we were supposed 2304 * to insert this extent into the tree. But, it got 2305 * deleted before we ever needed to insert it, so all 2306 * we have to do is clean up the accounting 2307 */ 2308 BUG_ON(extent_op); 2309 head = btrfs_delayed_node_to_head(node); 2310 trace_run_delayed_ref_head(node, head, node->action); 2311 2312 if (insert_reserved) { 2313 btrfs_pin_extent(root, node->bytenr, 2314 node->num_bytes, 1); 2315 if (head->is_data) { 2316 ret = btrfs_del_csums(trans, root, 2317 node->bytenr, 2318 node->num_bytes); 2319 } 2320 } 2321 return ret; 2322 } 2323 2324 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2325 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2326 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2327 insert_reserved); 2328 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2329 node->type == BTRFS_SHARED_DATA_REF_KEY) 2330 ret = run_delayed_data_ref(trans, root, node, extent_op, 2331 insert_reserved); 2332 else 2333 BUG(); 2334 return ret; 2335 } 2336 2337 static noinline struct btrfs_delayed_ref_node * 2338 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2339 { 2340 struct rb_node *node; 2341 struct btrfs_delayed_ref_node *ref, *last = NULL;; 2342 2343 /* 2344 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2345 * this prevents ref count from going down to zero when 2346 * there still are pending delayed ref. 2347 */ 2348 node = rb_first(&head->ref_root); 2349 while (node) { 2350 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2351 rb_node); 2352 if (ref->action == BTRFS_ADD_DELAYED_REF) 2353 return ref; 2354 else if (last == NULL) 2355 last = ref; 2356 node = rb_next(node); 2357 } 2358 return last; 2359 } 2360 2361 /* 2362 * Returns 0 on success or if called with an already aborted transaction. 2363 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2364 */ 2365 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2366 struct btrfs_root *root, 2367 unsigned long nr) 2368 { 2369 struct btrfs_delayed_ref_root *delayed_refs; 2370 struct btrfs_delayed_ref_node *ref; 2371 struct btrfs_delayed_ref_head *locked_ref = NULL; 2372 struct btrfs_delayed_extent_op *extent_op; 2373 struct btrfs_fs_info *fs_info = root->fs_info; 2374 ktime_t start = ktime_get(); 2375 int ret; 2376 unsigned long count = 0; 2377 unsigned long actual_count = 0; 2378 int must_insert_reserved = 0; 2379 2380 delayed_refs = &trans->transaction->delayed_refs; 2381 while (1) { 2382 if (!locked_ref) { 2383 if (count >= nr) 2384 break; 2385 2386 spin_lock(&delayed_refs->lock); 2387 locked_ref = btrfs_select_ref_head(trans); 2388 if (!locked_ref) { 2389 spin_unlock(&delayed_refs->lock); 2390 break; 2391 } 2392 2393 /* grab the lock that says we are going to process 2394 * all the refs for this head */ 2395 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2396 spin_unlock(&delayed_refs->lock); 2397 /* 2398 * we may have dropped the spin lock to get the head 2399 * mutex lock, and that might have given someone else 2400 * time to free the head. If that's true, it has been 2401 * removed from our list and we can move on. 2402 */ 2403 if (ret == -EAGAIN) { 2404 locked_ref = NULL; 2405 count++; 2406 continue; 2407 } 2408 } 2409 2410 /* 2411 * We need to try and merge add/drops of the same ref since we 2412 * can run into issues with relocate dropping the implicit ref 2413 * and then it being added back again before the drop can 2414 * finish. If we merged anything we need to re-loop so we can 2415 * get a good ref. 2416 */ 2417 spin_lock(&locked_ref->lock); 2418 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2419 locked_ref); 2420 2421 /* 2422 * locked_ref is the head node, so we have to go one 2423 * node back for any delayed ref updates 2424 */ 2425 ref = select_delayed_ref(locked_ref); 2426 2427 if (ref && ref->seq && 2428 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2429 spin_unlock(&locked_ref->lock); 2430 btrfs_delayed_ref_unlock(locked_ref); 2431 spin_lock(&delayed_refs->lock); 2432 locked_ref->processing = 0; 2433 delayed_refs->num_heads_ready++; 2434 spin_unlock(&delayed_refs->lock); 2435 locked_ref = NULL; 2436 cond_resched(); 2437 count++; 2438 continue; 2439 } 2440 2441 /* 2442 * record the must insert reserved flag before we 2443 * drop the spin lock. 2444 */ 2445 must_insert_reserved = locked_ref->must_insert_reserved; 2446 locked_ref->must_insert_reserved = 0; 2447 2448 extent_op = locked_ref->extent_op; 2449 locked_ref->extent_op = NULL; 2450 2451 if (!ref) { 2452 2453 2454 /* All delayed refs have been processed, Go ahead 2455 * and send the head node to run_one_delayed_ref, 2456 * so that any accounting fixes can happen 2457 */ 2458 ref = &locked_ref->node; 2459 2460 if (extent_op && must_insert_reserved) { 2461 btrfs_free_delayed_extent_op(extent_op); 2462 extent_op = NULL; 2463 } 2464 2465 if (extent_op) { 2466 spin_unlock(&locked_ref->lock); 2467 ret = run_delayed_extent_op(trans, root, 2468 ref, extent_op); 2469 btrfs_free_delayed_extent_op(extent_op); 2470 2471 if (ret) { 2472 /* 2473 * Need to reset must_insert_reserved if 2474 * there was an error so the abort stuff 2475 * can cleanup the reserved space 2476 * properly. 2477 */ 2478 if (must_insert_reserved) 2479 locked_ref->must_insert_reserved = 1; 2480 locked_ref->processing = 0; 2481 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2482 btrfs_delayed_ref_unlock(locked_ref); 2483 return ret; 2484 } 2485 continue; 2486 } 2487 2488 /* 2489 * Need to drop our head ref lock and re-aqcuire the 2490 * delayed ref lock and then re-check to make sure 2491 * nobody got added. 2492 */ 2493 spin_unlock(&locked_ref->lock); 2494 spin_lock(&delayed_refs->lock); 2495 spin_lock(&locked_ref->lock); 2496 if (rb_first(&locked_ref->ref_root) || 2497 locked_ref->extent_op) { 2498 spin_unlock(&locked_ref->lock); 2499 spin_unlock(&delayed_refs->lock); 2500 continue; 2501 } 2502 ref->in_tree = 0; 2503 delayed_refs->num_heads--; 2504 rb_erase(&locked_ref->href_node, 2505 &delayed_refs->href_root); 2506 spin_unlock(&delayed_refs->lock); 2507 } else { 2508 actual_count++; 2509 ref->in_tree = 0; 2510 rb_erase(&ref->rb_node, &locked_ref->ref_root); 2511 } 2512 atomic_dec(&delayed_refs->num_entries); 2513 2514 if (!btrfs_delayed_ref_is_head(ref)) { 2515 /* 2516 * when we play the delayed ref, also correct the 2517 * ref_mod on head 2518 */ 2519 switch (ref->action) { 2520 case BTRFS_ADD_DELAYED_REF: 2521 case BTRFS_ADD_DELAYED_EXTENT: 2522 locked_ref->node.ref_mod -= ref->ref_mod; 2523 break; 2524 case BTRFS_DROP_DELAYED_REF: 2525 locked_ref->node.ref_mod += ref->ref_mod; 2526 break; 2527 default: 2528 WARN_ON(1); 2529 } 2530 } 2531 spin_unlock(&locked_ref->lock); 2532 2533 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2534 must_insert_reserved); 2535 2536 btrfs_free_delayed_extent_op(extent_op); 2537 if (ret) { 2538 locked_ref->processing = 0; 2539 btrfs_delayed_ref_unlock(locked_ref); 2540 btrfs_put_delayed_ref(ref); 2541 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2542 return ret; 2543 } 2544 2545 /* 2546 * If this node is a head, that means all the refs in this head 2547 * have been dealt with, and we will pick the next head to deal 2548 * with, so we must unlock the head and drop it from the cluster 2549 * list before we release it. 2550 */ 2551 if (btrfs_delayed_ref_is_head(ref)) { 2552 btrfs_delayed_ref_unlock(locked_ref); 2553 locked_ref = NULL; 2554 } 2555 btrfs_put_delayed_ref(ref); 2556 count++; 2557 cond_resched(); 2558 } 2559 2560 /* 2561 * We don't want to include ref heads since we can have empty ref heads 2562 * and those will drastically skew our runtime down since we just do 2563 * accounting, no actual extent tree updates. 2564 */ 2565 if (actual_count > 0) { 2566 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2567 u64 avg; 2568 2569 /* 2570 * We weigh the current average higher than our current runtime 2571 * to avoid large swings in the average. 2572 */ 2573 spin_lock(&delayed_refs->lock); 2574 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2575 avg = div64_u64(avg, 4); 2576 fs_info->avg_delayed_ref_runtime = avg; 2577 spin_unlock(&delayed_refs->lock); 2578 } 2579 return 0; 2580 } 2581 2582 #ifdef SCRAMBLE_DELAYED_REFS 2583 /* 2584 * Normally delayed refs get processed in ascending bytenr order. This 2585 * correlates in most cases to the order added. To expose dependencies on this 2586 * order, we start to process the tree in the middle instead of the beginning 2587 */ 2588 static u64 find_middle(struct rb_root *root) 2589 { 2590 struct rb_node *n = root->rb_node; 2591 struct btrfs_delayed_ref_node *entry; 2592 int alt = 1; 2593 u64 middle; 2594 u64 first = 0, last = 0; 2595 2596 n = rb_first(root); 2597 if (n) { 2598 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2599 first = entry->bytenr; 2600 } 2601 n = rb_last(root); 2602 if (n) { 2603 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2604 last = entry->bytenr; 2605 } 2606 n = root->rb_node; 2607 2608 while (n) { 2609 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2610 WARN_ON(!entry->in_tree); 2611 2612 middle = entry->bytenr; 2613 2614 if (alt) 2615 n = n->rb_left; 2616 else 2617 n = n->rb_right; 2618 2619 alt = 1 - alt; 2620 } 2621 return middle; 2622 } 2623 #endif 2624 2625 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2626 { 2627 u64 num_bytes; 2628 2629 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2630 sizeof(struct btrfs_extent_inline_ref)); 2631 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2632 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2633 2634 /* 2635 * We don't ever fill up leaves all the way so multiply by 2 just to be 2636 * closer to what we're really going to want to ouse. 2637 */ 2638 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2639 } 2640 2641 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2642 struct btrfs_root *root) 2643 { 2644 struct btrfs_block_rsv *global_rsv; 2645 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2646 u64 num_bytes; 2647 int ret = 0; 2648 2649 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2650 num_heads = heads_to_leaves(root, num_heads); 2651 if (num_heads > 1) 2652 num_bytes += (num_heads - 1) * root->leafsize; 2653 num_bytes <<= 1; 2654 global_rsv = &root->fs_info->global_block_rsv; 2655 2656 /* 2657 * If we can't allocate any more chunks lets make sure we have _lots_ of 2658 * wiggle room since running delayed refs can create more delayed refs. 2659 */ 2660 if (global_rsv->space_info->full) 2661 num_bytes <<= 1; 2662 2663 spin_lock(&global_rsv->lock); 2664 if (global_rsv->reserved <= num_bytes) 2665 ret = 1; 2666 spin_unlock(&global_rsv->lock); 2667 return ret; 2668 } 2669 2670 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2671 struct btrfs_root *root) 2672 { 2673 struct btrfs_fs_info *fs_info = root->fs_info; 2674 u64 num_entries = 2675 atomic_read(&trans->transaction->delayed_refs.num_entries); 2676 u64 avg_runtime; 2677 u64 val; 2678 2679 smp_mb(); 2680 avg_runtime = fs_info->avg_delayed_ref_runtime; 2681 val = num_entries * avg_runtime; 2682 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2683 return 1; 2684 if (val >= NSEC_PER_SEC / 2) 2685 return 2; 2686 2687 return btrfs_check_space_for_delayed_refs(trans, root); 2688 } 2689 2690 struct async_delayed_refs { 2691 struct btrfs_root *root; 2692 int count; 2693 int error; 2694 int sync; 2695 struct completion wait; 2696 struct btrfs_work work; 2697 }; 2698 2699 static void delayed_ref_async_start(struct btrfs_work *work) 2700 { 2701 struct async_delayed_refs *async; 2702 struct btrfs_trans_handle *trans; 2703 int ret; 2704 2705 async = container_of(work, struct async_delayed_refs, work); 2706 2707 trans = btrfs_join_transaction(async->root); 2708 if (IS_ERR(trans)) { 2709 async->error = PTR_ERR(trans); 2710 goto done; 2711 } 2712 2713 /* 2714 * trans->sync means that when we call end_transaciton, we won't 2715 * wait on delayed refs 2716 */ 2717 trans->sync = true; 2718 ret = btrfs_run_delayed_refs(trans, async->root, async->count); 2719 if (ret) 2720 async->error = ret; 2721 2722 ret = btrfs_end_transaction(trans, async->root); 2723 if (ret && !async->error) 2724 async->error = ret; 2725 done: 2726 if (async->sync) 2727 complete(&async->wait); 2728 else 2729 kfree(async); 2730 } 2731 2732 int btrfs_async_run_delayed_refs(struct btrfs_root *root, 2733 unsigned long count, int wait) 2734 { 2735 struct async_delayed_refs *async; 2736 int ret; 2737 2738 async = kmalloc(sizeof(*async), GFP_NOFS); 2739 if (!async) 2740 return -ENOMEM; 2741 2742 async->root = root->fs_info->tree_root; 2743 async->count = count; 2744 async->error = 0; 2745 if (wait) 2746 async->sync = 1; 2747 else 2748 async->sync = 0; 2749 init_completion(&async->wait); 2750 2751 btrfs_init_work(&async->work, delayed_ref_async_start, 2752 NULL, NULL); 2753 2754 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2755 2756 if (wait) { 2757 wait_for_completion(&async->wait); 2758 ret = async->error; 2759 kfree(async); 2760 return ret; 2761 } 2762 return 0; 2763 } 2764 2765 /* 2766 * this starts processing the delayed reference count updates and 2767 * extent insertions we have queued up so far. count can be 2768 * 0, which means to process everything in the tree at the start 2769 * of the run (but not newly added entries), or it can be some target 2770 * number you'd like to process. 2771 * 2772 * Returns 0 on success or if called with an aborted transaction 2773 * Returns <0 on error and aborts the transaction 2774 */ 2775 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2776 struct btrfs_root *root, unsigned long count) 2777 { 2778 struct rb_node *node; 2779 struct btrfs_delayed_ref_root *delayed_refs; 2780 struct btrfs_delayed_ref_head *head; 2781 int ret; 2782 int run_all = count == (unsigned long)-1; 2783 int run_most = 0; 2784 2785 /* We'll clean this up in btrfs_cleanup_transaction */ 2786 if (trans->aborted) 2787 return 0; 2788 2789 if (root == root->fs_info->extent_root) 2790 root = root->fs_info->tree_root; 2791 2792 delayed_refs = &trans->transaction->delayed_refs; 2793 if (count == 0) { 2794 count = atomic_read(&delayed_refs->num_entries) * 2; 2795 run_most = 1; 2796 } 2797 2798 again: 2799 #ifdef SCRAMBLE_DELAYED_REFS 2800 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2801 #endif 2802 ret = __btrfs_run_delayed_refs(trans, root, count); 2803 if (ret < 0) { 2804 btrfs_abort_transaction(trans, root, ret); 2805 return ret; 2806 } 2807 2808 if (run_all) { 2809 if (!list_empty(&trans->new_bgs)) 2810 btrfs_create_pending_block_groups(trans, root); 2811 2812 spin_lock(&delayed_refs->lock); 2813 node = rb_first(&delayed_refs->href_root); 2814 if (!node) { 2815 spin_unlock(&delayed_refs->lock); 2816 goto out; 2817 } 2818 count = (unsigned long)-1; 2819 2820 while (node) { 2821 head = rb_entry(node, struct btrfs_delayed_ref_head, 2822 href_node); 2823 if (btrfs_delayed_ref_is_head(&head->node)) { 2824 struct btrfs_delayed_ref_node *ref; 2825 2826 ref = &head->node; 2827 atomic_inc(&ref->refs); 2828 2829 spin_unlock(&delayed_refs->lock); 2830 /* 2831 * Mutex was contended, block until it's 2832 * released and try again 2833 */ 2834 mutex_lock(&head->mutex); 2835 mutex_unlock(&head->mutex); 2836 2837 btrfs_put_delayed_ref(ref); 2838 cond_resched(); 2839 goto again; 2840 } else { 2841 WARN_ON(1); 2842 } 2843 node = rb_next(node); 2844 } 2845 spin_unlock(&delayed_refs->lock); 2846 cond_resched(); 2847 goto again; 2848 } 2849 out: 2850 ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); 2851 if (ret) 2852 return ret; 2853 assert_qgroups_uptodate(trans); 2854 return 0; 2855 } 2856 2857 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2858 struct btrfs_root *root, 2859 u64 bytenr, u64 num_bytes, u64 flags, 2860 int level, int is_data) 2861 { 2862 struct btrfs_delayed_extent_op *extent_op; 2863 int ret; 2864 2865 extent_op = btrfs_alloc_delayed_extent_op(); 2866 if (!extent_op) 2867 return -ENOMEM; 2868 2869 extent_op->flags_to_set = flags; 2870 extent_op->update_flags = 1; 2871 extent_op->update_key = 0; 2872 extent_op->is_data = is_data ? 1 : 0; 2873 extent_op->level = level; 2874 2875 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2876 num_bytes, extent_op); 2877 if (ret) 2878 btrfs_free_delayed_extent_op(extent_op); 2879 return ret; 2880 } 2881 2882 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2883 struct btrfs_root *root, 2884 struct btrfs_path *path, 2885 u64 objectid, u64 offset, u64 bytenr) 2886 { 2887 struct btrfs_delayed_ref_head *head; 2888 struct btrfs_delayed_ref_node *ref; 2889 struct btrfs_delayed_data_ref *data_ref; 2890 struct btrfs_delayed_ref_root *delayed_refs; 2891 struct rb_node *node; 2892 int ret = 0; 2893 2894 delayed_refs = &trans->transaction->delayed_refs; 2895 spin_lock(&delayed_refs->lock); 2896 head = btrfs_find_delayed_ref_head(trans, bytenr); 2897 if (!head) { 2898 spin_unlock(&delayed_refs->lock); 2899 return 0; 2900 } 2901 2902 if (!mutex_trylock(&head->mutex)) { 2903 atomic_inc(&head->node.refs); 2904 spin_unlock(&delayed_refs->lock); 2905 2906 btrfs_release_path(path); 2907 2908 /* 2909 * Mutex was contended, block until it's released and let 2910 * caller try again 2911 */ 2912 mutex_lock(&head->mutex); 2913 mutex_unlock(&head->mutex); 2914 btrfs_put_delayed_ref(&head->node); 2915 return -EAGAIN; 2916 } 2917 spin_unlock(&delayed_refs->lock); 2918 2919 spin_lock(&head->lock); 2920 node = rb_first(&head->ref_root); 2921 while (node) { 2922 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2923 node = rb_next(node); 2924 2925 /* If it's a shared ref we know a cross reference exists */ 2926 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2927 ret = 1; 2928 break; 2929 } 2930 2931 data_ref = btrfs_delayed_node_to_data_ref(ref); 2932 2933 /* 2934 * If our ref doesn't match the one we're currently looking at 2935 * then we have a cross reference. 2936 */ 2937 if (data_ref->root != root->root_key.objectid || 2938 data_ref->objectid != objectid || 2939 data_ref->offset != offset) { 2940 ret = 1; 2941 break; 2942 } 2943 } 2944 spin_unlock(&head->lock); 2945 mutex_unlock(&head->mutex); 2946 return ret; 2947 } 2948 2949 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2950 struct btrfs_root *root, 2951 struct btrfs_path *path, 2952 u64 objectid, u64 offset, u64 bytenr) 2953 { 2954 struct btrfs_root *extent_root = root->fs_info->extent_root; 2955 struct extent_buffer *leaf; 2956 struct btrfs_extent_data_ref *ref; 2957 struct btrfs_extent_inline_ref *iref; 2958 struct btrfs_extent_item *ei; 2959 struct btrfs_key key; 2960 u32 item_size; 2961 int ret; 2962 2963 key.objectid = bytenr; 2964 key.offset = (u64)-1; 2965 key.type = BTRFS_EXTENT_ITEM_KEY; 2966 2967 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2968 if (ret < 0) 2969 goto out; 2970 BUG_ON(ret == 0); /* Corruption */ 2971 2972 ret = -ENOENT; 2973 if (path->slots[0] == 0) 2974 goto out; 2975 2976 path->slots[0]--; 2977 leaf = path->nodes[0]; 2978 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2979 2980 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2981 goto out; 2982 2983 ret = 1; 2984 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2985 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2986 if (item_size < sizeof(*ei)) { 2987 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2988 goto out; 2989 } 2990 #endif 2991 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2992 2993 if (item_size != sizeof(*ei) + 2994 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2995 goto out; 2996 2997 if (btrfs_extent_generation(leaf, ei) <= 2998 btrfs_root_last_snapshot(&root->root_item)) 2999 goto out; 3000 3001 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3002 if (btrfs_extent_inline_ref_type(leaf, iref) != 3003 BTRFS_EXTENT_DATA_REF_KEY) 3004 goto out; 3005 3006 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3007 if (btrfs_extent_refs(leaf, ei) != 3008 btrfs_extent_data_ref_count(leaf, ref) || 3009 btrfs_extent_data_ref_root(leaf, ref) != 3010 root->root_key.objectid || 3011 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3012 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3013 goto out; 3014 3015 ret = 0; 3016 out: 3017 return ret; 3018 } 3019 3020 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3021 struct btrfs_root *root, 3022 u64 objectid, u64 offset, u64 bytenr) 3023 { 3024 struct btrfs_path *path; 3025 int ret; 3026 int ret2; 3027 3028 path = btrfs_alloc_path(); 3029 if (!path) 3030 return -ENOENT; 3031 3032 do { 3033 ret = check_committed_ref(trans, root, path, objectid, 3034 offset, bytenr); 3035 if (ret && ret != -ENOENT) 3036 goto out; 3037 3038 ret2 = check_delayed_ref(trans, root, path, objectid, 3039 offset, bytenr); 3040 } while (ret2 == -EAGAIN); 3041 3042 if (ret2 && ret2 != -ENOENT) { 3043 ret = ret2; 3044 goto out; 3045 } 3046 3047 if (ret != -ENOENT || ret2 != -ENOENT) 3048 ret = 0; 3049 out: 3050 btrfs_free_path(path); 3051 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3052 WARN_ON(ret > 0); 3053 return ret; 3054 } 3055 3056 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3057 struct btrfs_root *root, 3058 struct extent_buffer *buf, 3059 int full_backref, int inc, int no_quota) 3060 { 3061 u64 bytenr; 3062 u64 num_bytes; 3063 u64 parent; 3064 u64 ref_root; 3065 u32 nritems; 3066 struct btrfs_key key; 3067 struct btrfs_file_extent_item *fi; 3068 int i; 3069 int level; 3070 int ret = 0; 3071 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3072 u64, u64, u64, u64, u64, u64, int); 3073 3074 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3075 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 3076 return 0; 3077 #endif 3078 ref_root = btrfs_header_owner(buf); 3079 nritems = btrfs_header_nritems(buf); 3080 level = btrfs_header_level(buf); 3081 3082 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3083 return 0; 3084 3085 if (inc) 3086 process_func = btrfs_inc_extent_ref; 3087 else 3088 process_func = btrfs_free_extent; 3089 3090 if (full_backref) 3091 parent = buf->start; 3092 else 3093 parent = 0; 3094 3095 for (i = 0; i < nritems; i++) { 3096 if (level == 0) { 3097 btrfs_item_key_to_cpu(buf, &key, i); 3098 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3099 continue; 3100 fi = btrfs_item_ptr(buf, i, 3101 struct btrfs_file_extent_item); 3102 if (btrfs_file_extent_type(buf, fi) == 3103 BTRFS_FILE_EXTENT_INLINE) 3104 continue; 3105 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3106 if (bytenr == 0) 3107 continue; 3108 3109 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3110 key.offset -= btrfs_file_extent_offset(buf, fi); 3111 ret = process_func(trans, root, bytenr, num_bytes, 3112 parent, ref_root, key.objectid, 3113 key.offset, no_quota); 3114 if (ret) 3115 goto fail; 3116 } else { 3117 bytenr = btrfs_node_blockptr(buf, i); 3118 num_bytes = btrfs_level_size(root, level - 1); 3119 ret = process_func(trans, root, bytenr, num_bytes, 3120 parent, ref_root, level - 1, 0, 3121 no_quota); 3122 if (ret) 3123 goto fail; 3124 } 3125 } 3126 return 0; 3127 fail: 3128 return ret; 3129 } 3130 3131 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3132 struct extent_buffer *buf, int full_backref, int no_quota) 3133 { 3134 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); 3135 } 3136 3137 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3138 struct extent_buffer *buf, int full_backref, int no_quota) 3139 { 3140 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); 3141 } 3142 3143 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3144 struct btrfs_root *root, 3145 struct btrfs_path *path, 3146 struct btrfs_block_group_cache *cache) 3147 { 3148 int ret; 3149 struct btrfs_root *extent_root = root->fs_info->extent_root; 3150 unsigned long bi; 3151 struct extent_buffer *leaf; 3152 3153 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3154 if (ret < 0) 3155 goto fail; 3156 BUG_ON(ret); /* Corruption */ 3157 3158 leaf = path->nodes[0]; 3159 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3160 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3161 btrfs_mark_buffer_dirty(leaf); 3162 btrfs_release_path(path); 3163 fail: 3164 if (ret) { 3165 btrfs_abort_transaction(trans, root, ret); 3166 return ret; 3167 } 3168 return 0; 3169 3170 } 3171 3172 static struct btrfs_block_group_cache * 3173 next_block_group(struct btrfs_root *root, 3174 struct btrfs_block_group_cache *cache) 3175 { 3176 struct rb_node *node; 3177 spin_lock(&root->fs_info->block_group_cache_lock); 3178 node = rb_next(&cache->cache_node); 3179 btrfs_put_block_group(cache); 3180 if (node) { 3181 cache = rb_entry(node, struct btrfs_block_group_cache, 3182 cache_node); 3183 btrfs_get_block_group(cache); 3184 } else 3185 cache = NULL; 3186 spin_unlock(&root->fs_info->block_group_cache_lock); 3187 return cache; 3188 } 3189 3190 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3191 struct btrfs_trans_handle *trans, 3192 struct btrfs_path *path) 3193 { 3194 struct btrfs_root *root = block_group->fs_info->tree_root; 3195 struct inode *inode = NULL; 3196 u64 alloc_hint = 0; 3197 int dcs = BTRFS_DC_ERROR; 3198 int num_pages = 0; 3199 int retries = 0; 3200 int ret = 0; 3201 3202 /* 3203 * If this block group is smaller than 100 megs don't bother caching the 3204 * block group. 3205 */ 3206 if (block_group->key.offset < (100 * 1024 * 1024)) { 3207 spin_lock(&block_group->lock); 3208 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3209 spin_unlock(&block_group->lock); 3210 return 0; 3211 } 3212 3213 again: 3214 inode = lookup_free_space_inode(root, block_group, path); 3215 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3216 ret = PTR_ERR(inode); 3217 btrfs_release_path(path); 3218 goto out; 3219 } 3220 3221 if (IS_ERR(inode)) { 3222 BUG_ON(retries); 3223 retries++; 3224 3225 if (block_group->ro) 3226 goto out_free; 3227 3228 ret = create_free_space_inode(root, trans, block_group, path); 3229 if (ret) 3230 goto out_free; 3231 goto again; 3232 } 3233 3234 /* We've already setup this transaction, go ahead and exit */ 3235 if (block_group->cache_generation == trans->transid && 3236 i_size_read(inode)) { 3237 dcs = BTRFS_DC_SETUP; 3238 goto out_put; 3239 } 3240 3241 /* 3242 * We want to set the generation to 0, that way if anything goes wrong 3243 * from here on out we know not to trust this cache when we load up next 3244 * time. 3245 */ 3246 BTRFS_I(inode)->generation = 0; 3247 ret = btrfs_update_inode(trans, root, inode); 3248 WARN_ON(ret); 3249 3250 if (i_size_read(inode) > 0) { 3251 ret = btrfs_check_trunc_cache_free_space(root, 3252 &root->fs_info->global_block_rsv); 3253 if (ret) 3254 goto out_put; 3255 3256 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3257 if (ret) 3258 goto out_put; 3259 } 3260 3261 spin_lock(&block_group->lock); 3262 if (block_group->cached != BTRFS_CACHE_FINISHED || 3263 !btrfs_test_opt(root, SPACE_CACHE)) { 3264 /* 3265 * don't bother trying to write stuff out _if_ 3266 * a) we're not cached, 3267 * b) we're with nospace_cache mount option. 3268 */ 3269 dcs = BTRFS_DC_WRITTEN; 3270 spin_unlock(&block_group->lock); 3271 goto out_put; 3272 } 3273 spin_unlock(&block_group->lock); 3274 3275 /* 3276 * Try to preallocate enough space based on how big the block group is. 3277 * Keep in mind this has to include any pinned space which could end up 3278 * taking up quite a bit since it's not folded into the other space 3279 * cache. 3280 */ 3281 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3282 if (!num_pages) 3283 num_pages = 1; 3284 3285 num_pages *= 16; 3286 num_pages *= PAGE_CACHE_SIZE; 3287 3288 ret = btrfs_check_data_free_space(inode, num_pages); 3289 if (ret) 3290 goto out_put; 3291 3292 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3293 num_pages, num_pages, 3294 &alloc_hint); 3295 if (!ret) 3296 dcs = BTRFS_DC_SETUP; 3297 btrfs_free_reserved_data_space(inode, num_pages); 3298 3299 out_put: 3300 iput(inode); 3301 out_free: 3302 btrfs_release_path(path); 3303 out: 3304 spin_lock(&block_group->lock); 3305 if (!ret && dcs == BTRFS_DC_SETUP) 3306 block_group->cache_generation = trans->transid; 3307 block_group->disk_cache_state = dcs; 3308 spin_unlock(&block_group->lock); 3309 3310 return ret; 3311 } 3312 3313 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3314 struct btrfs_root *root) 3315 { 3316 struct btrfs_block_group_cache *cache; 3317 int err = 0; 3318 struct btrfs_path *path; 3319 u64 last = 0; 3320 3321 path = btrfs_alloc_path(); 3322 if (!path) 3323 return -ENOMEM; 3324 3325 again: 3326 while (1) { 3327 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3328 while (cache) { 3329 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3330 break; 3331 cache = next_block_group(root, cache); 3332 } 3333 if (!cache) { 3334 if (last == 0) 3335 break; 3336 last = 0; 3337 continue; 3338 } 3339 err = cache_save_setup(cache, trans, path); 3340 last = cache->key.objectid + cache->key.offset; 3341 btrfs_put_block_group(cache); 3342 } 3343 3344 while (1) { 3345 if (last == 0) { 3346 err = btrfs_run_delayed_refs(trans, root, 3347 (unsigned long)-1); 3348 if (err) /* File system offline */ 3349 goto out; 3350 } 3351 3352 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3353 while (cache) { 3354 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 3355 btrfs_put_block_group(cache); 3356 goto again; 3357 } 3358 3359 if (cache->dirty) 3360 break; 3361 cache = next_block_group(root, cache); 3362 } 3363 if (!cache) { 3364 if (last == 0) 3365 break; 3366 last = 0; 3367 continue; 3368 } 3369 3370 if (cache->disk_cache_state == BTRFS_DC_SETUP) 3371 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 3372 cache->dirty = 0; 3373 last = cache->key.objectid + cache->key.offset; 3374 3375 err = write_one_cache_group(trans, root, path, cache); 3376 btrfs_put_block_group(cache); 3377 if (err) /* File system offline */ 3378 goto out; 3379 } 3380 3381 while (1) { 3382 /* 3383 * I don't think this is needed since we're just marking our 3384 * preallocated extent as written, but just in case it can't 3385 * hurt. 3386 */ 3387 if (last == 0) { 3388 err = btrfs_run_delayed_refs(trans, root, 3389 (unsigned long)-1); 3390 if (err) /* File system offline */ 3391 goto out; 3392 } 3393 3394 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3395 while (cache) { 3396 /* 3397 * Really this shouldn't happen, but it could if we 3398 * couldn't write the entire preallocated extent and 3399 * splitting the extent resulted in a new block. 3400 */ 3401 if (cache->dirty) { 3402 btrfs_put_block_group(cache); 3403 goto again; 3404 } 3405 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3406 break; 3407 cache = next_block_group(root, cache); 3408 } 3409 if (!cache) { 3410 if (last == 0) 3411 break; 3412 last = 0; 3413 continue; 3414 } 3415 3416 err = btrfs_write_out_cache(root, trans, cache, path); 3417 3418 /* 3419 * If we didn't have an error then the cache state is still 3420 * NEED_WRITE, so we can set it to WRITTEN. 3421 */ 3422 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3423 cache->disk_cache_state = BTRFS_DC_WRITTEN; 3424 last = cache->key.objectid + cache->key.offset; 3425 btrfs_put_block_group(cache); 3426 } 3427 out: 3428 3429 btrfs_free_path(path); 3430 return err; 3431 } 3432 3433 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3434 { 3435 struct btrfs_block_group_cache *block_group; 3436 int readonly = 0; 3437 3438 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3439 if (!block_group || block_group->ro) 3440 readonly = 1; 3441 if (block_group) 3442 btrfs_put_block_group(block_group); 3443 return readonly; 3444 } 3445 3446 static const char *alloc_name(u64 flags) 3447 { 3448 switch (flags) { 3449 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3450 return "mixed"; 3451 case BTRFS_BLOCK_GROUP_METADATA: 3452 return "metadata"; 3453 case BTRFS_BLOCK_GROUP_DATA: 3454 return "data"; 3455 case BTRFS_BLOCK_GROUP_SYSTEM: 3456 return "system"; 3457 default: 3458 WARN_ON(1); 3459 return "invalid-combination"; 3460 }; 3461 } 3462 3463 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3464 u64 total_bytes, u64 bytes_used, 3465 struct btrfs_space_info **space_info) 3466 { 3467 struct btrfs_space_info *found; 3468 int i; 3469 int factor; 3470 int ret; 3471 3472 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3473 BTRFS_BLOCK_GROUP_RAID10)) 3474 factor = 2; 3475 else 3476 factor = 1; 3477 3478 found = __find_space_info(info, flags); 3479 if (found) { 3480 spin_lock(&found->lock); 3481 found->total_bytes += total_bytes; 3482 found->disk_total += total_bytes * factor; 3483 found->bytes_used += bytes_used; 3484 found->disk_used += bytes_used * factor; 3485 found->full = 0; 3486 spin_unlock(&found->lock); 3487 *space_info = found; 3488 return 0; 3489 } 3490 found = kzalloc(sizeof(*found), GFP_NOFS); 3491 if (!found) 3492 return -ENOMEM; 3493 3494 ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3495 if (ret) { 3496 kfree(found); 3497 return ret; 3498 } 3499 3500 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3501 INIT_LIST_HEAD(&found->block_groups[i]); 3502 init_rwsem(&found->groups_sem); 3503 spin_lock_init(&found->lock); 3504 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3505 found->total_bytes = total_bytes; 3506 found->disk_total = total_bytes * factor; 3507 found->bytes_used = bytes_used; 3508 found->disk_used = bytes_used * factor; 3509 found->bytes_pinned = 0; 3510 found->bytes_reserved = 0; 3511 found->bytes_readonly = 0; 3512 found->bytes_may_use = 0; 3513 found->full = 0; 3514 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3515 found->chunk_alloc = 0; 3516 found->flush = 0; 3517 init_waitqueue_head(&found->wait); 3518 3519 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3520 info->space_info_kobj, "%s", 3521 alloc_name(found->flags)); 3522 if (ret) { 3523 kfree(found); 3524 return ret; 3525 } 3526 3527 *space_info = found; 3528 list_add_rcu(&found->list, &info->space_info); 3529 if (flags & BTRFS_BLOCK_GROUP_DATA) 3530 info->data_sinfo = found; 3531 3532 return ret; 3533 } 3534 3535 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3536 { 3537 u64 extra_flags = chunk_to_extended(flags) & 3538 BTRFS_EXTENDED_PROFILE_MASK; 3539 3540 write_seqlock(&fs_info->profiles_lock); 3541 if (flags & BTRFS_BLOCK_GROUP_DATA) 3542 fs_info->avail_data_alloc_bits |= extra_flags; 3543 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3544 fs_info->avail_metadata_alloc_bits |= extra_flags; 3545 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3546 fs_info->avail_system_alloc_bits |= extra_flags; 3547 write_sequnlock(&fs_info->profiles_lock); 3548 } 3549 3550 /* 3551 * returns target flags in extended format or 0 if restripe for this 3552 * chunk_type is not in progress 3553 * 3554 * should be called with either volume_mutex or balance_lock held 3555 */ 3556 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3557 { 3558 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3559 u64 target = 0; 3560 3561 if (!bctl) 3562 return 0; 3563 3564 if (flags & BTRFS_BLOCK_GROUP_DATA && 3565 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3566 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3567 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3568 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3569 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3570 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3571 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3572 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3573 } 3574 3575 return target; 3576 } 3577 3578 /* 3579 * @flags: available profiles in extended format (see ctree.h) 3580 * 3581 * Returns reduced profile in chunk format. If profile changing is in 3582 * progress (either running or paused) picks the target profile (if it's 3583 * already available), otherwise falls back to plain reducing. 3584 */ 3585 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3586 { 3587 /* 3588 * we add in the count of missing devices because we want 3589 * to make sure that any RAID levels on a degraded FS 3590 * continue to be honored. 3591 */ 3592 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3593 root->fs_info->fs_devices->missing_devices; 3594 u64 target; 3595 u64 tmp; 3596 3597 /* 3598 * see if restripe for this chunk_type is in progress, if so 3599 * try to reduce to the target profile 3600 */ 3601 spin_lock(&root->fs_info->balance_lock); 3602 target = get_restripe_target(root->fs_info, flags); 3603 if (target) { 3604 /* pick target profile only if it's already available */ 3605 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3606 spin_unlock(&root->fs_info->balance_lock); 3607 return extended_to_chunk(target); 3608 } 3609 } 3610 spin_unlock(&root->fs_info->balance_lock); 3611 3612 /* First, mask out the RAID levels which aren't possible */ 3613 if (num_devices == 1) 3614 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3615 BTRFS_BLOCK_GROUP_RAID5); 3616 if (num_devices < 3) 3617 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3618 if (num_devices < 4) 3619 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3620 3621 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3622 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3623 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3624 flags &= ~tmp; 3625 3626 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3627 tmp = BTRFS_BLOCK_GROUP_RAID6; 3628 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3629 tmp = BTRFS_BLOCK_GROUP_RAID5; 3630 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3631 tmp = BTRFS_BLOCK_GROUP_RAID10; 3632 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3633 tmp = BTRFS_BLOCK_GROUP_RAID1; 3634 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3635 tmp = BTRFS_BLOCK_GROUP_RAID0; 3636 3637 return extended_to_chunk(flags | tmp); 3638 } 3639 3640 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 3641 { 3642 unsigned seq; 3643 u64 flags; 3644 3645 do { 3646 flags = orig_flags; 3647 seq = read_seqbegin(&root->fs_info->profiles_lock); 3648 3649 if (flags & BTRFS_BLOCK_GROUP_DATA) 3650 flags |= root->fs_info->avail_data_alloc_bits; 3651 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3652 flags |= root->fs_info->avail_system_alloc_bits; 3653 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3654 flags |= root->fs_info->avail_metadata_alloc_bits; 3655 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3656 3657 return btrfs_reduce_alloc_profile(root, flags); 3658 } 3659 3660 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3661 { 3662 u64 flags; 3663 u64 ret; 3664 3665 if (data) 3666 flags = BTRFS_BLOCK_GROUP_DATA; 3667 else if (root == root->fs_info->chunk_root) 3668 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3669 else 3670 flags = BTRFS_BLOCK_GROUP_METADATA; 3671 3672 ret = get_alloc_profile(root, flags); 3673 return ret; 3674 } 3675 3676 /* 3677 * This will check the space that the inode allocates from to make sure we have 3678 * enough space for bytes. 3679 */ 3680 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3681 { 3682 struct btrfs_space_info *data_sinfo; 3683 struct btrfs_root *root = BTRFS_I(inode)->root; 3684 struct btrfs_fs_info *fs_info = root->fs_info; 3685 u64 used; 3686 int ret = 0, committed = 0, alloc_chunk = 1; 3687 3688 /* make sure bytes are sectorsize aligned */ 3689 bytes = ALIGN(bytes, root->sectorsize); 3690 3691 if (btrfs_is_free_space_inode(inode)) { 3692 committed = 1; 3693 ASSERT(current->journal_info); 3694 } 3695 3696 data_sinfo = fs_info->data_sinfo; 3697 if (!data_sinfo) 3698 goto alloc; 3699 3700 again: 3701 /* make sure we have enough space to handle the data first */ 3702 spin_lock(&data_sinfo->lock); 3703 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3704 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3705 data_sinfo->bytes_may_use; 3706 3707 if (used + bytes > data_sinfo->total_bytes) { 3708 struct btrfs_trans_handle *trans; 3709 3710 /* 3711 * if we don't have enough free bytes in this space then we need 3712 * to alloc a new chunk. 3713 */ 3714 if (!data_sinfo->full && alloc_chunk) { 3715 u64 alloc_target; 3716 3717 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3718 spin_unlock(&data_sinfo->lock); 3719 alloc: 3720 alloc_target = btrfs_get_alloc_profile(root, 1); 3721 /* 3722 * It is ugly that we don't call nolock join 3723 * transaction for the free space inode case here. 3724 * But it is safe because we only do the data space 3725 * reservation for the free space cache in the 3726 * transaction context, the common join transaction 3727 * just increase the counter of the current transaction 3728 * handler, doesn't try to acquire the trans_lock of 3729 * the fs. 3730 */ 3731 trans = btrfs_join_transaction(root); 3732 if (IS_ERR(trans)) 3733 return PTR_ERR(trans); 3734 3735 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3736 alloc_target, 3737 CHUNK_ALLOC_NO_FORCE); 3738 btrfs_end_transaction(trans, root); 3739 if (ret < 0) { 3740 if (ret != -ENOSPC) 3741 return ret; 3742 else 3743 goto commit_trans; 3744 } 3745 3746 if (!data_sinfo) 3747 data_sinfo = fs_info->data_sinfo; 3748 3749 goto again; 3750 } 3751 3752 /* 3753 * If we don't have enough pinned space to deal with this 3754 * allocation don't bother committing the transaction. 3755 */ 3756 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, 3757 bytes) < 0) 3758 committed = 1; 3759 spin_unlock(&data_sinfo->lock); 3760 3761 /* commit the current transaction and try again */ 3762 commit_trans: 3763 if (!committed && 3764 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3765 committed = 1; 3766 3767 trans = btrfs_join_transaction(root); 3768 if (IS_ERR(trans)) 3769 return PTR_ERR(trans); 3770 ret = btrfs_commit_transaction(trans, root); 3771 if (ret) 3772 return ret; 3773 goto again; 3774 } 3775 3776 trace_btrfs_space_reservation(root->fs_info, 3777 "space_info:enospc", 3778 data_sinfo->flags, bytes, 1); 3779 return -ENOSPC; 3780 } 3781 data_sinfo->bytes_may_use += bytes; 3782 trace_btrfs_space_reservation(root->fs_info, "space_info", 3783 data_sinfo->flags, bytes, 1); 3784 spin_unlock(&data_sinfo->lock); 3785 3786 return 0; 3787 } 3788 3789 /* 3790 * Called if we need to clear a data reservation for this inode. 3791 */ 3792 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3793 { 3794 struct btrfs_root *root = BTRFS_I(inode)->root; 3795 struct btrfs_space_info *data_sinfo; 3796 3797 /* make sure bytes are sectorsize aligned */ 3798 bytes = ALIGN(bytes, root->sectorsize); 3799 3800 data_sinfo = root->fs_info->data_sinfo; 3801 spin_lock(&data_sinfo->lock); 3802 WARN_ON(data_sinfo->bytes_may_use < bytes); 3803 data_sinfo->bytes_may_use -= bytes; 3804 trace_btrfs_space_reservation(root->fs_info, "space_info", 3805 data_sinfo->flags, bytes, 0); 3806 spin_unlock(&data_sinfo->lock); 3807 } 3808 3809 static void force_metadata_allocation(struct btrfs_fs_info *info) 3810 { 3811 struct list_head *head = &info->space_info; 3812 struct btrfs_space_info *found; 3813 3814 rcu_read_lock(); 3815 list_for_each_entry_rcu(found, head, list) { 3816 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3817 found->force_alloc = CHUNK_ALLOC_FORCE; 3818 } 3819 rcu_read_unlock(); 3820 } 3821 3822 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3823 { 3824 return (global->size << 1); 3825 } 3826 3827 static int should_alloc_chunk(struct btrfs_root *root, 3828 struct btrfs_space_info *sinfo, int force) 3829 { 3830 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3831 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3832 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3833 u64 thresh; 3834 3835 if (force == CHUNK_ALLOC_FORCE) 3836 return 1; 3837 3838 /* 3839 * We need to take into account the global rsv because for all intents 3840 * and purposes it's used space. Don't worry about locking the 3841 * global_rsv, it doesn't change except when the transaction commits. 3842 */ 3843 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3844 num_allocated += calc_global_rsv_need_space(global_rsv); 3845 3846 /* 3847 * in limited mode, we want to have some free space up to 3848 * about 1% of the FS size. 3849 */ 3850 if (force == CHUNK_ALLOC_LIMITED) { 3851 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3852 thresh = max_t(u64, 64 * 1024 * 1024, 3853 div_factor_fine(thresh, 1)); 3854 3855 if (num_bytes - num_allocated < thresh) 3856 return 1; 3857 } 3858 3859 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3860 return 0; 3861 return 1; 3862 } 3863 3864 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3865 { 3866 u64 num_dev; 3867 3868 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3869 BTRFS_BLOCK_GROUP_RAID0 | 3870 BTRFS_BLOCK_GROUP_RAID5 | 3871 BTRFS_BLOCK_GROUP_RAID6)) 3872 num_dev = root->fs_info->fs_devices->rw_devices; 3873 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3874 num_dev = 2; 3875 else 3876 num_dev = 1; /* DUP or single */ 3877 3878 /* metadata for updaing devices and chunk tree */ 3879 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3880 } 3881 3882 static void check_system_chunk(struct btrfs_trans_handle *trans, 3883 struct btrfs_root *root, u64 type) 3884 { 3885 struct btrfs_space_info *info; 3886 u64 left; 3887 u64 thresh; 3888 3889 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3890 spin_lock(&info->lock); 3891 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3892 info->bytes_reserved - info->bytes_readonly; 3893 spin_unlock(&info->lock); 3894 3895 thresh = get_system_chunk_thresh(root, type); 3896 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3897 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3898 left, thresh, type); 3899 dump_space_info(info, 0, 0); 3900 } 3901 3902 if (left < thresh) { 3903 u64 flags; 3904 3905 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3906 btrfs_alloc_chunk(trans, root, flags); 3907 } 3908 } 3909 3910 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3911 struct btrfs_root *extent_root, u64 flags, int force) 3912 { 3913 struct btrfs_space_info *space_info; 3914 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3915 int wait_for_alloc = 0; 3916 int ret = 0; 3917 3918 /* Don't re-enter if we're already allocating a chunk */ 3919 if (trans->allocating_chunk) 3920 return -ENOSPC; 3921 3922 space_info = __find_space_info(extent_root->fs_info, flags); 3923 if (!space_info) { 3924 ret = update_space_info(extent_root->fs_info, flags, 3925 0, 0, &space_info); 3926 BUG_ON(ret); /* -ENOMEM */ 3927 } 3928 BUG_ON(!space_info); /* Logic error */ 3929 3930 again: 3931 spin_lock(&space_info->lock); 3932 if (force < space_info->force_alloc) 3933 force = space_info->force_alloc; 3934 if (space_info->full) { 3935 if (should_alloc_chunk(extent_root, space_info, force)) 3936 ret = -ENOSPC; 3937 else 3938 ret = 0; 3939 spin_unlock(&space_info->lock); 3940 return ret; 3941 } 3942 3943 if (!should_alloc_chunk(extent_root, space_info, force)) { 3944 spin_unlock(&space_info->lock); 3945 return 0; 3946 } else if (space_info->chunk_alloc) { 3947 wait_for_alloc = 1; 3948 } else { 3949 space_info->chunk_alloc = 1; 3950 } 3951 3952 spin_unlock(&space_info->lock); 3953 3954 mutex_lock(&fs_info->chunk_mutex); 3955 3956 /* 3957 * The chunk_mutex is held throughout the entirety of a chunk 3958 * allocation, so once we've acquired the chunk_mutex we know that the 3959 * other guy is done and we need to recheck and see if we should 3960 * allocate. 3961 */ 3962 if (wait_for_alloc) { 3963 mutex_unlock(&fs_info->chunk_mutex); 3964 wait_for_alloc = 0; 3965 goto again; 3966 } 3967 3968 trans->allocating_chunk = true; 3969 3970 /* 3971 * If we have mixed data/metadata chunks we want to make sure we keep 3972 * allocating mixed chunks instead of individual chunks. 3973 */ 3974 if (btrfs_mixed_space_info(space_info)) 3975 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3976 3977 /* 3978 * if we're doing a data chunk, go ahead and make sure that 3979 * we keep a reasonable number of metadata chunks allocated in the 3980 * FS as well. 3981 */ 3982 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3983 fs_info->data_chunk_allocations++; 3984 if (!(fs_info->data_chunk_allocations % 3985 fs_info->metadata_ratio)) 3986 force_metadata_allocation(fs_info); 3987 } 3988 3989 /* 3990 * Check if we have enough space in SYSTEM chunk because we may need 3991 * to update devices. 3992 */ 3993 check_system_chunk(trans, extent_root, flags); 3994 3995 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3996 trans->allocating_chunk = false; 3997 3998 spin_lock(&space_info->lock); 3999 if (ret < 0 && ret != -ENOSPC) 4000 goto out; 4001 if (ret) 4002 space_info->full = 1; 4003 else 4004 ret = 1; 4005 4006 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4007 out: 4008 space_info->chunk_alloc = 0; 4009 spin_unlock(&space_info->lock); 4010 mutex_unlock(&fs_info->chunk_mutex); 4011 return ret; 4012 } 4013 4014 static int can_overcommit(struct btrfs_root *root, 4015 struct btrfs_space_info *space_info, u64 bytes, 4016 enum btrfs_reserve_flush_enum flush) 4017 { 4018 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4019 u64 profile = btrfs_get_alloc_profile(root, 0); 4020 u64 space_size; 4021 u64 avail; 4022 u64 used; 4023 4024 used = space_info->bytes_used + space_info->bytes_reserved + 4025 space_info->bytes_pinned + space_info->bytes_readonly; 4026 4027 /* 4028 * We only want to allow over committing if we have lots of actual space 4029 * free, but if we don't have enough space to handle the global reserve 4030 * space then we could end up having a real enospc problem when trying 4031 * to allocate a chunk or some other such important allocation. 4032 */ 4033 spin_lock(&global_rsv->lock); 4034 space_size = calc_global_rsv_need_space(global_rsv); 4035 spin_unlock(&global_rsv->lock); 4036 if (used + space_size >= space_info->total_bytes) 4037 return 0; 4038 4039 used += space_info->bytes_may_use; 4040 4041 spin_lock(&root->fs_info->free_chunk_lock); 4042 avail = root->fs_info->free_chunk_space; 4043 spin_unlock(&root->fs_info->free_chunk_lock); 4044 4045 /* 4046 * If we have dup, raid1 or raid10 then only half of the free 4047 * space is actually useable. For raid56, the space info used 4048 * doesn't include the parity drive, so we don't have to 4049 * change the math 4050 */ 4051 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4052 BTRFS_BLOCK_GROUP_RAID1 | 4053 BTRFS_BLOCK_GROUP_RAID10)) 4054 avail >>= 1; 4055 4056 /* 4057 * If we aren't flushing all things, let us overcommit up to 4058 * 1/2th of the space. If we can flush, don't let us overcommit 4059 * too much, let it overcommit up to 1/8 of the space. 4060 */ 4061 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4062 avail >>= 3; 4063 else 4064 avail >>= 1; 4065 4066 if (used + bytes < space_info->total_bytes + avail) 4067 return 1; 4068 return 0; 4069 } 4070 4071 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4072 unsigned long nr_pages, int nr_items) 4073 { 4074 struct super_block *sb = root->fs_info->sb; 4075 4076 if (down_read_trylock(&sb->s_umount)) { 4077 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4078 up_read(&sb->s_umount); 4079 } else { 4080 /* 4081 * We needn't worry the filesystem going from r/w to r/o though 4082 * we don't acquire ->s_umount mutex, because the filesystem 4083 * should guarantee the delalloc inodes list be empty after 4084 * the filesystem is readonly(all dirty pages are written to 4085 * the disk). 4086 */ 4087 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4088 if (!current->journal_info) 4089 btrfs_wait_ordered_roots(root->fs_info, nr_items); 4090 } 4091 } 4092 4093 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4094 { 4095 u64 bytes; 4096 int nr; 4097 4098 bytes = btrfs_calc_trans_metadata_size(root, 1); 4099 nr = (int)div64_u64(to_reclaim, bytes); 4100 if (!nr) 4101 nr = 1; 4102 return nr; 4103 } 4104 4105 #define EXTENT_SIZE_PER_ITEM (256 * 1024) 4106 4107 /* 4108 * shrink metadata reservation for delalloc 4109 */ 4110 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4111 bool wait_ordered) 4112 { 4113 struct btrfs_block_rsv *block_rsv; 4114 struct btrfs_space_info *space_info; 4115 struct btrfs_trans_handle *trans; 4116 u64 delalloc_bytes; 4117 u64 max_reclaim; 4118 long time_left; 4119 unsigned long nr_pages; 4120 int loops; 4121 int items; 4122 enum btrfs_reserve_flush_enum flush; 4123 4124 /* Calc the number of the pages we need flush for space reservation */ 4125 items = calc_reclaim_items_nr(root, to_reclaim); 4126 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4127 4128 trans = (struct btrfs_trans_handle *)current->journal_info; 4129 block_rsv = &root->fs_info->delalloc_block_rsv; 4130 space_info = block_rsv->space_info; 4131 4132 delalloc_bytes = percpu_counter_sum_positive( 4133 &root->fs_info->delalloc_bytes); 4134 if (delalloc_bytes == 0) { 4135 if (trans) 4136 return; 4137 if (wait_ordered) 4138 btrfs_wait_ordered_roots(root->fs_info, items); 4139 return; 4140 } 4141 4142 loops = 0; 4143 while (delalloc_bytes && loops < 3) { 4144 max_reclaim = min(delalloc_bytes, to_reclaim); 4145 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4146 btrfs_writeback_inodes_sb_nr(root, nr_pages, items); 4147 /* 4148 * We need to wait for the async pages to actually start before 4149 * we do anything. 4150 */ 4151 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4152 if (!max_reclaim) 4153 goto skip_async; 4154 4155 if (max_reclaim <= nr_pages) 4156 max_reclaim = 0; 4157 else 4158 max_reclaim -= nr_pages; 4159 4160 wait_event(root->fs_info->async_submit_wait, 4161 atomic_read(&root->fs_info->async_delalloc_pages) <= 4162 (int)max_reclaim); 4163 skip_async: 4164 if (!trans) 4165 flush = BTRFS_RESERVE_FLUSH_ALL; 4166 else 4167 flush = BTRFS_RESERVE_NO_FLUSH; 4168 spin_lock(&space_info->lock); 4169 if (can_overcommit(root, space_info, orig, flush)) { 4170 spin_unlock(&space_info->lock); 4171 break; 4172 } 4173 spin_unlock(&space_info->lock); 4174 4175 loops++; 4176 if (wait_ordered && !trans) { 4177 btrfs_wait_ordered_roots(root->fs_info, items); 4178 } else { 4179 time_left = schedule_timeout_killable(1); 4180 if (time_left) 4181 break; 4182 } 4183 delalloc_bytes = percpu_counter_sum_positive( 4184 &root->fs_info->delalloc_bytes); 4185 } 4186 } 4187 4188 /** 4189 * maybe_commit_transaction - possibly commit the transaction if its ok to 4190 * @root - the root we're allocating for 4191 * @bytes - the number of bytes we want to reserve 4192 * @force - force the commit 4193 * 4194 * This will check to make sure that committing the transaction will actually 4195 * get us somewhere and then commit the transaction if it does. Otherwise it 4196 * will return -ENOSPC. 4197 */ 4198 static int may_commit_transaction(struct btrfs_root *root, 4199 struct btrfs_space_info *space_info, 4200 u64 bytes, int force) 4201 { 4202 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4203 struct btrfs_trans_handle *trans; 4204 4205 trans = (struct btrfs_trans_handle *)current->journal_info; 4206 if (trans) 4207 return -EAGAIN; 4208 4209 if (force) 4210 goto commit; 4211 4212 /* See if there is enough pinned space to make this reservation */ 4213 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4214 bytes) >= 0) 4215 goto commit; 4216 4217 /* 4218 * See if there is some space in the delayed insertion reservation for 4219 * this reservation. 4220 */ 4221 if (space_info != delayed_rsv->space_info) 4222 return -ENOSPC; 4223 4224 spin_lock(&delayed_rsv->lock); 4225 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4226 bytes - delayed_rsv->size) >= 0) { 4227 spin_unlock(&delayed_rsv->lock); 4228 return -ENOSPC; 4229 } 4230 spin_unlock(&delayed_rsv->lock); 4231 4232 commit: 4233 trans = btrfs_join_transaction(root); 4234 if (IS_ERR(trans)) 4235 return -ENOSPC; 4236 4237 return btrfs_commit_transaction(trans, root); 4238 } 4239 4240 enum flush_state { 4241 FLUSH_DELAYED_ITEMS_NR = 1, 4242 FLUSH_DELAYED_ITEMS = 2, 4243 FLUSH_DELALLOC = 3, 4244 FLUSH_DELALLOC_WAIT = 4, 4245 ALLOC_CHUNK = 5, 4246 COMMIT_TRANS = 6, 4247 }; 4248 4249 static int flush_space(struct btrfs_root *root, 4250 struct btrfs_space_info *space_info, u64 num_bytes, 4251 u64 orig_bytes, int state) 4252 { 4253 struct btrfs_trans_handle *trans; 4254 int nr; 4255 int ret = 0; 4256 4257 switch (state) { 4258 case FLUSH_DELAYED_ITEMS_NR: 4259 case FLUSH_DELAYED_ITEMS: 4260 if (state == FLUSH_DELAYED_ITEMS_NR) 4261 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4262 else 4263 nr = -1; 4264 4265 trans = btrfs_join_transaction(root); 4266 if (IS_ERR(trans)) { 4267 ret = PTR_ERR(trans); 4268 break; 4269 } 4270 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4271 btrfs_end_transaction(trans, root); 4272 break; 4273 case FLUSH_DELALLOC: 4274 case FLUSH_DELALLOC_WAIT: 4275 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4276 state == FLUSH_DELALLOC_WAIT); 4277 break; 4278 case ALLOC_CHUNK: 4279 trans = btrfs_join_transaction(root); 4280 if (IS_ERR(trans)) { 4281 ret = PTR_ERR(trans); 4282 break; 4283 } 4284 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4285 btrfs_get_alloc_profile(root, 0), 4286 CHUNK_ALLOC_NO_FORCE); 4287 btrfs_end_transaction(trans, root); 4288 if (ret == -ENOSPC) 4289 ret = 0; 4290 break; 4291 case COMMIT_TRANS: 4292 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4293 break; 4294 default: 4295 ret = -ENOSPC; 4296 break; 4297 } 4298 4299 return ret; 4300 } 4301 4302 static inline u64 4303 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4304 struct btrfs_space_info *space_info) 4305 { 4306 u64 used; 4307 u64 expected; 4308 u64 to_reclaim; 4309 4310 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, 4311 16 * 1024 * 1024); 4312 spin_lock(&space_info->lock); 4313 if (can_overcommit(root, space_info, to_reclaim, 4314 BTRFS_RESERVE_FLUSH_ALL)) { 4315 to_reclaim = 0; 4316 goto out; 4317 } 4318 4319 used = space_info->bytes_used + space_info->bytes_reserved + 4320 space_info->bytes_pinned + space_info->bytes_readonly + 4321 space_info->bytes_may_use; 4322 if (can_overcommit(root, space_info, 1024 * 1024, 4323 BTRFS_RESERVE_FLUSH_ALL)) 4324 expected = div_factor_fine(space_info->total_bytes, 95); 4325 else 4326 expected = div_factor_fine(space_info->total_bytes, 90); 4327 4328 if (used > expected) 4329 to_reclaim = used - expected; 4330 else 4331 to_reclaim = 0; 4332 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4333 space_info->bytes_reserved); 4334 out: 4335 spin_unlock(&space_info->lock); 4336 4337 return to_reclaim; 4338 } 4339 4340 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4341 struct btrfs_fs_info *fs_info, u64 used) 4342 { 4343 return (used >= div_factor_fine(space_info->total_bytes, 98) && 4344 !btrfs_fs_closing(fs_info) && 4345 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4346 } 4347 4348 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4349 struct btrfs_fs_info *fs_info) 4350 { 4351 u64 used; 4352 4353 spin_lock(&space_info->lock); 4354 used = space_info->bytes_used + space_info->bytes_reserved + 4355 space_info->bytes_pinned + space_info->bytes_readonly + 4356 space_info->bytes_may_use; 4357 if (need_do_async_reclaim(space_info, fs_info, used)) { 4358 spin_unlock(&space_info->lock); 4359 return 1; 4360 } 4361 spin_unlock(&space_info->lock); 4362 4363 return 0; 4364 } 4365 4366 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4367 { 4368 struct btrfs_fs_info *fs_info; 4369 struct btrfs_space_info *space_info; 4370 u64 to_reclaim; 4371 int flush_state; 4372 4373 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4374 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4375 4376 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4377 space_info); 4378 if (!to_reclaim) 4379 return; 4380 4381 flush_state = FLUSH_DELAYED_ITEMS_NR; 4382 do { 4383 flush_space(fs_info->fs_root, space_info, to_reclaim, 4384 to_reclaim, flush_state); 4385 flush_state++; 4386 if (!btrfs_need_do_async_reclaim(space_info, fs_info)) 4387 return; 4388 } while (flush_state <= COMMIT_TRANS); 4389 4390 if (btrfs_need_do_async_reclaim(space_info, fs_info)) 4391 queue_work(system_unbound_wq, work); 4392 } 4393 4394 void btrfs_init_async_reclaim_work(struct work_struct *work) 4395 { 4396 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 4397 } 4398 4399 /** 4400 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4401 * @root - the root we're allocating for 4402 * @block_rsv - the block_rsv we're allocating for 4403 * @orig_bytes - the number of bytes we want 4404 * @flush - whether or not we can flush to make our reservation 4405 * 4406 * This will reserve orgi_bytes number of bytes from the space info associated 4407 * with the block_rsv. If there is not enough space it will make an attempt to 4408 * flush out space to make room. It will do this by flushing delalloc if 4409 * possible or committing the transaction. If flush is 0 then no attempts to 4410 * regain reservations will be made and this will fail if there is not enough 4411 * space already. 4412 */ 4413 static int reserve_metadata_bytes(struct btrfs_root *root, 4414 struct btrfs_block_rsv *block_rsv, 4415 u64 orig_bytes, 4416 enum btrfs_reserve_flush_enum flush) 4417 { 4418 struct btrfs_space_info *space_info = block_rsv->space_info; 4419 u64 used; 4420 u64 num_bytes = orig_bytes; 4421 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4422 int ret = 0; 4423 bool flushing = false; 4424 4425 again: 4426 ret = 0; 4427 spin_lock(&space_info->lock); 4428 /* 4429 * We only want to wait if somebody other than us is flushing and we 4430 * are actually allowed to flush all things. 4431 */ 4432 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4433 space_info->flush) { 4434 spin_unlock(&space_info->lock); 4435 /* 4436 * If we have a trans handle we can't wait because the flusher 4437 * may have to commit the transaction, which would mean we would 4438 * deadlock since we are waiting for the flusher to finish, but 4439 * hold the current transaction open. 4440 */ 4441 if (current->journal_info) 4442 return -EAGAIN; 4443 ret = wait_event_killable(space_info->wait, !space_info->flush); 4444 /* Must have been killed, return */ 4445 if (ret) 4446 return -EINTR; 4447 4448 spin_lock(&space_info->lock); 4449 } 4450 4451 ret = -ENOSPC; 4452 used = space_info->bytes_used + space_info->bytes_reserved + 4453 space_info->bytes_pinned + space_info->bytes_readonly + 4454 space_info->bytes_may_use; 4455 4456 /* 4457 * The idea here is that we've not already over-reserved the block group 4458 * then we can go ahead and save our reservation first and then start 4459 * flushing if we need to. Otherwise if we've already overcommitted 4460 * lets start flushing stuff first and then come back and try to make 4461 * our reservation. 4462 */ 4463 if (used <= space_info->total_bytes) { 4464 if (used + orig_bytes <= space_info->total_bytes) { 4465 space_info->bytes_may_use += orig_bytes; 4466 trace_btrfs_space_reservation(root->fs_info, 4467 "space_info", space_info->flags, orig_bytes, 1); 4468 ret = 0; 4469 } else { 4470 /* 4471 * Ok set num_bytes to orig_bytes since we aren't 4472 * overocmmitted, this way we only try and reclaim what 4473 * we need. 4474 */ 4475 num_bytes = orig_bytes; 4476 } 4477 } else { 4478 /* 4479 * Ok we're over committed, set num_bytes to the overcommitted 4480 * amount plus the amount of bytes that we need for this 4481 * reservation. 4482 */ 4483 num_bytes = used - space_info->total_bytes + 4484 (orig_bytes * 2); 4485 } 4486 4487 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4488 space_info->bytes_may_use += orig_bytes; 4489 trace_btrfs_space_reservation(root->fs_info, "space_info", 4490 space_info->flags, orig_bytes, 4491 1); 4492 ret = 0; 4493 } 4494 4495 /* 4496 * Couldn't make our reservation, save our place so while we're trying 4497 * to reclaim space we can actually use it instead of somebody else 4498 * stealing it from us. 4499 * 4500 * We make the other tasks wait for the flush only when we can flush 4501 * all things. 4502 */ 4503 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4504 flushing = true; 4505 space_info->flush = 1; 4506 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4507 used += orig_bytes; 4508 if (need_do_async_reclaim(space_info, root->fs_info, used) && 4509 !work_busy(&root->fs_info->async_reclaim_work)) 4510 queue_work(system_unbound_wq, 4511 &root->fs_info->async_reclaim_work); 4512 } 4513 spin_unlock(&space_info->lock); 4514 4515 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4516 goto out; 4517 4518 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4519 flush_state); 4520 flush_state++; 4521 4522 /* 4523 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4524 * would happen. So skip delalloc flush. 4525 */ 4526 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4527 (flush_state == FLUSH_DELALLOC || 4528 flush_state == FLUSH_DELALLOC_WAIT)) 4529 flush_state = ALLOC_CHUNK; 4530 4531 if (!ret) 4532 goto again; 4533 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4534 flush_state < COMMIT_TRANS) 4535 goto again; 4536 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4537 flush_state <= COMMIT_TRANS) 4538 goto again; 4539 4540 out: 4541 if (ret == -ENOSPC && 4542 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4543 struct btrfs_block_rsv *global_rsv = 4544 &root->fs_info->global_block_rsv; 4545 4546 if (block_rsv != global_rsv && 4547 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4548 ret = 0; 4549 } 4550 if (ret == -ENOSPC) 4551 trace_btrfs_space_reservation(root->fs_info, 4552 "space_info:enospc", 4553 space_info->flags, orig_bytes, 1); 4554 if (flushing) { 4555 spin_lock(&space_info->lock); 4556 space_info->flush = 0; 4557 wake_up_all(&space_info->wait); 4558 spin_unlock(&space_info->lock); 4559 } 4560 return ret; 4561 } 4562 4563 static struct btrfs_block_rsv *get_block_rsv( 4564 const struct btrfs_trans_handle *trans, 4565 const struct btrfs_root *root) 4566 { 4567 struct btrfs_block_rsv *block_rsv = NULL; 4568 4569 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4570 block_rsv = trans->block_rsv; 4571 4572 if (root == root->fs_info->csum_root && trans->adding_csums) 4573 block_rsv = trans->block_rsv; 4574 4575 if (root == root->fs_info->uuid_root) 4576 block_rsv = trans->block_rsv; 4577 4578 if (!block_rsv) 4579 block_rsv = root->block_rsv; 4580 4581 if (!block_rsv) 4582 block_rsv = &root->fs_info->empty_block_rsv; 4583 4584 return block_rsv; 4585 } 4586 4587 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4588 u64 num_bytes) 4589 { 4590 int ret = -ENOSPC; 4591 spin_lock(&block_rsv->lock); 4592 if (block_rsv->reserved >= num_bytes) { 4593 block_rsv->reserved -= num_bytes; 4594 if (block_rsv->reserved < block_rsv->size) 4595 block_rsv->full = 0; 4596 ret = 0; 4597 } 4598 spin_unlock(&block_rsv->lock); 4599 return ret; 4600 } 4601 4602 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4603 u64 num_bytes, int update_size) 4604 { 4605 spin_lock(&block_rsv->lock); 4606 block_rsv->reserved += num_bytes; 4607 if (update_size) 4608 block_rsv->size += num_bytes; 4609 else if (block_rsv->reserved >= block_rsv->size) 4610 block_rsv->full = 1; 4611 spin_unlock(&block_rsv->lock); 4612 } 4613 4614 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 4615 struct btrfs_block_rsv *dest, u64 num_bytes, 4616 int min_factor) 4617 { 4618 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4619 u64 min_bytes; 4620 4621 if (global_rsv->space_info != dest->space_info) 4622 return -ENOSPC; 4623 4624 spin_lock(&global_rsv->lock); 4625 min_bytes = div_factor(global_rsv->size, min_factor); 4626 if (global_rsv->reserved < min_bytes + num_bytes) { 4627 spin_unlock(&global_rsv->lock); 4628 return -ENOSPC; 4629 } 4630 global_rsv->reserved -= num_bytes; 4631 if (global_rsv->reserved < global_rsv->size) 4632 global_rsv->full = 0; 4633 spin_unlock(&global_rsv->lock); 4634 4635 block_rsv_add_bytes(dest, num_bytes, 1); 4636 return 0; 4637 } 4638 4639 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4640 struct btrfs_block_rsv *block_rsv, 4641 struct btrfs_block_rsv *dest, u64 num_bytes) 4642 { 4643 struct btrfs_space_info *space_info = block_rsv->space_info; 4644 4645 spin_lock(&block_rsv->lock); 4646 if (num_bytes == (u64)-1) 4647 num_bytes = block_rsv->size; 4648 block_rsv->size -= num_bytes; 4649 if (block_rsv->reserved >= block_rsv->size) { 4650 num_bytes = block_rsv->reserved - block_rsv->size; 4651 block_rsv->reserved = block_rsv->size; 4652 block_rsv->full = 1; 4653 } else { 4654 num_bytes = 0; 4655 } 4656 spin_unlock(&block_rsv->lock); 4657 4658 if (num_bytes > 0) { 4659 if (dest) { 4660 spin_lock(&dest->lock); 4661 if (!dest->full) { 4662 u64 bytes_to_add; 4663 4664 bytes_to_add = dest->size - dest->reserved; 4665 bytes_to_add = min(num_bytes, bytes_to_add); 4666 dest->reserved += bytes_to_add; 4667 if (dest->reserved >= dest->size) 4668 dest->full = 1; 4669 num_bytes -= bytes_to_add; 4670 } 4671 spin_unlock(&dest->lock); 4672 } 4673 if (num_bytes) { 4674 spin_lock(&space_info->lock); 4675 space_info->bytes_may_use -= num_bytes; 4676 trace_btrfs_space_reservation(fs_info, "space_info", 4677 space_info->flags, num_bytes, 0); 4678 spin_unlock(&space_info->lock); 4679 } 4680 } 4681 } 4682 4683 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4684 struct btrfs_block_rsv *dst, u64 num_bytes) 4685 { 4686 int ret; 4687 4688 ret = block_rsv_use_bytes(src, num_bytes); 4689 if (ret) 4690 return ret; 4691 4692 block_rsv_add_bytes(dst, num_bytes, 1); 4693 return 0; 4694 } 4695 4696 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4697 { 4698 memset(rsv, 0, sizeof(*rsv)); 4699 spin_lock_init(&rsv->lock); 4700 rsv->type = type; 4701 } 4702 4703 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4704 unsigned short type) 4705 { 4706 struct btrfs_block_rsv *block_rsv; 4707 struct btrfs_fs_info *fs_info = root->fs_info; 4708 4709 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4710 if (!block_rsv) 4711 return NULL; 4712 4713 btrfs_init_block_rsv(block_rsv, type); 4714 block_rsv->space_info = __find_space_info(fs_info, 4715 BTRFS_BLOCK_GROUP_METADATA); 4716 return block_rsv; 4717 } 4718 4719 void btrfs_free_block_rsv(struct btrfs_root *root, 4720 struct btrfs_block_rsv *rsv) 4721 { 4722 if (!rsv) 4723 return; 4724 btrfs_block_rsv_release(root, rsv, (u64)-1); 4725 kfree(rsv); 4726 } 4727 4728 int btrfs_block_rsv_add(struct btrfs_root *root, 4729 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4730 enum btrfs_reserve_flush_enum flush) 4731 { 4732 int ret; 4733 4734 if (num_bytes == 0) 4735 return 0; 4736 4737 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4738 if (!ret) { 4739 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4740 return 0; 4741 } 4742 4743 return ret; 4744 } 4745 4746 int btrfs_block_rsv_check(struct btrfs_root *root, 4747 struct btrfs_block_rsv *block_rsv, int min_factor) 4748 { 4749 u64 num_bytes = 0; 4750 int ret = -ENOSPC; 4751 4752 if (!block_rsv) 4753 return 0; 4754 4755 spin_lock(&block_rsv->lock); 4756 num_bytes = div_factor(block_rsv->size, min_factor); 4757 if (block_rsv->reserved >= num_bytes) 4758 ret = 0; 4759 spin_unlock(&block_rsv->lock); 4760 4761 return ret; 4762 } 4763 4764 int btrfs_block_rsv_refill(struct btrfs_root *root, 4765 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4766 enum btrfs_reserve_flush_enum flush) 4767 { 4768 u64 num_bytes = 0; 4769 int ret = -ENOSPC; 4770 4771 if (!block_rsv) 4772 return 0; 4773 4774 spin_lock(&block_rsv->lock); 4775 num_bytes = min_reserved; 4776 if (block_rsv->reserved >= num_bytes) 4777 ret = 0; 4778 else 4779 num_bytes -= block_rsv->reserved; 4780 spin_unlock(&block_rsv->lock); 4781 4782 if (!ret) 4783 return 0; 4784 4785 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4786 if (!ret) { 4787 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4788 return 0; 4789 } 4790 4791 return ret; 4792 } 4793 4794 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4795 struct btrfs_block_rsv *dst_rsv, 4796 u64 num_bytes) 4797 { 4798 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4799 } 4800 4801 void btrfs_block_rsv_release(struct btrfs_root *root, 4802 struct btrfs_block_rsv *block_rsv, 4803 u64 num_bytes) 4804 { 4805 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4806 if (global_rsv == block_rsv || 4807 block_rsv->space_info != global_rsv->space_info) 4808 global_rsv = NULL; 4809 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4810 num_bytes); 4811 } 4812 4813 /* 4814 * helper to calculate size of global block reservation. 4815 * the desired value is sum of space used by extent tree, 4816 * checksum tree and root tree 4817 */ 4818 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4819 { 4820 struct btrfs_space_info *sinfo; 4821 u64 num_bytes; 4822 u64 meta_used; 4823 u64 data_used; 4824 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4825 4826 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4827 spin_lock(&sinfo->lock); 4828 data_used = sinfo->bytes_used; 4829 spin_unlock(&sinfo->lock); 4830 4831 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4832 spin_lock(&sinfo->lock); 4833 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4834 data_used = 0; 4835 meta_used = sinfo->bytes_used; 4836 spin_unlock(&sinfo->lock); 4837 4838 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4839 csum_size * 2; 4840 num_bytes += div64_u64(data_used + meta_used, 50); 4841 4842 if (num_bytes * 3 > meta_used) 4843 num_bytes = div64_u64(meta_used, 3); 4844 4845 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4846 } 4847 4848 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4849 { 4850 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4851 struct btrfs_space_info *sinfo = block_rsv->space_info; 4852 u64 num_bytes; 4853 4854 num_bytes = calc_global_metadata_size(fs_info); 4855 4856 spin_lock(&sinfo->lock); 4857 spin_lock(&block_rsv->lock); 4858 4859 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4860 4861 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4862 sinfo->bytes_reserved + sinfo->bytes_readonly + 4863 sinfo->bytes_may_use; 4864 4865 if (sinfo->total_bytes > num_bytes) { 4866 num_bytes = sinfo->total_bytes - num_bytes; 4867 block_rsv->reserved += num_bytes; 4868 sinfo->bytes_may_use += num_bytes; 4869 trace_btrfs_space_reservation(fs_info, "space_info", 4870 sinfo->flags, num_bytes, 1); 4871 } 4872 4873 if (block_rsv->reserved >= block_rsv->size) { 4874 num_bytes = block_rsv->reserved - block_rsv->size; 4875 sinfo->bytes_may_use -= num_bytes; 4876 trace_btrfs_space_reservation(fs_info, "space_info", 4877 sinfo->flags, num_bytes, 0); 4878 block_rsv->reserved = block_rsv->size; 4879 block_rsv->full = 1; 4880 } 4881 4882 spin_unlock(&block_rsv->lock); 4883 spin_unlock(&sinfo->lock); 4884 } 4885 4886 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4887 { 4888 struct btrfs_space_info *space_info; 4889 4890 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4891 fs_info->chunk_block_rsv.space_info = space_info; 4892 4893 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4894 fs_info->global_block_rsv.space_info = space_info; 4895 fs_info->delalloc_block_rsv.space_info = space_info; 4896 fs_info->trans_block_rsv.space_info = space_info; 4897 fs_info->empty_block_rsv.space_info = space_info; 4898 fs_info->delayed_block_rsv.space_info = space_info; 4899 4900 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4901 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4902 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4903 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4904 if (fs_info->quota_root) 4905 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 4906 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4907 4908 update_global_block_rsv(fs_info); 4909 } 4910 4911 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4912 { 4913 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4914 (u64)-1); 4915 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4916 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4917 WARN_ON(fs_info->trans_block_rsv.size > 0); 4918 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4919 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4920 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4921 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4922 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4923 } 4924 4925 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4926 struct btrfs_root *root) 4927 { 4928 if (!trans->block_rsv) 4929 return; 4930 4931 if (!trans->bytes_reserved) 4932 return; 4933 4934 trace_btrfs_space_reservation(root->fs_info, "transaction", 4935 trans->transid, trans->bytes_reserved, 0); 4936 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4937 trans->bytes_reserved = 0; 4938 } 4939 4940 /* Can only return 0 or -ENOSPC */ 4941 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4942 struct inode *inode) 4943 { 4944 struct btrfs_root *root = BTRFS_I(inode)->root; 4945 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4946 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4947 4948 /* 4949 * We need to hold space in order to delete our orphan item once we've 4950 * added it, so this takes the reservation so we can release it later 4951 * when we are truly done with the orphan item. 4952 */ 4953 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4954 trace_btrfs_space_reservation(root->fs_info, "orphan", 4955 btrfs_ino(inode), num_bytes, 1); 4956 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4957 } 4958 4959 void btrfs_orphan_release_metadata(struct inode *inode) 4960 { 4961 struct btrfs_root *root = BTRFS_I(inode)->root; 4962 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4963 trace_btrfs_space_reservation(root->fs_info, "orphan", 4964 btrfs_ino(inode), num_bytes, 0); 4965 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4966 } 4967 4968 /* 4969 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4970 * root: the root of the parent directory 4971 * rsv: block reservation 4972 * items: the number of items that we need do reservation 4973 * qgroup_reserved: used to return the reserved size in qgroup 4974 * 4975 * This function is used to reserve the space for snapshot/subvolume 4976 * creation and deletion. Those operations are different with the 4977 * common file/directory operations, they change two fs/file trees 4978 * and root tree, the number of items that the qgroup reserves is 4979 * different with the free space reservation. So we can not use 4980 * the space reseravtion mechanism in start_transaction(). 4981 */ 4982 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4983 struct btrfs_block_rsv *rsv, 4984 int items, 4985 u64 *qgroup_reserved, 4986 bool use_global_rsv) 4987 { 4988 u64 num_bytes; 4989 int ret; 4990 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4991 4992 if (root->fs_info->quota_enabled) { 4993 /* One for parent inode, two for dir entries */ 4994 num_bytes = 3 * root->leafsize; 4995 ret = btrfs_qgroup_reserve(root, num_bytes); 4996 if (ret) 4997 return ret; 4998 } else { 4999 num_bytes = 0; 5000 } 5001 5002 *qgroup_reserved = num_bytes; 5003 5004 num_bytes = btrfs_calc_trans_metadata_size(root, items); 5005 rsv->space_info = __find_space_info(root->fs_info, 5006 BTRFS_BLOCK_GROUP_METADATA); 5007 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5008 BTRFS_RESERVE_FLUSH_ALL); 5009 5010 if (ret == -ENOSPC && use_global_rsv) 5011 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 5012 5013 if (ret) { 5014 if (*qgroup_reserved) 5015 btrfs_qgroup_free(root, *qgroup_reserved); 5016 } 5017 5018 return ret; 5019 } 5020 5021 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 5022 struct btrfs_block_rsv *rsv, 5023 u64 qgroup_reserved) 5024 { 5025 btrfs_block_rsv_release(root, rsv, (u64)-1); 5026 if (qgroup_reserved) 5027 btrfs_qgroup_free(root, qgroup_reserved); 5028 } 5029 5030 /** 5031 * drop_outstanding_extent - drop an outstanding extent 5032 * @inode: the inode we're dropping the extent for 5033 * 5034 * This is called when we are freeing up an outstanding extent, either called 5035 * after an error or after an extent is written. This will return the number of 5036 * reserved extents that need to be freed. This must be called with 5037 * BTRFS_I(inode)->lock held. 5038 */ 5039 static unsigned drop_outstanding_extent(struct inode *inode) 5040 { 5041 unsigned drop_inode_space = 0; 5042 unsigned dropped_extents = 0; 5043 5044 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 5045 BTRFS_I(inode)->outstanding_extents--; 5046 5047 if (BTRFS_I(inode)->outstanding_extents == 0 && 5048 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5049 &BTRFS_I(inode)->runtime_flags)) 5050 drop_inode_space = 1; 5051 5052 /* 5053 * If we have more or the same amount of outsanding extents than we have 5054 * reserved then we need to leave the reserved extents count alone. 5055 */ 5056 if (BTRFS_I(inode)->outstanding_extents >= 5057 BTRFS_I(inode)->reserved_extents) 5058 return drop_inode_space; 5059 5060 dropped_extents = BTRFS_I(inode)->reserved_extents - 5061 BTRFS_I(inode)->outstanding_extents; 5062 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5063 return dropped_extents + drop_inode_space; 5064 } 5065 5066 /** 5067 * calc_csum_metadata_size - return the amount of metada space that must be 5068 * reserved/free'd for the given bytes. 5069 * @inode: the inode we're manipulating 5070 * @num_bytes: the number of bytes in question 5071 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5072 * 5073 * This adjusts the number of csum_bytes in the inode and then returns the 5074 * correct amount of metadata that must either be reserved or freed. We 5075 * calculate how many checksums we can fit into one leaf and then divide the 5076 * number of bytes that will need to be checksumed by this value to figure out 5077 * how many checksums will be required. If we are adding bytes then the number 5078 * may go up and we will return the number of additional bytes that must be 5079 * reserved. If it is going down we will return the number of bytes that must 5080 * be freed. 5081 * 5082 * This must be called with BTRFS_I(inode)->lock held. 5083 */ 5084 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5085 int reserve) 5086 { 5087 struct btrfs_root *root = BTRFS_I(inode)->root; 5088 u64 csum_size; 5089 int num_csums_per_leaf; 5090 int num_csums; 5091 int old_csums; 5092 5093 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5094 BTRFS_I(inode)->csum_bytes == 0) 5095 return 0; 5096 5097 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5098 if (reserve) 5099 BTRFS_I(inode)->csum_bytes += num_bytes; 5100 else 5101 BTRFS_I(inode)->csum_bytes -= num_bytes; 5102 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 5103 num_csums_per_leaf = (int)div64_u64(csum_size, 5104 sizeof(struct btrfs_csum_item) + 5105 sizeof(struct btrfs_disk_key)); 5106 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5107 num_csums = num_csums + num_csums_per_leaf - 1; 5108 num_csums = num_csums / num_csums_per_leaf; 5109 5110 old_csums = old_csums + num_csums_per_leaf - 1; 5111 old_csums = old_csums / num_csums_per_leaf; 5112 5113 /* No change, no need to reserve more */ 5114 if (old_csums == num_csums) 5115 return 0; 5116 5117 if (reserve) 5118 return btrfs_calc_trans_metadata_size(root, 5119 num_csums - old_csums); 5120 5121 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 5122 } 5123 5124 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5125 { 5126 struct btrfs_root *root = BTRFS_I(inode)->root; 5127 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 5128 u64 to_reserve = 0; 5129 u64 csum_bytes; 5130 unsigned nr_extents = 0; 5131 int extra_reserve = 0; 5132 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5133 int ret = 0; 5134 bool delalloc_lock = true; 5135 u64 to_free = 0; 5136 unsigned dropped; 5137 5138 /* If we are a free space inode we need to not flush since we will be in 5139 * the middle of a transaction commit. We also don't need the delalloc 5140 * mutex since we won't race with anybody. We need this mostly to make 5141 * lockdep shut its filthy mouth. 5142 */ 5143 if (btrfs_is_free_space_inode(inode)) { 5144 flush = BTRFS_RESERVE_NO_FLUSH; 5145 delalloc_lock = false; 5146 } 5147 5148 if (flush != BTRFS_RESERVE_NO_FLUSH && 5149 btrfs_transaction_in_commit(root->fs_info)) 5150 schedule_timeout(1); 5151 5152 if (delalloc_lock) 5153 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5154 5155 num_bytes = ALIGN(num_bytes, root->sectorsize); 5156 5157 spin_lock(&BTRFS_I(inode)->lock); 5158 BTRFS_I(inode)->outstanding_extents++; 5159 5160 if (BTRFS_I(inode)->outstanding_extents > 5161 BTRFS_I(inode)->reserved_extents) 5162 nr_extents = BTRFS_I(inode)->outstanding_extents - 5163 BTRFS_I(inode)->reserved_extents; 5164 5165 /* 5166 * Add an item to reserve for updating the inode when we complete the 5167 * delalloc io. 5168 */ 5169 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5170 &BTRFS_I(inode)->runtime_flags)) { 5171 nr_extents++; 5172 extra_reserve = 1; 5173 } 5174 5175 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 5176 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5177 csum_bytes = BTRFS_I(inode)->csum_bytes; 5178 spin_unlock(&BTRFS_I(inode)->lock); 5179 5180 if (root->fs_info->quota_enabled) { 5181 ret = btrfs_qgroup_reserve(root, num_bytes + 5182 nr_extents * root->leafsize); 5183 if (ret) 5184 goto out_fail; 5185 } 5186 5187 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5188 if (unlikely(ret)) { 5189 if (root->fs_info->quota_enabled) 5190 btrfs_qgroup_free(root, num_bytes + 5191 nr_extents * root->leafsize); 5192 goto out_fail; 5193 } 5194 5195 spin_lock(&BTRFS_I(inode)->lock); 5196 if (extra_reserve) { 5197 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5198 &BTRFS_I(inode)->runtime_flags); 5199 nr_extents--; 5200 } 5201 BTRFS_I(inode)->reserved_extents += nr_extents; 5202 spin_unlock(&BTRFS_I(inode)->lock); 5203 5204 if (delalloc_lock) 5205 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5206 5207 if (to_reserve) 5208 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5209 btrfs_ino(inode), to_reserve, 1); 5210 block_rsv_add_bytes(block_rsv, to_reserve, 1); 5211 5212 return 0; 5213 5214 out_fail: 5215 spin_lock(&BTRFS_I(inode)->lock); 5216 dropped = drop_outstanding_extent(inode); 5217 /* 5218 * If the inodes csum_bytes is the same as the original 5219 * csum_bytes then we know we haven't raced with any free()ers 5220 * so we can just reduce our inodes csum bytes and carry on. 5221 */ 5222 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 5223 calc_csum_metadata_size(inode, num_bytes, 0); 5224 } else { 5225 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 5226 u64 bytes; 5227 5228 /* 5229 * This is tricky, but first we need to figure out how much we 5230 * free'd from any free-ers that occured during this 5231 * reservation, so we reset ->csum_bytes to the csum_bytes 5232 * before we dropped our lock, and then call the free for the 5233 * number of bytes that were freed while we were trying our 5234 * reservation. 5235 */ 5236 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 5237 BTRFS_I(inode)->csum_bytes = csum_bytes; 5238 to_free = calc_csum_metadata_size(inode, bytes, 0); 5239 5240 5241 /* 5242 * Now we need to see how much we would have freed had we not 5243 * been making this reservation and our ->csum_bytes were not 5244 * artificially inflated. 5245 */ 5246 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 5247 bytes = csum_bytes - orig_csum_bytes; 5248 bytes = calc_csum_metadata_size(inode, bytes, 0); 5249 5250 /* 5251 * Now reset ->csum_bytes to what it should be. If bytes is 5252 * more than to_free then we would have free'd more space had we 5253 * not had an artificially high ->csum_bytes, so we need to free 5254 * the remainder. If bytes is the same or less then we don't 5255 * need to do anything, the other free-ers did the correct 5256 * thing. 5257 */ 5258 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 5259 if (bytes > to_free) 5260 to_free = bytes - to_free; 5261 else 5262 to_free = 0; 5263 } 5264 spin_unlock(&BTRFS_I(inode)->lock); 5265 if (dropped) 5266 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5267 5268 if (to_free) { 5269 btrfs_block_rsv_release(root, block_rsv, to_free); 5270 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5271 btrfs_ino(inode), to_free, 0); 5272 } 5273 if (delalloc_lock) 5274 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5275 return ret; 5276 } 5277 5278 /** 5279 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5280 * @inode: the inode to release the reservation for 5281 * @num_bytes: the number of bytes we're releasing 5282 * 5283 * This will release the metadata reservation for an inode. This can be called 5284 * once we complete IO for a given set of bytes to release their metadata 5285 * reservations. 5286 */ 5287 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 5288 { 5289 struct btrfs_root *root = BTRFS_I(inode)->root; 5290 u64 to_free = 0; 5291 unsigned dropped; 5292 5293 num_bytes = ALIGN(num_bytes, root->sectorsize); 5294 spin_lock(&BTRFS_I(inode)->lock); 5295 dropped = drop_outstanding_extent(inode); 5296 5297 if (num_bytes) 5298 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5299 spin_unlock(&BTRFS_I(inode)->lock); 5300 if (dropped > 0) 5301 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5302 5303 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5304 btrfs_ino(inode), to_free, 0); 5305 if (root->fs_info->quota_enabled) { 5306 btrfs_qgroup_free(root, num_bytes + 5307 dropped * root->leafsize); 5308 } 5309 5310 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5311 to_free); 5312 } 5313 5314 /** 5315 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 5316 * @inode: inode we're writing to 5317 * @num_bytes: the number of bytes we want to allocate 5318 * 5319 * This will do the following things 5320 * 5321 * o reserve space in the data space info for num_bytes 5322 * o reserve space in the metadata space info based on number of outstanding 5323 * extents and how much csums will be needed 5324 * o add to the inodes ->delalloc_bytes 5325 * o add it to the fs_info's delalloc inodes list. 5326 * 5327 * This will return 0 for success and -ENOSPC if there is no space left. 5328 */ 5329 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 5330 { 5331 int ret; 5332 5333 ret = btrfs_check_data_free_space(inode, num_bytes); 5334 if (ret) 5335 return ret; 5336 5337 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 5338 if (ret) { 5339 btrfs_free_reserved_data_space(inode, num_bytes); 5340 return ret; 5341 } 5342 5343 return 0; 5344 } 5345 5346 /** 5347 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5348 * @inode: inode we're releasing space for 5349 * @num_bytes: the number of bytes we want to free up 5350 * 5351 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5352 * called in the case that we don't need the metadata AND data reservations 5353 * anymore. So if there is an error or we insert an inline extent. 5354 * 5355 * This function will release the metadata space that was not used and will 5356 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5357 * list if there are no delalloc bytes left. 5358 */ 5359 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5360 { 5361 btrfs_delalloc_release_metadata(inode, num_bytes); 5362 btrfs_free_reserved_data_space(inode, num_bytes); 5363 } 5364 5365 static int update_block_group(struct btrfs_root *root, 5366 u64 bytenr, u64 num_bytes, int alloc) 5367 { 5368 struct btrfs_block_group_cache *cache = NULL; 5369 struct btrfs_fs_info *info = root->fs_info; 5370 u64 total = num_bytes; 5371 u64 old_val; 5372 u64 byte_in_group; 5373 int factor; 5374 5375 /* block accounting for super block */ 5376 spin_lock(&info->delalloc_root_lock); 5377 old_val = btrfs_super_bytes_used(info->super_copy); 5378 if (alloc) 5379 old_val += num_bytes; 5380 else 5381 old_val -= num_bytes; 5382 btrfs_set_super_bytes_used(info->super_copy, old_val); 5383 spin_unlock(&info->delalloc_root_lock); 5384 5385 while (total) { 5386 cache = btrfs_lookup_block_group(info, bytenr); 5387 if (!cache) 5388 return -ENOENT; 5389 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5390 BTRFS_BLOCK_GROUP_RAID1 | 5391 BTRFS_BLOCK_GROUP_RAID10)) 5392 factor = 2; 5393 else 5394 factor = 1; 5395 /* 5396 * If this block group has free space cache written out, we 5397 * need to make sure to load it if we are removing space. This 5398 * is because we need the unpinning stage to actually add the 5399 * space back to the block group, otherwise we will leak space. 5400 */ 5401 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5402 cache_block_group(cache, 1); 5403 5404 byte_in_group = bytenr - cache->key.objectid; 5405 WARN_ON(byte_in_group > cache->key.offset); 5406 5407 spin_lock(&cache->space_info->lock); 5408 spin_lock(&cache->lock); 5409 5410 if (btrfs_test_opt(root, SPACE_CACHE) && 5411 cache->disk_cache_state < BTRFS_DC_CLEAR) 5412 cache->disk_cache_state = BTRFS_DC_CLEAR; 5413 5414 cache->dirty = 1; 5415 old_val = btrfs_block_group_used(&cache->item); 5416 num_bytes = min(total, cache->key.offset - byte_in_group); 5417 if (alloc) { 5418 old_val += num_bytes; 5419 btrfs_set_block_group_used(&cache->item, old_val); 5420 cache->reserved -= num_bytes; 5421 cache->space_info->bytes_reserved -= num_bytes; 5422 cache->space_info->bytes_used += num_bytes; 5423 cache->space_info->disk_used += num_bytes * factor; 5424 spin_unlock(&cache->lock); 5425 spin_unlock(&cache->space_info->lock); 5426 } else { 5427 old_val -= num_bytes; 5428 btrfs_set_block_group_used(&cache->item, old_val); 5429 cache->pinned += num_bytes; 5430 cache->space_info->bytes_pinned += num_bytes; 5431 cache->space_info->bytes_used -= num_bytes; 5432 cache->space_info->disk_used -= num_bytes * factor; 5433 spin_unlock(&cache->lock); 5434 spin_unlock(&cache->space_info->lock); 5435 5436 set_extent_dirty(info->pinned_extents, 5437 bytenr, bytenr + num_bytes - 1, 5438 GFP_NOFS | __GFP_NOFAIL); 5439 } 5440 btrfs_put_block_group(cache); 5441 total -= num_bytes; 5442 bytenr += num_bytes; 5443 } 5444 return 0; 5445 } 5446 5447 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5448 { 5449 struct btrfs_block_group_cache *cache; 5450 u64 bytenr; 5451 5452 spin_lock(&root->fs_info->block_group_cache_lock); 5453 bytenr = root->fs_info->first_logical_byte; 5454 spin_unlock(&root->fs_info->block_group_cache_lock); 5455 5456 if (bytenr < (u64)-1) 5457 return bytenr; 5458 5459 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5460 if (!cache) 5461 return 0; 5462 5463 bytenr = cache->key.objectid; 5464 btrfs_put_block_group(cache); 5465 5466 return bytenr; 5467 } 5468 5469 static int pin_down_extent(struct btrfs_root *root, 5470 struct btrfs_block_group_cache *cache, 5471 u64 bytenr, u64 num_bytes, int reserved) 5472 { 5473 spin_lock(&cache->space_info->lock); 5474 spin_lock(&cache->lock); 5475 cache->pinned += num_bytes; 5476 cache->space_info->bytes_pinned += num_bytes; 5477 if (reserved) { 5478 cache->reserved -= num_bytes; 5479 cache->space_info->bytes_reserved -= num_bytes; 5480 } 5481 spin_unlock(&cache->lock); 5482 spin_unlock(&cache->space_info->lock); 5483 5484 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5485 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5486 if (reserved) 5487 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); 5488 return 0; 5489 } 5490 5491 /* 5492 * this function must be called within transaction 5493 */ 5494 int btrfs_pin_extent(struct btrfs_root *root, 5495 u64 bytenr, u64 num_bytes, int reserved) 5496 { 5497 struct btrfs_block_group_cache *cache; 5498 5499 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5500 BUG_ON(!cache); /* Logic error */ 5501 5502 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5503 5504 btrfs_put_block_group(cache); 5505 return 0; 5506 } 5507 5508 /* 5509 * this function must be called within transaction 5510 */ 5511 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5512 u64 bytenr, u64 num_bytes) 5513 { 5514 struct btrfs_block_group_cache *cache; 5515 int ret; 5516 5517 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5518 if (!cache) 5519 return -EINVAL; 5520 5521 /* 5522 * pull in the free space cache (if any) so that our pin 5523 * removes the free space from the cache. We have load_only set 5524 * to one because the slow code to read in the free extents does check 5525 * the pinned extents. 5526 */ 5527 cache_block_group(cache, 1); 5528 5529 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5530 5531 /* remove us from the free space cache (if we're there at all) */ 5532 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5533 btrfs_put_block_group(cache); 5534 return ret; 5535 } 5536 5537 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 5538 { 5539 int ret; 5540 struct btrfs_block_group_cache *block_group; 5541 struct btrfs_caching_control *caching_ctl; 5542 5543 block_group = btrfs_lookup_block_group(root->fs_info, start); 5544 if (!block_group) 5545 return -EINVAL; 5546 5547 cache_block_group(block_group, 0); 5548 caching_ctl = get_caching_control(block_group); 5549 5550 if (!caching_ctl) { 5551 /* Logic error */ 5552 BUG_ON(!block_group_cache_done(block_group)); 5553 ret = btrfs_remove_free_space(block_group, start, num_bytes); 5554 } else { 5555 mutex_lock(&caching_ctl->mutex); 5556 5557 if (start >= caching_ctl->progress) { 5558 ret = add_excluded_extent(root, start, num_bytes); 5559 } else if (start + num_bytes <= caching_ctl->progress) { 5560 ret = btrfs_remove_free_space(block_group, 5561 start, num_bytes); 5562 } else { 5563 num_bytes = caching_ctl->progress - start; 5564 ret = btrfs_remove_free_space(block_group, 5565 start, num_bytes); 5566 if (ret) 5567 goto out_lock; 5568 5569 num_bytes = (start + num_bytes) - 5570 caching_ctl->progress; 5571 start = caching_ctl->progress; 5572 ret = add_excluded_extent(root, start, num_bytes); 5573 } 5574 out_lock: 5575 mutex_unlock(&caching_ctl->mutex); 5576 put_caching_control(caching_ctl); 5577 } 5578 btrfs_put_block_group(block_group); 5579 return ret; 5580 } 5581 5582 int btrfs_exclude_logged_extents(struct btrfs_root *log, 5583 struct extent_buffer *eb) 5584 { 5585 struct btrfs_file_extent_item *item; 5586 struct btrfs_key key; 5587 int found_type; 5588 int i; 5589 5590 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 5591 return 0; 5592 5593 for (i = 0; i < btrfs_header_nritems(eb); i++) { 5594 btrfs_item_key_to_cpu(eb, &key, i); 5595 if (key.type != BTRFS_EXTENT_DATA_KEY) 5596 continue; 5597 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 5598 found_type = btrfs_file_extent_type(eb, item); 5599 if (found_type == BTRFS_FILE_EXTENT_INLINE) 5600 continue; 5601 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 5602 continue; 5603 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 5604 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 5605 __exclude_logged_extent(log, key.objectid, key.offset); 5606 } 5607 5608 return 0; 5609 } 5610 5611 /** 5612 * btrfs_update_reserved_bytes - update the block_group and space info counters 5613 * @cache: The cache we are manipulating 5614 * @num_bytes: The number of bytes in question 5615 * @reserve: One of the reservation enums 5616 * 5617 * This is called by the allocator when it reserves space, or by somebody who is 5618 * freeing space that was never actually used on disk. For example if you 5619 * reserve some space for a new leaf in transaction A and before transaction A 5620 * commits you free that leaf, you call this with reserve set to 0 in order to 5621 * clear the reservation. 5622 * 5623 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5624 * ENOSPC accounting. For data we handle the reservation through clearing the 5625 * delalloc bits in the io_tree. We have to do this since we could end up 5626 * allocating less disk space for the amount of data we have reserved in the 5627 * case of compression. 5628 * 5629 * If this is a reservation and the block group has become read only we cannot 5630 * make the reservation and return -EAGAIN, otherwise this function always 5631 * succeeds. 5632 */ 5633 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5634 u64 num_bytes, int reserve) 5635 { 5636 struct btrfs_space_info *space_info = cache->space_info; 5637 int ret = 0; 5638 5639 spin_lock(&space_info->lock); 5640 spin_lock(&cache->lock); 5641 if (reserve != RESERVE_FREE) { 5642 if (cache->ro) { 5643 ret = -EAGAIN; 5644 } else { 5645 cache->reserved += num_bytes; 5646 space_info->bytes_reserved += num_bytes; 5647 if (reserve == RESERVE_ALLOC) { 5648 trace_btrfs_space_reservation(cache->fs_info, 5649 "space_info", space_info->flags, 5650 num_bytes, 0); 5651 space_info->bytes_may_use -= num_bytes; 5652 } 5653 } 5654 } else { 5655 if (cache->ro) 5656 space_info->bytes_readonly += num_bytes; 5657 cache->reserved -= num_bytes; 5658 space_info->bytes_reserved -= num_bytes; 5659 } 5660 spin_unlock(&cache->lock); 5661 spin_unlock(&space_info->lock); 5662 return ret; 5663 } 5664 5665 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5666 struct btrfs_root *root) 5667 { 5668 struct btrfs_fs_info *fs_info = root->fs_info; 5669 struct btrfs_caching_control *next; 5670 struct btrfs_caching_control *caching_ctl; 5671 struct btrfs_block_group_cache *cache; 5672 struct btrfs_space_info *space_info; 5673 5674 down_write(&fs_info->commit_root_sem); 5675 5676 list_for_each_entry_safe(caching_ctl, next, 5677 &fs_info->caching_block_groups, list) { 5678 cache = caching_ctl->block_group; 5679 if (block_group_cache_done(cache)) { 5680 cache->last_byte_to_unpin = (u64)-1; 5681 list_del_init(&caching_ctl->list); 5682 put_caching_control(caching_ctl); 5683 } else { 5684 cache->last_byte_to_unpin = caching_ctl->progress; 5685 } 5686 } 5687 5688 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5689 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5690 else 5691 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5692 5693 up_write(&fs_info->commit_root_sem); 5694 5695 list_for_each_entry_rcu(space_info, &fs_info->space_info, list) 5696 percpu_counter_set(&space_info->total_bytes_pinned, 0); 5697 5698 update_global_block_rsv(fs_info); 5699 } 5700 5701 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5702 { 5703 struct btrfs_fs_info *fs_info = root->fs_info; 5704 struct btrfs_block_group_cache *cache = NULL; 5705 struct btrfs_space_info *space_info; 5706 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5707 u64 len; 5708 bool readonly; 5709 5710 while (start <= end) { 5711 readonly = false; 5712 if (!cache || 5713 start >= cache->key.objectid + cache->key.offset) { 5714 if (cache) 5715 btrfs_put_block_group(cache); 5716 cache = btrfs_lookup_block_group(fs_info, start); 5717 BUG_ON(!cache); /* Logic error */ 5718 } 5719 5720 len = cache->key.objectid + cache->key.offset - start; 5721 len = min(len, end + 1 - start); 5722 5723 if (start < cache->last_byte_to_unpin) { 5724 len = min(len, cache->last_byte_to_unpin - start); 5725 btrfs_add_free_space(cache, start, len); 5726 } 5727 5728 start += len; 5729 space_info = cache->space_info; 5730 5731 spin_lock(&space_info->lock); 5732 spin_lock(&cache->lock); 5733 cache->pinned -= len; 5734 space_info->bytes_pinned -= len; 5735 if (cache->ro) { 5736 space_info->bytes_readonly += len; 5737 readonly = true; 5738 } 5739 spin_unlock(&cache->lock); 5740 if (!readonly && global_rsv->space_info == space_info) { 5741 spin_lock(&global_rsv->lock); 5742 if (!global_rsv->full) { 5743 len = min(len, global_rsv->size - 5744 global_rsv->reserved); 5745 global_rsv->reserved += len; 5746 space_info->bytes_may_use += len; 5747 if (global_rsv->reserved >= global_rsv->size) 5748 global_rsv->full = 1; 5749 } 5750 spin_unlock(&global_rsv->lock); 5751 } 5752 spin_unlock(&space_info->lock); 5753 } 5754 5755 if (cache) 5756 btrfs_put_block_group(cache); 5757 return 0; 5758 } 5759 5760 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5761 struct btrfs_root *root) 5762 { 5763 struct btrfs_fs_info *fs_info = root->fs_info; 5764 struct extent_io_tree *unpin; 5765 u64 start; 5766 u64 end; 5767 int ret; 5768 5769 if (trans->aborted) 5770 return 0; 5771 5772 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5773 unpin = &fs_info->freed_extents[1]; 5774 else 5775 unpin = &fs_info->freed_extents[0]; 5776 5777 while (1) { 5778 ret = find_first_extent_bit(unpin, 0, &start, &end, 5779 EXTENT_DIRTY, NULL); 5780 if (ret) 5781 break; 5782 5783 if (btrfs_test_opt(root, DISCARD)) 5784 ret = btrfs_discard_extent(root, start, 5785 end + 1 - start, NULL); 5786 5787 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5788 unpin_extent_range(root, start, end); 5789 cond_resched(); 5790 } 5791 5792 return 0; 5793 } 5794 5795 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 5796 u64 owner, u64 root_objectid) 5797 { 5798 struct btrfs_space_info *space_info; 5799 u64 flags; 5800 5801 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5802 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 5803 flags = BTRFS_BLOCK_GROUP_SYSTEM; 5804 else 5805 flags = BTRFS_BLOCK_GROUP_METADATA; 5806 } else { 5807 flags = BTRFS_BLOCK_GROUP_DATA; 5808 } 5809 5810 space_info = __find_space_info(fs_info, flags); 5811 BUG_ON(!space_info); /* Logic bug */ 5812 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 5813 } 5814 5815 5816 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5817 struct btrfs_root *root, 5818 u64 bytenr, u64 num_bytes, u64 parent, 5819 u64 root_objectid, u64 owner_objectid, 5820 u64 owner_offset, int refs_to_drop, 5821 struct btrfs_delayed_extent_op *extent_op, 5822 int no_quota) 5823 { 5824 struct btrfs_key key; 5825 struct btrfs_path *path; 5826 struct btrfs_fs_info *info = root->fs_info; 5827 struct btrfs_root *extent_root = info->extent_root; 5828 struct extent_buffer *leaf; 5829 struct btrfs_extent_item *ei; 5830 struct btrfs_extent_inline_ref *iref; 5831 int ret; 5832 int is_data; 5833 int extent_slot = 0; 5834 int found_extent = 0; 5835 int num_to_del = 1; 5836 u32 item_size; 5837 u64 refs; 5838 int last_ref = 0; 5839 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; 5840 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5841 SKINNY_METADATA); 5842 5843 if (!info->quota_enabled || !is_fstree(root_objectid)) 5844 no_quota = 1; 5845 5846 path = btrfs_alloc_path(); 5847 if (!path) 5848 return -ENOMEM; 5849 5850 path->reada = 1; 5851 path->leave_spinning = 1; 5852 5853 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5854 BUG_ON(!is_data && refs_to_drop != 1); 5855 5856 if (is_data) 5857 skinny_metadata = 0; 5858 5859 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5860 bytenr, num_bytes, parent, 5861 root_objectid, owner_objectid, 5862 owner_offset); 5863 if (ret == 0) { 5864 extent_slot = path->slots[0]; 5865 while (extent_slot >= 0) { 5866 btrfs_item_key_to_cpu(path->nodes[0], &key, 5867 extent_slot); 5868 if (key.objectid != bytenr) 5869 break; 5870 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5871 key.offset == num_bytes) { 5872 found_extent = 1; 5873 break; 5874 } 5875 if (key.type == BTRFS_METADATA_ITEM_KEY && 5876 key.offset == owner_objectid) { 5877 found_extent = 1; 5878 break; 5879 } 5880 if (path->slots[0] - extent_slot > 5) 5881 break; 5882 extent_slot--; 5883 } 5884 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5885 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5886 if (found_extent && item_size < sizeof(*ei)) 5887 found_extent = 0; 5888 #endif 5889 if (!found_extent) { 5890 BUG_ON(iref); 5891 ret = remove_extent_backref(trans, extent_root, path, 5892 NULL, refs_to_drop, 5893 is_data, &last_ref); 5894 if (ret) { 5895 btrfs_abort_transaction(trans, extent_root, ret); 5896 goto out; 5897 } 5898 btrfs_release_path(path); 5899 path->leave_spinning = 1; 5900 5901 key.objectid = bytenr; 5902 key.type = BTRFS_EXTENT_ITEM_KEY; 5903 key.offset = num_bytes; 5904 5905 if (!is_data && skinny_metadata) { 5906 key.type = BTRFS_METADATA_ITEM_KEY; 5907 key.offset = owner_objectid; 5908 } 5909 5910 ret = btrfs_search_slot(trans, extent_root, 5911 &key, path, -1, 1); 5912 if (ret > 0 && skinny_metadata && path->slots[0]) { 5913 /* 5914 * Couldn't find our skinny metadata item, 5915 * see if we have ye olde extent item. 5916 */ 5917 path->slots[0]--; 5918 btrfs_item_key_to_cpu(path->nodes[0], &key, 5919 path->slots[0]); 5920 if (key.objectid == bytenr && 5921 key.type == BTRFS_EXTENT_ITEM_KEY && 5922 key.offset == num_bytes) 5923 ret = 0; 5924 } 5925 5926 if (ret > 0 && skinny_metadata) { 5927 skinny_metadata = false; 5928 key.objectid = bytenr; 5929 key.type = BTRFS_EXTENT_ITEM_KEY; 5930 key.offset = num_bytes; 5931 btrfs_release_path(path); 5932 ret = btrfs_search_slot(trans, extent_root, 5933 &key, path, -1, 1); 5934 } 5935 5936 if (ret) { 5937 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5938 ret, bytenr); 5939 if (ret > 0) 5940 btrfs_print_leaf(extent_root, 5941 path->nodes[0]); 5942 } 5943 if (ret < 0) { 5944 btrfs_abort_transaction(trans, extent_root, ret); 5945 goto out; 5946 } 5947 extent_slot = path->slots[0]; 5948 } 5949 } else if (WARN_ON(ret == -ENOENT)) { 5950 btrfs_print_leaf(extent_root, path->nodes[0]); 5951 btrfs_err(info, 5952 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5953 bytenr, parent, root_objectid, owner_objectid, 5954 owner_offset); 5955 btrfs_abort_transaction(trans, extent_root, ret); 5956 goto out; 5957 } else { 5958 btrfs_abort_transaction(trans, extent_root, ret); 5959 goto out; 5960 } 5961 5962 leaf = path->nodes[0]; 5963 item_size = btrfs_item_size_nr(leaf, extent_slot); 5964 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5965 if (item_size < sizeof(*ei)) { 5966 BUG_ON(found_extent || extent_slot != path->slots[0]); 5967 ret = convert_extent_item_v0(trans, extent_root, path, 5968 owner_objectid, 0); 5969 if (ret < 0) { 5970 btrfs_abort_transaction(trans, extent_root, ret); 5971 goto out; 5972 } 5973 5974 btrfs_release_path(path); 5975 path->leave_spinning = 1; 5976 5977 key.objectid = bytenr; 5978 key.type = BTRFS_EXTENT_ITEM_KEY; 5979 key.offset = num_bytes; 5980 5981 ret = btrfs_search_slot(trans, extent_root, &key, path, 5982 -1, 1); 5983 if (ret) { 5984 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5985 ret, bytenr); 5986 btrfs_print_leaf(extent_root, path->nodes[0]); 5987 } 5988 if (ret < 0) { 5989 btrfs_abort_transaction(trans, extent_root, ret); 5990 goto out; 5991 } 5992 5993 extent_slot = path->slots[0]; 5994 leaf = path->nodes[0]; 5995 item_size = btrfs_item_size_nr(leaf, extent_slot); 5996 } 5997 #endif 5998 BUG_ON(item_size < sizeof(*ei)); 5999 ei = btrfs_item_ptr(leaf, extent_slot, 6000 struct btrfs_extent_item); 6001 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6002 key.type == BTRFS_EXTENT_ITEM_KEY) { 6003 struct btrfs_tree_block_info *bi; 6004 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6005 bi = (struct btrfs_tree_block_info *)(ei + 1); 6006 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6007 } 6008 6009 refs = btrfs_extent_refs(leaf, ei); 6010 if (refs < refs_to_drop) { 6011 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 6012 "for bytenr %Lu", refs_to_drop, refs, bytenr); 6013 ret = -EINVAL; 6014 btrfs_abort_transaction(trans, extent_root, ret); 6015 goto out; 6016 } 6017 refs -= refs_to_drop; 6018 6019 if (refs > 0) { 6020 type = BTRFS_QGROUP_OPER_SUB_SHARED; 6021 if (extent_op) 6022 __run_delayed_extent_op(extent_op, leaf, ei); 6023 /* 6024 * In the case of inline back ref, reference count will 6025 * be updated by remove_extent_backref 6026 */ 6027 if (iref) { 6028 BUG_ON(!found_extent); 6029 } else { 6030 btrfs_set_extent_refs(leaf, ei, refs); 6031 btrfs_mark_buffer_dirty(leaf); 6032 } 6033 if (found_extent) { 6034 ret = remove_extent_backref(trans, extent_root, path, 6035 iref, refs_to_drop, 6036 is_data, &last_ref); 6037 if (ret) { 6038 btrfs_abort_transaction(trans, extent_root, ret); 6039 goto out; 6040 } 6041 } 6042 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 6043 root_objectid); 6044 } else { 6045 if (found_extent) { 6046 BUG_ON(is_data && refs_to_drop != 6047 extent_data_ref_count(root, path, iref)); 6048 if (iref) { 6049 BUG_ON(path->slots[0] != extent_slot); 6050 } else { 6051 BUG_ON(path->slots[0] != extent_slot + 1); 6052 path->slots[0] = extent_slot; 6053 num_to_del = 2; 6054 } 6055 } 6056 6057 last_ref = 1; 6058 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6059 num_to_del); 6060 if (ret) { 6061 btrfs_abort_transaction(trans, extent_root, ret); 6062 goto out; 6063 } 6064 btrfs_release_path(path); 6065 6066 if (is_data) { 6067 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 6068 if (ret) { 6069 btrfs_abort_transaction(trans, extent_root, ret); 6070 goto out; 6071 } 6072 } 6073 6074 ret = update_block_group(root, bytenr, num_bytes, 0); 6075 if (ret) { 6076 btrfs_abort_transaction(trans, extent_root, ret); 6077 goto out; 6078 } 6079 } 6080 btrfs_release_path(path); 6081 6082 /* Deal with the quota accounting */ 6083 if (!ret && last_ref && !no_quota) { 6084 int mod_seq = 0; 6085 6086 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && 6087 type == BTRFS_QGROUP_OPER_SUB_SHARED) 6088 mod_seq = 1; 6089 6090 ret = btrfs_qgroup_record_ref(trans, info, root_objectid, 6091 bytenr, num_bytes, type, 6092 mod_seq); 6093 } 6094 out: 6095 btrfs_free_path(path); 6096 return ret; 6097 } 6098 6099 /* 6100 * when we free an block, it is possible (and likely) that we free the last 6101 * delayed ref for that extent as well. This searches the delayed ref tree for 6102 * a given extent, and if there are no other delayed refs to be processed, it 6103 * removes it from the tree. 6104 */ 6105 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 6106 struct btrfs_root *root, u64 bytenr) 6107 { 6108 struct btrfs_delayed_ref_head *head; 6109 struct btrfs_delayed_ref_root *delayed_refs; 6110 int ret = 0; 6111 6112 delayed_refs = &trans->transaction->delayed_refs; 6113 spin_lock(&delayed_refs->lock); 6114 head = btrfs_find_delayed_ref_head(trans, bytenr); 6115 if (!head) 6116 goto out_delayed_unlock; 6117 6118 spin_lock(&head->lock); 6119 if (rb_first(&head->ref_root)) 6120 goto out; 6121 6122 if (head->extent_op) { 6123 if (!head->must_insert_reserved) 6124 goto out; 6125 btrfs_free_delayed_extent_op(head->extent_op); 6126 head->extent_op = NULL; 6127 } 6128 6129 /* 6130 * waiting for the lock here would deadlock. If someone else has it 6131 * locked they are already in the process of dropping it anyway 6132 */ 6133 if (!mutex_trylock(&head->mutex)) 6134 goto out; 6135 6136 /* 6137 * at this point we have a head with no other entries. Go 6138 * ahead and process it. 6139 */ 6140 head->node.in_tree = 0; 6141 rb_erase(&head->href_node, &delayed_refs->href_root); 6142 6143 atomic_dec(&delayed_refs->num_entries); 6144 6145 /* 6146 * we don't take a ref on the node because we're removing it from the 6147 * tree, so we just steal the ref the tree was holding. 6148 */ 6149 delayed_refs->num_heads--; 6150 if (head->processing == 0) 6151 delayed_refs->num_heads_ready--; 6152 head->processing = 0; 6153 spin_unlock(&head->lock); 6154 spin_unlock(&delayed_refs->lock); 6155 6156 BUG_ON(head->extent_op); 6157 if (head->must_insert_reserved) 6158 ret = 1; 6159 6160 mutex_unlock(&head->mutex); 6161 btrfs_put_delayed_ref(&head->node); 6162 return ret; 6163 out: 6164 spin_unlock(&head->lock); 6165 6166 out_delayed_unlock: 6167 spin_unlock(&delayed_refs->lock); 6168 return 0; 6169 } 6170 6171 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 6172 struct btrfs_root *root, 6173 struct extent_buffer *buf, 6174 u64 parent, int last_ref) 6175 { 6176 struct btrfs_block_group_cache *cache = NULL; 6177 int pin = 1; 6178 int ret; 6179 6180 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6181 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6182 buf->start, buf->len, 6183 parent, root->root_key.objectid, 6184 btrfs_header_level(buf), 6185 BTRFS_DROP_DELAYED_REF, NULL, 0); 6186 BUG_ON(ret); /* -ENOMEM */ 6187 } 6188 6189 if (!last_ref) 6190 return; 6191 6192 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 6193 6194 if (btrfs_header_generation(buf) == trans->transid) { 6195 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6196 ret = check_ref_cleanup(trans, root, buf->start); 6197 if (!ret) 6198 goto out; 6199 } 6200 6201 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6202 pin_down_extent(root, cache, buf->start, buf->len, 1); 6203 goto out; 6204 } 6205 6206 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6207 6208 btrfs_add_free_space(cache, buf->start, buf->len); 6209 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 6210 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6211 pin = 0; 6212 } 6213 out: 6214 if (pin) 6215 add_pinned_bytes(root->fs_info, buf->len, 6216 btrfs_header_level(buf), 6217 root->root_key.objectid); 6218 6219 /* 6220 * Deleting the buffer, clear the corrupt flag since it doesn't matter 6221 * anymore. 6222 */ 6223 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6224 btrfs_put_block_group(cache); 6225 } 6226 6227 /* Can return -ENOMEM */ 6228 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6229 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6230 u64 owner, u64 offset, int no_quota) 6231 { 6232 int ret; 6233 struct btrfs_fs_info *fs_info = root->fs_info; 6234 6235 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6236 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 6237 return 0; 6238 #endif 6239 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6240 6241 /* 6242 * tree log blocks never actually go into the extent allocation 6243 * tree, just update pinning info and exit early. 6244 */ 6245 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 6246 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 6247 /* unlocks the pinned mutex */ 6248 btrfs_pin_extent(root, bytenr, num_bytes, 1); 6249 ret = 0; 6250 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6251 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6252 num_bytes, 6253 parent, root_objectid, (int)owner, 6254 BTRFS_DROP_DELAYED_REF, NULL, no_quota); 6255 } else { 6256 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6257 num_bytes, 6258 parent, root_objectid, owner, 6259 offset, BTRFS_DROP_DELAYED_REF, 6260 NULL, no_quota); 6261 } 6262 return ret; 6263 } 6264 6265 static u64 stripe_align(struct btrfs_root *root, 6266 struct btrfs_block_group_cache *cache, 6267 u64 val, u64 num_bytes) 6268 { 6269 u64 ret = ALIGN(val, root->stripesize); 6270 return ret; 6271 } 6272 6273 /* 6274 * when we wait for progress in the block group caching, its because 6275 * our allocation attempt failed at least once. So, we must sleep 6276 * and let some progress happen before we try again. 6277 * 6278 * This function will sleep at least once waiting for new free space to 6279 * show up, and then it will check the block group free space numbers 6280 * for our min num_bytes. Another option is to have it go ahead 6281 * and look in the rbtree for a free extent of a given size, but this 6282 * is a good start. 6283 * 6284 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 6285 * any of the information in this block group. 6286 */ 6287 static noinline void 6288 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 6289 u64 num_bytes) 6290 { 6291 struct btrfs_caching_control *caching_ctl; 6292 6293 caching_ctl = get_caching_control(cache); 6294 if (!caching_ctl) 6295 return; 6296 6297 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 6298 (cache->free_space_ctl->free_space >= num_bytes)); 6299 6300 put_caching_control(caching_ctl); 6301 } 6302 6303 static noinline int 6304 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 6305 { 6306 struct btrfs_caching_control *caching_ctl; 6307 int ret = 0; 6308 6309 caching_ctl = get_caching_control(cache); 6310 if (!caching_ctl) 6311 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 6312 6313 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 6314 if (cache->cached == BTRFS_CACHE_ERROR) 6315 ret = -EIO; 6316 put_caching_control(caching_ctl); 6317 return ret; 6318 } 6319 6320 int __get_raid_index(u64 flags) 6321 { 6322 if (flags & BTRFS_BLOCK_GROUP_RAID10) 6323 return BTRFS_RAID_RAID10; 6324 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 6325 return BTRFS_RAID_RAID1; 6326 else if (flags & BTRFS_BLOCK_GROUP_DUP) 6327 return BTRFS_RAID_DUP; 6328 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 6329 return BTRFS_RAID_RAID0; 6330 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 6331 return BTRFS_RAID_RAID5; 6332 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 6333 return BTRFS_RAID_RAID6; 6334 6335 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 6336 } 6337 6338 int get_block_group_index(struct btrfs_block_group_cache *cache) 6339 { 6340 return __get_raid_index(cache->flags); 6341 } 6342 6343 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 6344 [BTRFS_RAID_RAID10] = "raid10", 6345 [BTRFS_RAID_RAID1] = "raid1", 6346 [BTRFS_RAID_DUP] = "dup", 6347 [BTRFS_RAID_RAID0] = "raid0", 6348 [BTRFS_RAID_SINGLE] = "single", 6349 [BTRFS_RAID_RAID5] = "raid5", 6350 [BTRFS_RAID_RAID6] = "raid6", 6351 }; 6352 6353 static const char *get_raid_name(enum btrfs_raid_types type) 6354 { 6355 if (type >= BTRFS_NR_RAID_TYPES) 6356 return NULL; 6357 6358 return btrfs_raid_type_names[type]; 6359 } 6360 6361 enum btrfs_loop_type { 6362 LOOP_CACHING_NOWAIT = 0, 6363 LOOP_CACHING_WAIT = 1, 6364 LOOP_ALLOC_CHUNK = 2, 6365 LOOP_NO_EMPTY_SIZE = 3, 6366 }; 6367 6368 /* 6369 * walks the btree of allocated extents and find a hole of a given size. 6370 * The key ins is changed to record the hole: 6371 * ins->objectid == start position 6372 * ins->flags = BTRFS_EXTENT_ITEM_KEY 6373 * ins->offset == the size of the hole. 6374 * Any available blocks before search_start are skipped. 6375 * 6376 * If there is no suitable free space, we will record the max size of 6377 * the free space extent currently. 6378 */ 6379 static noinline int find_free_extent(struct btrfs_root *orig_root, 6380 u64 num_bytes, u64 empty_size, 6381 u64 hint_byte, struct btrfs_key *ins, 6382 u64 flags) 6383 { 6384 int ret = 0; 6385 struct btrfs_root *root = orig_root->fs_info->extent_root; 6386 struct btrfs_free_cluster *last_ptr = NULL; 6387 struct btrfs_block_group_cache *block_group = NULL; 6388 u64 search_start = 0; 6389 u64 max_extent_size = 0; 6390 int empty_cluster = 2 * 1024 * 1024; 6391 struct btrfs_space_info *space_info; 6392 int loop = 0; 6393 int index = __get_raid_index(flags); 6394 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 6395 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 6396 bool failed_cluster_refill = false; 6397 bool failed_alloc = false; 6398 bool use_cluster = true; 6399 bool have_caching_bg = false; 6400 6401 WARN_ON(num_bytes < root->sectorsize); 6402 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 6403 ins->objectid = 0; 6404 ins->offset = 0; 6405 6406 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 6407 6408 space_info = __find_space_info(root->fs_info, flags); 6409 if (!space_info) { 6410 btrfs_err(root->fs_info, "No space info for %llu", flags); 6411 return -ENOSPC; 6412 } 6413 6414 /* 6415 * If the space info is for both data and metadata it means we have a 6416 * small filesystem and we can't use the clustering stuff. 6417 */ 6418 if (btrfs_mixed_space_info(space_info)) 6419 use_cluster = false; 6420 6421 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 6422 last_ptr = &root->fs_info->meta_alloc_cluster; 6423 if (!btrfs_test_opt(root, SSD)) 6424 empty_cluster = 64 * 1024; 6425 } 6426 6427 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 6428 btrfs_test_opt(root, SSD)) { 6429 last_ptr = &root->fs_info->data_alloc_cluster; 6430 } 6431 6432 if (last_ptr) { 6433 spin_lock(&last_ptr->lock); 6434 if (last_ptr->block_group) 6435 hint_byte = last_ptr->window_start; 6436 spin_unlock(&last_ptr->lock); 6437 } 6438 6439 search_start = max(search_start, first_logical_byte(root, 0)); 6440 search_start = max(search_start, hint_byte); 6441 6442 if (!last_ptr) 6443 empty_cluster = 0; 6444 6445 if (search_start == hint_byte) { 6446 block_group = btrfs_lookup_block_group(root->fs_info, 6447 search_start); 6448 /* 6449 * we don't want to use the block group if it doesn't match our 6450 * allocation bits, or if its not cached. 6451 * 6452 * However if we are re-searching with an ideal block group 6453 * picked out then we don't care that the block group is cached. 6454 */ 6455 if (block_group && block_group_bits(block_group, flags) && 6456 block_group->cached != BTRFS_CACHE_NO) { 6457 down_read(&space_info->groups_sem); 6458 if (list_empty(&block_group->list) || 6459 block_group->ro) { 6460 /* 6461 * someone is removing this block group, 6462 * we can't jump into the have_block_group 6463 * target because our list pointers are not 6464 * valid 6465 */ 6466 btrfs_put_block_group(block_group); 6467 up_read(&space_info->groups_sem); 6468 } else { 6469 index = get_block_group_index(block_group); 6470 goto have_block_group; 6471 } 6472 } else if (block_group) { 6473 btrfs_put_block_group(block_group); 6474 } 6475 } 6476 search: 6477 have_caching_bg = false; 6478 down_read(&space_info->groups_sem); 6479 list_for_each_entry(block_group, &space_info->block_groups[index], 6480 list) { 6481 u64 offset; 6482 int cached; 6483 6484 btrfs_get_block_group(block_group); 6485 search_start = block_group->key.objectid; 6486 6487 /* 6488 * this can happen if we end up cycling through all the 6489 * raid types, but we want to make sure we only allocate 6490 * for the proper type. 6491 */ 6492 if (!block_group_bits(block_group, flags)) { 6493 u64 extra = BTRFS_BLOCK_GROUP_DUP | 6494 BTRFS_BLOCK_GROUP_RAID1 | 6495 BTRFS_BLOCK_GROUP_RAID5 | 6496 BTRFS_BLOCK_GROUP_RAID6 | 6497 BTRFS_BLOCK_GROUP_RAID10; 6498 6499 /* 6500 * if they asked for extra copies and this block group 6501 * doesn't provide them, bail. This does allow us to 6502 * fill raid0 from raid1. 6503 */ 6504 if ((flags & extra) && !(block_group->flags & extra)) 6505 goto loop; 6506 } 6507 6508 have_block_group: 6509 cached = block_group_cache_done(block_group); 6510 if (unlikely(!cached)) { 6511 ret = cache_block_group(block_group, 0); 6512 BUG_ON(ret < 0); 6513 ret = 0; 6514 } 6515 6516 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 6517 goto loop; 6518 if (unlikely(block_group->ro)) 6519 goto loop; 6520 6521 /* 6522 * Ok we want to try and use the cluster allocator, so 6523 * lets look there 6524 */ 6525 if (last_ptr) { 6526 struct btrfs_block_group_cache *used_block_group; 6527 unsigned long aligned_cluster; 6528 /* 6529 * the refill lock keeps out other 6530 * people trying to start a new cluster 6531 */ 6532 spin_lock(&last_ptr->refill_lock); 6533 used_block_group = last_ptr->block_group; 6534 if (used_block_group != block_group && 6535 (!used_block_group || 6536 used_block_group->ro || 6537 !block_group_bits(used_block_group, flags))) 6538 goto refill_cluster; 6539 6540 if (used_block_group != block_group) 6541 btrfs_get_block_group(used_block_group); 6542 6543 offset = btrfs_alloc_from_cluster(used_block_group, 6544 last_ptr, 6545 num_bytes, 6546 used_block_group->key.objectid, 6547 &max_extent_size); 6548 if (offset) { 6549 /* we have a block, we're done */ 6550 spin_unlock(&last_ptr->refill_lock); 6551 trace_btrfs_reserve_extent_cluster(root, 6552 used_block_group, 6553 search_start, num_bytes); 6554 if (used_block_group != block_group) { 6555 btrfs_put_block_group(block_group); 6556 block_group = used_block_group; 6557 } 6558 goto checks; 6559 } 6560 6561 WARN_ON(last_ptr->block_group != used_block_group); 6562 if (used_block_group != block_group) 6563 btrfs_put_block_group(used_block_group); 6564 refill_cluster: 6565 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6566 * set up a new clusters, so lets just skip it 6567 * and let the allocator find whatever block 6568 * it can find. If we reach this point, we 6569 * will have tried the cluster allocator 6570 * plenty of times and not have found 6571 * anything, so we are likely way too 6572 * fragmented for the clustering stuff to find 6573 * anything. 6574 * 6575 * However, if the cluster is taken from the 6576 * current block group, release the cluster 6577 * first, so that we stand a better chance of 6578 * succeeding in the unclustered 6579 * allocation. */ 6580 if (loop >= LOOP_NO_EMPTY_SIZE && 6581 last_ptr->block_group != block_group) { 6582 spin_unlock(&last_ptr->refill_lock); 6583 goto unclustered_alloc; 6584 } 6585 6586 /* 6587 * this cluster didn't work out, free it and 6588 * start over 6589 */ 6590 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6591 6592 if (loop >= LOOP_NO_EMPTY_SIZE) { 6593 spin_unlock(&last_ptr->refill_lock); 6594 goto unclustered_alloc; 6595 } 6596 6597 aligned_cluster = max_t(unsigned long, 6598 empty_cluster + empty_size, 6599 block_group->full_stripe_len); 6600 6601 /* allocate a cluster in this block group */ 6602 ret = btrfs_find_space_cluster(root, block_group, 6603 last_ptr, search_start, 6604 num_bytes, 6605 aligned_cluster); 6606 if (ret == 0) { 6607 /* 6608 * now pull our allocation out of this 6609 * cluster 6610 */ 6611 offset = btrfs_alloc_from_cluster(block_group, 6612 last_ptr, 6613 num_bytes, 6614 search_start, 6615 &max_extent_size); 6616 if (offset) { 6617 /* we found one, proceed */ 6618 spin_unlock(&last_ptr->refill_lock); 6619 trace_btrfs_reserve_extent_cluster(root, 6620 block_group, search_start, 6621 num_bytes); 6622 goto checks; 6623 } 6624 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6625 && !failed_cluster_refill) { 6626 spin_unlock(&last_ptr->refill_lock); 6627 6628 failed_cluster_refill = true; 6629 wait_block_group_cache_progress(block_group, 6630 num_bytes + empty_cluster + empty_size); 6631 goto have_block_group; 6632 } 6633 6634 /* 6635 * at this point we either didn't find a cluster 6636 * or we weren't able to allocate a block from our 6637 * cluster. Free the cluster we've been trying 6638 * to use, and go to the next block group 6639 */ 6640 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6641 spin_unlock(&last_ptr->refill_lock); 6642 goto loop; 6643 } 6644 6645 unclustered_alloc: 6646 spin_lock(&block_group->free_space_ctl->tree_lock); 6647 if (cached && 6648 block_group->free_space_ctl->free_space < 6649 num_bytes + empty_cluster + empty_size) { 6650 if (block_group->free_space_ctl->free_space > 6651 max_extent_size) 6652 max_extent_size = 6653 block_group->free_space_ctl->free_space; 6654 spin_unlock(&block_group->free_space_ctl->tree_lock); 6655 goto loop; 6656 } 6657 spin_unlock(&block_group->free_space_ctl->tree_lock); 6658 6659 offset = btrfs_find_space_for_alloc(block_group, search_start, 6660 num_bytes, empty_size, 6661 &max_extent_size); 6662 /* 6663 * If we didn't find a chunk, and we haven't failed on this 6664 * block group before, and this block group is in the middle of 6665 * caching and we are ok with waiting, then go ahead and wait 6666 * for progress to be made, and set failed_alloc to true. 6667 * 6668 * If failed_alloc is true then we've already waited on this 6669 * block group once and should move on to the next block group. 6670 */ 6671 if (!offset && !failed_alloc && !cached && 6672 loop > LOOP_CACHING_NOWAIT) { 6673 wait_block_group_cache_progress(block_group, 6674 num_bytes + empty_size); 6675 failed_alloc = true; 6676 goto have_block_group; 6677 } else if (!offset) { 6678 if (!cached) 6679 have_caching_bg = true; 6680 goto loop; 6681 } 6682 checks: 6683 search_start = stripe_align(root, block_group, 6684 offset, num_bytes); 6685 6686 /* move on to the next group */ 6687 if (search_start + num_bytes > 6688 block_group->key.objectid + block_group->key.offset) { 6689 btrfs_add_free_space(block_group, offset, num_bytes); 6690 goto loop; 6691 } 6692 6693 if (offset < search_start) 6694 btrfs_add_free_space(block_group, offset, 6695 search_start - offset); 6696 BUG_ON(offset > search_start); 6697 6698 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 6699 alloc_type); 6700 if (ret == -EAGAIN) { 6701 btrfs_add_free_space(block_group, offset, num_bytes); 6702 goto loop; 6703 } 6704 6705 /* we are all good, lets return */ 6706 ins->objectid = search_start; 6707 ins->offset = num_bytes; 6708 6709 trace_btrfs_reserve_extent(orig_root, block_group, 6710 search_start, num_bytes); 6711 btrfs_put_block_group(block_group); 6712 break; 6713 loop: 6714 failed_cluster_refill = false; 6715 failed_alloc = false; 6716 BUG_ON(index != get_block_group_index(block_group)); 6717 btrfs_put_block_group(block_group); 6718 } 6719 up_read(&space_info->groups_sem); 6720 6721 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6722 goto search; 6723 6724 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6725 goto search; 6726 6727 /* 6728 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6729 * caching kthreads as we move along 6730 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6731 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6732 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6733 * again 6734 */ 6735 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6736 index = 0; 6737 loop++; 6738 if (loop == LOOP_ALLOC_CHUNK) { 6739 struct btrfs_trans_handle *trans; 6740 int exist = 0; 6741 6742 trans = current->journal_info; 6743 if (trans) 6744 exist = 1; 6745 else 6746 trans = btrfs_join_transaction(root); 6747 6748 if (IS_ERR(trans)) { 6749 ret = PTR_ERR(trans); 6750 goto out; 6751 } 6752 6753 ret = do_chunk_alloc(trans, root, flags, 6754 CHUNK_ALLOC_FORCE); 6755 /* 6756 * Do not bail out on ENOSPC since we 6757 * can do more things. 6758 */ 6759 if (ret < 0 && ret != -ENOSPC) 6760 btrfs_abort_transaction(trans, 6761 root, ret); 6762 else 6763 ret = 0; 6764 if (!exist) 6765 btrfs_end_transaction(trans, root); 6766 if (ret) 6767 goto out; 6768 } 6769 6770 if (loop == LOOP_NO_EMPTY_SIZE) { 6771 empty_size = 0; 6772 empty_cluster = 0; 6773 } 6774 6775 goto search; 6776 } else if (!ins->objectid) { 6777 ret = -ENOSPC; 6778 } else if (ins->objectid) { 6779 ret = 0; 6780 } 6781 out: 6782 if (ret == -ENOSPC) 6783 ins->offset = max_extent_size; 6784 return ret; 6785 } 6786 6787 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6788 int dump_block_groups) 6789 { 6790 struct btrfs_block_group_cache *cache; 6791 int index = 0; 6792 6793 spin_lock(&info->lock); 6794 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", 6795 info->flags, 6796 info->total_bytes - info->bytes_used - info->bytes_pinned - 6797 info->bytes_reserved - info->bytes_readonly, 6798 (info->full) ? "" : "not "); 6799 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " 6800 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6801 info->total_bytes, info->bytes_used, info->bytes_pinned, 6802 info->bytes_reserved, info->bytes_may_use, 6803 info->bytes_readonly); 6804 spin_unlock(&info->lock); 6805 6806 if (!dump_block_groups) 6807 return; 6808 6809 down_read(&info->groups_sem); 6810 again: 6811 list_for_each_entry(cache, &info->block_groups[index], list) { 6812 spin_lock(&cache->lock); 6813 printk(KERN_INFO "BTRFS: " 6814 "block group %llu has %llu bytes, " 6815 "%llu used %llu pinned %llu reserved %s\n", 6816 cache->key.objectid, cache->key.offset, 6817 btrfs_block_group_used(&cache->item), cache->pinned, 6818 cache->reserved, cache->ro ? "[readonly]" : ""); 6819 btrfs_dump_free_space(cache, bytes); 6820 spin_unlock(&cache->lock); 6821 } 6822 if (++index < BTRFS_NR_RAID_TYPES) 6823 goto again; 6824 up_read(&info->groups_sem); 6825 } 6826 6827 int btrfs_reserve_extent(struct btrfs_root *root, 6828 u64 num_bytes, u64 min_alloc_size, 6829 u64 empty_size, u64 hint_byte, 6830 struct btrfs_key *ins, int is_data) 6831 { 6832 bool final_tried = false; 6833 u64 flags; 6834 int ret; 6835 6836 flags = btrfs_get_alloc_profile(root, is_data); 6837 again: 6838 WARN_ON(num_bytes < root->sectorsize); 6839 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 6840 flags); 6841 6842 if (ret == -ENOSPC) { 6843 if (!final_tried && ins->offset) { 6844 num_bytes = min(num_bytes >> 1, ins->offset); 6845 num_bytes = round_down(num_bytes, root->sectorsize); 6846 num_bytes = max(num_bytes, min_alloc_size); 6847 if (num_bytes == min_alloc_size) 6848 final_tried = true; 6849 goto again; 6850 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6851 struct btrfs_space_info *sinfo; 6852 6853 sinfo = __find_space_info(root->fs_info, flags); 6854 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6855 flags, num_bytes); 6856 if (sinfo) 6857 dump_space_info(sinfo, num_bytes, 1); 6858 } 6859 } 6860 6861 return ret; 6862 } 6863 6864 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6865 u64 start, u64 len, int pin) 6866 { 6867 struct btrfs_block_group_cache *cache; 6868 int ret = 0; 6869 6870 cache = btrfs_lookup_block_group(root->fs_info, start); 6871 if (!cache) { 6872 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6873 start); 6874 return -ENOSPC; 6875 } 6876 6877 if (btrfs_test_opt(root, DISCARD)) 6878 ret = btrfs_discard_extent(root, start, len, NULL); 6879 6880 if (pin) 6881 pin_down_extent(root, cache, start, len, 1); 6882 else { 6883 btrfs_add_free_space(cache, start, len); 6884 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); 6885 } 6886 btrfs_put_block_group(cache); 6887 6888 trace_btrfs_reserved_extent_free(root, start, len); 6889 6890 return ret; 6891 } 6892 6893 int btrfs_free_reserved_extent(struct btrfs_root *root, 6894 u64 start, u64 len) 6895 { 6896 return __btrfs_free_reserved_extent(root, start, len, 0); 6897 } 6898 6899 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6900 u64 start, u64 len) 6901 { 6902 return __btrfs_free_reserved_extent(root, start, len, 1); 6903 } 6904 6905 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6906 struct btrfs_root *root, 6907 u64 parent, u64 root_objectid, 6908 u64 flags, u64 owner, u64 offset, 6909 struct btrfs_key *ins, int ref_mod) 6910 { 6911 int ret; 6912 struct btrfs_fs_info *fs_info = root->fs_info; 6913 struct btrfs_extent_item *extent_item; 6914 struct btrfs_extent_inline_ref *iref; 6915 struct btrfs_path *path; 6916 struct extent_buffer *leaf; 6917 int type; 6918 u32 size; 6919 6920 if (parent > 0) 6921 type = BTRFS_SHARED_DATA_REF_KEY; 6922 else 6923 type = BTRFS_EXTENT_DATA_REF_KEY; 6924 6925 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 6926 6927 path = btrfs_alloc_path(); 6928 if (!path) 6929 return -ENOMEM; 6930 6931 path->leave_spinning = 1; 6932 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 6933 ins, size); 6934 if (ret) { 6935 btrfs_free_path(path); 6936 return ret; 6937 } 6938 6939 leaf = path->nodes[0]; 6940 extent_item = btrfs_item_ptr(leaf, path->slots[0], 6941 struct btrfs_extent_item); 6942 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 6943 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 6944 btrfs_set_extent_flags(leaf, extent_item, 6945 flags | BTRFS_EXTENT_FLAG_DATA); 6946 6947 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 6948 btrfs_set_extent_inline_ref_type(leaf, iref, type); 6949 if (parent > 0) { 6950 struct btrfs_shared_data_ref *ref; 6951 ref = (struct btrfs_shared_data_ref *)(iref + 1); 6952 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 6953 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 6954 } else { 6955 struct btrfs_extent_data_ref *ref; 6956 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 6957 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 6958 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 6959 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 6960 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 6961 } 6962 6963 btrfs_mark_buffer_dirty(path->nodes[0]); 6964 btrfs_free_path(path); 6965 6966 /* Always set parent to 0 here since its exclusive anyway. */ 6967 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 6968 ins->objectid, ins->offset, 6969 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 6970 if (ret) 6971 return ret; 6972 6973 ret = update_block_group(root, ins->objectid, ins->offset, 1); 6974 if (ret) { /* -ENOENT, logic error */ 6975 btrfs_err(fs_info, "update block group failed for %llu %llu", 6976 ins->objectid, ins->offset); 6977 BUG(); 6978 } 6979 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 6980 return ret; 6981 } 6982 6983 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 6984 struct btrfs_root *root, 6985 u64 parent, u64 root_objectid, 6986 u64 flags, struct btrfs_disk_key *key, 6987 int level, struct btrfs_key *ins, 6988 int no_quota) 6989 { 6990 int ret; 6991 struct btrfs_fs_info *fs_info = root->fs_info; 6992 struct btrfs_extent_item *extent_item; 6993 struct btrfs_tree_block_info *block_info; 6994 struct btrfs_extent_inline_ref *iref; 6995 struct btrfs_path *path; 6996 struct extent_buffer *leaf; 6997 u32 size = sizeof(*extent_item) + sizeof(*iref); 6998 u64 num_bytes = ins->offset; 6999 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7000 SKINNY_METADATA); 7001 7002 if (!skinny_metadata) 7003 size += sizeof(*block_info); 7004 7005 path = btrfs_alloc_path(); 7006 if (!path) { 7007 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7008 root->leafsize); 7009 return -ENOMEM; 7010 } 7011 7012 path->leave_spinning = 1; 7013 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7014 ins, size); 7015 if (ret) { 7016 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7017 root->leafsize); 7018 btrfs_free_path(path); 7019 return ret; 7020 } 7021 7022 leaf = path->nodes[0]; 7023 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7024 struct btrfs_extent_item); 7025 btrfs_set_extent_refs(leaf, extent_item, 1); 7026 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7027 btrfs_set_extent_flags(leaf, extent_item, 7028 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 7029 7030 if (skinny_metadata) { 7031 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7032 num_bytes = root->leafsize; 7033 } else { 7034 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7035 btrfs_set_tree_block_key(leaf, block_info, key); 7036 btrfs_set_tree_block_level(leaf, block_info, level); 7037 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 7038 } 7039 7040 if (parent > 0) { 7041 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 7042 btrfs_set_extent_inline_ref_type(leaf, iref, 7043 BTRFS_SHARED_BLOCK_REF_KEY); 7044 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7045 } else { 7046 btrfs_set_extent_inline_ref_type(leaf, iref, 7047 BTRFS_TREE_BLOCK_REF_KEY); 7048 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 7049 } 7050 7051 btrfs_mark_buffer_dirty(leaf); 7052 btrfs_free_path(path); 7053 7054 if (!no_quota) { 7055 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7056 ins->objectid, num_bytes, 7057 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7058 if (ret) 7059 return ret; 7060 } 7061 7062 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7063 if (ret) { /* -ENOENT, logic error */ 7064 btrfs_err(fs_info, "update block group failed for %llu %llu", 7065 ins->objectid, ins->offset); 7066 BUG(); 7067 } 7068 7069 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); 7070 return ret; 7071 } 7072 7073 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7074 struct btrfs_root *root, 7075 u64 root_objectid, u64 owner, 7076 u64 offset, struct btrfs_key *ins) 7077 { 7078 int ret; 7079 7080 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 7081 7082 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 7083 ins->offset, 0, 7084 root_objectid, owner, offset, 7085 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 7086 return ret; 7087 } 7088 7089 /* 7090 * this is used by the tree logging recovery code. It records that 7091 * an extent has been allocated and makes sure to clear the free 7092 * space cache bits as well 7093 */ 7094 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 7095 struct btrfs_root *root, 7096 u64 root_objectid, u64 owner, u64 offset, 7097 struct btrfs_key *ins) 7098 { 7099 int ret; 7100 struct btrfs_block_group_cache *block_group; 7101 7102 /* 7103 * Mixed block groups will exclude before processing the log so we only 7104 * need to do the exlude dance if this fs isn't mixed. 7105 */ 7106 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 7107 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 7108 if (ret) 7109 return ret; 7110 } 7111 7112 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 7113 if (!block_group) 7114 return -EINVAL; 7115 7116 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 7117 RESERVE_ALLOC_NO_ACCOUNT); 7118 BUG_ON(ret); /* logic error */ 7119 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 7120 0, owner, offset, ins, 1); 7121 btrfs_put_block_group(block_group); 7122 return ret; 7123 } 7124 7125 static struct extent_buffer * 7126 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 7127 u64 bytenr, u32 blocksize, int level) 7128 { 7129 struct extent_buffer *buf; 7130 7131 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 7132 if (!buf) 7133 return ERR_PTR(-ENOMEM); 7134 btrfs_set_header_generation(buf, trans->transid); 7135 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 7136 btrfs_tree_lock(buf); 7137 clean_tree_block(trans, root, buf); 7138 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 7139 7140 btrfs_set_lock_blocking(buf); 7141 btrfs_set_buffer_uptodate(buf); 7142 7143 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7144 /* 7145 * we allow two log transactions at a time, use different 7146 * EXENT bit to differentiate dirty pages. 7147 */ 7148 if (root->log_transid % 2 == 0) 7149 set_extent_dirty(&root->dirty_log_pages, buf->start, 7150 buf->start + buf->len - 1, GFP_NOFS); 7151 else 7152 set_extent_new(&root->dirty_log_pages, buf->start, 7153 buf->start + buf->len - 1, GFP_NOFS); 7154 } else { 7155 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7156 buf->start + buf->len - 1, GFP_NOFS); 7157 } 7158 trans->blocks_used++; 7159 /* this returns a buffer locked for blocking */ 7160 return buf; 7161 } 7162 7163 static struct btrfs_block_rsv * 7164 use_block_rsv(struct btrfs_trans_handle *trans, 7165 struct btrfs_root *root, u32 blocksize) 7166 { 7167 struct btrfs_block_rsv *block_rsv; 7168 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 7169 int ret; 7170 bool global_updated = false; 7171 7172 block_rsv = get_block_rsv(trans, root); 7173 7174 if (unlikely(block_rsv->size == 0)) 7175 goto try_reserve; 7176 again: 7177 ret = block_rsv_use_bytes(block_rsv, blocksize); 7178 if (!ret) 7179 return block_rsv; 7180 7181 if (block_rsv->failfast) 7182 return ERR_PTR(ret); 7183 7184 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 7185 global_updated = true; 7186 update_global_block_rsv(root->fs_info); 7187 goto again; 7188 } 7189 7190 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 7191 static DEFINE_RATELIMIT_STATE(_rs, 7192 DEFAULT_RATELIMIT_INTERVAL * 10, 7193 /*DEFAULT_RATELIMIT_BURST*/ 1); 7194 if (__ratelimit(&_rs)) 7195 WARN(1, KERN_DEBUG 7196 "BTRFS: block rsv returned %d\n", ret); 7197 } 7198 try_reserve: 7199 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 7200 BTRFS_RESERVE_NO_FLUSH); 7201 if (!ret) 7202 return block_rsv; 7203 /* 7204 * If we couldn't reserve metadata bytes try and use some from 7205 * the global reserve if its space type is the same as the global 7206 * reservation. 7207 */ 7208 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 7209 block_rsv->space_info == global_rsv->space_info) { 7210 ret = block_rsv_use_bytes(global_rsv, blocksize); 7211 if (!ret) 7212 return global_rsv; 7213 } 7214 return ERR_PTR(ret); 7215 } 7216 7217 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 7218 struct btrfs_block_rsv *block_rsv, u32 blocksize) 7219 { 7220 block_rsv_add_bytes(block_rsv, blocksize, 0); 7221 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 7222 } 7223 7224 /* 7225 * finds a free extent and does all the dirty work required for allocation 7226 * returns the key for the extent through ins, and a tree buffer for 7227 * the first block of the extent through buf. 7228 * 7229 * returns the tree buffer or NULL. 7230 */ 7231 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 7232 struct btrfs_root *root, u32 blocksize, 7233 u64 parent, u64 root_objectid, 7234 struct btrfs_disk_key *key, int level, 7235 u64 hint, u64 empty_size) 7236 { 7237 struct btrfs_key ins; 7238 struct btrfs_block_rsv *block_rsv; 7239 struct extent_buffer *buf; 7240 u64 flags = 0; 7241 int ret; 7242 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7243 SKINNY_METADATA); 7244 7245 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7246 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { 7247 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7248 blocksize, level); 7249 if (!IS_ERR(buf)) 7250 root->alloc_bytenr += blocksize; 7251 return buf; 7252 } 7253 #endif 7254 block_rsv = use_block_rsv(trans, root, blocksize); 7255 if (IS_ERR(block_rsv)) 7256 return ERR_CAST(block_rsv); 7257 7258 ret = btrfs_reserve_extent(root, blocksize, blocksize, 7259 empty_size, hint, &ins, 0); 7260 if (ret) { 7261 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 7262 return ERR_PTR(ret); 7263 } 7264 7265 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 7266 blocksize, level); 7267 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 7268 7269 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 7270 if (parent == 0) 7271 parent = ins.objectid; 7272 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 7273 } else 7274 BUG_ON(parent > 0); 7275 7276 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 7277 struct btrfs_delayed_extent_op *extent_op; 7278 extent_op = btrfs_alloc_delayed_extent_op(); 7279 BUG_ON(!extent_op); /* -ENOMEM */ 7280 if (key) 7281 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 7282 else 7283 memset(&extent_op->key, 0, sizeof(extent_op->key)); 7284 extent_op->flags_to_set = flags; 7285 if (skinny_metadata) 7286 extent_op->update_key = 0; 7287 else 7288 extent_op->update_key = 1; 7289 extent_op->update_flags = 1; 7290 extent_op->is_data = 0; 7291 extent_op->level = level; 7292 7293 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 7294 ins.objectid, 7295 ins.offset, parent, root_objectid, 7296 level, BTRFS_ADD_DELAYED_EXTENT, 7297 extent_op, 0); 7298 BUG_ON(ret); /* -ENOMEM */ 7299 } 7300 return buf; 7301 } 7302 7303 struct walk_control { 7304 u64 refs[BTRFS_MAX_LEVEL]; 7305 u64 flags[BTRFS_MAX_LEVEL]; 7306 struct btrfs_key update_progress; 7307 int stage; 7308 int level; 7309 int shared_level; 7310 int update_ref; 7311 int keep_locks; 7312 int reada_slot; 7313 int reada_count; 7314 int for_reloc; 7315 }; 7316 7317 #define DROP_REFERENCE 1 7318 #define UPDATE_BACKREF 2 7319 7320 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 7321 struct btrfs_root *root, 7322 struct walk_control *wc, 7323 struct btrfs_path *path) 7324 { 7325 u64 bytenr; 7326 u64 generation; 7327 u64 refs; 7328 u64 flags; 7329 u32 nritems; 7330 u32 blocksize; 7331 struct btrfs_key key; 7332 struct extent_buffer *eb; 7333 int ret; 7334 int slot; 7335 int nread = 0; 7336 7337 if (path->slots[wc->level] < wc->reada_slot) { 7338 wc->reada_count = wc->reada_count * 2 / 3; 7339 wc->reada_count = max(wc->reada_count, 2); 7340 } else { 7341 wc->reada_count = wc->reada_count * 3 / 2; 7342 wc->reada_count = min_t(int, wc->reada_count, 7343 BTRFS_NODEPTRS_PER_BLOCK(root)); 7344 } 7345 7346 eb = path->nodes[wc->level]; 7347 nritems = btrfs_header_nritems(eb); 7348 blocksize = btrfs_level_size(root, wc->level - 1); 7349 7350 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7351 if (nread >= wc->reada_count) 7352 break; 7353 7354 cond_resched(); 7355 bytenr = btrfs_node_blockptr(eb, slot); 7356 generation = btrfs_node_ptr_generation(eb, slot); 7357 7358 if (slot == path->slots[wc->level]) 7359 goto reada; 7360 7361 if (wc->stage == UPDATE_BACKREF && 7362 generation <= root->root_key.offset) 7363 continue; 7364 7365 /* We don't lock the tree block, it's OK to be racy here */ 7366 ret = btrfs_lookup_extent_info(trans, root, bytenr, 7367 wc->level - 1, 1, &refs, 7368 &flags); 7369 /* We don't care about errors in readahead. */ 7370 if (ret < 0) 7371 continue; 7372 BUG_ON(refs == 0); 7373 7374 if (wc->stage == DROP_REFERENCE) { 7375 if (refs == 1) 7376 goto reada; 7377 7378 if (wc->level == 1 && 7379 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7380 continue; 7381 if (!wc->update_ref || 7382 generation <= root->root_key.offset) 7383 continue; 7384 btrfs_node_key_to_cpu(eb, &key, slot); 7385 ret = btrfs_comp_cpu_keys(&key, 7386 &wc->update_progress); 7387 if (ret < 0) 7388 continue; 7389 } else { 7390 if (wc->level == 1 && 7391 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7392 continue; 7393 } 7394 reada: 7395 ret = readahead_tree_block(root, bytenr, blocksize, 7396 generation); 7397 if (ret) 7398 break; 7399 nread++; 7400 } 7401 wc->reada_slot = slot; 7402 } 7403 7404 /* 7405 * helper to process tree block while walking down the tree. 7406 * 7407 * when wc->stage == UPDATE_BACKREF, this function updates 7408 * back refs for pointers in the block. 7409 * 7410 * NOTE: return value 1 means we should stop walking down. 7411 */ 7412 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 7413 struct btrfs_root *root, 7414 struct btrfs_path *path, 7415 struct walk_control *wc, int lookup_info) 7416 { 7417 int level = wc->level; 7418 struct extent_buffer *eb = path->nodes[level]; 7419 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7420 int ret; 7421 7422 if (wc->stage == UPDATE_BACKREF && 7423 btrfs_header_owner(eb) != root->root_key.objectid) 7424 return 1; 7425 7426 /* 7427 * when reference count of tree block is 1, it won't increase 7428 * again. once full backref flag is set, we never clear it. 7429 */ 7430 if (lookup_info && 7431 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 7432 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 7433 BUG_ON(!path->locks[level]); 7434 ret = btrfs_lookup_extent_info(trans, root, 7435 eb->start, level, 1, 7436 &wc->refs[level], 7437 &wc->flags[level]); 7438 BUG_ON(ret == -ENOMEM); 7439 if (ret) 7440 return ret; 7441 BUG_ON(wc->refs[level] == 0); 7442 } 7443 7444 if (wc->stage == DROP_REFERENCE) { 7445 if (wc->refs[level] > 1) 7446 return 1; 7447 7448 if (path->locks[level] && !wc->keep_locks) { 7449 btrfs_tree_unlock_rw(eb, path->locks[level]); 7450 path->locks[level] = 0; 7451 } 7452 return 0; 7453 } 7454 7455 /* wc->stage == UPDATE_BACKREF */ 7456 if (!(wc->flags[level] & flag)) { 7457 BUG_ON(!path->locks[level]); 7458 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 7459 BUG_ON(ret); /* -ENOMEM */ 7460 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 7461 BUG_ON(ret); /* -ENOMEM */ 7462 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7463 eb->len, flag, 7464 btrfs_header_level(eb), 0); 7465 BUG_ON(ret); /* -ENOMEM */ 7466 wc->flags[level] |= flag; 7467 } 7468 7469 /* 7470 * the block is shared by multiple trees, so it's not good to 7471 * keep the tree lock 7472 */ 7473 if (path->locks[level] && level > 0) { 7474 btrfs_tree_unlock_rw(eb, path->locks[level]); 7475 path->locks[level] = 0; 7476 } 7477 return 0; 7478 } 7479 7480 /* 7481 * helper to process tree block pointer. 7482 * 7483 * when wc->stage == DROP_REFERENCE, this function checks 7484 * reference count of the block pointed to. if the block 7485 * is shared and we need update back refs for the subtree 7486 * rooted at the block, this function changes wc->stage to 7487 * UPDATE_BACKREF. if the block is shared and there is no 7488 * need to update back, this function drops the reference 7489 * to the block. 7490 * 7491 * NOTE: return value 1 means we should stop walking down. 7492 */ 7493 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 7494 struct btrfs_root *root, 7495 struct btrfs_path *path, 7496 struct walk_control *wc, int *lookup_info) 7497 { 7498 u64 bytenr; 7499 u64 generation; 7500 u64 parent; 7501 u32 blocksize; 7502 struct btrfs_key key; 7503 struct extent_buffer *next; 7504 int level = wc->level; 7505 int reada = 0; 7506 int ret = 0; 7507 7508 generation = btrfs_node_ptr_generation(path->nodes[level], 7509 path->slots[level]); 7510 /* 7511 * if the lower level block was created before the snapshot 7512 * was created, we know there is no need to update back refs 7513 * for the subtree 7514 */ 7515 if (wc->stage == UPDATE_BACKREF && 7516 generation <= root->root_key.offset) { 7517 *lookup_info = 1; 7518 return 1; 7519 } 7520 7521 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7522 blocksize = btrfs_level_size(root, level - 1); 7523 7524 next = btrfs_find_tree_block(root, bytenr, blocksize); 7525 if (!next) { 7526 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7527 if (!next) 7528 return -ENOMEM; 7529 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 7530 level - 1); 7531 reada = 1; 7532 } 7533 btrfs_tree_lock(next); 7534 btrfs_set_lock_blocking(next); 7535 7536 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7537 &wc->refs[level - 1], 7538 &wc->flags[level - 1]); 7539 if (ret < 0) { 7540 btrfs_tree_unlock(next); 7541 return ret; 7542 } 7543 7544 if (unlikely(wc->refs[level - 1] == 0)) { 7545 btrfs_err(root->fs_info, "Missing references."); 7546 BUG(); 7547 } 7548 *lookup_info = 0; 7549 7550 if (wc->stage == DROP_REFERENCE) { 7551 if (wc->refs[level - 1] > 1) { 7552 if (level == 1 && 7553 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7554 goto skip; 7555 7556 if (!wc->update_ref || 7557 generation <= root->root_key.offset) 7558 goto skip; 7559 7560 btrfs_node_key_to_cpu(path->nodes[level], &key, 7561 path->slots[level]); 7562 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7563 if (ret < 0) 7564 goto skip; 7565 7566 wc->stage = UPDATE_BACKREF; 7567 wc->shared_level = level - 1; 7568 } 7569 } else { 7570 if (level == 1 && 7571 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7572 goto skip; 7573 } 7574 7575 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7576 btrfs_tree_unlock(next); 7577 free_extent_buffer(next); 7578 next = NULL; 7579 *lookup_info = 1; 7580 } 7581 7582 if (!next) { 7583 if (reada && level == 1) 7584 reada_walk_down(trans, root, wc, path); 7585 next = read_tree_block(root, bytenr, blocksize, generation); 7586 if (!next || !extent_buffer_uptodate(next)) { 7587 free_extent_buffer(next); 7588 return -EIO; 7589 } 7590 btrfs_tree_lock(next); 7591 btrfs_set_lock_blocking(next); 7592 } 7593 7594 level--; 7595 BUG_ON(level != btrfs_header_level(next)); 7596 path->nodes[level] = next; 7597 path->slots[level] = 0; 7598 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7599 wc->level = level; 7600 if (wc->level == 1) 7601 wc->reada_slot = 0; 7602 return 0; 7603 skip: 7604 wc->refs[level - 1] = 0; 7605 wc->flags[level - 1] = 0; 7606 if (wc->stage == DROP_REFERENCE) { 7607 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7608 parent = path->nodes[level]->start; 7609 } else { 7610 BUG_ON(root->root_key.objectid != 7611 btrfs_header_owner(path->nodes[level])); 7612 parent = 0; 7613 } 7614 7615 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7616 root->root_key.objectid, level - 1, 0, 0); 7617 BUG_ON(ret); /* -ENOMEM */ 7618 } 7619 btrfs_tree_unlock(next); 7620 free_extent_buffer(next); 7621 *lookup_info = 1; 7622 return 1; 7623 } 7624 7625 /* 7626 * helper to process tree block while walking up the tree. 7627 * 7628 * when wc->stage == DROP_REFERENCE, this function drops 7629 * reference count on the block. 7630 * 7631 * when wc->stage == UPDATE_BACKREF, this function changes 7632 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7633 * to UPDATE_BACKREF previously while processing the block. 7634 * 7635 * NOTE: return value 1 means we should stop walking up. 7636 */ 7637 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7638 struct btrfs_root *root, 7639 struct btrfs_path *path, 7640 struct walk_control *wc) 7641 { 7642 int ret; 7643 int level = wc->level; 7644 struct extent_buffer *eb = path->nodes[level]; 7645 u64 parent = 0; 7646 7647 if (wc->stage == UPDATE_BACKREF) { 7648 BUG_ON(wc->shared_level < level); 7649 if (level < wc->shared_level) 7650 goto out; 7651 7652 ret = find_next_key(path, level + 1, &wc->update_progress); 7653 if (ret > 0) 7654 wc->update_ref = 0; 7655 7656 wc->stage = DROP_REFERENCE; 7657 wc->shared_level = -1; 7658 path->slots[level] = 0; 7659 7660 /* 7661 * check reference count again if the block isn't locked. 7662 * we should start walking down the tree again if reference 7663 * count is one. 7664 */ 7665 if (!path->locks[level]) { 7666 BUG_ON(level == 0); 7667 btrfs_tree_lock(eb); 7668 btrfs_set_lock_blocking(eb); 7669 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7670 7671 ret = btrfs_lookup_extent_info(trans, root, 7672 eb->start, level, 1, 7673 &wc->refs[level], 7674 &wc->flags[level]); 7675 if (ret < 0) { 7676 btrfs_tree_unlock_rw(eb, path->locks[level]); 7677 path->locks[level] = 0; 7678 return ret; 7679 } 7680 BUG_ON(wc->refs[level] == 0); 7681 if (wc->refs[level] == 1) { 7682 btrfs_tree_unlock_rw(eb, path->locks[level]); 7683 path->locks[level] = 0; 7684 return 1; 7685 } 7686 } 7687 } 7688 7689 /* wc->stage == DROP_REFERENCE */ 7690 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7691 7692 if (wc->refs[level] == 1) { 7693 if (level == 0) { 7694 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7695 ret = btrfs_dec_ref(trans, root, eb, 1, 7696 wc->for_reloc); 7697 else 7698 ret = btrfs_dec_ref(trans, root, eb, 0, 7699 wc->for_reloc); 7700 BUG_ON(ret); /* -ENOMEM */ 7701 } 7702 /* make block locked assertion in clean_tree_block happy */ 7703 if (!path->locks[level] && 7704 btrfs_header_generation(eb) == trans->transid) { 7705 btrfs_tree_lock(eb); 7706 btrfs_set_lock_blocking(eb); 7707 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7708 } 7709 clean_tree_block(trans, root, eb); 7710 } 7711 7712 if (eb == root->node) { 7713 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7714 parent = eb->start; 7715 else 7716 BUG_ON(root->root_key.objectid != 7717 btrfs_header_owner(eb)); 7718 } else { 7719 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7720 parent = path->nodes[level + 1]->start; 7721 else 7722 BUG_ON(root->root_key.objectid != 7723 btrfs_header_owner(path->nodes[level + 1])); 7724 } 7725 7726 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 7727 out: 7728 wc->refs[level] = 0; 7729 wc->flags[level] = 0; 7730 return 0; 7731 } 7732 7733 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 7734 struct btrfs_root *root, 7735 struct btrfs_path *path, 7736 struct walk_control *wc) 7737 { 7738 int level = wc->level; 7739 int lookup_info = 1; 7740 int ret; 7741 7742 while (level >= 0) { 7743 ret = walk_down_proc(trans, root, path, wc, lookup_info); 7744 if (ret > 0) 7745 break; 7746 7747 if (level == 0) 7748 break; 7749 7750 if (path->slots[level] >= 7751 btrfs_header_nritems(path->nodes[level])) 7752 break; 7753 7754 ret = do_walk_down(trans, root, path, wc, &lookup_info); 7755 if (ret > 0) { 7756 path->slots[level]++; 7757 continue; 7758 } else if (ret < 0) 7759 return ret; 7760 level = wc->level; 7761 } 7762 return 0; 7763 } 7764 7765 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 7766 struct btrfs_root *root, 7767 struct btrfs_path *path, 7768 struct walk_control *wc, int max_level) 7769 { 7770 int level = wc->level; 7771 int ret; 7772 7773 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 7774 while (level < max_level && path->nodes[level]) { 7775 wc->level = level; 7776 if (path->slots[level] + 1 < 7777 btrfs_header_nritems(path->nodes[level])) { 7778 path->slots[level]++; 7779 return 0; 7780 } else { 7781 ret = walk_up_proc(trans, root, path, wc); 7782 if (ret > 0) 7783 return 0; 7784 7785 if (path->locks[level]) { 7786 btrfs_tree_unlock_rw(path->nodes[level], 7787 path->locks[level]); 7788 path->locks[level] = 0; 7789 } 7790 free_extent_buffer(path->nodes[level]); 7791 path->nodes[level] = NULL; 7792 level++; 7793 } 7794 } 7795 return 1; 7796 } 7797 7798 /* 7799 * drop a subvolume tree. 7800 * 7801 * this function traverses the tree freeing any blocks that only 7802 * referenced by the tree. 7803 * 7804 * when a shared tree block is found. this function decreases its 7805 * reference count by one. if update_ref is true, this function 7806 * also make sure backrefs for the shared block and all lower level 7807 * blocks are properly updated. 7808 * 7809 * If called with for_reloc == 0, may exit early with -EAGAIN 7810 */ 7811 int btrfs_drop_snapshot(struct btrfs_root *root, 7812 struct btrfs_block_rsv *block_rsv, int update_ref, 7813 int for_reloc) 7814 { 7815 struct btrfs_path *path; 7816 struct btrfs_trans_handle *trans; 7817 struct btrfs_root *tree_root = root->fs_info->tree_root; 7818 struct btrfs_root_item *root_item = &root->root_item; 7819 struct walk_control *wc; 7820 struct btrfs_key key; 7821 int err = 0; 7822 int ret; 7823 int level; 7824 bool root_dropped = false; 7825 7826 path = btrfs_alloc_path(); 7827 if (!path) { 7828 err = -ENOMEM; 7829 goto out; 7830 } 7831 7832 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7833 if (!wc) { 7834 btrfs_free_path(path); 7835 err = -ENOMEM; 7836 goto out; 7837 } 7838 7839 trans = btrfs_start_transaction(tree_root, 0); 7840 if (IS_ERR(trans)) { 7841 err = PTR_ERR(trans); 7842 goto out_free; 7843 } 7844 7845 if (block_rsv) 7846 trans->block_rsv = block_rsv; 7847 7848 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 7849 level = btrfs_header_level(root->node); 7850 path->nodes[level] = btrfs_lock_root_node(root); 7851 btrfs_set_lock_blocking(path->nodes[level]); 7852 path->slots[level] = 0; 7853 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7854 memset(&wc->update_progress, 0, 7855 sizeof(wc->update_progress)); 7856 } else { 7857 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 7858 memcpy(&wc->update_progress, &key, 7859 sizeof(wc->update_progress)); 7860 7861 level = root_item->drop_level; 7862 BUG_ON(level == 0); 7863 path->lowest_level = level; 7864 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7865 path->lowest_level = 0; 7866 if (ret < 0) { 7867 err = ret; 7868 goto out_end_trans; 7869 } 7870 WARN_ON(ret > 0); 7871 7872 /* 7873 * unlock our path, this is safe because only this 7874 * function is allowed to delete this snapshot 7875 */ 7876 btrfs_unlock_up_safe(path, 0); 7877 7878 level = btrfs_header_level(root->node); 7879 while (1) { 7880 btrfs_tree_lock(path->nodes[level]); 7881 btrfs_set_lock_blocking(path->nodes[level]); 7882 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7883 7884 ret = btrfs_lookup_extent_info(trans, root, 7885 path->nodes[level]->start, 7886 level, 1, &wc->refs[level], 7887 &wc->flags[level]); 7888 if (ret < 0) { 7889 err = ret; 7890 goto out_end_trans; 7891 } 7892 BUG_ON(wc->refs[level] == 0); 7893 7894 if (level == root_item->drop_level) 7895 break; 7896 7897 btrfs_tree_unlock(path->nodes[level]); 7898 path->locks[level] = 0; 7899 WARN_ON(wc->refs[level] != 1); 7900 level--; 7901 } 7902 } 7903 7904 wc->level = level; 7905 wc->shared_level = -1; 7906 wc->stage = DROP_REFERENCE; 7907 wc->update_ref = update_ref; 7908 wc->keep_locks = 0; 7909 wc->for_reloc = for_reloc; 7910 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7911 7912 while (1) { 7913 7914 ret = walk_down_tree(trans, root, path, wc); 7915 if (ret < 0) { 7916 err = ret; 7917 break; 7918 } 7919 7920 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 7921 if (ret < 0) { 7922 err = ret; 7923 break; 7924 } 7925 7926 if (ret > 0) { 7927 BUG_ON(wc->stage != DROP_REFERENCE); 7928 break; 7929 } 7930 7931 if (wc->stage == DROP_REFERENCE) { 7932 level = wc->level; 7933 btrfs_node_key(path->nodes[level], 7934 &root_item->drop_progress, 7935 path->slots[level]); 7936 root_item->drop_level = level; 7937 } 7938 7939 BUG_ON(wc->level == 0); 7940 if (btrfs_should_end_transaction(trans, tree_root) || 7941 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 7942 ret = btrfs_update_root(trans, tree_root, 7943 &root->root_key, 7944 root_item); 7945 if (ret) { 7946 btrfs_abort_transaction(trans, tree_root, ret); 7947 err = ret; 7948 goto out_end_trans; 7949 } 7950 7951 btrfs_end_transaction_throttle(trans, tree_root); 7952 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 7953 pr_debug("BTRFS: drop snapshot early exit\n"); 7954 err = -EAGAIN; 7955 goto out_free; 7956 } 7957 7958 trans = btrfs_start_transaction(tree_root, 0); 7959 if (IS_ERR(trans)) { 7960 err = PTR_ERR(trans); 7961 goto out_free; 7962 } 7963 if (block_rsv) 7964 trans->block_rsv = block_rsv; 7965 } 7966 } 7967 btrfs_release_path(path); 7968 if (err) 7969 goto out_end_trans; 7970 7971 ret = btrfs_del_root(trans, tree_root, &root->root_key); 7972 if (ret) { 7973 btrfs_abort_transaction(trans, tree_root, ret); 7974 goto out_end_trans; 7975 } 7976 7977 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7978 ret = btrfs_find_root(tree_root, &root->root_key, path, 7979 NULL, NULL); 7980 if (ret < 0) { 7981 btrfs_abort_transaction(trans, tree_root, ret); 7982 err = ret; 7983 goto out_end_trans; 7984 } else if (ret > 0) { 7985 /* if we fail to delete the orphan item this time 7986 * around, it'll get picked up the next time. 7987 * 7988 * The most common failure here is just -ENOENT. 7989 */ 7990 btrfs_del_orphan_item(trans, tree_root, 7991 root->root_key.objectid); 7992 } 7993 } 7994 7995 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 7996 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 7997 } else { 7998 free_extent_buffer(root->node); 7999 free_extent_buffer(root->commit_root); 8000 btrfs_put_fs_root(root); 8001 } 8002 root_dropped = true; 8003 out_end_trans: 8004 btrfs_end_transaction_throttle(trans, tree_root); 8005 out_free: 8006 kfree(wc); 8007 btrfs_free_path(path); 8008 out: 8009 /* 8010 * So if we need to stop dropping the snapshot for whatever reason we 8011 * need to make sure to add it back to the dead root list so that we 8012 * keep trying to do the work later. This also cleans up roots if we 8013 * don't have it in the radix (like when we recover after a power fail 8014 * or unmount) so we don't leak memory. 8015 */ 8016 if (!for_reloc && root_dropped == false) 8017 btrfs_add_dead_root(root); 8018 if (err && err != -EAGAIN) 8019 btrfs_std_error(root->fs_info, err); 8020 return err; 8021 } 8022 8023 /* 8024 * drop subtree rooted at tree block 'node'. 8025 * 8026 * NOTE: this function will unlock and release tree block 'node' 8027 * only used by relocation code 8028 */ 8029 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 8030 struct btrfs_root *root, 8031 struct extent_buffer *node, 8032 struct extent_buffer *parent) 8033 { 8034 struct btrfs_path *path; 8035 struct walk_control *wc; 8036 int level; 8037 int parent_level; 8038 int ret = 0; 8039 int wret; 8040 8041 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 8042 8043 path = btrfs_alloc_path(); 8044 if (!path) 8045 return -ENOMEM; 8046 8047 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8048 if (!wc) { 8049 btrfs_free_path(path); 8050 return -ENOMEM; 8051 } 8052 8053 btrfs_assert_tree_locked(parent); 8054 parent_level = btrfs_header_level(parent); 8055 extent_buffer_get(parent); 8056 path->nodes[parent_level] = parent; 8057 path->slots[parent_level] = btrfs_header_nritems(parent); 8058 8059 btrfs_assert_tree_locked(node); 8060 level = btrfs_header_level(node); 8061 path->nodes[level] = node; 8062 path->slots[level] = 0; 8063 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8064 8065 wc->refs[parent_level] = 1; 8066 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8067 wc->level = level; 8068 wc->shared_level = -1; 8069 wc->stage = DROP_REFERENCE; 8070 wc->update_ref = 0; 8071 wc->keep_locks = 1; 8072 wc->for_reloc = 1; 8073 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8074 8075 while (1) { 8076 wret = walk_down_tree(trans, root, path, wc); 8077 if (wret < 0) { 8078 ret = wret; 8079 break; 8080 } 8081 8082 wret = walk_up_tree(trans, root, path, wc, parent_level); 8083 if (wret < 0) 8084 ret = wret; 8085 if (wret != 0) 8086 break; 8087 } 8088 8089 kfree(wc); 8090 btrfs_free_path(path); 8091 return ret; 8092 } 8093 8094 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 8095 { 8096 u64 num_devices; 8097 u64 stripped; 8098 8099 /* 8100 * if restripe for this chunk_type is on pick target profile and 8101 * return, otherwise do the usual balance 8102 */ 8103 stripped = get_restripe_target(root->fs_info, flags); 8104 if (stripped) 8105 return extended_to_chunk(stripped); 8106 8107 /* 8108 * we add in the count of missing devices because we want 8109 * to make sure that any RAID levels on a degraded FS 8110 * continue to be honored. 8111 */ 8112 num_devices = root->fs_info->fs_devices->rw_devices + 8113 root->fs_info->fs_devices->missing_devices; 8114 8115 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8116 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8117 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 8118 8119 if (num_devices == 1) { 8120 stripped |= BTRFS_BLOCK_GROUP_DUP; 8121 stripped = flags & ~stripped; 8122 8123 /* turn raid0 into single device chunks */ 8124 if (flags & BTRFS_BLOCK_GROUP_RAID0) 8125 return stripped; 8126 8127 /* turn mirroring into duplication */ 8128 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 8129 BTRFS_BLOCK_GROUP_RAID10)) 8130 return stripped | BTRFS_BLOCK_GROUP_DUP; 8131 } else { 8132 /* they already had raid on here, just return */ 8133 if (flags & stripped) 8134 return flags; 8135 8136 stripped |= BTRFS_BLOCK_GROUP_DUP; 8137 stripped = flags & ~stripped; 8138 8139 /* switch duplicated blocks with raid1 */ 8140 if (flags & BTRFS_BLOCK_GROUP_DUP) 8141 return stripped | BTRFS_BLOCK_GROUP_RAID1; 8142 8143 /* this is drive concat, leave it alone */ 8144 } 8145 8146 return flags; 8147 } 8148 8149 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 8150 { 8151 struct btrfs_space_info *sinfo = cache->space_info; 8152 u64 num_bytes; 8153 u64 min_allocable_bytes; 8154 int ret = -ENOSPC; 8155 8156 8157 /* 8158 * We need some metadata space and system metadata space for 8159 * allocating chunks in some corner cases until we force to set 8160 * it to be readonly. 8161 */ 8162 if ((sinfo->flags & 8163 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 8164 !force) 8165 min_allocable_bytes = 1 * 1024 * 1024; 8166 else 8167 min_allocable_bytes = 0; 8168 8169 spin_lock(&sinfo->lock); 8170 spin_lock(&cache->lock); 8171 8172 if (cache->ro) { 8173 ret = 0; 8174 goto out; 8175 } 8176 8177 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8178 cache->bytes_super - btrfs_block_group_used(&cache->item); 8179 8180 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8181 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 8182 min_allocable_bytes <= sinfo->total_bytes) { 8183 sinfo->bytes_readonly += num_bytes; 8184 cache->ro = 1; 8185 ret = 0; 8186 } 8187 out: 8188 spin_unlock(&cache->lock); 8189 spin_unlock(&sinfo->lock); 8190 return ret; 8191 } 8192 8193 int btrfs_set_block_group_ro(struct btrfs_root *root, 8194 struct btrfs_block_group_cache *cache) 8195 8196 { 8197 struct btrfs_trans_handle *trans; 8198 u64 alloc_flags; 8199 int ret; 8200 8201 BUG_ON(cache->ro); 8202 8203 trans = btrfs_join_transaction(root); 8204 if (IS_ERR(trans)) 8205 return PTR_ERR(trans); 8206 8207 alloc_flags = update_block_group_flags(root, cache->flags); 8208 if (alloc_flags != cache->flags) { 8209 ret = do_chunk_alloc(trans, root, alloc_flags, 8210 CHUNK_ALLOC_FORCE); 8211 if (ret < 0) 8212 goto out; 8213 } 8214 8215 ret = set_block_group_ro(cache, 0); 8216 if (!ret) 8217 goto out; 8218 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8219 ret = do_chunk_alloc(trans, root, alloc_flags, 8220 CHUNK_ALLOC_FORCE); 8221 if (ret < 0) 8222 goto out; 8223 ret = set_block_group_ro(cache, 0); 8224 out: 8225 btrfs_end_transaction(trans, root); 8226 return ret; 8227 } 8228 8229 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 8230 struct btrfs_root *root, u64 type) 8231 { 8232 u64 alloc_flags = get_alloc_profile(root, type); 8233 return do_chunk_alloc(trans, root, alloc_flags, 8234 CHUNK_ALLOC_FORCE); 8235 } 8236 8237 /* 8238 * helper to account the unused space of all the readonly block group in the 8239 * list. takes mirrors into account. 8240 */ 8241 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8242 { 8243 struct btrfs_block_group_cache *block_group; 8244 u64 free_bytes = 0; 8245 int factor; 8246 8247 list_for_each_entry(block_group, groups_list, list) { 8248 spin_lock(&block_group->lock); 8249 8250 if (!block_group->ro) { 8251 spin_unlock(&block_group->lock); 8252 continue; 8253 } 8254 8255 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 8256 BTRFS_BLOCK_GROUP_RAID10 | 8257 BTRFS_BLOCK_GROUP_DUP)) 8258 factor = 2; 8259 else 8260 factor = 1; 8261 8262 free_bytes += (block_group->key.offset - 8263 btrfs_block_group_used(&block_group->item)) * 8264 factor; 8265 8266 spin_unlock(&block_group->lock); 8267 } 8268 8269 return free_bytes; 8270 } 8271 8272 /* 8273 * helper to account the unused space of all the readonly block group in the 8274 * space_info. takes mirrors into account. 8275 */ 8276 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 8277 { 8278 int i; 8279 u64 free_bytes = 0; 8280 8281 spin_lock(&sinfo->lock); 8282 8283 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 8284 if (!list_empty(&sinfo->block_groups[i])) 8285 free_bytes += __btrfs_get_ro_block_group_free_space( 8286 &sinfo->block_groups[i]); 8287 8288 spin_unlock(&sinfo->lock); 8289 8290 return free_bytes; 8291 } 8292 8293 void btrfs_set_block_group_rw(struct btrfs_root *root, 8294 struct btrfs_block_group_cache *cache) 8295 { 8296 struct btrfs_space_info *sinfo = cache->space_info; 8297 u64 num_bytes; 8298 8299 BUG_ON(!cache->ro); 8300 8301 spin_lock(&sinfo->lock); 8302 spin_lock(&cache->lock); 8303 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8304 cache->bytes_super - btrfs_block_group_used(&cache->item); 8305 sinfo->bytes_readonly -= num_bytes; 8306 cache->ro = 0; 8307 spin_unlock(&cache->lock); 8308 spin_unlock(&sinfo->lock); 8309 } 8310 8311 /* 8312 * checks to see if its even possible to relocate this block group. 8313 * 8314 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 8315 * ok to go ahead and try. 8316 */ 8317 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 8318 { 8319 struct btrfs_block_group_cache *block_group; 8320 struct btrfs_space_info *space_info; 8321 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 8322 struct btrfs_device *device; 8323 struct btrfs_trans_handle *trans; 8324 u64 min_free; 8325 u64 dev_min = 1; 8326 u64 dev_nr = 0; 8327 u64 target; 8328 int index; 8329 int full = 0; 8330 int ret = 0; 8331 8332 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 8333 8334 /* odd, couldn't find the block group, leave it alone */ 8335 if (!block_group) 8336 return -1; 8337 8338 min_free = btrfs_block_group_used(&block_group->item); 8339 8340 /* no bytes used, we're good */ 8341 if (!min_free) 8342 goto out; 8343 8344 space_info = block_group->space_info; 8345 spin_lock(&space_info->lock); 8346 8347 full = space_info->full; 8348 8349 /* 8350 * if this is the last block group we have in this space, we can't 8351 * relocate it unless we're able to allocate a new chunk below. 8352 * 8353 * Otherwise, we need to make sure we have room in the space to handle 8354 * all of the extents from this block group. If we can, we're good 8355 */ 8356 if ((space_info->total_bytes != block_group->key.offset) && 8357 (space_info->bytes_used + space_info->bytes_reserved + 8358 space_info->bytes_pinned + space_info->bytes_readonly + 8359 min_free < space_info->total_bytes)) { 8360 spin_unlock(&space_info->lock); 8361 goto out; 8362 } 8363 spin_unlock(&space_info->lock); 8364 8365 /* 8366 * ok we don't have enough space, but maybe we have free space on our 8367 * devices to allocate new chunks for relocation, so loop through our 8368 * alloc devices and guess if we have enough space. if this block 8369 * group is going to be restriped, run checks against the target 8370 * profile instead of the current one. 8371 */ 8372 ret = -1; 8373 8374 /* 8375 * index: 8376 * 0: raid10 8377 * 1: raid1 8378 * 2: dup 8379 * 3: raid0 8380 * 4: single 8381 */ 8382 target = get_restripe_target(root->fs_info, block_group->flags); 8383 if (target) { 8384 index = __get_raid_index(extended_to_chunk(target)); 8385 } else { 8386 /* 8387 * this is just a balance, so if we were marked as full 8388 * we know there is no space for a new chunk 8389 */ 8390 if (full) 8391 goto out; 8392 8393 index = get_block_group_index(block_group); 8394 } 8395 8396 if (index == BTRFS_RAID_RAID10) { 8397 dev_min = 4; 8398 /* Divide by 2 */ 8399 min_free >>= 1; 8400 } else if (index == BTRFS_RAID_RAID1) { 8401 dev_min = 2; 8402 } else if (index == BTRFS_RAID_DUP) { 8403 /* Multiply by 2 */ 8404 min_free <<= 1; 8405 } else if (index == BTRFS_RAID_RAID0) { 8406 dev_min = fs_devices->rw_devices; 8407 do_div(min_free, dev_min); 8408 } 8409 8410 /* We need to do this so that we can look at pending chunks */ 8411 trans = btrfs_join_transaction(root); 8412 if (IS_ERR(trans)) { 8413 ret = PTR_ERR(trans); 8414 goto out; 8415 } 8416 8417 mutex_lock(&root->fs_info->chunk_mutex); 8418 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8419 u64 dev_offset; 8420 8421 /* 8422 * check to make sure we can actually find a chunk with enough 8423 * space to fit our block group in. 8424 */ 8425 if (device->total_bytes > device->bytes_used + min_free && 8426 !device->is_tgtdev_for_dev_replace) { 8427 ret = find_free_dev_extent(trans, device, min_free, 8428 &dev_offset, NULL); 8429 if (!ret) 8430 dev_nr++; 8431 8432 if (dev_nr >= dev_min) 8433 break; 8434 8435 ret = -1; 8436 } 8437 } 8438 mutex_unlock(&root->fs_info->chunk_mutex); 8439 btrfs_end_transaction(trans, root); 8440 out: 8441 btrfs_put_block_group(block_group); 8442 return ret; 8443 } 8444 8445 static int find_first_block_group(struct btrfs_root *root, 8446 struct btrfs_path *path, struct btrfs_key *key) 8447 { 8448 int ret = 0; 8449 struct btrfs_key found_key; 8450 struct extent_buffer *leaf; 8451 int slot; 8452 8453 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 8454 if (ret < 0) 8455 goto out; 8456 8457 while (1) { 8458 slot = path->slots[0]; 8459 leaf = path->nodes[0]; 8460 if (slot >= btrfs_header_nritems(leaf)) { 8461 ret = btrfs_next_leaf(root, path); 8462 if (ret == 0) 8463 continue; 8464 if (ret < 0) 8465 goto out; 8466 break; 8467 } 8468 btrfs_item_key_to_cpu(leaf, &found_key, slot); 8469 8470 if (found_key.objectid >= key->objectid && 8471 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 8472 ret = 0; 8473 goto out; 8474 } 8475 path->slots[0]++; 8476 } 8477 out: 8478 return ret; 8479 } 8480 8481 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 8482 { 8483 struct btrfs_block_group_cache *block_group; 8484 u64 last = 0; 8485 8486 while (1) { 8487 struct inode *inode; 8488 8489 block_group = btrfs_lookup_first_block_group(info, last); 8490 while (block_group) { 8491 spin_lock(&block_group->lock); 8492 if (block_group->iref) 8493 break; 8494 spin_unlock(&block_group->lock); 8495 block_group = next_block_group(info->tree_root, 8496 block_group); 8497 } 8498 if (!block_group) { 8499 if (last == 0) 8500 break; 8501 last = 0; 8502 continue; 8503 } 8504 8505 inode = block_group->inode; 8506 block_group->iref = 0; 8507 block_group->inode = NULL; 8508 spin_unlock(&block_group->lock); 8509 iput(inode); 8510 last = block_group->key.objectid + block_group->key.offset; 8511 btrfs_put_block_group(block_group); 8512 } 8513 } 8514 8515 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8516 { 8517 struct btrfs_block_group_cache *block_group; 8518 struct btrfs_space_info *space_info; 8519 struct btrfs_caching_control *caching_ctl; 8520 struct rb_node *n; 8521 8522 down_write(&info->commit_root_sem); 8523 while (!list_empty(&info->caching_block_groups)) { 8524 caching_ctl = list_entry(info->caching_block_groups.next, 8525 struct btrfs_caching_control, list); 8526 list_del(&caching_ctl->list); 8527 put_caching_control(caching_ctl); 8528 } 8529 up_write(&info->commit_root_sem); 8530 8531 spin_lock(&info->block_group_cache_lock); 8532 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8533 block_group = rb_entry(n, struct btrfs_block_group_cache, 8534 cache_node); 8535 rb_erase(&block_group->cache_node, 8536 &info->block_group_cache_tree); 8537 spin_unlock(&info->block_group_cache_lock); 8538 8539 down_write(&block_group->space_info->groups_sem); 8540 list_del(&block_group->list); 8541 up_write(&block_group->space_info->groups_sem); 8542 8543 if (block_group->cached == BTRFS_CACHE_STARTED) 8544 wait_block_group_cache_done(block_group); 8545 8546 /* 8547 * We haven't cached this block group, which means we could 8548 * possibly have excluded extents on this block group. 8549 */ 8550 if (block_group->cached == BTRFS_CACHE_NO || 8551 block_group->cached == BTRFS_CACHE_ERROR) 8552 free_excluded_extents(info->extent_root, block_group); 8553 8554 btrfs_remove_free_space_cache(block_group); 8555 btrfs_put_block_group(block_group); 8556 8557 spin_lock(&info->block_group_cache_lock); 8558 } 8559 spin_unlock(&info->block_group_cache_lock); 8560 8561 /* now that all the block groups are freed, go through and 8562 * free all the space_info structs. This is only called during 8563 * the final stages of unmount, and so we know nobody is 8564 * using them. We call synchronize_rcu() once before we start, 8565 * just to be on the safe side. 8566 */ 8567 synchronize_rcu(); 8568 8569 release_global_block_rsv(info); 8570 8571 while (!list_empty(&info->space_info)) { 8572 int i; 8573 8574 space_info = list_entry(info->space_info.next, 8575 struct btrfs_space_info, 8576 list); 8577 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8578 if (WARN_ON(space_info->bytes_pinned > 0 || 8579 space_info->bytes_reserved > 0 || 8580 space_info->bytes_may_use > 0)) { 8581 dump_space_info(space_info, 0, 0); 8582 } 8583 } 8584 list_del(&space_info->list); 8585 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 8586 struct kobject *kobj; 8587 kobj = space_info->block_group_kobjs[i]; 8588 space_info->block_group_kobjs[i] = NULL; 8589 if (kobj) { 8590 kobject_del(kobj); 8591 kobject_put(kobj); 8592 } 8593 } 8594 kobject_del(&space_info->kobj); 8595 kobject_put(&space_info->kobj); 8596 } 8597 return 0; 8598 } 8599 8600 static void __link_block_group(struct btrfs_space_info *space_info, 8601 struct btrfs_block_group_cache *cache) 8602 { 8603 int index = get_block_group_index(cache); 8604 bool first = false; 8605 8606 down_write(&space_info->groups_sem); 8607 if (list_empty(&space_info->block_groups[index])) 8608 first = true; 8609 list_add_tail(&cache->list, &space_info->block_groups[index]); 8610 up_write(&space_info->groups_sem); 8611 8612 if (first) { 8613 struct raid_kobject *rkobj; 8614 int ret; 8615 8616 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 8617 if (!rkobj) 8618 goto out_err; 8619 rkobj->raid_type = index; 8620 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 8621 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 8622 "%s", get_raid_name(index)); 8623 if (ret) { 8624 kobject_put(&rkobj->kobj); 8625 goto out_err; 8626 } 8627 space_info->block_group_kobjs[index] = &rkobj->kobj; 8628 } 8629 8630 return; 8631 out_err: 8632 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 8633 } 8634 8635 static struct btrfs_block_group_cache * 8636 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) 8637 { 8638 struct btrfs_block_group_cache *cache; 8639 8640 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8641 if (!cache) 8642 return NULL; 8643 8644 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8645 GFP_NOFS); 8646 if (!cache->free_space_ctl) { 8647 kfree(cache); 8648 return NULL; 8649 } 8650 8651 cache->key.objectid = start; 8652 cache->key.offset = size; 8653 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8654 8655 cache->sectorsize = root->sectorsize; 8656 cache->fs_info = root->fs_info; 8657 cache->full_stripe_len = btrfs_full_stripe_len(root, 8658 &root->fs_info->mapping_tree, 8659 start); 8660 atomic_set(&cache->count, 1); 8661 spin_lock_init(&cache->lock); 8662 INIT_LIST_HEAD(&cache->list); 8663 INIT_LIST_HEAD(&cache->cluster_list); 8664 INIT_LIST_HEAD(&cache->new_bg_list); 8665 btrfs_init_free_space_ctl(cache); 8666 8667 return cache; 8668 } 8669 8670 int btrfs_read_block_groups(struct btrfs_root *root) 8671 { 8672 struct btrfs_path *path; 8673 int ret; 8674 struct btrfs_block_group_cache *cache; 8675 struct btrfs_fs_info *info = root->fs_info; 8676 struct btrfs_space_info *space_info; 8677 struct btrfs_key key; 8678 struct btrfs_key found_key; 8679 struct extent_buffer *leaf; 8680 int need_clear = 0; 8681 u64 cache_gen; 8682 8683 root = info->extent_root; 8684 key.objectid = 0; 8685 key.offset = 0; 8686 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 8687 path = btrfs_alloc_path(); 8688 if (!path) 8689 return -ENOMEM; 8690 path->reada = 1; 8691 8692 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 8693 if (btrfs_test_opt(root, SPACE_CACHE) && 8694 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 8695 need_clear = 1; 8696 if (btrfs_test_opt(root, CLEAR_CACHE)) 8697 need_clear = 1; 8698 8699 while (1) { 8700 ret = find_first_block_group(root, path, &key); 8701 if (ret > 0) 8702 break; 8703 if (ret != 0) 8704 goto error; 8705 8706 leaf = path->nodes[0]; 8707 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8708 8709 cache = btrfs_create_block_group_cache(root, found_key.objectid, 8710 found_key.offset); 8711 if (!cache) { 8712 ret = -ENOMEM; 8713 goto error; 8714 } 8715 8716 if (need_clear) { 8717 /* 8718 * When we mount with old space cache, we need to 8719 * set BTRFS_DC_CLEAR and set dirty flag. 8720 * 8721 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 8722 * truncate the old free space cache inode and 8723 * setup a new one. 8724 * b) Setting 'dirty flag' makes sure that we flush 8725 * the new space cache info onto disk. 8726 */ 8727 cache->disk_cache_state = BTRFS_DC_CLEAR; 8728 if (btrfs_test_opt(root, SPACE_CACHE)) 8729 cache->dirty = 1; 8730 } 8731 8732 read_extent_buffer(leaf, &cache->item, 8733 btrfs_item_ptr_offset(leaf, path->slots[0]), 8734 sizeof(cache->item)); 8735 cache->flags = btrfs_block_group_flags(&cache->item); 8736 8737 key.objectid = found_key.objectid + found_key.offset; 8738 btrfs_release_path(path); 8739 8740 /* 8741 * We need to exclude the super stripes now so that the space 8742 * info has super bytes accounted for, otherwise we'll think 8743 * we have more space than we actually do. 8744 */ 8745 ret = exclude_super_stripes(root, cache); 8746 if (ret) { 8747 /* 8748 * We may have excluded something, so call this just in 8749 * case. 8750 */ 8751 free_excluded_extents(root, cache); 8752 btrfs_put_block_group(cache); 8753 goto error; 8754 } 8755 8756 /* 8757 * check for two cases, either we are full, and therefore 8758 * don't need to bother with the caching work since we won't 8759 * find any space, or we are empty, and we can just add all 8760 * the space in and be done with it. This saves us _alot_ of 8761 * time, particularly in the full case. 8762 */ 8763 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8764 cache->last_byte_to_unpin = (u64)-1; 8765 cache->cached = BTRFS_CACHE_FINISHED; 8766 free_excluded_extents(root, cache); 8767 } else if (btrfs_block_group_used(&cache->item) == 0) { 8768 cache->last_byte_to_unpin = (u64)-1; 8769 cache->cached = BTRFS_CACHE_FINISHED; 8770 add_new_free_space(cache, root->fs_info, 8771 found_key.objectid, 8772 found_key.objectid + 8773 found_key.offset); 8774 free_excluded_extents(root, cache); 8775 } 8776 8777 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8778 if (ret) { 8779 btrfs_remove_free_space_cache(cache); 8780 btrfs_put_block_group(cache); 8781 goto error; 8782 } 8783 8784 ret = update_space_info(info, cache->flags, found_key.offset, 8785 btrfs_block_group_used(&cache->item), 8786 &space_info); 8787 if (ret) { 8788 btrfs_remove_free_space_cache(cache); 8789 spin_lock(&info->block_group_cache_lock); 8790 rb_erase(&cache->cache_node, 8791 &info->block_group_cache_tree); 8792 spin_unlock(&info->block_group_cache_lock); 8793 btrfs_put_block_group(cache); 8794 goto error; 8795 } 8796 8797 cache->space_info = space_info; 8798 spin_lock(&cache->space_info->lock); 8799 cache->space_info->bytes_readonly += cache->bytes_super; 8800 spin_unlock(&cache->space_info->lock); 8801 8802 __link_block_group(space_info, cache); 8803 8804 set_avail_alloc_bits(root->fs_info, cache->flags); 8805 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8806 set_block_group_ro(cache, 1); 8807 } 8808 8809 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 8810 if (!(get_alloc_profile(root, space_info->flags) & 8811 (BTRFS_BLOCK_GROUP_RAID10 | 8812 BTRFS_BLOCK_GROUP_RAID1 | 8813 BTRFS_BLOCK_GROUP_RAID5 | 8814 BTRFS_BLOCK_GROUP_RAID6 | 8815 BTRFS_BLOCK_GROUP_DUP))) 8816 continue; 8817 /* 8818 * avoid allocating from un-mirrored block group if there are 8819 * mirrored block groups. 8820 */ 8821 list_for_each_entry(cache, 8822 &space_info->block_groups[BTRFS_RAID_RAID0], 8823 list) 8824 set_block_group_ro(cache, 1); 8825 list_for_each_entry(cache, 8826 &space_info->block_groups[BTRFS_RAID_SINGLE], 8827 list) 8828 set_block_group_ro(cache, 1); 8829 } 8830 8831 init_global_block_rsv(info); 8832 ret = 0; 8833 error: 8834 btrfs_free_path(path); 8835 return ret; 8836 } 8837 8838 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 8839 struct btrfs_root *root) 8840 { 8841 struct btrfs_block_group_cache *block_group, *tmp; 8842 struct btrfs_root *extent_root = root->fs_info->extent_root; 8843 struct btrfs_block_group_item item; 8844 struct btrfs_key key; 8845 int ret = 0; 8846 8847 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 8848 new_bg_list) { 8849 list_del_init(&block_group->new_bg_list); 8850 8851 if (ret) 8852 continue; 8853 8854 spin_lock(&block_group->lock); 8855 memcpy(&item, &block_group->item, sizeof(item)); 8856 memcpy(&key, &block_group->key, sizeof(key)); 8857 spin_unlock(&block_group->lock); 8858 8859 ret = btrfs_insert_item(trans, extent_root, &key, &item, 8860 sizeof(item)); 8861 if (ret) 8862 btrfs_abort_transaction(trans, extent_root, ret); 8863 ret = btrfs_finish_chunk_alloc(trans, extent_root, 8864 key.objectid, key.offset); 8865 if (ret) 8866 btrfs_abort_transaction(trans, extent_root, ret); 8867 } 8868 } 8869 8870 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 8871 struct btrfs_root *root, u64 bytes_used, 8872 u64 type, u64 chunk_objectid, u64 chunk_offset, 8873 u64 size) 8874 { 8875 int ret; 8876 struct btrfs_root *extent_root; 8877 struct btrfs_block_group_cache *cache; 8878 8879 extent_root = root->fs_info->extent_root; 8880 8881 btrfs_set_log_full_commit(root->fs_info, trans); 8882 8883 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 8884 if (!cache) 8885 return -ENOMEM; 8886 8887 btrfs_set_block_group_used(&cache->item, bytes_used); 8888 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8889 btrfs_set_block_group_flags(&cache->item, type); 8890 8891 cache->flags = type; 8892 cache->last_byte_to_unpin = (u64)-1; 8893 cache->cached = BTRFS_CACHE_FINISHED; 8894 ret = exclude_super_stripes(root, cache); 8895 if (ret) { 8896 /* 8897 * We may have excluded something, so call this just in 8898 * case. 8899 */ 8900 free_excluded_extents(root, cache); 8901 btrfs_put_block_group(cache); 8902 return ret; 8903 } 8904 8905 add_new_free_space(cache, root->fs_info, chunk_offset, 8906 chunk_offset + size); 8907 8908 free_excluded_extents(root, cache); 8909 8910 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8911 if (ret) { 8912 btrfs_remove_free_space_cache(cache); 8913 btrfs_put_block_group(cache); 8914 return ret; 8915 } 8916 8917 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 8918 &cache->space_info); 8919 if (ret) { 8920 btrfs_remove_free_space_cache(cache); 8921 spin_lock(&root->fs_info->block_group_cache_lock); 8922 rb_erase(&cache->cache_node, 8923 &root->fs_info->block_group_cache_tree); 8924 spin_unlock(&root->fs_info->block_group_cache_lock); 8925 btrfs_put_block_group(cache); 8926 return ret; 8927 } 8928 update_global_block_rsv(root->fs_info); 8929 8930 spin_lock(&cache->space_info->lock); 8931 cache->space_info->bytes_readonly += cache->bytes_super; 8932 spin_unlock(&cache->space_info->lock); 8933 8934 __link_block_group(cache->space_info, cache); 8935 8936 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 8937 8938 set_avail_alloc_bits(extent_root->fs_info, type); 8939 8940 return 0; 8941 } 8942 8943 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 8944 { 8945 u64 extra_flags = chunk_to_extended(flags) & 8946 BTRFS_EXTENDED_PROFILE_MASK; 8947 8948 write_seqlock(&fs_info->profiles_lock); 8949 if (flags & BTRFS_BLOCK_GROUP_DATA) 8950 fs_info->avail_data_alloc_bits &= ~extra_flags; 8951 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8952 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8953 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8954 fs_info->avail_system_alloc_bits &= ~extra_flags; 8955 write_sequnlock(&fs_info->profiles_lock); 8956 } 8957 8958 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8959 struct btrfs_root *root, u64 group_start) 8960 { 8961 struct btrfs_path *path; 8962 struct btrfs_block_group_cache *block_group; 8963 struct btrfs_free_cluster *cluster; 8964 struct btrfs_root *tree_root = root->fs_info->tree_root; 8965 struct btrfs_key key; 8966 struct inode *inode; 8967 struct kobject *kobj = NULL; 8968 int ret; 8969 int index; 8970 int factor; 8971 8972 root = root->fs_info->extent_root; 8973 8974 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 8975 BUG_ON(!block_group); 8976 BUG_ON(!block_group->ro); 8977 8978 /* 8979 * Free the reserved super bytes from this block group before 8980 * remove it. 8981 */ 8982 free_excluded_extents(root, block_group); 8983 8984 memcpy(&key, &block_group->key, sizeof(key)); 8985 index = get_block_group_index(block_group); 8986 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8987 BTRFS_BLOCK_GROUP_RAID1 | 8988 BTRFS_BLOCK_GROUP_RAID10)) 8989 factor = 2; 8990 else 8991 factor = 1; 8992 8993 /* make sure this block group isn't part of an allocation cluster */ 8994 cluster = &root->fs_info->data_alloc_cluster; 8995 spin_lock(&cluster->refill_lock); 8996 btrfs_return_cluster_to_free_space(block_group, cluster); 8997 spin_unlock(&cluster->refill_lock); 8998 8999 /* 9000 * make sure this block group isn't part of a metadata 9001 * allocation cluster 9002 */ 9003 cluster = &root->fs_info->meta_alloc_cluster; 9004 spin_lock(&cluster->refill_lock); 9005 btrfs_return_cluster_to_free_space(block_group, cluster); 9006 spin_unlock(&cluster->refill_lock); 9007 9008 path = btrfs_alloc_path(); 9009 if (!path) { 9010 ret = -ENOMEM; 9011 goto out; 9012 } 9013 9014 inode = lookup_free_space_inode(tree_root, block_group, path); 9015 if (!IS_ERR(inode)) { 9016 ret = btrfs_orphan_add(trans, inode); 9017 if (ret) { 9018 btrfs_add_delayed_iput(inode); 9019 goto out; 9020 } 9021 clear_nlink(inode); 9022 /* One for the block groups ref */ 9023 spin_lock(&block_group->lock); 9024 if (block_group->iref) { 9025 block_group->iref = 0; 9026 block_group->inode = NULL; 9027 spin_unlock(&block_group->lock); 9028 iput(inode); 9029 } else { 9030 spin_unlock(&block_group->lock); 9031 } 9032 /* One for our lookup ref */ 9033 btrfs_add_delayed_iput(inode); 9034 } 9035 9036 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 9037 key.offset = block_group->key.objectid; 9038 key.type = 0; 9039 9040 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 9041 if (ret < 0) 9042 goto out; 9043 if (ret > 0) 9044 btrfs_release_path(path); 9045 if (ret == 0) { 9046 ret = btrfs_del_item(trans, tree_root, path); 9047 if (ret) 9048 goto out; 9049 btrfs_release_path(path); 9050 } 9051 9052 spin_lock(&root->fs_info->block_group_cache_lock); 9053 rb_erase(&block_group->cache_node, 9054 &root->fs_info->block_group_cache_tree); 9055 9056 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9057 root->fs_info->first_logical_byte = (u64)-1; 9058 spin_unlock(&root->fs_info->block_group_cache_lock); 9059 9060 down_write(&block_group->space_info->groups_sem); 9061 /* 9062 * we must use list_del_init so people can check to see if they 9063 * are still on the list after taking the semaphore 9064 */ 9065 list_del_init(&block_group->list); 9066 if (list_empty(&block_group->space_info->block_groups[index])) { 9067 kobj = block_group->space_info->block_group_kobjs[index]; 9068 block_group->space_info->block_group_kobjs[index] = NULL; 9069 clear_avail_alloc_bits(root->fs_info, block_group->flags); 9070 } 9071 up_write(&block_group->space_info->groups_sem); 9072 if (kobj) { 9073 kobject_del(kobj); 9074 kobject_put(kobj); 9075 } 9076 9077 if (block_group->cached == BTRFS_CACHE_STARTED) 9078 wait_block_group_cache_done(block_group); 9079 9080 btrfs_remove_free_space_cache(block_group); 9081 9082 spin_lock(&block_group->space_info->lock); 9083 block_group->space_info->total_bytes -= block_group->key.offset; 9084 block_group->space_info->bytes_readonly -= block_group->key.offset; 9085 block_group->space_info->disk_total -= block_group->key.offset * factor; 9086 spin_unlock(&block_group->space_info->lock); 9087 9088 memcpy(&key, &block_group->key, sizeof(key)); 9089 9090 btrfs_clear_space_info_full(root->fs_info); 9091 9092 btrfs_put_block_group(block_group); 9093 btrfs_put_block_group(block_group); 9094 9095 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 9096 if (ret > 0) 9097 ret = -EIO; 9098 if (ret < 0) 9099 goto out; 9100 9101 ret = btrfs_del_item(trans, root, path); 9102 out: 9103 btrfs_free_path(path); 9104 return ret; 9105 } 9106 9107 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9108 { 9109 struct btrfs_space_info *space_info; 9110 struct btrfs_super_block *disk_super; 9111 u64 features; 9112 u64 flags; 9113 int mixed = 0; 9114 int ret; 9115 9116 disk_super = fs_info->super_copy; 9117 if (!btrfs_super_root(disk_super)) 9118 return 1; 9119 9120 features = btrfs_super_incompat_flags(disk_super); 9121 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 9122 mixed = 1; 9123 9124 flags = BTRFS_BLOCK_GROUP_SYSTEM; 9125 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9126 if (ret) 9127 goto out; 9128 9129 if (mixed) { 9130 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 9131 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9132 } else { 9133 flags = BTRFS_BLOCK_GROUP_METADATA; 9134 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9135 if (ret) 9136 goto out; 9137 9138 flags = BTRFS_BLOCK_GROUP_DATA; 9139 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9140 } 9141 out: 9142 return ret; 9143 } 9144 9145 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 9146 { 9147 return unpin_extent_range(root, start, end); 9148 } 9149 9150 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 9151 u64 num_bytes, u64 *actual_bytes) 9152 { 9153 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 9154 } 9155 9156 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 9157 { 9158 struct btrfs_fs_info *fs_info = root->fs_info; 9159 struct btrfs_block_group_cache *cache = NULL; 9160 u64 group_trimmed; 9161 u64 start; 9162 u64 end; 9163 u64 trimmed = 0; 9164 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 9165 int ret = 0; 9166 9167 /* 9168 * try to trim all FS space, our block group may start from non-zero. 9169 */ 9170 if (range->len == total_bytes) 9171 cache = btrfs_lookup_first_block_group(fs_info, range->start); 9172 else 9173 cache = btrfs_lookup_block_group(fs_info, range->start); 9174 9175 while (cache) { 9176 if (cache->key.objectid >= (range->start + range->len)) { 9177 btrfs_put_block_group(cache); 9178 break; 9179 } 9180 9181 start = max(range->start, cache->key.objectid); 9182 end = min(range->start + range->len, 9183 cache->key.objectid + cache->key.offset); 9184 9185 if (end - start >= range->minlen) { 9186 if (!block_group_cache_done(cache)) { 9187 ret = cache_block_group(cache, 0); 9188 if (ret) { 9189 btrfs_put_block_group(cache); 9190 break; 9191 } 9192 ret = wait_block_group_cache_done(cache); 9193 if (ret) { 9194 btrfs_put_block_group(cache); 9195 break; 9196 } 9197 } 9198 ret = btrfs_trim_block_group(cache, 9199 &group_trimmed, 9200 start, 9201 end, 9202 range->minlen); 9203 9204 trimmed += group_trimmed; 9205 if (ret) { 9206 btrfs_put_block_group(cache); 9207 break; 9208 } 9209 } 9210 9211 cache = next_block_group(fs_info->tree_root, cache); 9212 } 9213 9214 range->len = trimmed; 9215 return ret; 9216 } 9217 9218 /* 9219 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), 9220 * they are used to prevent the some tasks writing data into the page cache 9221 * by nocow before the subvolume is snapshoted, but flush the data into 9222 * the disk after the snapshot creation. 9223 */ 9224 void btrfs_end_nocow_write(struct btrfs_root *root) 9225 { 9226 percpu_counter_dec(&root->subv_writers->counter); 9227 /* 9228 * Make sure counter is updated before we wake up 9229 * waiters. 9230 */ 9231 smp_mb(); 9232 if (waitqueue_active(&root->subv_writers->wait)) 9233 wake_up(&root->subv_writers->wait); 9234 } 9235 9236 int btrfs_start_nocow_write(struct btrfs_root *root) 9237 { 9238 if (unlikely(atomic_read(&root->will_be_snapshoted))) 9239 return 0; 9240 9241 percpu_counter_inc(&root->subv_writers->counter); 9242 /* 9243 * Make sure counter is updated before we check for snapshot creation. 9244 */ 9245 smp_mb(); 9246 if (unlikely(atomic_read(&root->will_be_snapshoted))) { 9247 btrfs_end_nocow_write(root); 9248 return 0; 9249 } 9250 return 1; 9251 } 9252