1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "math.h" 37 #include "sysfs.h" 38 #include "qgroup.h" 39 40 #undef SCRAMBLE_DELAYED_REFS 41 42 /* 43 * control flags for do_chunk_alloc's force field 44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 45 * if we really need one. 46 * 47 * CHUNK_ALLOC_LIMITED means to only try and allocate one 48 * if we have very few chunks already allocated. This is 49 * used as part of the clustering code to help make sure 50 * we have a good pool of storage to cluster in, without 51 * filling the FS with empty chunks 52 * 53 * CHUNK_ALLOC_FORCE means it must try to allocate one 54 * 55 */ 56 enum { 57 CHUNK_ALLOC_NO_FORCE = 0, 58 CHUNK_ALLOC_LIMITED = 1, 59 CHUNK_ALLOC_FORCE = 2, 60 }; 61 62 /* 63 * Control how reservations are dealt with. 64 * 65 * RESERVE_FREE - freeing a reservation. 66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 67 * ENOSPC accounting 68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 69 * bytes_may_use as the ENOSPC accounting is done elsewhere 70 */ 71 enum { 72 RESERVE_FREE = 0, 73 RESERVE_ALLOC = 1, 74 RESERVE_ALLOC_NO_ACCOUNT = 2, 75 }; 76 77 static int update_block_group(struct btrfs_root *root, 78 u64 bytenr, u64 num_bytes, int alloc); 79 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 80 struct btrfs_root *root, 81 u64 bytenr, u64 num_bytes, u64 parent, 82 u64 root_objectid, u64 owner_objectid, 83 u64 owner_offset, int refs_to_drop, 84 struct btrfs_delayed_extent_op *extra_op, 85 int no_quota); 86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 87 struct extent_buffer *leaf, 88 struct btrfs_extent_item *ei); 89 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 90 struct btrfs_root *root, 91 u64 parent, u64 root_objectid, 92 u64 flags, u64 owner, u64 offset, 93 struct btrfs_key *ins, int ref_mod); 94 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, 96 u64 parent, u64 root_objectid, 97 u64 flags, struct btrfs_disk_key *key, 98 int level, struct btrfs_key *ins, 99 int no_quota); 100 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 101 struct btrfs_root *extent_root, u64 flags, 102 int force); 103 static int find_next_key(struct btrfs_path *path, int level, 104 struct btrfs_key *key); 105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 106 int dump_block_groups); 107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 108 u64 num_bytes, int reserve, 109 int delalloc); 110 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 111 u64 num_bytes); 112 int btrfs_pin_extent(struct btrfs_root *root, 113 u64 bytenr, u64 num_bytes, int reserved); 114 115 static noinline int 116 block_group_cache_done(struct btrfs_block_group_cache *cache) 117 { 118 smp_mb(); 119 return cache->cached == BTRFS_CACHE_FINISHED || 120 cache->cached == BTRFS_CACHE_ERROR; 121 } 122 123 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 124 { 125 return (cache->flags & bits) == bits; 126 } 127 128 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 129 { 130 atomic_inc(&cache->count); 131 } 132 133 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 134 { 135 if (atomic_dec_and_test(&cache->count)) { 136 WARN_ON(cache->pinned > 0); 137 WARN_ON(cache->reserved > 0); 138 kfree(cache->free_space_ctl); 139 kfree(cache); 140 } 141 } 142 143 /* 144 * this adds the block group to the fs_info rb tree for the block group 145 * cache 146 */ 147 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 148 struct btrfs_block_group_cache *block_group) 149 { 150 struct rb_node **p; 151 struct rb_node *parent = NULL; 152 struct btrfs_block_group_cache *cache; 153 154 spin_lock(&info->block_group_cache_lock); 155 p = &info->block_group_cache_tree.rb_node; 156 157 while (*p) { 158 parent = *p; 159 cache = rb_entry(parent, struct btrfs_block_group_cache, 160 cache_node); 161 if (block_group->key.objectid < cache->key.objectid) { 162 p = &(*p)->rb_left; 163 } else if (block_group->key.objectid > cache->key.objectid) { 164 p = &(*p)->rb_right; 165 } else { 166 spin_unlock(&info->block_group_cache_lock); 167 return -EEXIST; 168 } 169 } 170 171 rb_link_node(&block_group->cache_node, parent, p); 172 rb_insert_color(&block_group->cache_node, 173 &info->block_group_cache_tree); 174 175 if (info->first_logical_byte > block_group->key.objectid) 176 info->first_logical_byte = block_group->key.objectid; 177 178 spin_unlock(&info->block_group_cache_lock); 179 180 return 0; 181 } 182 183 /* 184 * This will return the block group at or after bytenr if contains is 0, else 185 * it will return the block group that contains the bytenr 186 */ 187 static struct btrfs_block_group_cache * 188 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 189 int contains) 190 { 191 struct btrfs_block_group_cache *cache, *ret = NULL; 192 struct rb_node *n; 193 u64 end, start; 194 195 spin_lock(&info->block_group_cache_lock); 196 n = info->block_group_cache_tree.rb_node; 197 198 while (n) { 199 cache = rb_entry(n, struct btrfs_block_group_cache, 200 cache_node); 201 end = cache->key.objectid + cache->key.offset - 1; 202 start = cache->key.objectid; 203 204 if (bytenr < start) { 205 if (!contains && (!ret || start < ret->key.objectid)) 206 ret = cache; 207 n = n->rb_left; 208 } else if (bytenr > start) { 209 if (contains && bytenr <= end) { 210 ret = cache; 211 break; 212 } 213 n = n->rb_right; 214 } else { 215 ret = cache; 216 break; 217 } 218 } 219 if (ret) { 220 btrfs_get_block_group(ret); 221 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 222 info->first_logical_byte = ret->key.objectid; 223 } 224 spin_unlock(&info->block_group_cache_lock); 225 226 return ret; 227 } 228 229 static int add_excluded_extent(struct btrfs_root *root, 230 u64 start, u64 num_bytes) 231 { 232 u64 end = start + num_bytes - 1; 233 set_extent_bits(&root->fs_info->freed_extents[0], 234 start, end, EXTENT_UPTODATE, GFP_NOFS); 235 set_extent_bits(&root->fs_info->freed_extents[1], 236 start, end, EXTENT_UPTODATE, GFP_NOFS); 237 return 0; 238 } 239 240 static void free_excluded_extents(struct btrfs_root *root, 241 struct btrfs_block_group_cache *cache) 242 { 243 u64 start, end; 244 245 start = cache->key.objectid; 246 end = start + cache->key.offset - 1; 247 248 clear_extent_bits(&root->fs_info->freed_extents[0], 249 start, end, EXTENT_UPTODATE, GFP_NOFS); 250 clear_extent_bits(&root->fs_info->freed_extents[1], 251 start, end, EXTENT_UPTODATE, GFP_NOFS); 252 } 253 254 static int exclude_super_stripes(struct btrfs_root *root, 255 struct btrfs_block_group_cache *cache) 256 { 257 u64 bytenr; 258 u64 *logical; 259 int stripe_len; 260 int i, nr, ret; 261 262 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 263 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 264 cache->bytes_super += stripe_len; 265 ret = add_excluded_extent(root, cache->key.objectid, 266 stripe_len); 267 if (ret) 268 return ret; 269 } 270 271 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 272 bytenr = btrfs_sb_offset(i); 273 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 274 cache->key.objectid, bytenr, 275 0, &logical, &nr, &stripe_len); 276 if (ret) 277 return ret; 278 279 while (nr--) { 280 u64 start, len; 281 282 if (logical[nr] > cache->key.objectid + 283 cache->key.offset) 284 continue; 285 286 if (logical[nr] + stripe_len <= cache->key.objectid) 287 continue; 288 289 start = logical[nr]; 290 if (start < cache->key.objectid) { 291 start = cache->key.objectid; 292 len = (logical[nr] + stripe_len) - start; 293 } else { 294 len = min_t(u64, stripe_len, 295 cache->key.objectid + 296 cache->key.offset - start); 297 } 298 299 cache->bytes_super += len; 300 ret = add_excluded_extent(root, start, len); 301 if (ret) { 302 kfree(logical); 303 return ret; 304 } 305 } 306 307 kfree(logical); 308 } 309 return 0; 310 } 311 312 static struct btrfs_caching_control * 313 get_caching_control(struct btrfs_block_group_cache *cache) 314 { 315 struct btrfs_caching_control *ctl; 316 317 spin_lock(&cache->lock); 318 if (cache->cached != BTRFS_CACHE_STARTED) { 319 spin_unlock(&cache->lock); 320 return NULL; 321 } 322 323 /* We're loading it the fast way, so we don't have a caching_ctl. */ 324 if (!cache->caching_ctl) { 325 spin_unlock(&cache->lock); 326 return NULL; 327 } 328 329 ctl = cache->caching_ctl; 330 atomic_inc(&ctl->count); 331 spin_unlock(&cache->lock); 332 return ctl; 333 } 334 335 static void put_caching_control(struct btrfs_caching_control *ctl) 336 { 337 if (atomic_dec_and_test(&ctl->count)) 338 kfree(ctl); 339 } 340 341 /* 342 * this is only called by cache_block_group, since we could have freed extents 343 * we need to check the pinned_extents for any extents that can't be used yet 344 * since their free space will be released as soon as the transaction commits. 345 */ 346 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 347 struct btrfs_fs_info *info, u64 start, u64 end) 348 { 349 u64 extent_start, extent_end, size, total_added = 0; 350 int ret; 351 352 while (start < end) { 353 ret = find_first_extent_bit(info->pinned_extents, start, 354 &extent_start, &extent_end, 355 EXTENT_DIRTY | EXTENT_UPTODATE, 356 NULL); 357 if (ret) 358 break; 359 360 if (extent_start <= start) { 361 start = extent_end + 1; 362 } else if (extent_start > start && extent_start < end) { 363 size = extent_start - start; 364 total_added += size; 365 ret = btrfs_add_free_space(block_group, start, 366 size); 367 BUG_ON(ret); /* -ENOMEM or logic error */ 368 start = extent_end + 1; 369 } else { 370 break; 371 } 372 } 373 374 if (start < end) { 375 size = end - start; 376 total_added += size; 377 ret = btrfs_add_free_space(block_group, start, size); 378 BUG_ON(ret); /* -ENOMEM or logic error */ 379 } 380 381 return total_added; 382 } 383 384 static noinline void caching_thread(struct btrfs_work *work) 385 { 386 struct btrfs_block_group_cache *block_group; 387 struct btrfs_fs_info *fs_info; 388 struct btrfs_caching_control *caching_ctl; 389 struct btrfs_root *extent_root; 390 struct btrfs_path *path; 391 struct extent_buffer *leaf; 392 struct btrfs_key key; 393 u64 total_found = 0; 394 u64 last = 0; 395 u32 nritems; 396 int ret = -ENOMEM; 397 398 caching_ctl = container_of(work, struct btrfs_caching_control, work); 399 block_group = caching_ctl->block_group; 400 fs_info = block_group->fs_info; 401 extent_root = fs_info->extent_root; 402 403 path = btrfs_alloc_path(); 404 if (!path) 405 goto out; 406 407 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 408 409 /* 410 * We don't want to deadlock with somebody trying to allocate a new 411 * extent for the extent root while also trying to search the extent 412 * root to add free space. So we skip locking and search the commit 413 * root, since its read-only 414 */ 415 path->skip_locking = 1; 416 path->search_commit_root = 1; 417 path->reada = 1; 418 419 key.objectid = last; 420 key.offset = 0; 421 key.type = BTRFS_EXTENT_ITEM_KEY; 422 again: 423 mutex_lock(&caching_ctl->mutex); 424 /* need to make sure the commit_root doesn't disappear */ 425 down_read(&fs_info->commit_root_sem); 426 427 next: 428 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 429 if (ret < 0) 430 goto err; 431 432 leaf = path->nodes[0]; 433 nritems = btrfs_header_nritems(leaf); 434 435 while (1) { 436 if (btrfs_fs_closing(fs_info) > 1) { 437 last = (u64)-1; 438 break; 439 } 440 441 if (path->slots[0] < nritems) { 442 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 443 } else { 444 ret = find_next_key(path, 0, &key); 445 if (ret) 446 break; 447 448 if (need_resched() || 449 rwsem_is_contended(&fs_info->commit_root_sem)) { 450 caching_ctl->progress = last; 451 btrfs_release_path(path); 452 up_read(&fs_info->commit_root_sem); 453 mutex_unlock(&caching_ctl->mutex); 454 cond_resched(); 455 goto again; 456 } 457 458 ret = btrfs_next_leaf(extent_root, path); 459 if (ret < 0) 460 goto err; 461 if (ret) 462 break; 463 leaf = path->nodes[0]; 464 nritems = btrfs_header_nritems(leaf); 465 continue; 466 } 467 468 if (key.objectid < last) { 469 key.objectid = last; 470 key.offset = 0; 471 key.type = BTRFS_EXTENT_ITEM_KEY; 472 473 caching_ctl->progress = last; 474 btrfs_release_path(path); 475 goto next; 476 } 477 478 if (key.objectid < block_group->key.objectid) { 479 path->slots[0]++; 480 continue; 481 } 482 483 if (key.objectid >= block_group->key.objectid + 484 block_group->key.offset) 485 break; 486 487 if (key.type == BTRFS_EXTENT_ITEM_KEY || 488 key.type == BTRFS_METADATA_ITEM_KEY) { 489 total_found += add_new_free_space(block_group, 490 fs_info, last, 491 key.objectid); 492 if (key.type == BTRFS_METADATA_ITEM_KEY) 493 last = key.objectid + 494 fs_info->tree_root->leafsize; 495 else 496 last = key.objectid + key.offset; 497 498 if (total_found > (1024 * 1024 * 2)) { 499 total_found = 0; 500 wake_up(&caching_ctl->wait); 501 } 502 } 503 path->slots[0]++; 504 } 505 ret = 0; 506 507 total_found += add_new_free_space(block_group, fs_info, last, 508 block_group->key.objectid + 509 block_group->key.offset); 510 caching_ctl->progress = (u64)-1; 511 512 spin_lock(&block_group->lock); 513 block_group->caching_ctl = NULL; 514 block_group->cached = BTRFS_CACHE_FINISHED; 515 spin_unlock(&block_group->lock); 516 517 err: 518 btrfs_free_path(path); 519 up_read(&fs_info->commit_root_sem); 520 521 free_excluded_extents(extent_root, block_group); 522 523 mutex_unlock(&caching_ctl->mutex); 524 out: 525 if (ret) { 526 spin_lock(&block_group->lock); 527 block_group->caching_ctl = NULL; 528 block_group->cached = BTRFS_CACHE_ERROR; 529 spin_unlock(&block_group->lock); 530 } 531 wake_up(&caching_ctl->wait); 532 533 put_caching_control(caching_ctl); 534 btrfs_put_block_group(block_group); 535 } 536 537 static int cache_block_group(struct btrfs_block_group_cache *cache, 538 int load_cache_only) 539 { 540 DEFINE_WAIT(wait); 541 struct btrfs_fs_info *fs_info = cache->fs_info; 542 struct btrfs_caching_control *caching_ctl; 543 int ret = 0; 544 545 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 546 if (!caching_ctl) 547 return -ENOMEM; 548 549 INIT_LIST_HEAD(&caching_ctl->list); 550 mutex_init(&caching_ctl->mutex); 551 init_waitqueue_head(&caching_ctl->wait); 552 caching_ctl->block_group = cache; 553 caching_ctl->progress = cache->key.objectid; 554 atomic_set(&caching_ctl->count, 1); 555 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 556 557 spin_lock(&cache->lock); 558 /* 559 * This should be a rare occasion, but this could happen I think in the 560 * case where one thread starts to load the space cache info, and then 561 * some other thread starts a transaction commit which tries to do an 562 * allocation while the other thread is still loading the space cache 563 * info. The previous loop should have kept us from choosing this block 564 * group, but if we've moved to the state where we will wait on caching 565 * block groups we need to first check if we're doing a fast load here, 566 * so we can wait for it to finish, otherwise we could end up allocating 567 * from a block group who's cache gets evicted for one reason or 568 * another. 569 */ 570 while (cache->cached == BTRFS_CACHE_FAST) { 571 struct btrfs_caching_control *ctl; 572 573 ctl = cache->caching_ctl; 574 atomic_inc(&ctl->count); 575 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 576 spin_unlock(&cache->lock); 577 578 schedule(); 579 580 finish_wait(&ctl->wait, &wait); 581 put_caching_control(ctl); 582 spin_lock(&cache->lock); 583 } 584 585 if (cache->cached != BTRFS_CACHE_NO) { 586 spin_unlock(&cache->lock); 587 kfree(caching_ctl); 588 return 0; 589 } 590 WARN_ON(cache->caching_ctl); 591 cache->caching_ctl = caching_ctl; 592 cache->cached = BTRFS_CACHE_FAST; 593 spin_unlock(&cache->lock); 594 595 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 596 ret = load_free_space_cache(fs_info, cache); 597 598 spin_lock(&cache->lock); 599 if (ret == 1) { 600 cache->caching_ctl = NULL; 601 cache->cached = BTRFS_CACHE_FINISHED; 602 cache->last_byte_to_unpin = (u64)-1; 603 } else { 604 if (load_cache_only) { 605 cache->caching_ctl = NULL; 606 cache->cached = BTRFS_CACHE_NO; 607 } else { 608 cache->cached = BTRFS_CACHE_STARTED; 609 } 610 } 611 spin_unlock(&cache->lock); 612 wake_up(&caching_ctl->wait); 613 if (ret == 1) { 614 put_caching_control(caching_ctl); 615 free_excluded_extents(fs_info->extent_root, cache); 616 return 0; 617 } 618 } else { 619 /* 620 * We are not going to do the fast caching, set cached to the 621 * appropriate value and wakeup any waiters. 622 */ 623 spin_lock(&cache->lock); 624 if (load_cache_only) { 625 cache->caching_ctl = NULL; 626 cache->cached = BTRFS_CACHE_NO; 627 } else { 628 cache->cached = BTRFS_CACHE_STARTED; 629 } 630 spin_unlock(&cache->lock); 631 wake_up(&caching_ctl->wait); 632 } 633 634 if (load_cache_only) { 635 put_caching_control(caching_ctl); 636 return 0; 637 } 638 639 down_write(&fs_info->commit_root_sem); 640 atomic_inc(&caching_ctl->count); 641 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 642 up_write(&fs_info->commit_root_sem); 643 644 btrfs_get_block_group(cache); 645 646 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 647 648 return ret; 649 } 650 651 /* 652 * return the block group that starts at or after bytenr 653 */ 654 static struct btrfs_block_group_cache * 655 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 656 { 657 struct btrfs_block_group_cache *cache; 658 659 cache = block_group_cache_tree_search(info, bytenr, 0); 660 661 return cache; 662 } 663 664 /* 665 * return the block group that contains the given bytenr 666 */ 667 struct btrfs_block_group_cache *btrfs_lookup_block_group( 668 struct btrfs_fs_info *info, 669 u64 bytenr) 670 { 671 struct btrfs_block_group_cache *cache; 672 673 cache = block_group_cache_tree_search(info, bytenr, 1); 674 675 return cache; 676 } 677 678 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 679 u64 flags) 680 { 681 struct list_head *head = &info->space_info; 682 struct btrfs_space_info *found; 683 684 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 685 686 rcu_read_lock(); 687 list_for_each_entry_rcu(found, head, list) { 688 if (found->flags & flags) { 689 rcu_read_unlock(); 690 return found; 691 } 692 } 693 rcu_read_unlock(); 694 return NULL; 695 } 696 697 /* 698 * after adding space to the filesystem, we need to clear the full flags 699 * on all the space infos. 700 */ 701 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 702 { 703 struct list_head *head = &info->space_info; 704 struct btrfs_space_info *found; 705 706 rcu_read_lock(); 707 list_for_each_entry_rcu(found, head, list) 708 found->full = 0; 709 rcu_read_unlock(); 710 } 711 712 /* simple helper to search for an existing extent at a given offset */ 713 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 714 { 715 int ret; 716 struct btrfs_key key; 717 struct btrfs_path *path; 718 719 path = btrfs_alloc_path(); 720 if (!path) 721 return -ENOMEM; 722 723 key.objectid = start; 724 key.offset = len; 725 key.type = BTRFS_EXTENT_ITEM_KEY; 726 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 727 0, 0); 728 if (ret > 0) { 729 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 730 if (key.objectid == start && 731 key.type == BTRFS_METADATA_ITEM_KEY) 732 ret = 0; 733 } 734 btrfs_free_path(path); 735 return ret; 736 } 737 738 /* 739 * helper function to lookup reference count and flags of a tree block. 740 * 741 * the head node for delayed ref is used to store the sum of all the 742 * reference count modifications queued up in the rbtree. the head 743 * node may also store the extent flags to set. This way you can check 744 * to see what the reference count and extent flags would be if all of 745 * the delayed refs are not processed. 746 */ 747 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 748 struct btrfs_root *root, u64 bytenr, 749 u64 offset, int metadata, u64 *refs, u64 *flags) 750 { 751 struct btrfs_delayed_ref_head *head; 752 struct btrfs_delayed_ref_root *delayed_refs; 753 struct btrfs_path *path; 754 struct btrfs_extent_item *ei; 755 struct extent_buffer *leaf; 756 struct btrfs_key key; 757 u32 item_size; 758 u64 num_refs; 759 u64 extent_flags; 760 int ret; 761 762 /* 763 * If we don't have skinny metadata, don't bother doing anything 764 * different 765 */ 766 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 767 offset = root->leafsize; 768 metadata = 0; 769 } 770 771 path = btrfs_alloc_path(); 772 if (!path) 773 return -ENOMEM; 774 775 if (!trans) { 776 path->skip_locking = 1; 777 path->search_commit_root = 1; 778 } 779 780 search_again: 781 key.objectid = bytenr; 782 key.offset = offset; 783 if (metadata) 784 key.type = BTRFS_METADATA_ITEM_KEY; 785 else 786 key.type = BTRFS_EXTENT_ITEM_KEY; 787 788 again: 789 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 790 &key, path, 0, 0); 791 if (ret < 0) 792 goto out_free; 793 794 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 795 if (path->slots[0]) { 796 path->slots[0]--; 797 btrfs_item_key_to_cpu(path->nodes[0], &key, 798 path->slots[0]); 799 if (key.objectid == bytenr && 800 key.type == BTRFS_EXTENT_ITEM_KEY && 801 key.offset == root->leafsize) 802 ret = 0; 803 } 804 if (ret) { 805 key.objectid = bytenr; 806 key.type = BTRFS_EXTENT_ITEM_KEY; 807 key.offset = root->leafsize; 808 btrfs_release_path(path); 809 goto again; 810 } 811 } 812 813 if (ret == 0) { 814 leaf = path->nodes[0]; 815 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 816 if (item_size >= sizeof(*ei)) { 817 ei = btrfs_item_ptr(leaf, path->slots[0], 818 struct btrfs_extent_item); 819 num_refs = btrfs_extent_refs(leaf, ei); 820 extent_flags = btrfs_extent_flags(leaf, ei); 821 } else { 822 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 823 struct btrfs_extent_item_v0 *ei0; 824 BUG_ON(item_size != sizeof(*ei0)); 825 ei0 = btrfs_item_ptr(leaf, path->slots[0], 826 struct btrfs_extent_item_v0); 827 num_refs = btrfs_extent_refs_v0(leaf, ei0); 828 /* FIXME: this isn't correct for data */ 829 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 830 #else 831 BUG(); 832 #endif 833 } 834 BUG_ON(num_refs == 0); 835 } else { 836 num_refs = 0; 837 extent_flags = 0; 838 ret = 0; 839 } 840 841 if (!trans) 842 goto out; 843 844 delayed_refs = &trans->transaction->delayed_refs; 845 spin_lock(&delayed_refs->lock); 846 head = btrfs_find_delayed_ref_head(trans, bytenr); 847 if (head) { 848 if (!mutex_trylock(&head->mutex)) { 849 atomic_inc(&head->node.refs); 850 spin_unlock(&delayed_refs->lock); 851 852 btrfs_release_path(path); 853 854 /* 855 * Mutex was contended, block until it's released and try 856 * again 857 */ 858 mutex_lock(&head->mutex); 859 mutex_unlock(&head->mutex); 860 btrfs_put_delayed_ref(&head->node); 861 goto search_again; 862 } 863 spin_lock(&head->lock); 864 if (head->extent_op && head->extent_op->update_flags) 865 extent_flags |= head->extent_op->flags_to_set; 866 else 867 BUG_ON(num_refs == 0); 868 869 num_refs += head->node.ref_mod; 870 spin_unlock(&head->lock); 871 mutex_unlock(&head->mutex); 872 } 873 spin_unlock(&delayed_refs->lock); 874 out: 875 WARN_ON(num_refs == 0); 876 if (refs) 877 *refs = num_refs; 878 if (flags) 879 *flags = extent_flags; 880 out_free: 881 btrfs_free_path(path); 882 return ret; 883 } 884 885 /* 886 * Back reference rules. Back refs have three main goals: 887 * 888 * 1) differentiate between all holders of references to an extent so that 889 * when a reference is dropped we can make sure it was a valid reference 890 * before freeing the extent. 891 * 892 * 2) Provide enough information to quickly find the holders of an extent 893 * if we notice a given block is corrupted or bad. 894 * 895 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 896 * maintenance. This is actually the same as #2, but with a slightly 897 * different use case. 898 * 899 * There are two kinds of back refs. The implicit back refs is optimized 900 * for pointers in non-shared tree blocks. For a given pointer in a block, 901 * back refs of this kind provide information about the block's owner tree 902 * and the pointer's key. These information allow us to find the block by 903 * b-tree searching. The full back refs is for pointers in tree blocks not 904 * referenced by their owner trees. The location of tree block is recorded 905 * in the back refs. Actually the full back refs is generic, and can be 906 * used in all cases the implicit back refs is used. The major shortcoming 907 * of the full back refs is its overhead. Every time a tree block gets 908 * COWed, we have to update back refs entry for all pointers in it. 909 * 910 * For a newly allocated tree block, we use implicit back refs for 911 * pointers in it. This means most tree related operations only involve 912 * implicit back refs. For a tree block created in old transaction, the 913 * only way to drop a reference to it is COW it. So we can detect the 914 * event that tree block loses its owner tree's reference and do the 915 * back refs conversion. 916 * 917 * When a tree block is COW'd through a tree, there are four cases: 918 * 919 * The reference count of the block is one and the tree is the block's 920 * owner tree. Nothing to do in this case. 921 * 922 * The reference count of the block is one and the tree is not the 923 * block's owner tree. In this case, full back refs is used for pointers 924 * in the block. Remove these full back refs, add implicit back refs for 925 * every pointers in the new block. 926 * 927 * The reference count of the block is greater than one and the tree is 928 * the block's owner tree. In this case, implicit back refs is used for 929 * pointers in the block. Add full back refs for every pointers in the 930 * block, increase lower level extents' reference counts. The original 931 * implicit back refs are entailed to the new block. 932 * 933 * The reference count of the block is greater than one and the tree is 934 * not the block's owner tree. Add implicit back refs for every pointer in 935 * the new block, increase lower level extents' reference count. 936 * 937 * Back Reference Key composing: 938 * 939 * The key objectid corresponds to the first byte in the extent, 940 * The key type is used to differentiate between types of back refs. 941 * There are different meanings of the key offset for different types 942 * of back refs. 943 * 944 * File extents can be referenced by: 945 * 946 * - multiple snapshots, subvolumes, or different generations in one subvol 947 * - different files inside a single subvolume 948 * - different offsets inside a file (bookend extents in file.c) 949 * 950 * The extent ref structure for the implicit back refs has fields for: 951 * 952 * - Objectid of the subvolume root 953 * - objectid of the file holding the reference 954 * - original offset in the file 955 * - how many bookend extents 956 * 957 * The key offset for the implicit back refs is hash of the first 958 * three fields. 959 * 960 * The extent ref structure for the full back refs has field for: 961 * 962 * - number of pointers in the tree leaf 963 * 964 * The key offset for the implicit back refs is the first byte of 965 * the tree leaf 966 * 967 * When a file extent is allocated, The implicit back refs is used. 968 * the fields are filled in: 969 * 970 * (root_key.objectid, inode objectid, offset in file, 1) 971 * 972 * When a file extent is removed file truncation, we find the 973 * corresponding implicit back refs and check the following fields: 974 * 975 * (btrfs_header_owner(leaf), inode objectid, offset in file) 976 * 977 * Btree extents can be referenced by: 978 * 979 * - Different subvolumes 980 * 981 * Both the implicit back refs and the full back refs for tree blocks 982 * only consist of key. The key offset for the implicit back refs is 983 * objectid of block's owner tree. The key offset for the full back refs 984 * is the first byte of parent block. 985 * 986 * When implicit back refs is used, information about the lowest key and 987 * level of the tree block are required. These information are stored in 988 * tree block info structure. 989 */ 990 991 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 992 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 993 struct btrfs_root *root, 994 struct btrfs_path *path, 995 u64 owner, u32 extra_size) 996 { 997 struct btrfs_extent_item *item; 998 struct btrfs_extent_item_v0 *ei0; 999 struct btrfs_extent_ref_v0 *ref0; 1000 struct btrfs_tree_block_info *bi; 1001 struct extent_buffer *leaf; 1002 struct btrfs_key key; 1003 struct btrfs_key found_key; 1004 u32 new_size = sizeof(*item); 1005 u64 refs; 1006 int ret; 1007 1008 leaf = path->nodes[0]; 1009 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1010 1011 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1012 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1013 struct btrfs_extent_item_v0); 1014 refs = btrfs_extent_refs_v0(leaf, ei0); 1015 1016 if (owner == (u64)-1) { 1017 while (1) { 1018 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1019 ret = btrfs_next_leaf(root, path); 1020 if (ret < 0) 1021 return ret; 1022 BUG_ON(ret > 0); /* Corruption */ 1023 leaf = path->nodes[0]; 1024 } 1025 btrfs_item_key_to_cpu(leaf, &found_key, 1026 path->slots[0]); 1027 BUG_ON(key.objectid != found_key.objectid); 1028 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1029 path->slots[0]++; 1030 continue; 1031 } 1032 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1033 struct btrfs_extent_ref_v0); 1034 owner = btrfs_ref_objectid_v0(leaf, ref0); 1035 break; 1036 } 1037 } 1038 btrfs_release_path(path); 1039 1040 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1041 new_size += sizeof(*bi); 1042 1043 new_size -= sizeof(*ei0); 1044 ret = btrfs_search_slot(trans, root, &key, path, 1045 new_size + extra_size, 1); 1046 if (ret < 0) 1047 return ret; 1048 BUG_ON(ret); /* Corruption */ 1049 1050 btrfs_extend_item(root, path, new_size); 1051 1052 leaf = path->nodes[0]; 1053 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1054 btrfs_set_extent_refs(leaf, item, refs); 1055 /* FIXME: get real generation */ 1056 btrfs_set_extent_generation(leaf, item, 0); 1057 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1058 btrfs_set_extent_flags(leaf, item, 1059 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1060 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1061 bi = (struct btrfs_tree_block_info *)(item + 1); 1062 /* FIXME: get first key of the block */ 1063 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1064 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1065 } else { 1066 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1067 } 1068 btrfs_mark_buffer_dirty(leaf); 1069 return 0; 1070 } 1071 #endif 1072 1073 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1074 { 1075 u32 high_crc = ~(u32)0; 1076 u32 low_crc = ~(u32)0; 1077 __le64 lenum; 1078 1079 lenum = cpu_to_le64(root_objectid); 1080 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1081 lenum = cpu_to_le64(owner); 1082 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1083 lenum = cpu_to_le64(offset); 1084 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1085 1086 return ((u64)high_crc << 31) ^ (u64)low_crc; 1087 } 1088 1089 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1090 struct btrfs_extent_data_ref *ref) 1091 { 1092 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1093 btrfs_extent_data_ref_objectid(leaf, ref), 1094 btrfs_extent_data_ref_offset(leaf, ref)); 1095 } 1096 1097 static int match_extent_data_ref(struct extent_buffer *leaf, 1098 struct btrfs_extent_data_ref *ref, 1099 u64 root_objectid, u64 owner, u64 offset) 1100 { 1101 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1102 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1103 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1104 return 0; 1105 return 1; 1106 } 1107 1108 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1109 struct btrfs_root *root, 1110 struct btrfs_path *path, 1111 u64 bytenr, u64 parent, 1112 u64 root_objectid, 1113 u64 owner, u64 offset) 1114 { 1115 struct btrfs_key key; 1116 struct btrfs_extent_data_ref *ref; 1117 struct extent_buffer *leaf; 1118 u32 nritems; 1119 int ret; 1120 int recow; 1121 int err = -ENOENT; 1122 1123 key.objectid = bytenr; 1124 if (parent) { 1125 key.type = BTRFS_SHARED_DATA_REF_KEY; 1126 key.offset = parent; 1127 } else { 1128 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1129 key.offset = hash_extent_data_ref(root_objectid, 1130 owner, offset); 1131 } 1132 again: 1133 recow = 0; 1134 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1135 if (ret < 0) { 1136 err = ret; 1137 goto fail; 1138 } 1139 1140 if (parent) { 1141 if (!ret) 1142 return 0; 1143 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1144 key.type = BTRFS_EXTENT_REF_V0_KEY; 1145 btrfs_release_path(path); 1146 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1147 if (ret < 0) { 1148 err = ret; 1149 goto fail; 1150 } 1151 if (!ret) 1152 return 0; 1153 #endif 1154 goto fail; 1155 } 1156 1157 leaf = path->nodes[0]; 1158 nritems = btrfs_header_nritems(leaf); 1159 while (1) { 1160 if (path->slots[0] >= nritems) { 1161 ret = btrfs_next_leaf(root, path); 1162 if (ret < 0) 1163 err = ret; 1164 if (ret) 1165 goto fail; 1166 1167 leaf = path->nodes[0]; 1168 nritems = btrfs_header_nritems(leaf); 1169 recow = 1; 1170 } 1171 1172 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1173 if (key.objectid != bytenr || 1174 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1175 goto fail; 1176 1177 ref = btrfs_item_ptr(leaf, path->slots[0], 1178 struct btrfs_extent_data_ref); 1179 1180 if (match_extent_data_ref(leaf, ref, root_objectid, 1181 owner, offset)) { 1182 if (recow) { 1183 btrfs_release_path(path); 1184 goto again; 1185 } 1186 err = 0; 1187 break; 1188 } 1189 path->slots[0]++; 1190 } 1191 fail: 1192 return err; 1193 } 1194 1195 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1196 struct btrfs_root *root, 1197 struct btrfs_path *path, 1198 u64 bytenr, u64 parent, 1199 u64 root_objectid, u64 owner, 1200 u64 offset, int refs_to_add) 1201 { 1202 struct btrfs_key key; 1203 struct extent_buffer *leaf; 1204 u32 size; 1205 u32 num_refs; 1206 int ret; 1207 1208 key.objectid = bytenr; 1209 if (parent) { 1210 key.type = BTRFS_SHARED_DATA_REF_KEY; 1211 key.offset = parent; 1212 size = sizeof(struct btrfs_shared_data_ref); 1213 } else { 1214 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1215 key.offset = hash_extent_data_ref(root_objectid, 1216 owner, offset); 1217 size = sizeof(struct btrfs_extent_data_ref); 1218 } 1219 1220 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1221 if (ret && ret != -EEXIST) 1222 goto fail; 1223 1224 leaf = path->nodes[0]; 1225 if (parent) { 1226 struct btrfs_shared_data_ref *ref; 1227 ref = btrfs_item_ptr(leaf, path->slots[0], 1228 struct btrfs_shared_data_ref); 1229 if (ret == 0) { 1230 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1231 } else { 1232 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1233 num_refs += refs_to_add; 1234 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1235 } 1236 } else { 1237 struct btrfs_extent_data_ref *ref; 1238 while (ret == -EEXIST) { 1239 ref = btrfs_item_ptr(leaf, path->slots[0], 1240 struct btrfs_extent_data_ref); 1241 if (match_extent_data_ref(leaf, ref, root_objectid, 1242 owner, offset)) 1243 break; 1244 btrfs_release_path(path); 1245 key.offset++; 1246 ret = btrfs_insert_empty_item(trans, root, path, &key, 1247 size); 1248 if (ret && ret != -EEXIST) 1249 goto fail; 1250 1251 leaf = path->nodes[0]; 1252 } 1253 ref = btrfs_item_ptr(leaf, path->slots[0], 1254 struct btrfs_extent_data_ref); 1255 if (ret == 0) { 1256 btrfs_set_extent_data_ref_root(leaf, ref, 1257 root_objectid); 1258 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1259 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1260 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1261 } else { 1262 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1263 num_refs += refs_to_add; 1264 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1265 } 1266 } 1267 btrfs_mark_buffer_dirty(leaf); 1268 ret = 0; 1269 fail: 1270 btrfs_release_path(path); 1271 return ret; 1272 } 1273 1274 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1275 struct btrfs_root *root, 1276 struct btrfs_path *path, 1277 int refs_to_drop, int *last_ref) 1278 { 1279 struct btrfs_key key; 1280 struct btrfs_extent_data_ref *ref1 = NULL; 1281 struct btrfs_shared_data_ref *ref2 = NULL; 1282 struct extent_buffer *leaf; 1283 u32 num_refs = 0; 1284 int ret = 0; 1285 1286 leaf = path->nodes[0]; 1287 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1288 1289 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1290 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1291 struct btrfs_extent_data_ref); 1292 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1293 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1294 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1295 struct btrfs_shared_data_ref); 1296 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1297 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1298 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1299 struct btrfs_extent_ref_v0 *ref0; 1300 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1301 struct btrfs_extent_ref_v0); 1302 num_refs = btrfs_ref_count_v0(leaf, ref0); 1303 #endif 1304 } else { 1305 BUG(); 1306 } 1307 1308 BUG_ON(num_refs < refs_to_drop); 1309 num_refs -= refs_to_drop; 1310 1311 if (num_refs == 0) { 1312 ret = btrfs_del_item(trans, root, path); 1313 *last_ref = 1; 1314 } else { 1315 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1316 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1317 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1318 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1319 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1320 else { 1321 struct btrfs_extent_ref_v0 *ref0; 1322 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1323 struct btrfs_extent_ref_v0); 1324 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1325 } 1326 #endif 1327 btrfs_mark_buffer_dirty(leaf); 1328 } 1329 return ret; 1330 } 1331 1332 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1333 struct btrfs_path *path, 1334 struct btrfs_extent_inline_ref *iref) 1335 { 1336 struct btrfs_key key; 1337 struct extent_buffer *leaf; 1338 struct btrfs_extent_data_ref *ref1; 1339 struct btrfs_shared_data_ref *ref2; 1340 u32 num_refs = 0; 1341 1342 leaf = path->nodes[0]; 1343 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1344 if (iref) { 1345 if (btrfs_extent_inline_ref_type(leaf, iref) == 1346 BTRFS_EXTENT_DATA_REF_KEY) { 1347 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1348 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1349 } else { 1350 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1351 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1352 } 1353 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1354 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1355 struct btrfs_extent_data_ref); 1356 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1357 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1358 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1359 struct btrfs_shared_data_ref); 1360 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1361 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1362 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1363 struct btrfs_extent_ref_v0 *ref0; 1364 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1365 struct btrfs_extent_ref_v0); 1366 num_refs = btrfs_ref_count_v0(leaf, ref0); 1367 #endif 1368 } else { 1369 WARN_ON(1); 1370 } 1371 return num_refs; 1372 } 1373 1374 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1375 struct btrfs_root *root, 1376 struct btrfs_path *path, 1377 u64 bytenr, u64 parent, 1378 u64 root_objectid) 1379 { 1380 struct btrfs_key key; 1381 int ret; 1382 1383 key.objectid = bytenr; 1384 if (parent) { 1385 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1386 key.offset = parent; 1387 } else { 1388 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1389 key.offset = root_objectid; 1390 } 1391 1392 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1393 if (ret > 0) 1394 ret = -ENOENT; 1395 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1396 if (ret == -ENOENT && parent) { 1397 btrfs_release_path(path); 1398 key.type = BTRFS_EXTENT_REF_V0_KEY; 1399 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1400 if (ret > 0) 1401 ret = -ENOENT; 1402 } 1403 #endif 1404 return ret; 1405 } 1406 1407 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1408 struct btrfs_root *root, 1409 struct btrfs_path *path, 1410 u64 bytenr, u64 parent, 1411 u64 root_objectid) 1412 { 1413 struct btrfs_key key; 1414 int ret; 1415 1416 key.objectid = bytenr; 1417 if (parent) { 1418 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1419 key.offset = parent; 1420 } else { 1421 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1422 key.offset = root_objectid; 1423 } 1424 1425 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1426 btrfs_release_path(path); 1427 return ret; 1428 } 1429 1430 static inline int extent_ref_type(u64 parent, u64 owner) 1431 { 1432 int type; 1433 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1434 if (parent > 0) 1435 type = BTRFS_SHARED_BLOCK_REF_KEY; 1436 else 1437 type = BTRFS_TREE_BLOCK_REF_KEY; 1438 } else { 1439 if (parent > 0) 1440 type = BTRFS_SHARED_DATA_REF_KEY; 1441 else 1442 type = BTRFS_EXTENT_DATA_REF_KEY; 1443 } 1444 return type; 1445 } 1446 1447 static int find_next_key(struct btrfs_path *path, int level, 1448 struct btrfs_key *key) 1449 1450 { 1451 for (; level < BTRFS_MAX_LEVEL; level++) { 1452 if (!path->nodes[level]) 1453 break; 1454 if (path->slots[level] + 1 >= 1455 btrfs_header_nritems(path->nodes[level])) 1456 continue; 1457 if (level == 0) 1458 btrfs_item_key_to_cpu(path->nodes[level], key, 1459 path->slots[level] + 1); 1460 else 1461 btrfs_node_key_to_cpu(path->nodes[level], key, 1462 path->slots[level] + 1); 1463 return 0; 1464 } 1465 return 1; 1466 } 1467 1468 /* 1469 * look for inline back ref. if back ref is found, *ref_ret is set 1470 * to the address of inline back ref, and 0 is returned. 1471 * 1472 * if back ref isn't found, *ref_ret is set to the address where it 1473 * should be inserted, and -ENOENT is returned. 1474 * 1475 * if insert is true and there are too many inline back refs, the path 1476 * points to the extent item, and -EAGAIN is returned. 1477 * 1478 * NOTE: inline back refs are ordered in the same way that back ref 1479 * items in the tree are ordered. 1480 */ 1481 static noinline_for_stack 1482 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1483 struct btrfs_root *root, 1484 struct btrfs_path *path, 1485 struct btrfs_extent_inline_ref **ref_ret, 1486 u64 bytenr, u64 num_bytes, 1487 u64 parent, u64 root_objectid, 1488 u64 owner, u64 offset, int insert) 1489 { 1490 struct btrfs_key key; 1491 struct extent_buffer *leaf; 1492 struct btrfs_extent_item *ei; 1493 struct btrfs_extent_inline_ref *iref; 1494 u64 flags; 1495 u64 item_size; 1496 unsigned long ptr; 1497 unsigned long end; 1498 int extra_size; 1499 int type; 1500 int want; 1501 int ret; 1502 int err = 0; 1503 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1504 SKINNY_METADATA); 1505 1506 key.objectid = bytenr; 1507 key.type = BTRFS_EXTENT_ITEM_KEY; 1508 key.offset = num_bytes; 1509 1510 want = extent_ref_type(parent, owner); 1511 if (insert) { 1512 extra_size = btrfs_extent_inline_ref_size(want); 1513 path->keep_locks = 1; 1514 } else 1515 extra_size = -1; 1516 1517 /* 1518 * Owner is our parent level, so we can just add one to get the level 1519 * for the block we are interested in. 1520 */ 1521 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1522 key.type = BTRFS_METADATA_ITEM_KEY; 1523 key.offset = owner; 1524 } 1525 1526 again: 1527 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1528 if (ret < 0) { 1529 err = ret; 1530 goto out; 1531 } 1532 1533 /* 1534 * We may be a newly converted file system which still has the old fat 1535 * extent entries for metadata, so try and see if we have one of those. 1536 */ 1537 if (ret > 0 && skinny_metadata) { 1538 skinny_metadata = false; 1539 if (path->slots[0]) { 1540 path->slots[0]--; 1541 btrfs_item_key_to_cpu(path->nodes[0], &key, 1542 path->slots[0]); 1543 if (key.objectid == bytenr && 1544 key.type == BTRFS_EXTENT_ITEM_KEY && 1545 key.offset == num_bytes) 1546 ret = 0; 1547 } 1548 if (ret) { 1549 key.objectid = bytenr; 1550 key.type = BTRFS_EXTENT_ITEM_KEY; 1551 key.offset = num_bytes; 1552 btrfs_release_path(path); 1553 goto again; 1554 } 1555 } 1556 1557 if (ret && !insert) { 1558 err = -ENOENT; 1559 goto out; 1560 } else if (WARN_ON(ret)) { 1561 err = -EIO; 1562 goto out; 1563 } 1564 1565 leaf = path->nodes[0]; 1566 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1567 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1568 if (item_size < sizeof(*ei)) { 1569 if (!insert) { 1570 err = -ENOENT; 1571 goto out; 1572 } 1573 ret = convert_extent_item_v0(trans, root, path, owner, 1574 extra_size); 1575 if (ret < 0) { 1576 err = ret; 1577 goto out; 1578 } 1579 leaf = path->nodes[0]; 1580 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1581 } 1582 #endif 1583 BUG_ON(item_size < sizeof(*ei)); 1584 1585 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1586 flags = btrfs_extent_flags(leaf, ei); 1587 1588 ptr = (unsigned long)(ei + 1); 1589 end = (unsigned long)ei + item_size; 1590 1591 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1592 ptr += sizeof(struct btrfs_tree_block_info); 1593 BUG_ON(ptr > end); 1594 } 1595 1596 err = -ENOENT; 1597 while (1) { 1598 if (ptr >= end) { 1599 WARN_ON(ptr > end); 1600 break; 1601 } 1602 iref = (struct btrfs_extent_inline_ref *)ptr; 1603 type = btrfs_extent_inline_ref_type(leaf, iref); 1604 if (want < type) 1605 break; 1606 if (want > type) { 1607 ptr += btrfs_extent_inline_ref_size(type); 1608 continue; 1609 } 1610 1611 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1612 struct btrfs_extent_data_ref *dref; 1613 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1614 if (match_extent_data_ref(leaf, dref, root_objectid, 1615 owner, offset)) { 1616 err = 0; 1617 break; 1618 } 1619 if (hash_extent_data_ref_item(leaf, dref) < 1620 hash_extent_data_ref(root_objectid, owner, offset)) 1621 break; 1622 } else { 1623 u64 ref_offset; 1624 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1625 if (parent > 0) { 1626 if (parent == ref_offset) { 1627 err = 0; 1628 break; 1629 } 1630 if (ref_offset < parent) 1631 break; 1632 } else { 1633 if (root_objectid == ref_offset) { 1634 err = 0; 1635 break; 1636 } 1637 if (ref_offset < root_objectid) 1638 break; 1639 } 1640 } 1641 ptr += btrfs_extent_inline_ref_size(type); 1642 } 1643 if (err == -ENOENT && insert) { 1644 if (item_size + extra_size >= 1645 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1646 err = -EAGAIN; 1647 goto out; 1648 } 1649 /* 1650 * To add new inline back ref, we have to make sure 1651 * there is no corresponding back ref item. 1652 * For simplicity, we just do not add new inline back 1653 * ref if there is any kind of item for this block 1654 */ 1655 if (find_next_key(path, 0, &key) == 0 && 1656 key.objectid == bytenr && 1657 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1658 err = -EAGAIN; 1659 goto out; 1660 } 1661 } 1662 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1663 out: 1664 if (insert) { 1665 path->keep_locks = 0; 1666 btrfs_unlock_up_safe(path, 1); 1667 } 1668 return err; 1669 } 1670 1671 /* 1672 * helper to add new inline back ref 1673 */ 1674 static noinline_for_stack 1675 void setup_inline_extent_backref(struct btrfs_root *root, 1676 struct btrfs_path *path, 1677 struct btrfs_extent_inline_ref *iref, 1678 u64 parent, u64 root_objectid, 1679 u64 owner, u64 offset, int refs_to_add, 1680 struct btrfs_delayed_extent_op *extent_op) 1681 { 1682 struct extent_buffer *leaf; 1683 struct btrfs_extent_item *ei; 1684 unsigned long ptr; 1685 unsigned long end; 1686 unsigned long item_offset; 1687 u64 refs; 1688 int size; 1689 int type; 1690 1691 leaf = path->nodes[0]; 1692 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1693 item_offset = (unsigned long)iref - (unsigned long)ei; 1694 1695 type = extent_ref_type(parent, owner); 1696 size = btrfs_extent_inline_ref_size(type); 1697 1698 btrfs_extend_item(root, path, size); 1699 1700 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1701 refs = btrfs_extent_refs(leaf, ei); 1702 refs += refs_to_add; 1703 btrfs_set_extent_refs(leaf, ei, refs); 1704 if (extent_op) 1705 __run_delayed_extent_op(extent_op, leaf, ei); 1706 1707 ptr = (unsigned long)ei + item_offset; 1708 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1709 if (ptr < end - size) 1710 memmove_extent_buffer(leaf, ptr + size, ptr, 1711 end - size - ptr); 1712 1713 iref = (struct btrfs_extent_inline_ref *)ptr; 1714 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1715 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1716 struct btrfs_extent_data_ref *dref; 1717 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1718 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1719 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1720 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1721 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1722 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1723 struct btrfs_shared_data_ref *sref; 1724 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1725 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1726 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1727 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1728 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1729 } else { 1730 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1731 } 1732 btrfs_mark_buffer_dirty(leaf); 1733 } 1734 1735 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1736 struct btrfs_root *root, 1737 struct btrfs_path *path, 1738 struct btrfs_extent_inline_ref **ref_ret, 1739 u64 bytenr, u64 num_bytes, u64 parent, 1740 u64 root_objectid, u64 owner, u64 offset) 1741 { 1742 int ret; 1743 1744 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1745 bytenr, num_bytes, parent, 1746 root_objectid, owner, offset, 0); 1747 if (ret != -ENOENT) 1748 return ret; 1749 1750 btrfs_release_path(path); 1751 *ref_ret = NULL; 1752 1753 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1754 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1755 root_objectid); 1756 } else { 1757 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1758 root_objectid, owner, offset); 1759 } 1760 return ret; 1761 } 1762 1763 /* 1764 * helper to update/remove inline back ref 1765 */ 1766 static noinline_for_stack 1767 void update_inline_extent_backref(struct btrfs_root *root, 1768 struct btrfs_path *path, 1769 struct btrfs_extent_inline_ref *iref, 1770 int refs_to_mod, 1771 struct btrfs_delayed_extent_op *extent_op, 1772 int *last_ref) 1773 { 1774 struct extent_buffer *leaf; 1775 struct btrfs_extent_item *ei; 1776 struct btrfs_extent_data_ref *dref = NULL; 1777 struct btrfs_shared_data_ref *sref = NULL; 1778 unsigned long ptr; 1779 unsigned long end; 1780 u32 item_size; 1781 int size; 1782 int type; 1783 u64 refs; 1784 1785 leaf = path->nodes[0]; 1786 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1787 refs = btrfs_extent_refs(leaf, ei); 1788 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1789 refs += refs_to_mod; 1790 btrfs_set_extent_refs(leaf, ei, refs); 1791 if (extent_op) 1792 __run_delayed_extent_op(extent_op, leaf, ei); 1793 1794 type = btrfs_extent_inline_ref_type(leaf, iref); 1795 1796 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1797 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1798 refs = btrfs_extent_data_ref_count(leaf, dref); 1799 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1800 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1801 refs = btrfs_shared_data_ref_count(leaf, sref); 1802 } else { 1803 refs = 1; 1804 BUG_ON(refs_to_mod != -1); 1805 } 1806 1807 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1808 refs += refs_to_mod; 1809 1810 if (refs > 0) { 1811 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1812 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1813 else 1814 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1815 } else { 1816 *last_ref = 1; 1817 size = btrfs_extent_inline_ref_size(type); 1818 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1819 ptr = (unsigned long)iref; 1820 end = (unsigned long)ei + item_size; 1821 if (ptr + size < end) 1822 memmove_extent_buffer(leaf, ptr, ptr + size, 1823 end - ptr - size); 1824 item_size -= size; 1825 btrfs_truncate_item(root, path, item_size, 1); 1826 } 1827 btrfs_mark_buffer_dirty(leaf); 1828 } 1829 1830 static noinline_for_stack 1831 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1832 struct btrfs_root *root, 1833 struct btrfs_path *path, 1834 u64 bytenr, u64 num_bytes, u64 parent, 1835 u64 root_objectid, u64 owner, 1836 u64 offset, int refs_to_add, 1837 struct btrfs_delayed_extent_op *extent_op) 1838 { 1839 struct btrfs_extent_inline_ref *iref; 1840 int ret; 1841 1842 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1843 bytenr, num_bytes, parent, 1844 root_objectid, owner, offset, 1); 1845 if (ret == 0) { 1846 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1847 update_inline_extent_backref(root, path, iref, 1848 refs_to_add, extent_op, NULL); 1849 } else if (ret == -ENOENT) { 1850 setup_inline_extent_backref(root, path, iref, parent, 1851 root_objectid, owner, offset, 1852 refs_to_add, extent_op); 1853 ret = 0; 1854 } 1855 return ret; 1856 } 1857 1858 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1859 struct btrfs_root *root, 1860 struct btrfs_path *path, 1861 u64 bytenr, u64 parent, u64 root_objectid, 1862 u64 owner, u64 offset, int refs_to_add) 1863 { 1864 int ret; 1865 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1866 BUG_ON(refs_to_add != 1); 1867 ret = insert_tree_block_ref(trans, root, path, bytenr, 1868 parent, root_objectid); 1869 } else { 1870 ret = insert_extent_data_ref(trans, root, path, bytenr, 1871 parent, root_objectid, 1872 owner, offset, refs_to_add); 1873 } 1874 return ret; 1875 } 1876 1877 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1878 struct btrfs_root *root, 1879 struct btrfs_path *path, 1880 struct btrfs_extent_inline_ref *iref, 1881 int refs_to_drop, int is_data, int *last_ref) 1882 { 1883 int ret = 0; 1884 1885 BUG_ON(!is_data && refs_to_drop != 1); 1886 if (iref) { 1887 update_inline_extent_backref(root, path, iref, 1888 -refs_to_drop, NULL, last_ref); 1889 } else if (is_data) { 1890 ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1891 last_ref); 1892 } else { 1893 *last_ref = 1; 1894 ret = btrfs_del_item(trans, root, path); 1895 } 1896 return ret; 1897 } 1898 1899 static int btrfs_issue_discard(struct block_device *bdev, 1900 u64 start, u64 len) 1901 { 1902 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1903 } 1904 1905 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1906 u64 num_bytes, u64 *actual_bytes) 1907 { 1908 int ret; 1909 u64 discarded_bytes = 0; 1910 struct btrfs_bio *bbio = NULL; 1911 1912 1913 /* Tell the block device(s) that the sectors can be discarded */ 1914 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1915 bytenr, &num_bytes, &bbio, 0); 1916 /* Error condition is -ENOMEM */ 1917 if (!ret) { 1918 struct btrfs_bio_stripe *stripe = bbio->stripes; 1919 int i; 1920 1921 1922 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1923 if (!stripe->dev->can_discard) 1924 continue; 1925 1926 ret = btrfs_issue_discard(stripe->dev->bdev, 1927 stripe->physical, 1928 stripe->length); 1929 if (!ret) 1930 discarded_bytes += stripe->length; 1931 else if (ret != -EOPNOTSUPP) 1932 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1933 1934 /* 1935 * Just in case we get back EOPNOTSUPP for some reason, 1936 * just ignore the return value so we don't screw up 1937 * people calling discard_extent. 1938 */ 1939 ret = 0; 1940 } 1941 kfree(bbio); 1942 } 1943 1944 if (actual_bytes) 1945 *actual_bytes = discarded_bytes; 1946 1947 1948 if (ret == -EOPNOTSUPP) 1949 ret = 0; 1950 return ret; 1951 } 1952 1953 /* Can return -ENOMEM */ 1954 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1955 struct btrfs_root *root, 1956 u64 bytenr, u64 num_bytes, u64 parent, 1957 u64 root_objectid, u64 owner, u64 offset, 1958 int no_quota) 1959 { 1960 int ret; 1961 struct btrfs_fs_info *fs_info = root->fs_info; 1962 1963 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1964 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1965 1966 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1967 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1968 num_bytes, 1969 parent, root_objectid, (int)owner, 1970 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1971 } else { 1972 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1973 num_bytes, 1974 parent, root_objectid, owner, offset, 1975 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1976 } 1977 return ret; 1978 } 1979 1980 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1981 struct btrfs_root *root, 1982 u64 bytenr, u64 num_bytes, 1983 u64 parent, u64 root_objectid, 1984 u64 owner, u64 offset, int refs_to_add, 1985 int no_quota, 1986 struct btrfs_delayed_extent_op *extent_op) 1987 { 1988 struct btrfs_fs_info *fs_info = root->fs_info; 1989 struct btrfs_path *path; 1990 struct extent_buffer *leaf; 1991 struct btrfs_extent_item *item; 1992 struct btrfs_key key; 1993 u64 refs; 1994 int ret; 1995 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; 1996 1997 path = btrfs_alloc_path(); 1998 if (!path) 1999 return -ENOMEM; 2000 2001 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) 2002 no_quota = 1; 2003 2004 path->reada = 1; 2005 path->leave_spinning = 1; 2006 /* this will setup the path even if it fails to insert the back ref */ 2007 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, 2008 bytenr, num_bytes, parent, 2009 root_objectid, owner, offset, 2010 refs_to_add, extent_op); 2011 if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) 2012 goto out; 2013 /* 2014 * Ok we were able to insert an inline extent and it appears to be a new 2015 * reference, deal with the qgroup accounting. 2016 */ 2017 if (!ret && !no_quota) { 2018 ASSERT(root->fs_info->quota_enabled); 2019 leaf = path->nodes[0]; 2020 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2021 item = btrfs_item_ptr(leaf, path->slots[0], 2022 struct btrfs_extent_item); 2023 if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) 2024 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2025 btrfs_release_path(path); 2026 2027 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2028 bytenr, num_bytes, type, 0); 2029 goto out; 2030 } 2031 2032 /* 2033 * Ok we had -EAGAIN which means we didn't have space to insert and 2034 * inline extent ref, so just update the reference count and add a 2035 * normal backref. 2036 */ 2037 leaf = path->nodes[0]; 2038 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2039 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2040 refs = btrfs_extent_refs(leaf, item); 2041 if (refs) 2042 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2043 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2044 if (extent_op) 2045 __run_delayed_extent_op(extent_op, leaf, item); 2046 2047 btrfs_mark_buffer_dirty(leaf); 2048 btrfs_release_path(path); 2049 2050 if (!no_quota) { 2051 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2052 bytenr, num_bytes, type, 0); 2053 if (ret) 2054 goto out; 2055 } 2056 2057 path->reada = 1; 2058 path->leave_spinning = 1; 2059 /* now insert the actual backref */ 2060 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2061 path, bytenr, parent, root_objectid, 2062 owner, offset, refs_to_add); 2063 if (ret) 2064 btrfs_abort_transaction(trans, root, ret); 2065 out: 2066 btrfs_free_path(path); 2067 return ret; 2068 } 2069 2070 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2071 struct btrfs_root *root, 2072 struct btrfs_delayed_ref_node *node, 2073 struct btrfs_delayed_extent_op *extent_op, 2074 int insert_reserved) 2075 { 2076 int ret = 0; 2077 struct btrfs_delayed_data_ref *ref; 2078 struct btrfs_key ins; 2079 u64 parent = 0; 2080 u64 ref_root = 0; 2081 u64 flags = 0; 2082 2083 ins.objectid = node->bytenr; 2084 ins.offset = node->num_bytes; 2085 ins.type = BTRFS_EXTENT_ITEM_KEY; 2086 2087 ref = btrfs_delayed_node_to_data_ref(node); 2088 trace_run_delayed_data_ref(node, ref, node->action); 2089 2090 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2091 parent = ref->parent; 2092 ref_root = ref->root; 2093 2094 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2095 if (extent_op) 2096 flags |= extent_op->flags_to_set; 2097 ret = alloc_reserved_file_extent(trans, root, 2098 parent, ref_root, flags, 2099 ref->objectid, ref->offset, 2100 &ins, node->ref_mod); 2101 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2102 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2103 node->num_bytes, parent, 2104 ref_root, ref->objectid, 2105 ref->offset, node->ref_mod, 2106 node->no_quota, extent_op); 2107 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2108 ret = __btrfs_free_extent(trans, root, node->bytenr, 2109 node->num_bytes, parent, 2110 ref_root, ref->objectid, 2111 ref->offset, node->ref_mod, 2112 extent_op, node->no_quota); 2113 } else { 2114 BUG(); 2115 } 2116 return ret; 2117 } 2118 2119 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2120 struct extent_buffer *leaf, 2121 struct btrfs_extent_item *ei) 2122 { 2123 u64 flags = btrfs_extent_flags(leaf, ei); 2124 if (extent_op->update_flags) { 2125 flags |= extent_op->flags_to_set; 2126 btrfs_set_extent_flags(leaf, ei, flags); 2127 } 2128 2129 if (extent_op->update_key) { 2130 struct btrfs_tree_block_info *bi; 2131 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2132 bi = (struct btrfs_tree_block_info *)(ei + 1); 2133 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2134 } 2135 } 2136 2137 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2138 struct btrfs_root *root, 2139 struct btrfs_delayed_ref_node *node, 2140 struct btrfs_delayed_extent_op *extent_op) 2141 { 2142 struct btrfs_key key; 2143 struct btrfs_path *path; 2144 struct btrfs_extent_item *ei; 2145 struct extent_buffer *leaf; 2146 u32 item_size; 2147 int ret; 2148 int err = 0; 2149 int metadata = !extent_op->is_data; 2150 2151 if (trans->aborted) 2152 return 0; 2153 2154 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2155 metadata = 0; 2156 2157 path = btrfs_alloc_path(); 2158 if (!path) 2159 return -ENOMEM; 2160 2161 key.objectid = node->bytenr; 2162 2163 if (metadata) { 2164 key.type = BTRFS_METADATA_ITEM_KEY; 2165 key.offset = extent_op->level; 2166 } else { 2167 key.type = BTRFS_EXTENT_ITEM_KEY; 2168 key.offset = node->num_bytes; 2169 } 2170 2171 again: 2172 path->reada = 1; 2173 path->leave_spinning = 1; 2174 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2175 path, 0, 1); 2176 if (ret < 0) { 2177 err = ret; 2178 goto out; 2179 } 2180 if (ret > 0) { 2181 if (metadata) { 2182 if (path->slots[0] > 0) { 2183 path->slots[0]--; 2184 btrfs_item_key_to_cpu(path->nodes[0], &key, 2185 path->slots[0]); 2186 if (key.objectid == node->bytenr && 2187 key.type == BTRFS_EXTENT_ITEM_KEY && 2188 key.offset == node->num_bytes) 2189 ret = 0; 2190 } 2191 if (ret > 0) { 2192 btrfs_release_path(path); 2193 metadata = 0; 2194 2195 key.objectid = node->bytenr; 2196 key.offset = node->num_bytes; 2197 key.type = BTRFS_EXTENT_ITEM_KEY; 2198 goto again; 2199 } 2200 } else { 2201 err = -EIO; 2202 goto out; 2203 } 2204 } 2205 2206 leaf = path->nodes[0]; 2207 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2208 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2209 if (item_size < sizeof(*ei)) { 2210 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2211 path, (u64)-1, 0); 2212 if (ret < 0) { 2213 err = ret; 2214 goto out; 2215 } 2216 leaf = path->nodes[0]; 2217 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2218 } 2219 #endif 2220 BUG_ON(item_size < sizeof(*ei)); 2221 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2222 __run_delayed_extent_op(extent_op, leaf, ei); 2223 2224 btrfs_mark_buffer_dirty(leaf); 2225 out: 2226 btrfs_free_path(path); 2227 return err; 2228 } 2229 2230 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2231 struct btrfs_root *root, 2232 struct btrfs_delayed_ref_node *node, 2233 struct btrfs_delayed_extent_op *extent_op, 2234 int insert_reserved) 2235 { 2236 int ret = 0; 2237 struct btrfs_delayed_tree_ref *ref; 2238 struct btrfs_key ins; 2239 u64 parent = 0; 2240 u64 ref_root = 0; 2241 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2242 SKINNY_METADATA); 2243 2244 ref = btrfs_delayed_node_to_tree_ref(node); 2245 trace_run_delayed_tree_ref(node, ref, node->action); 2246 2247 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2248 parent = ref->parent; 2249 ref_root = ref->root; 2250 2251 ins.objectid = node->bytenr; 2252 if (skinny_metadata) { 2253 ins.offset = ref->level; 2254 ins.type = BTRFS_METADATA_ITEM_KEY; 2255 } else { 2256 ins.offset = node->num_bytes; 2257 ins.type = BTRFS_EXTENT_ITEM_KEY; 2258 } 2259 2260 BUG_ON(node->ref_mod != 1); 2261 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2262 BUG_ON(!extent_op || !extent_op->update_flags); 2263 ret = alloc_reserved_tree_block(trans, root, 2264 parent, ref_root, 2265 extent_op->flags_to_set, 2266 &extent_op->key, 2267 ref->level, &ins, 2268 node->no_quota); 2269 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2270 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2271 node->num_bytes, parent, ref_root, 2272 ref->level, 0, 1, node->no_quota, 2273 extent_op); 2274 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2275 ret = __btrfs_free_extent(trans, root, node->bytenr, 2276 node->num_bytes, parent, ref_root, 2277 ref->level, 0, 1, extent_op, 2278 node->no_quota); 2279 } else { 2280 BUG(); 2281 } 2282 return ret; 2283 } 2284 2285 /* helper function to actually process a single delayed ref entry */ 2286 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2287 struct btrfs_root *root, 2288 struct btrfs_delayed_ref_node *node, 2289 struct btrfs_delayed_extent_op *extent_op, 2290 int insert_reserved) 2291 { 2292 int ret = 0; 2293 2294 if (trans->aborted) { 2295 if (insert_reserved) 2296 btrfs_pin_extent(root, node->bytenr, 2297 node->num_bytes, 1); 2298 return 0; 2299 } 2300 2301 if (btrfs_delayed_ref_is_head(node)) { 2302 struct btrfs_delayed_ref_head *head; 2303 /* 2304 * we've hit the end of the chain and we were supposed 2305 * to insert this extent into the tree. But, it got 2306 * deleted before we ever needed to insert it, so all 2307 * we have to do is clean up the accounting 2308 */ 2309 BUG_ON(extent_op); 2310 head = btrfs_delayed_node_to_head(node); 2311 trace_run_delayed_ref_head(node, head, node->action); 2312 2313 if (insert_reserved) { 2314 btrfs_pin_extent(root, node->bytenr, 2315 node->num_bytes, 1); 2316 if (head->is_data) { 2317 ret = btrfs_del_csums(trans, root, 2318 node->bytenr, 2319 node->num_bytes); 2320 } 2321 } 2322 return ret; 2323 } 2324 2325 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2326 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2327 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2328 insert_reserved); 2329 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2330 node->type == BTRFS_SHARED_DATA_REF_KEY) 2331 ret = run_delayed_data_ref(trans, root, node, extent_op, 2332 insert_reserved); 2333 else 2334 BUG(); 2335 return ret; 2336 } 2337 2338 static noinline struct btrfs_delayed_ref_node * 2339 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2340 { 2341 struct rb_node *node; 2342 struct btrfs_delayed_ref_node *ref, *last = NULL;; 2343 2344 /* 2345 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2346 * this prevents ref count from going down to zero when 2347 * there still are pending delayed ref. 2348 */ 2349 node = rb_first(&head->ref_root); 2350 while (node) { 2351 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2352 rb_node); 2353 if (ref->action == BTRFS_ADD_DELAYED_REF) 2354 return ref; 2355 else if (last == NULL) 2356 last = ref; 2357 node = rb_next(node); 2358 } 2359 return last; 2360 } 2361 2362 /* 2363 * Returns 0 on success or if called with an already aborted transaction. 2364 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2365 */ 2366 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2367 struct btrfs_root *root, 2368 unsigned long nr) 2369 { 2370 struct btrfs_delayed_ref_root *delayed_refs; 2371 struct btrfs_delayed_ref_node *ref; 2372 struct btrfs_delayed_ref_head *locked_ref = NULL; 2373 struct btrfs_delayed_extent_op *extent_op; 2374 struct btrfs_fs_info *fs_info = root->fs_info; 2375 ktime_t start = ktime_get(); 2376 int ret; 2377 unsigned long count = 0; 2378 unsigned long actual_count = 0; 2379 int must_insert_reserved = 0; 2380 2381 delayed_refs = &trans->transaction->delayed_refs; 2382 while (1) { 2383 if (!locked_ref) { 2384 if (count >= nr) 2385 break; 2386 2387 spin_lock(&delayed_refs->lock); 2388 locked_ref = btrfs_select_ref_head(trans); 2389 if (!locked_ref) { 2390 spin_unlock(&delayed_refs->lock); 2391 break; 2392 } 2393 2394 /* grab the lock that says we are going to process 2395 * all the refs for this head */ 2396 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2397 spin_unlock(&delayed_refs->lock); 2398 /* 2399 * we may have dropped the spin lock to get the head 2400 * mutex lock, and that might have given someone else 2401 * time to free the head. If that's true, it has been 2402 * removed from our list and we can move on. 2403 */ 2404 if (ret == -EAGAIN) { 2405 locked_ref = NULL; 2406 count++; 2407 continue; 2408 } 2409 } 2410 2411 /* 2412 * We need to try and merge add/drops of the same ref since we 2413 * can run into issues with relocate dropping the implicit ref 2414 * and then it being added back again before the drop can 2415 * finish. If we merged anything we need to re-loop so we can 2416 * get a good ref. 2417 */ 2418 spin_lock(&locked_ref->lock); 2419 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2420 locked_ref); 2421 2422 /* 2423 * locked_ref is the head node, so we have to go one 2424 * node back for any delayed ref updates 2425 */ 2426 ref = select_delayed_ref(locked_ref); 2427 2428 if (ref && ref->seq && 2429 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2430 spin_unlock(&locked_ref->lock); 2431 btrfs_delayed_ref_unlock(locked_ref); 2432 spin_lock(&delayed_refs->lock); 2433 locked_ref->processing = 0; 2434 delayed_refs->num_heads_ready++; 2435 spin_unlock(&delayed_refs->lock); 2436 locked_ref = NULL; 2437 cond_resched(); 2438 count++; 2439 continue; 2440 } 2441 2442 /* 2443 * record the must insert reserved flag before we 2444 * drop the spin lock. 2445 */ 2446 must_insert_reserved = locked_ref->must_insert_reserved; 2447 locked_ref->must_insert_reserved = 0; 2448 2449 extent_op = locked_ref->extent_op; 2450 locked_ref->extent_op = NULL; 2451 2452 if (!ref) { 2453 2454 2455 /* All delayed refs have been processed, Go ahead 2456 * and send the head node to run_one_delayed_ref, 2457 * so that any accounting fixes can happen 2458 */ 2459 ref = &locked_ref->node; 2460 2461 if (extent_op && must_insert_reserved) { 2462 btrfs_free_delayed_extent_op(extent_op); 2463 extent_op = NULL; 2464 } 2465 2466 if (extent_op) { 2467 spin_unlock(&locked_ref->lock); 2468 ret = run_delayed_extent_op(trans, root, 2469 ref, extent_op); 2470 btrfs_free_delayed_extent_op(extent_op); 2471 2472 if (ret) { 2473 /* 2474 * Need to reset must_insert_reserved if 2475 * there was an error so the abort stuff 2476 * can cleanup the reserved space 2477 * properly. 2478 */ 2479 if (must_insert_reserved) 2480 locked_ref->must_insert_reserved = 1; 2481 locked_ref->processing = 0; 2482 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2483 btrfs_delayed_ref_unlock(locked_ref); 2484 return ret; 2485 } 2486 continue; 2487 } 2488 2489 /* 2490 * Need to drop our head ref lock and re-aqcuire the 2491 * delayed ref lock and then re-check to make sure 2492 * nobody got added. 2493 */ 2494 spin_unlock(&locked_ref->lock); 2495 spin_lock(&delayed_refs->lock); 2496 spin_lock(&locked_ref->lock); 2497 if (rb_first(&locked_ref->ref_root) || 2498 locked_ref->extent_op) { 2499 spin_unlock(&locked_ref->lock); 2500 spin_unlock(&delayed_refs->lock); 2501 continue; 2502 } 2503 ref->in_tree = 0; 2504 delayed_refs->num_heads--; 2505 rb_erase(&locked_ref->href_node, 2506 &delayed_refs->href_root); 2507 spin_unlock(&delayed_refs->lock); 2508 } else { 2509 actual_count++; 2510 ref->in_tree = 0; 2511 rb_erase(&ref->rb_node, &locked_ref->ref_root); 2512 } 2513 atomic_dec(&delayed_refs->num_entries); 2514 2515 if (!btrfs_delayed_ref_is_head(ref)) { 2516 /* 2517 * when we play the delayed ref, also correct the 2518 * ref_mod on head 2519 */ 2520 switch (ref->action) { 2521 case BTRFS_ADD_DELAYED_REF: 2522 case BTRFS_ADD_DELAYED_EXTENT: 2523 locked_ref->node.ref_mod -= ref->ref_mod; 2524 break; 2525 case BTRFS_DROP_DELAYED_REF: 2526 locked_ref->node.ref_mod += ref->ref_mod; 2527 break; 2528 default: 2529 WARN_ON(1); 2530 } 2531 } 2532 spin_unlock(&locked_ref->lock); 2533 2534 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2535 must_insert_reserved); 2536 2537 btrfs_free_delayed_extent_op(extent_op); 2538 if (ret) { 2539 locked_ref->processing = 0; 2540 btrfs_delayed_ref_unlock(locked_ref); 2541 btrfs_put_delayed_ref(ref); 2542 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2543 return ret; 2544 } 2545 2546 /* 2547 * If this node is a head, that means all the refs in this head 2548 * have been dealt with, and we will pick the next head to deal 2549 * with, so we must unlock the head and drop it from the cluster 2550 * list before we release it. 2551 */ 2552 if (btrfs_delayed_ref_is_head(ref)) { 2553 btrfs_delayed_ref_unlock(locked_ref); 2554 locked_ref = NULL; 2555 } 2556 btrfs_put_delayed_ref(ref); 2557 count++; 2558 cond_resched(); 2559 } 2560 2561 /* 2562 * We don't want to include ref heads since we can have empty ref heads 2563 * and those will drastically skew our runtime down since we just do 2564 * accounting, no actual extent tree updates. 2565 */ 2566 if (actual_count > 0) { 2567 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2568 u64 avg; 2569 2570 /* 2571 * We weigh the current average higher than our current runtime 2572 * to avoid large swings in the average. 2573 */ 2574 spin_lock(&delayed_refs->lock); 2575 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2576 avg = div64_u64(avg, 4); 2577 fs_info->avg_delayed_ref_runtime = avg; 2578 spin_unlock(&delayed_refs->lock); 2579 } 2580 return 0; 2581 } 2582 2583 #ifdef SCRAMBLE_DELAYED_REFS 2584 /* 2585 * Normally delayed refs get processed in ascending bytenr order. This 2586 * correlates in most cases to the order added. To expose dependencies on this 2587 * order, we start to process the tree in the middle instead of the beginning 2588 */ 2589 static u64 find_middle(struct rb_root *root) 2590 { 2591 struct rb_node *n = root->rb_node; 2592 struct btrfs_delayed_ref_node *entry; 2593 int alt = 1; 2594 u64 middle; 2595 u64 first = 0, last = 0; 2596 2597 n = rb_first(root); 2598 if (n) { 2599 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2600 first = entry->bytenr; 2601 } 2602 n = rb_last(root); 2603 if (n) { 2604 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2605 last = entry->bytenr; 2606 } 2607 n = root->rb_node; 2608 2609 while (n) { 2610 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2611 WARN_ON(!entry->in_tree); 2612 2613 middle = entry->bytenr; 2614 2615 if (alt) 2616 n = n->rb_left; 2617 else 2618 n = n->rb_right; 2619 2620 alt = 1 - alt; 2621 } 2622 return middle; 2623 } 2624 #endif 2625 2626 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2627 { 2628 u64 num_bytes; 2629 2630 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2631 sizeof(struct btrfs_extent_inline_ref)); 2632 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2633 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2634 2635 /* 2636 * We don't ever fill up leaves all the way so multiply by 2 just to be 2637 * closer to what we're really going to want to ouse. 2638 */ 2639 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2640 } 2641 2642 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2643 struct btrfs_root *root) 2644 { 2645 struct btrfs_block_rsv *global_rsv; 2646 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2647 u64 num_bytes; 2648 int ret = 0; 2649 2650 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2651 num_heads = heads_to_leaves(root, num_heads); 2652 if (num_heads > 1) 2653 num_bytes += (num_heads - 1) * root->leafsize; 2654 num_bytes <<= 1; 2655 global_rsv = &root->fs_info->global_block_rsv; 2656 2657 /* 2658 * If we can't allocate any more chunks lets make sure we have _lots_ of 2659 * wiggle room since running delayed refs can create more delayed refs. 2660 */ 2661 if (global_rsv->space_info->full) 2662 num_bytes <<= 1; 2663 2664 spin_lock(&global_rsv->lock); 2665 if (global_rsv->reserved <= num_bytes) 2666 ret = 1; 2667 spin_unlock(&global_rsv->lock); 2668 return ret; 2669 } 2670 2671 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2672 struct btrfs_root *root) 2673 { 2674 struct btrfs_fs_info *fs_info = root->fs_info; 2675 u64 num_entries = 2676 atomic_read(&trans->transaction->delayed_refs.num_entries); 2677 u64 avg_runtime; 2678 u64 val; 2679 2680 smp_mb(); 2681 avg_runtime = fs_info->avg_delayed_ref_runtime; 2682 val = num_entries * avg_runtime; 2683 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2684 return 1; 2685 if (val >= NSEC_PER_SEC / 2) 2686 return 2; 2687 2688 return btrfs_check_space_for_delayed_refs(trans, root); 2689 } 2690 2691 struct async_delayed_refs { 2692 struct btrfs_root *root; 2693 int count; 2694 int error; 2695 int sync; 2696 struct completion wait; 2697 struct btrfs_work work; 2698 }; 2699 2700 static void delayed_ref_async_start(struct btrfs_work *work) 2701 { 2702 struct async_delayed_refs *async; 2703 struct btrfs_trans_handle *trans; 2704 int ret; 2705 2706 async = container_of(work, struct async_delayed_refs, work); 2707 2708 trans = btrfs_join_transaction(async->root); 2709 if (IS_ERR(trans)) { 2710 async->error = PTR_ERR(trans); 2711 goto done; 2712 } 2713 2714 /* 2715 * trans->sync means that when we call end_transaciton, we won't 2716 * wait on delayed refs 2717 */ 2718 trans->sync = true; 2719 ret = btrfs_run_delayed_refs(trans, async->root, async->count); 2720 if (ret) 2721 async->error = ret; 2722 2723 ret = btrfs_end_transaction(trans, async->root); 2724 if (ret && !async->error) 2725 async->error = ret; 2726 done: 2727 if (async->sync) 2728 complete(&async->wait); 2729 else 2730 kfree(async); 2731 } 2732 2733 int btrfs_async_run_delayed_refs(struct btrfs_root *root, 2734 unsigned long count, int wait) 2735 { 2736 struct async_delayed_refs *async; 2737 int ret; 2738 2739 async = kmalloc(sizeof(*async), GFP_NOFS); 2740 if (!async) 2741 return -ENOMEM; 2742 2743 async->root = root->fs_info->tree_root; 2744 async->count = count; 2745 async->error = 0; 2746 if (wait) 2747 async->sync = 1; 2748 else 2749 async->sync = 0; 2750 init_completion(&async->wait); 2751 2752 btrfs_init_work(&async->work, delayed_ref_async_start, 2753 NULL, NULL); 2754 2755 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2756 2757 if (wait) { 2758 wait_for_completion(&async->wait); 2759 ret = async->error; 2760 kfree(async); 2761 return ret; 2762 } 2763 return 0; 2764 } 2765 2766 /* 2767 * this starts processing the delayed reference count updates and 2768 * extent insertions we have queued up so far. count can be 2769 * 0, which means to process everything in the tree at the start 2770 * of the run (but not newly added entries), or it can be some target 2771 * number you'd like to process. 2772 * 2773 * Returns 0 on success or if called with an aborted transaction 2774 * Returns <0 on error and aborts the transaction 2775 */ 2776 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2777 struct btrfs_root *root, unsigned long count) 2778 { 2779 struct rb_node *node; 2780 struct btrfs_delayed_ref_root *delayed_refs; 2781 struct btrfs_delayed_ref_head *head; 2782 int ret; 2783 int run_all = count == (unsigned long)-1; 2784 int run_most = 0; 2785 2786 /* We'll clean this up in btrfs_cleanup_transaction */ 2787 if (trans->aborted) 2788 return 0; 2789 2790 if (root == root->fs_info->extent_root) 2791 root = root->fs_info->tree_root; 2792 2793 delayed_refs = &trans->transaction->delayed_refs; 2794 if (count == 0) { 2795 count = atomic_read(&delayed_refs->num_entries) * 2; 2796 run_most = 1; 2797 } 2798 2799 again: 2800 #ifdef SCRAMBLE_DELAYED_REFS 2801 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2802 #endif 2803 ret = __btrfs_run_delayed_refs(trans, root, count); 2804 if (ret < 0) { 2805 btrfs_abort_transaction(trans, root, ret); 2806 return ret; 2807 } 2808 2809 if (run_all) { 2810 if (!list_empty(&trans->new_bgs)) 2811 btrfs_create_pending_block_groups(trans, root); 2812 2813 spin_lock(&delayed_refs->lock); 2814 node = rb_first(&delayed_refs->href_root); 2815 if (!node) { 2816 spin_unlock(&delayed_refs->lock); 2817 goto out; 2818 } 2819 count = (unsigned long)-1; 2820 2821 while (node) { 2822 head = rb_entry(node, struct btrfs_delayed_ref_head, 2823 href_node); 2824 if (btrfs_delayed_ref_is_head(&head->node)) { 2825 struct btrfs_delayed_ref_node *ref; 2826 2827 ref = &head->node; 2828 atomic_inc(&ref->refs); 2829 2830 spin_unlock(&delayed_refs->lock); 2831 /* 2832 * Mutex was contended, block until it's 2833 * released and try again 2834 */ 2835 mutex_lock(&head->mutex); 2836 mutex_unlock(&head->mutex); 2837 2838 btrfs_put_delayed_ref(ref); 2839 cond_resched(); 2840 goto again; 2841 } else { 2842 WARN_ON(1); 2843 } 2844 node = rb_next(node); 2845 } 2846 spin_unlock(&delayed_refs->lock); 2847 cond_resched(); 2848 goto again; 2849 } 2850 out: 2851 ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); 2852 if (ret) 2853 return ret; 2854 assert_qgroups_uptodate(trans); 2855 return 0; 2856 } 2857 2858 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2859 struct btrfs_root *root, 2860 u64 bytenr, u64 num_bytes, u64 flags, 2861 int level, int is_data) 2862 { 2863 struct btrfs_delayed_extent_op *extent_op; 2864 int ret; 2865 2866 extent_op = btrfs_alloc_delayed_extent_op(); 2867 if (!extent_op) 2868 return -ENOMEM; 2869 2870 extent_op->flags_to_set = flags; 2871 extent_op->update_flags = 1; 2872 extent_op->update_key = 0; 2873 extent_op->is_data = is_data ? 1 : 0; 2874 extent_op->level = level; 2875 2876 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2877 num_bytes, extent_op); 2878 if (ret) 2879 btrfs_free_delayed_extent_op(extent_op); 2880 return ret; 2881 } 2882 2883 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2884 struct btrfs_root *root, 2885 struct btrfs_path *path, 2886 u64 objectid, u64 offset, u64 bytenr) 2887 { 2888 struct btrfs_delayed_ref_head *head; 2889 struct btrfs_delayed_ref_node *ref; 2890 struct btrfs_delayed_data_ref *data_ref; 2891 struct btrfs_delayed_ref_root *delayed_refs; 2892 struct rb_node *node; 2893 int ret = 0; 2894 2895 delayed_refs = &trans->transaction->delayed_refs; 2896 spin_lock(&delayed_refs->lock); 2897 head = btrfs_find_delayed_ref_head(trans, bytenr); 2898 if (!head) { 2899 spin_unlock(&delayed_refs->lock); 2900 return 0; 2901 } 2902 2903 if (!mutex_trylock(&head->mutex)) { 2904 atomic_inc(&head->node.refs); 2905 spin_unlock(&delayed_refs->lock); 2906 2907 btrfs_release_path(path); 2908 2909 /* 2910 * Mutex was contended, block until it's released and let 2911 * caller try again 2912 */ 2913 mutex_lock(&head->mutex); 2914 mutex_unlock(&head->mutex); 2915 btrfs_put_delayed_ref(&head->node); 2916 return -EAGAIN; 2917 } 2918 spin_unlock(&delayed_refs->lock); 2919 2920 spin_lock(&head->lock); 2921 node = rb_first(&head->ref_root); 2922 while (node) { 2923 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2924 node = rb_next(node); 2925 2926 /* If it's a shared ref we know a cross reference exists */ 2927 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2928 ret = 1; 2929 break; 2930 } 2931 2932 data_ref = btrfs_delayed_node_to_data_ref(ref); 2933 2934 /* 2935 * If our ref doesn't match the one we're currently looking at 2936 * then we have a cross reference. 2937 */ 2938 if (data_ref->root != root->root_key.objectid || 2939 data_ref->objectid != objectid || 2940 data_ref->offset != offset) { 2941 ret = 1; 2942 break; 2943 } 2944 } 2945 spin_unlock(&head->lock); 2946 mutex_unlock(&head->mutex); 2947 return ret; 2948 } 2949 2950 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2951 struct btrfs_root *root, 2952 struct btrfs_path *path, 2953 u64 objectid, u64 offset, u64 bytenr) 2954 { 2955 struct btrfs_root *extent_root = root->fs_info->extent_root; 2956 struct extent_buffer *leaf; 2957 struct btrfs_extent_data_ref *ref; 2958 struct btrfs_extent_inline_ref *iref; 2959 struct btrfs_extent_item *ei; 2960 struct btrfs_key key; 2961 u32 item_size; 2962 int ret; 2963 2964 key.objectid = bytenr; 2965 key.offset = (u64)-1; 2966 key.type = BTRFS_EXTENT_ITEM_KEY; 2967 2968 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2969 if (ret < 0) 2970 goto out; 2971 BUG_ON(ret == 0); /* Corruption */ 2972 2973 ret = -ENOENT; 2974 if (path->slots[0] == 0) 2975 goto out; 2976 2977 path->slots[0]--; 2978 leaf = path->nodes[0]; 2979 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2980 2981 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2982 goto out; 2983 2984 ret = 1; 2985 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2986 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2987 if (item_size < sizeof(*ei)) { 2988 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2989 goto out; 2990 } 2991 #endif 2992 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2993 2994 if (item_size != sizeof(*ei) + 2995 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2996 goto out; 2997 2998 if (btrfs_extent_generation(leaf, ei) <= 2999 btrfs_root_last_snapshot(&root->root_item)) 3000 goto out; 3001 3002 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3003 if (btrfs_extent_inline_ref_type(leaf, iref) != 3004 BTRFS_EXTENT_DATA_REF_KEY) 3005 goto out; 3006 3007 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3008 if (btrfs_extent_refs(leaf, ei) != 3009 btrfs_extent_data_ref_count(leaf, ref) || 3010 btrfs_extent_data_ref_root(leaf, ref) != 3011 root->root_key.objectid || 3012 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3013 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3014 goto out; 3015 3016 ret = 0; 3017 out: 3018 return ret; 3019 } 3020 3021 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3022 struct btrfs_root *root, 3023 u64 objectid, u64 offset, u64 bytenr) 3024 { 3025 struct btrfs_path *path; 3026 int ret; 3027 int ret2; 3028 3029 path = btrfs_alloc_path(); 3030 if (!path) 3031 return -ENOENT; 3032 3033 do { 3034 ret = check_committed_ref(trans, root, path, objectid, 3035 offset, bytenr); 3036 if (ret && ret != -ENOENT) 3037 goto out; 3038 3039 ret2 = check_delayed_ref(trans, root, path, objectid, 3040 offset, bytenr); 3041 } while (ret2 == -EAGAIN); 3042 3043 if (ret2 && ret2 != -ENOENT) { 3044 ret = ret2; 3045 goto out; 3046 } 3047 3048 if (ret != -ENOENT || ret2 != -ENOENT) 3049 ret = 0; 3050 out: 3051 btrfs_free_path(path); 3052 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3053 WARN_ON(ret > 0); 3054 return ret; 3055 } 3056 3057 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3058 struct btrfs_root *root, 3059 struct extent_buffer *buf, 3060 int full_backref, int inc, int no_quota) 3061 { 3062 u64 bytenr; 3063 u64 num_bytes; 3064 u64 parent; 3065 u64 ref_root; 3066 u32 nritems; 3067 struct btrfs_key key; 3068 struct btrfs_file_extent_item *fi; 3069 int i; 3070 int level; 3071 int ret = 0; 3072 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3073 u64, u64, u64, u64, u64, u64, int); 3074 3075 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3076 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 3077 return 0; 3078 #endif 3079 ref_root = btrfs_header_owner(buf); 3080 nritems = btrfs_header_nritems(buf); 3081 level = btrfs_header_level(buf); 3082 3083 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3084 return 0; 3085 3086 if (inc) 3087 process_func = btrfs_inc_extent_ref; 3088 else 3089 process_func = btrfs_free_extent; 3090 3091 if (full_backref) 3092 parent = buf->start; 3093 else 3094 parent = 0; 3095 3096 for (i = 0; i < nritems; i++) { 3097 if (level == 0) { 3098 btrfs_item_key_to_cpu(buf, &key, i); 3099 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3100 continue; 3101 fi = btrfs_item_ptr(buf, i, 3102 struct btrfs_file_extent_item); 3103 if (btrfs_file_extent_type(buf, fi) == 3104 BTRFS_FILE_EXTENT_INLINE) 3105 continue; 3106 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3107 if (bytenr == 0) 3108 continue; 3109 3110 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3111 key.offset -= btrfs_file_extent_offset(buf, fi); 3112 ret = process_func(trans, root, bytenr, num_bytes, 3113 parent, ref_root, key.objectid, 3114 key.offset, no_quota); 3115 if (ret) 3116 goto fail; 3117 } else { 3118 bytenr = btrfs_node_blockptr(buf, i); 3119 num_bytes = btrfs_level_size(root, level - 1); 3120 ret = process_func(trans, root, bytenr, num_bytes, 3121 parent, ref_root, level - 1, 0, 3122 no_quota); 3123 if (ret) 3124 goto fail; 3125 } 3126 } 3127 return 0; 3128 fail: 3129 return ret; 3130 } 3131 3132 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3133 struct extent_buffer *buf, int full_backref, int no_quota) 3134 { 3135 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); 3136 } 3137 3138 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3139 struct extent_buffer *buf, int full_backref, int no_quota) 3140 { 3141 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); 3142 } 3143 3144 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3145 struct btrfs_root *root, 3146 struct btrfs_path *path, 3147 struct btrfs_block_group_cache *cache) 3148 { 3149 int ret; 3150 struct btrfs_root *extent_root = root->fs_info->extent_root; 3151 unsigned long bi; 3152 struct extent_buffer *leaf; 3153 3154 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3155 if (ret < 0) 3156 goto fail; 3157 BUG_ON(ret); /* Corruption */ 3158 3159 leaf = path->nodes[0]; 3160 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3161 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3162 btrfs_mark_buffer_dirty(leaf); 3163 btrfs_release_path(path); 3164 fail: 3165 if (ret) { 3166 btrfs_abort_transaction(trans, root, ret); 3167 return ret; 3168 } 3169 return 0; 3170 3171 } 3172 3173 static struct btrfs_block_group_cache * 3174 next_block_group(struct btrfs_root *root, 3175 struct btrfs_block_group_cache *cache) 3176 { 3177 struct rb_node *node; 3178 spin_lock(&root->fs_info->block_group_cache_lock); 3179 node = rb_next(&cache->cache_node); 3180 btrfs_put_block_group(cache); 3181 if (node) { 3182 cache = rb_entry(node, struct btrfs_block_group_cache, 3183 cache_node); 3184 btrfs_get_block_group(cache); 3185 } else 3186 cache = NULL; 3187 spin_unlock(&root->fs_info->block_group_cache_lock); 3188 return cache; 3189 } 3190 3191 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3192 struct btrfs_trans_handle *trans, 3193 struct btrfs_path *path) 3194 { 3195 struct btrfs_root *root = block_group->fs_info->tree_root; 3196 struct inode *inode = NULL; 3197 u64 alloc_hint = 0; 3198 int dcs = BTRFS_DC_ERROR; 3199 int num_pages = 0; 3200 int retries = 0; 3201 int ret = 0; 3202 3203 /* 3204 * If this block group is smaller than 100 megs don't bother caching the 3205 * block group. 3206 */ 3207 if (block_group->key.offset < (100 * 1024 * 1024)) { 3208 spin_lock(&block_group->lock); 3209 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3210 spin_unlock(&block_group->lock); 3211 return 0; 3212 } 3213 3214 again: 3215 inode = lookup_free_space_inode(root, block_group, path); 3216 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3217 ret = PTR_ERR(inode); 3218 btrfs_release_path(path); 3219 goto out; 3220 } 3221 3222 if (IS_ERR(inode)) { 3223 BUG_ON(retries); 3224 retries++; 3225 3226 if (block_group->ro) 3227 goto out_free; 3228 3229 ret = create_free_space_inode(root, trans, block_group, path); 3230 if (ret) 3231 goto out_free; 3232 goto again; 3233 } 3234 3235 /* We've already setup this transaction, go ahead and exit */ 3236 if (block_group->cache_generation == trans->transid && 3237 i_size_read(inode)) { 3238 dcs = BTRFS_DC_SETUP; 3239 goto out_put; 3240 } 3241 3242 /* 3243 * We want to set the generation to 0, that way if anything goes wrong 3244 * from here on out we know not to trust this cache when we load up next 3245 * time. 3246 */ 3247 BTRFS_I(inode)->generation = 0; 3248 ret = btrfs_update_inode(trans, root, inode); 3249 WARN_ON(ret); 3250 3251 if (i_size_read(inode) > 0) { 3252 ret = btrfs_check_trunc_cache_free_space(root, 3253 &root->fs_info->global_block_rsv); 3254 if (ret) 3255 goto out_put; 3256 3257 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3258 if (ret) 3259 goto out_put; 3260 } 3261 3262 spin_lock(&block_group->lock); 3263 if (block_group->cached != BTRFS_CACHE_FINISHED || 3264 !btrfs_test_opt(root, SPACE_CACHE) || 3265 block_group->delalloc_bytes) { 3266 /* 3267 * don't bother trying to write stuff out _if_ 3268 * a) we're not cached, 3269 * b) we're with nospace_cache mount option. 3270 */ 3271 dcs = BTRFS_DC_WRITTEN; 3272 spin_unlock(&block_group->lock); 3273 goto out_put; 3274 } 3275 spin_unlock(&block_group->lock); 3276 3277 /* 3278 * Try to preallocate enough space based on how big the block group is. 3279 * Keep in mind this has to include any pinned space which could end up 3280 * taking up quite a bit since it's not folded into the other space 3281 * cache. 3282 */ 3283 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3284 if (!num_pages) 3285 num_pages = 1; 3286 3287 num_pages *= 16; 3288 num_pages *= PAGE_CACHE_SIZE; 3289 3290 ret = btrfs_check_data_free_space(inode, num_pages); 3291 if (ret) 3292 goto out_put; 3293 3294 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3295 num_pages, num_pages, 3296 &alloc_hint); 3297 if (!ret) 3298 dcs = BTRFS_DC_SETUP; 3299 btrfs_free_reserved_data_space(inode, num_pages); 3300 3301 out_put: 3302 iput(inode); 3303 out_free: 3304 btrfs_release_path(path); 3305 out: 3306 spin_lock(&block_group->lock); 3307 if (!ret && dcs == BTRFS_DC_SETUP) 3308 block_group->cache_generation = trans->transid; 3309 block_group->disk_cache_state = dcs; 3310 spin_unlock(&block_group->lock); 3311 3312 return ret; 3313 } 3314 3315 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3316 struct btrfs_root *root) 3317 { 3318 struct btrfs_block_group_cache *cache; 3319 int err = 0; 3320 struct btrfs_path *path; 3321 u64 last = 0; 3322 3323 path = btrfs_alloc_path(); 3324 if (!path) 3325 return -ENOMEM; 3326 3327 again: 3328 while (1) { 3329 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3330 while (cache) { 3331 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3332 break; 3333 cache = next_block_group(root, cache); 3334 } 3335 if (!cache) { 3336 if (last == 0) 3337 break; 3338 last = 0; 3339 continue; 3340 } 3341 err = cache_save_setup(cache, trans, path); 3342 last = cache->key.objectid + cache->key.offset; 3343 btrfs_put_block_group(cache); 3344 } 3345 3346 while (1) { 3347 if (last == 0) { 3348 err = btrfs_run_delayed_refs(trans, root, 3349 (unsigned long)-1); 3350 if (err) /* File system offline */ 3351 goto out; 3352 } 3353 3354 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3355 while (cache) { 3356 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 3357 btrfs_put_block_group(cache); 3358 goto again; 3359 } 3360 3361 if (cache->dirty) 3362 break; 3363 cache = next_block_group(root, cache); 3364 } 3365 if (!cache) { 3366 if (last == 0) 3367 break; 3368 last = 0; 3369 continue; 3370 } 3371 3372 if (cache->disk_cache_state == BTRFS_DC_SETUP) 3373 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 3374 cache->dirty = 0; 3375 last = cache->key.objectid + cache->key.offset; 3376 3377 err = write_one_cache_group(trans, root, path, cache); 3378 btrfs_put_block_group(cache); 3379 if (err) /* File system offline */ 3380 goto out; 3381 } 3382 3383 while (1) { 3384 /* 3385 * I don't think this is needed since we're just marking our 3386 * preallocated extent as written, but just in case it can't 3387 * hurt. 3388 */ 3389 if (last == 0) { 3390 err = btrfs_run_delayed_refs(trans, root, 3391 (unsigned long)-1); 3392 if (err) /* File system offline */ 3393 goto out; 3394 } 3395 3396 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3397 while (cache) { 3398 /* 3399 * Really this shouldn't happen, but it could if we 3400 * couldn't write the entire preallocated extent and 3401 * splitting the extent resulted in a new block. 3402 */ 3403 if (cache->dirty) { 3404 btrfs_put_block_group(cache); 3405 goto again; 3406 } 3407 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3408 break; 3409 cache = next_block_group(root, cache); 3410 } 3411 if (!cache) { 3412 if (last == 0) 3413 break; 3414 last = 0; 3415 continue; 3416 } 3417 3418 err = btrfs_write_out_cache(root, trans, cache, path); 3419 3420 /* 3421 * If we didn't have an error then the cache state is still 3422 * NEED_WRITE, so we can set it to WRITTEN. 3423 */ 3424 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3425 cache->disk_cache_state = BTRFS_DC_WRITTEN; 3426 last = cache->key.objectid + cache->key.offset; 3427 btrfs_put_block_group(cache); 3428 } 3429 out: 3430 3431 btrfs_free_path(path); 3432 return err; 3433 } 3434 3435 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3436 { 3437 struct btrfs_block_group_cache *block_group; 3438 int readonly = 0; 3439 3440 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3441 if (!block_group || block_group->ro) 3442 readonly = 1; 3443 if (block_group) 3444 btrfs_put_block_group(block_group); 3445 return readonly; 3446 } 3447 3448 static const char *alloc_name(u64 flags) 3449 { 3450 switch (flags) { 3451 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3452 return "mixed"; 3453 case BTRFS_BLOCK_GROUP_METADATA: 3454 return "metadata"; 3455 case BTRFS_BLOCK_GROUP_DATA: 3456 return "data"; 3457 case BTRFS_BLOCK_GROUP_SYSTEM: 3458 return "system"; 3459 default: 3460 WARN_ON(1); 3461 return "invalid-combination"; 3462 }; 3463 } 3464 3465 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3466 u64 total_bytes, u64 bytes_used, 3467 struct btrfs_space_info **space_info) 3468 { 3469 struct btrfs_space_info *found; 3470 int i; 3471 int factor; 3472 int ret; 3473 3474 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3475 BTRFS_BLOCK_GROUP_RAID10)) 3476 factor = 2; 3477 else 3478 factor = 1; 3479 3480 found = __find_space_info(info, flags); 3481 if (found) { 3482 spin_lock(&found->lock); 3483 found->total_bytes += total_bytes; 3484 found->disk_total += total_bytes * factor; 3485 found->bytes_used += bytes_used; 3486 found->disk_used += bytes_used * factor; 3487 found->full = 0; 3488 spin_unlock(&found->lock); 3489 *space_info = found; 3490 return 0; 3491 } 3492 found = kzalloc(sizeof(*found), GFP_NOFS); 3493 if (!found) 3494 return -ENOMEM; 3495 3496 ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3497 if (ret) { 3498 kfree(found); 3499 return ret; 3500 } 3501 3502 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3503 INIT_LIST_HEAD(&found->block_groups[i]); 3504 init_rwsem(&found->groups_sem); 3505 spin_lock_init(&found->lock); 3506 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3507 found->total_bytes = total_bytes; 3508 found->disk_total = total_bytes * factor; 3509 found->bytes_used = bytes_used; 3510 found->disk_used = bytes_used * factor; 3511 found->bytes_pinned = 0; 3512 found->bytes_reserved = 0; 3513 found->bytes_readonly = 0; 3514 found->bytes_may_use = 0; 3515 found->full = 0; 3516 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3517 found->chunk_alloc = 0; 3518 found->flush = 0; 3519 init_waitqueue_head(&found->wait); 3520 3521 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3522 info->space_info_kobj, "%s", 3523 alloc_name(found->flags)); 3524 if (ret) { 3525 kfree(found); 3526 return ret; 3527 } 3528 3529 *space_info = found; 3530 list_add_rcu(&found->list, &info->space_info); 3531 if (flags & BTRFS_BLOCK_GROUP_DATA) 3532 info->data_sinfo = found; 3533 3534 return ret; 3535 } 3536 3537 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3538 { 3539 u64 extra_flags = chunk_to_extended(flags) & 3540 BTRFS_EXTENDED_PROFILE_MASK; 3541 3542 write_seqlock(&fs_info->profiles_lock); 3543 if (flags & BTRFS_BLOCK_GROUP_DATA) 3544 fs_info->avail_data_alloc_bits |= extra_flags; 3545 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3546 fs_info->avail_metadata_alloc_bits |= extra_flags; 3547 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3548 fs_info->avail_system_alloc_bits |= extra_flags; 3549 write_sequnlock(&fs_info->profiles_lock); 3550 } 3551 3552 /* 3553 * returns target flags in extended format or 0 if restripe for this 3554 * chunk_type is not in progress 3555 * 3556 * should be called with either volume_mutex or balance_lock held 3557 */ 3558 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3559 { 3560 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3561 u64 target = 0; 3562 3563 if (!bctl) 3564 return 0; 3565 3566 if (flags & BTRFS_BLOCK_GROUP_DATA && 3567 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3568 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3569 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3570 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3571 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3572 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3573 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3574 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3575 } 3576 3577 return target; 3578 } 3579 3580 /* 3581 * @flags: available profiles in extended format (see ctree.h) 3582 * 3583 * Returns reduced profile in chunk format. If profile changing is in 3584 * progress (either running or paused) picks the target profile (if it's 3585 * already available), otherwise falls back to plain reducing. 3586 */ 3587 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3588 { 3589 /* 3590 * we add in the count of missing devices because we want 3591 * to make sure that any RAID levels on a degraded FS 3592 * continue to be honored. 3593 */ 3594 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3595 root->fs_info->fs_devices->missing_devices; 3596 u64 target; 3597 u64 tmp; 3598 3599 /* 3600 * see if restripe for this chunk_type is in progress, if so 3601 * try to reduce to the target profile 3602 */ 3603 spin_lock(&root->fs_info->balance_lock); 3604 target = get_restripe_target(root->fs_info, flags); 3605 if (target) { 3606 /* pick target profile only if it's already available */ 3607 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3608 spin_unlock(&root->fs_info->balance_lock); 3609 return extended_to_chunk(target); 3610 } 3611 } 3612 spin_unlock(&root->fs_info->balance_lock); 3613 3614 /* First, mask out the RAID levels which aren't possible */ 3615 if (num_devices == 1) 3616 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3617 BTRFS_BLOCK_GROUP_RAID5); 3618 if (num_devices < 3) 3619 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3620 if (num_devices < 4) 3621 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3622 3623 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3624 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3625 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3626 flags &= ~tmp; 3627 3628 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3629 tmp = BTRFS_BLOCK_GROUP_RAID6; 3630 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3631 tmp = BTRFS_BLOCK_GROUP_RAID5; 3632 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3633 tmp = BTRFS_BLOCK_GROUP_RAID10; 3634 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3635 tmp = BTRFS_BLOCK_GROUP_RAID1; 3636 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3637 tmp = BTRFS_BLOCK_GROUP_RAID0; 3638 3639 return extended_to_chunk(flags | tmp); 3640 } 3641 3642 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 3643 { 3644 unsigned seq; 3645 u64 flags; 3646 3647 do { 3648 flags = orig_flags; 3649 seq = read_seqbegin(&root->fs_info->profiles_lock); 3650 3651 if (flags & BTRFS_BLOCK_GROUP_DATA) 3652 flags |= root->fs_info->avail_data_alloc_bits; 3653 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3654 flags |= root->fs_info->avail_system_alloc_bits; 3655 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3656 flags |= root->fs_info->avail_metadata_alloc_bits; 3657 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3658 3659 return btrfs_reduce_alloc_profile(root, flags); 3660 } 3661 3662 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3663 { 3664 u64 flags; 3665 u64 ret; 3666 3667 if (data) 3668 flags = BTRFS_BLOCK_GROUP_DATA; 3669 else if (root == root->fs_info->chunk_root) 3670 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3671 else 3672 flags = BTRFS_BLOCK_GROUP_METADATA; 3673 3674 ret = get_alloc_profile(root, flags); 3675 return ret; 3676 } 3677 3678 /* 3679 * This will check the space that the inode allocates from to make sure we have 3680 * enough space for bytes. 3681 */ 3682 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3683 { 3684 struct btrfs_space_info *data_sinfo; 3685 struct btrfs_root *root = BTRFS_I(inode)->root; 3686 struct btrfs_fs_info *fs_info = root->fs_info; 3687 u64 used; 3688 int ret = 0, committed = 0, alloc_chunk = 1; 3689 3690 /* make sure bytes are sectorsize aligned */ 3691 bytes = ALIGN(bytes, root->sectorsize); 3692 3693 if (btrfs_is_free_space_inode(inode)) { 3694 committed = 1; 3695 ASSERT(current->journal_info); 3696 } 3697 3698 data_sinfo = fs_info->data_sinfo; 3699 if (!data_sinfo) 3700 goto alloc; 3701 3702 again: 3703 /* make sure we have enough space to handle the data first */ 3704 spin_lock(&data_sinfo->lock); 3705 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3706 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3707 data_sinfo->bytes_may_use; 3708 3709 if (used + bytes > data_sinfo->total_bytes) { 3710 struct btrfs_trans_handle *trans; 3711 3712 /* 3713 * if we don't have enough free bytes in this space then we need 3714 * to alloc a new chunk. 3715 */ 3716 if (!data_sinfo->full && alloc_chunk) { 3717 u64 alloc_target; 3718 3719 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3720 spin_unlock(&data_sinfo->lock); 3721 alloc: 3722 alloc_target = btrfs_get_alloc_profile(root, 1); 3723 /* 3724 * It is ugly that we don't call nolock join 3725 * transaction for the free space inode case here. 3726 * But it is safe because we only do the data space 3727 * reservation for the free space cache in the 3728 * transaction context, the common join transaction 3729 * just increase the counter of the current transaction 3730 * handler, doesn't try to acquire the trans_lock of 3731 * the fs. 3732 */ 3733 trans = btrfs_join_transaction(root); 3734 if (IS_ERR(trans)) 3735 return PTR_ERR(trans); 3736 3737 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3738 alloc_target, 3739 CHUNK_ALLOC_NO_FORCE); 3740 btrfs_end_transaction(trans, root); 3741 if (ret < 0) { 3742 if (ret != -ENOSPC) 3743 return ret; 3744 else 3745 goto commit_trans; 3746 } 3747 3748 if (!data_sinfo) 3749 data_sinfo = fs_info->data_sinfo; 3750 3751 goto again; 3752 } 3753 3754 /* 3755 * If we don't have enough pinned space to deal with this 3756 * allocation don't bother committing the transaction. 3757 */ 3758 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, 3759 bytes) < 0) 3760 committed = 1; 3761 spin_unlock(&data_sinfo->lock); 3762 3763 /* commit the current transaction and try again */ 3764 commit_trans: 3765 if (!committed && 3766 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3767 committed = 1; 3768 3769 trans = btrfs_join_transaction(root); 3770 if (IS_ERR(trans)) 3771 return PTR_ERR(trans); 3772 ret = btrfs_commit_transaction(trans, root); 3773 if (ret) 3774 return ret; 3775 goto again; 3776 } 3777 3778 trace_btrfs_space_reservation(root->fs_info, 3779 "space_info:enospc", 3780 data_sinfo->flags, bytes, 1); 3781 return -ENOSPC; 3782 } 3783 data_sinfo->bytes_may_use += bytes; 3784 trace_btrfs_space_reservation(root->fs_info, "space_info", 3785 data_sinfo->flags, bytes, 1); 3786 spin_unlock(&data_sinfo->lock); 3787 3788 return 0; 3789 } 3790 3791 /* 3792 * Called if we need to clear a data reservation for this inode. 3793 */ 3794 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3795 { 3796 struct btrfs_root *root = BTRFS_I(inode)->root; 3797 struct btrfs_space_info *data_sinfo; 3798 3799 /* make sure bytes are sectorsize aligned */ 3800 bytes = ALIGN(bytes, root->sectorsize); 3801 3802 data_sinfo = root->fs_info->data_sinfo; 3803 spin_lock(&data_sinfo->lock); 3804 WARN_ON(data_sinfo->bytes_may_use < bytes); 3805 data_sinfo->bytes_may_use -= bytes; 3806 trace_btrfs_space_reservation(root->fs_info, "space_info", 3807 data_sinfo->flags, bytes, 0); 3808 spin_unlock(&data_sinfo->lock); 3809 } 3810 3811 static void force_metadata_allocation(struct btrfs_fs_info *info) 3812 { 3813 struct list_head *head = &info->space_info; 3814 struct btrfs_space_info *found; 3815 3816 rcu_read_lock(); 3817 list_for_each_entry_rcu(found, head, list) { 3818 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3819 found->force_alloc = CHUNK_ALLOC_FORCE; 3820 } 3821 rcu_read_unlock(); 3822 } 3823 3824 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3825 { 3826 return (global->size << 1); 3827 } 3828 3829 static int should_alloc_chunk(struct btrfs_root *root, 3830 struct btrfs_space_info *sinfo, int force) 3831 { 3832 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3833 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3834 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3835 u64 thresh; 3836 3837 if (force == CHUNK_ALLOC_FORCE) 3838 return 1; 3839 3840 /* 3841 * We need to take into account the global rsv because for all intents 3842 * and purposes it's used space. Don't worry about locking the 3843 * global_rsv, it doesn't change except when the transaction commits. 3844 */ 3845 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3846 num_allocated += calc_global_rsv_need_space(global_rsv); 3847 3848 /* 3849 * in limited mode, we want to have some free space up to 3850 * about 1% of the FS size. 3851 */ 3852 if (force == CHUNK_ALLOC_LIMITED) { 3853 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3854 thresh = max_t(u64, 64 * 1024 * 1024, 3855 div_factor_fine(thresh, 1)); 3856 3857 if (num_bytes - num_allocated < thresh) 3858 return 1; 3859 } 3860 3861 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3862 return 0; 3863 return 1; 3864 } 3865 3866 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3867 { 3868 u64 num_dev; 3869 3870 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3871 BTRFS_BLOCK_GROUP_RAID0 | 3872 BTRFS_BLOCK_GROUP_RAID5 | 3873 BTRFS_BLOCK_GROUP_RAID6)) 3874 num_dev = root->fs_info->fs_devices->rw_devices; 3875 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3876 num_dev = 2; 3877 else 3878 num_dev = 1; /* DUP or single */ 3879 3880 /* metadata for updaing devices and chunk tree */ 3881 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3882 } 3883 3884 static void check_system_chunk(struct btrfs_trans_handle *trans, 3885 struct btrfs_root *root, u64 type) 3886 { 3887 struct btrfs_space_info *info; 3888 u64 left; 3889 u64 thresh; 3890 3891 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3892 spin_lock(&info->lock); 3893 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3894 info->bytes_reserved - info->bytes_readonly; 3895 spin_unlock(&info->lock); 3896 3897 thresh = get_system_chunk_thresh(root, type); 3898 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3899 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3900 left, thresh, type); 3901 dump_space_info(info, 0, 0); 3902 } 3903 3904 if (left < thresh) { 3905 u64 flags; 3906 3907 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3908 btrfs_alloc_chunk(trans, root, flags); 3909 } 3910 } 3911 3912 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3913 struct btrfs_root *extent_root, u64 flags, int force) 3914 { 3915 struct btrfs_space_info *space_info; 3916 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3917 int wait_for_alloc = 0; 3918 int ret = 0; 3919 3920 /* Don't re-enter if we're already allocating a chunk */ 3921 if (trans->allocating_chunk) 3922 return -ENOSPC; 3923 3924 space_info = __find_space_info(extent_root->fs_info, flags); 3925 if (!space_info) { 3926 ret = update_space_info(extent_root->fs_info, flags, 3927 0, 0, &space_info); 3928 BUG_ON(ret); /* -ENOMEM */ 3929 } 3930 BUG_ON(!space_info); /* Logic error */ 3931 3932 again: 3933 spin_lock(&space_info->lock); 3934 if (force < space_info->force_alloc) 3935 force = space_info->force_alloc; 3936 if (space_info->full) { 3937 if (should_alloc_chunk(extent_root, space_info, force)) 3938 ret = -ENOSPC; 3939 else 3940 ret = 0; 3941 spin_unlock(&space_info->lock); 3942 return ret; 3943 } 3944 3945 if (!should_alloc_chunk(extent_root, space_info, force)) { 3946 spin_unlock(&space_info->lock); 3947 return 0; 3948 } else if (space_info->chunk_alloc) { 3949 wait_for_alloc = 1; 3950 } else { 3951 space_info->chunk_alloc = 1; 3952 } 3953 3954 spin_unlock(&space_info->lock); 3955 3956 mutex_lock(&fs_info->chunk_mutex); 3957 3958 /* 3959 * The chunk_mutex is held throughout the entirety of a chunk 3960 * allocation, so once we've acquired the chunk_mutex we know that the 3961 * other guy is done and we need to recheck and see if we should 3962 * allocate. 3963 */ 3964 if (wait_for_alloc) { 3965 mutex_unlock(&fs_info->chunk_mutex); 3966 wait_for_alloc = 0; 3967 goto again; 3968 } 3969 3970 trans->allocating_chunk = true; 3971 3972 /* 3973 * If we have mixed data/metadata chunks we want to make sure we keep 3974 * allocating mixed chunks instead of individual chunks. 3975 */ 3976 if (btrfs_mixed_space_info(space_info)) 3977 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3978 3979 /* 3980 * if we're doing a data chunk, go ahead and make sure that 3981 * we keep a reasonable number of metadata chunks allocated in the 3982 * FS as well. 3983 */ 3984 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3985 fs_info->data_chunk_allocations++; 3986 if (!(fs_info->data_chunk_allocations % 3987 fs_info->metadata_ratio)) 3988 force_metadata_allocation(fs_info); 3989 } 3990 3991 /* 3992 * Check if we have enough space in SYSTEM chunk because we may need 3993 * to update devices. 3994 */ 3995 check_system_chunk(trans, extent_root, flags); 3996 3997 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3998 trans->allocating_chunk = false; 3999 4000 spin_lock(&space_info->lock); 4001 if (ret < 0 && ret != -ENOSPC) 4002 goto out; 4003 if (ret) 4004 space_info->full = 1; 4005 else 4006 ret = 1; 4007 4008 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4009 out: 4010 space_info->chunk_alloc = 0; 4011 spin_unlock(&space_info->lock); 4012 mutex_unlock(&fs_info->chunk_mutex); 4013 return ret; 4014 } 4015 4016 static int can_overcommit(struct btrfs_root *root, 4017 struct btrfs_space_info *space_info, u64 bytes, 4018 enum btrfs_reserve_flush_enum flush) 4019 { 4020 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4021 u64 profile = btrfs_get_alloc_profile(root, 0); 4022 u64 space_size; 4023 u64 avail; 4024 u64 used; 4025 4026 used = space_info->bytes_used + space_info->bytes_reserved + 4027 space_info->bytes_pinned + space_info->bytes_readonly; 4028 4029 /* 4030 * We only want to allow over committing if we have lots of actual space 4031 * free, but if we don't have enough space to handle the global reserve 4032 * space then we could end up having a real enospc problem when trying 4033 * to allocate a chunk or some other such important allocation. 4034 */ 4035 spin_lock(&global_rsv->lock); 4036 space_size = calc_global_rsv_need_space(global_rsv); 4037 spin_unlock(&global_rsv->lock); 4038 if (used + space_size >= space_info->total_bytes) 4039 return 0; 4040 4041 used += space_info->bytes_may_use; 4042 4043 spin_lock(&root->fs_info->free_chunk_lock); 4044 avail = root->fs_info->free_chunk_space; 4045 spin_unlock(&root->fs_info->free_chunk_lock); 4046 4047 /* 4048 * If we have dup, raid1 or raid10 then only half of the free 4049 * space is actually useable. For raid56, the space info used 4050 * doesn't include the parity drive, so we don't have to 4051 * change the math 4052 */ 4053 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4054 BTRFS_BLOCK_GROUP_RAID1 | 4055 BTRFS_BLOCK_GROUP_RAID10)) 4056 avail >>= 1; 4057 4058 /* 4059 * If we aren't flushing all things, let us overcommit up to 4060 * 1/2th of the space. If we can flush, don't let us overcommit 4061 * too much, let it overcommit up to 1/8 of the space. 4062 */ 4063 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4064 avail >>= 3; 4065 else 4066 avail >>= 1; 4067 4068 if (used + bytes < space_info->total_bytes + avail) 4069 return 1; 4070 return 0; 4071 } 4072 4073 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4074 unsigned long nr_pages, int nr_items) 4075 { 4076 struct super_block *sb = root->fs_info->sb; 4077 4078 if (down_read_trylock(&sb->s_umount)) { 4079 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4080 up_read(&sb->s_umount); 4081 } else { 4082 /* 4083 * We needn't worry the filesystem going from r/w to r/o though 4084 * we don't acquire ->s_umount mutex, because the filesystem 4085 * should guarantee the delalloc inodes list be empty after 4086 * the filesystem is readonly(all dirty pages are written to 4087 * the disk). 4088 */ 4089 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4090 if (!current->journal_info) 4091 btrfs_wait_ordered_roots(root->fs_info, nr_items); 4092 } 4093 } 4094 4095 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4096 { 4097 u64 bytes; 4098 int nr; 4099 4100 bytes = btrfs_calc_trans_metadata_size(root, 1); 4101 nr = (int)div64_u64(to_reclaim, bytes); 4102 if (!nr) 4103 nr = 1; 4104 return nr; 4105 } 4106 4107 #define EXTENT_SIZE_PER_ITEM (256 * 1024) 4108 4109 /* 4110 * shrink metadata reservation for delalloc 4111 */ 4112 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4113 bool wait_ordered) 4114 { 4115 struct btrfs_block_rsv *block_rsv; 4116 struct btrfs_space_info *space_info; 4117 struct btrfs_trans_handle *trans; 4118 u64 delalloc_bytes; 4119 u64 max_reclaim; 4120 long time_left; 4121 unsigned long nr_pages; 4122 int loops; 4123 int items; 4124 enum btrfs_reserve_flush_enum flush; 4125 4126 /* Calc the number of the pages we need flush for space reservation */ 4127 items = calc_reclaim_items_nr(root, to_reclaim); 4128 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4129 4130 trans = (struct btrfs_trans_handle *)current->journal_info; 4131 block_rsv = &root->fs_info->delalloc_block_rsv; 4132 space_info = block_rsv->space_info; 4133 4134 delalloc_bytes = percpu_counter_sum_positive( 4135 &root->fs_info->delalloc_bytes); 4136 if (delalloc_bytes == 0) { 4137 if (trans) 4138 return; 4139 if (wait_ordered) 4140 btrfs_wait_ordered_roots(root->fs_info, items); 4141 return; 4142 } 4143 4144 loops = 0; 4145 while (delalloc_bytes && loops < 3) { 4146 max_reclaim = min(delalloc_bytes, to_reclaim); 4147 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4148 btrfs_writeback_inodes_sb_nr(root, nr_pages, items); 4149 /* 4150 * We need to wait for the async pages to actually start before 4151 * we do anything. 4152 */ 4153 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4154 if (!max_reclaim) 4155 goto skip_async; 4156 4157 if (max_reclaim <= nr_pages) 4158 max_reclaim = 0; 4159 else 4160 max_reclaim -= nr_pages; 4161 4162 wait_event(root->fs_info->async_submit_wait, 4163 atomic_read(&root->fs_info->async_delalloc_pages) <= 4164 (int)max_reclaim); 4165 skip_async: 4166 if (!trans) 4167 flush = BTRFS_RESERVE_FLUSH_ALL; 4168 else 4169 flush = BTRFS_RESERVE_NO_FLUSH; 4170 spin_lock(&space_info->lock); 4171 if (can_overcommit(root, space_info, orig, flush)) { 4172 spin_unlock(&space_info->lock); 4173 break; 4174 } 4175 spin_unlock(&space_info->lock); 4176 4177 loops++; 4178 if (wait_ordered && !trans) { 4179 btrfs_wait_ordered_roots(root->fs_info, items); 4180 } else { 4181 time_left = schedule_timeout_killable(1); 4182 if (time_left) 4183 break; 4184 } 4185 delalloc_bytes = percpu_counter_sum_positive( 4186 &root->fs_info->delalloc_bytes); 4187 } 4188 } 4189 4190 /** 4191 * maybe_commit_transaction - possibly commit the transaction if its ok to 4192 * @root - the root we're allocating for 4193 * @bytes - the number of bytes we want to reserve 4194 * @force - force the commit 4195 * 4196 * This will check to make sure that committing the transaction will actually 4197 * get us somewhere and then commit the transaction if it does. Otherwise it 4198 * will return -ENOSPC. 4199 */ 4200 static int may_commit_transaction(struct btrfs_root *root, 4201 struct btrfs_space_info *space_info, 4202 u64 bytes, int force) 4203 { 4204 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4205 struct btrfs_trans_handle *trans; 4206 4207 trans = (struct btrfs_trans_handle *)current->journal_info; 4208 if (trans) 4209 return -EAGAIN; 4210 4211 if (force) 4212 goto commit; 4213 4214 /* See if there is enough pinned space to make this reservation */ 4215 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4216 bytes) >= 0) 4217 goto commit; 4218 4219 /* 4220 * See if there is some space in the delayed insertion reservation for 4221 * this reservation. 4222 */ 4223 if (space_info != delayed_rsv->space_info) 4224 return -ENOSPC; 4225 4226 spin_lock(&delayed_rsv->lock); 4227 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4228 bytes - delayed_rsv->size) >= 0) { 4229 spin_unlock(&delayed_rsv->lock); 4230 return -ENOSPC; 4231 } 4232 spin_unlock(&delayed_rsv->lock); 4233 4234 commit: 4235 trans = btrfs_join_transaction(root); 4236 if (IS_ERR(trans)) 4237 return -ENOSPC; 4238 4239 return btrfs_commit_transaction(trans, root); 4240 } 4241 4242 enum flush_state { 4243 FLUSH_DELAYED_ITEMS_NR = 1, 4244 FLUSH_DELAYED_ITEMS = 2, 4245 FLUSH_DELALLOC = 3, 4246 FLUSH_DELALLOC_WAIT = 4, 4247 ALLOC_CHUNK = 5, 4248 COMMIT_TRANS = 6, 4249 }; 4250 4251 static int flush_space(struct btrfs_root *root, 4252 struct btrfs_space_info *space_info, u64 num_bytes, 4253 u64 orig_bytes, int state) 4254 { 4255 struct btrfs_trans_handle *trans; 4256 int nr; 4257 int ret = 0; 4258 4259 switch (state) { 4260 case FLUSH_DELAYED_ITEMS_NR: 4261 case FLUSH_DELAYED_ITEMS: 4262 if (state == FLUSH_DELAYED_ITEMS_NR) 4263 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4264 else 4265 nr = -1; 4266 4267 trans = btrfs_join_transaction(root); 4268 if (IS_ERR(trans)) { 4269 ret = PTR_ERR(trans); 4270 break; 4271 } 4272 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4273 btrfs_end_transaction(trans, root); 4274 break; 4275 case FLUSH_DELALLOC: 4276 case FLUSH_DELALLOC_WAIT: 4277 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4278 state == FLUSH_DELALLOC_WAIT); 4279 break; 4280 case ALLOC_CHUNK: 4281 trans = btrfs_join_transaction(root); 4282 if (IS_ERR(trans)) { 4283 ret = PTR_ERR(trans); 4284 break; 4285 } 4286 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4287 btrfs_get_alloc_profile(root, 0), 4288 CHUNK_ALLOC_NO_FORCE); 4289 btrfs_end_transaction(trans, root); 4290 if (ret == -ENOSPC) 4291 ret = 0; 4292 break; 4293 case COMMIT_TRANS: 4294 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4295 break; 4296 default: 4297 ret = -ENOSPC; 4298 break; 4299 } 4300 4301 return ret; 4302 } 4303 4304 static inline u64 4305 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4306 struct btrfs_space_info *space_info) 4307 { 4308 u64 used; 4309 u64 expected; 4310 u64 to_reclaim; 4311 4312 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, 4313 16 * 1024 * 1024); 4314 spin_lock(&space_info->lock); 4315 if (can_overcommit(root, space_info, to_reclaim, 4316 BTRFS_RESERVE_FLUSH_ALL)) { 4317 to_reclaim = 0; 4318 goto out; 4319 } 4320 4321 used = space_info->bytes_used + space_info->bytes_reserved + 4322 space_info->bytes_pinned + space_info->bytes_readonly + 4323 space_info->bytes_may_use; 4324 if (can_overcommit(root, space_info, 1024 * 1024, 4325 BTRFS_RESERVE_FLUSH_ALL)) 4326 expected = div_factor_fine(space_info->total_bytes, 95); 4327 else 4328 expected = div_factor_fine(space_info->total_bytes, 90); 4329 4330 if (used > expected) 4331 to_reclaim = used - expected; 4332 else 4333 to_reclaim = 0; 4334 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4335 space_info->bytes_reserved); 4336 out: 4337 spin_unlock(&space_info->lock); 4338 4339 return to_reclaim; 4340 } 4341 4342 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4343 struct btrfs_fs_info *fs_info, u64 used) 4344 { 4345 return (used >= div_factor_fine(space_info->total_bytes, 98) && 4346 !btrfs_fs_closing(fs_info) && 4347 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4348 } 4349 4350 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4351 struct btrfs_fs_info *fs_info) 4352 { 4353 u64 used; 4354 4355 spin_lock(&space_info->lock); 4356 used = space_info->bytes_used + space_info->bytes_reserved + 4357 space_info->bytes_pinned + space_info->bytes_readonly + 4358 space_info->bytes_may_use; 4359 if (need_do_async_reclaim(space_info, fs_info, used)) { 4360 spin_unlock(&space_info->lock); 4361 return 1; 4362 } 4363 spin_unlock(&space_info->lock); 4364 4365 return 0; 4366 } 4367 4368 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4369 { 4370 struct btrfs_fs_info *fs_info; 4371 struct btrfs_space_info *space_info; 4372 u64 to_reclaim; 4373 int flush_state; 4374 4375 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4376 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4377 4378 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4379 space_info); 4380 if (!to_reclaim) 4381 return; 4382 4383 flush_state = FLUSH_DELAYED_ITEMS_NR; 4384 do { 4385 flush_space(fs_info->fs_root, space_info, to_reclaim, 4386 to_reclaim, flush_state); 4387 flush_state++; 4388 if (!btrfs_need_do_async_reclaim(space_info, fs_info)) 4389 return; 4390 } while (flush_state <= COMMIT_TRANS); 4391 4392 if (btrfs_need_do_async_reclaim(space_info, fs_info)) 4393 queue_work(system_unbound_wq, work); 4394 } 4395 4396 void btrfs_init_async_reclaim_work(struct work_struct *work) 4397 { 4398 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 4399 } 4400 4401 /** 4402 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4403 * @root - the root we're allocating for 4404 * @block_rsv - the block_rsv we're allocating for 4405 * @orig_bytes - the number of bytes we want 4406 * @flush - whether or not we can flush to make our reservation 4407 * 4408 * This will reserve orgi_bytes number of bytes from the space info associated 4409 * with the block_rsv. If there is not enough space it will make an attempt to 4410 * flush out space to make room. It will do this by flushing delalloc if 4411 * possible or committing the transaction. If flush is 0 then no attempts to 4412 * regain reservations will be made and this will fail if there is not enough 4413 * space already. 4414 */ 4415 static int reserve_metadata_bytes(struct btrfs_root *root, 4416 struct btrfs_block_rsv *block_rsv, 4417 u64 orig_bytes, 4418 enum btrfs_reserve_flush_enum flush) 4419 { 4420 struct btrfs_space_info *space_info = block_rsv->space_info; 4421 u64 used; 4422 u64 num_bytes = orig_bytes; 4423 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4424 int ret = 0; 4425 bool flushing = false; 4426 4427 again: 4428 ret = 0; 4429 spin_lock(&space_info->lock); 4430 /* 4431 * We only want to wait if somebody other than us is flushing and we 4432 * are actually allowed to flush all things. 4433 */ 4434 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4435 space_info->flush) { 4436 spin_unlock(&space_info->lock); 4437 /* 4438 * If we have a trans handle we can't wait because the flusher 4439 * may have to commit the transaction, which would mean we would 4440 * deadlock since we are waiting for the flusher to finish, but 4441 * hold the current transaction open. 4442 */ 4443 if (current->journal_info) 4444 return -EAGAIN; 4445 ret = wait_event_killable(space_info->wait, !space_info->flush); 4446 /* Must have been killed, return */ 4447 if (ret) 4448 return -EINTR; 4449 4450 spin_lock(&space_info->lock); 4451 } 4452 4453 ret = -ENOSPC; 4454 used = space_info->bytes_used + space_info->bytes_reserved + 4455 space_info->bytes_pinned + space_info->bytes_readonly + 4456 space_info->bytes_may_use; 4457 4458 /* 4459 * The idea here is that we've not already over-reserved the block group 4460 * then we can go ahead and save our reservation first and then start 4461 * flushing if we need to. Otherwise if we've already overcommitted 4462 * lets start flushing stuff first and then come back and try to make 4463 * our reservation. 4464 */ 4465 if (used <= space_info->total_bytes) { 4466 if (used + orig_bytes <= space_info->total_bytes) { 4467 space_info->bytes_may_use += orig_bytes; 4468 trace_btrfs_space_reservation(root->fs_info, 4469 "space_info", space_info->flags, orig_bytes, 1); 4470 ret = 0; 4471 } else { 4472 /* 4473 * Ok set num_bytes to orig_bytes since we aren't 4474 * overocmmitted, this way we only try and reclaim what 4475 * we need. 4476 */ 4477 num_bytes = orig_bytes; 4478 } 4479 } else { 4480 /* 4481 * Ok we're over committed, set num_bytes to the overcommitted 4482 * amount plus the amount of bytes that we need for this 4483 * reservation. 4484 */ 4485 num_bytes = used - space_info->total_bytes + 4486 (orig_bytes * 2); 4487 } 4488 4489 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4490 space_info->bytes_may_use += orig_bytes; 4491 trace_btrfs_space_reservation(root->fs_info, "space_info", 4492 space_info->flags, orig_bytes, 4493 1); 4494 ret = 0; 4495 } 4496 4497 /* 4498 * Couldn't make our reservation, save our place so while we're trying 4499 * to reclaim space we can actually use it instead of somebody else 4500 * stealing it from us. 4501 * 4502 * We make the other tasks wait for the flush only when we can flush 4503 * all things. 4504 */ 4505 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4506 flushing = true; 4507 space_info->flush = 1; 4508 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4509 used += orig_bytes; 4510 if (need_do_async_reclaim(space_info, root->fs_info, used) && 4511 !work_busy(&root->fs_info->async_reclaim_work)) 4512 queue_work(system_unbound_wq, 4513 &root->fs_info->async_reclaim_work); 4514 } 4515 spin_unlock(&space_info->lock); 4516 4517 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4518 goto out; 4519 4520 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4521 flush_state); 4522 flush_state++; 4523 4524 /* 4525 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4526 * would happen. So skip delalloc flush. 4527 */ 4528 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4529 (flush_state == FLUSH_DELALLOC || 4530 flush_state == FLUSH_DELALLOC_WAIT)) 4531 flush_state = ALLOC_CHUNK; 4532 4533 if (!ret) 4534 goto again; 4535 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4536 flush_state < COMMIT_TRANS) 4537 goto again; 4538 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4539 flush_state <= COMMIT_TRANS) 4540 goto again; 4541 4542 out: 4543 if (ret == -ENOSPC && 4544 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4545 struct btrfs_block_rsv *global_rsv = 4546 &root->fs_info->global_block_rsv; 4547 4548 if (block_rsv != global_rsv && 4549 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4550 ret = 0; 4551 } 4552 if (ret == -ENOSPC) 4553 trace_btrfs_space_reservation(root->fs_info, 4554 "space_info:enospc", 4555 space_info->flags, orig_bytes, 1); 4556 if (flushing) { 4557 spin_lock(&space_info->lock); 4558 space_info->flush = 0; 4559 wake_up_all(&space_info->wait); 4560 spin_unlock(&space_info->lock); 4561 } 4562 return ret; 4563 } 4564 4565 static struct btrfs_block_rsv *get_block_rsv( 4566 const struct btrfs_trans_handle *trans, 4567 const struct btrfs_root *root) 4568 { 4569 struct btrfs_block_rsv *block_rsv = NULL; 4570 4571 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4572 block_rsv = trans->block_rsv; 4573 4574 if (root == root->fs_info->csum_root && trans->adding_csums) 4575 block_rsv = trans->block_rsv; 4576 4577 if (root == root->fs_info->uuid_root) 4578 block_rsv = trans->block_rsv; 4579 4580 if (!block_rsv) 4581 block_rsv = root->block_rsv; 4582 4583 if (!block_rsv) 4584 block_rsv = &root->fs_info->empty_block_rsv; 4585 4586 return block_rsv; 4587 } 4588 4589 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4590 u64 num_bytes) 4591 { 4592 int ret = -ENOSPC; 4593 spin_lock(&block_rsv->lock); 4594 if (block_rsv->reserved >= num_bytes) { 4595 block_rsv->reserved -= num_bytes; 4596 if (block_rsv->reserved < block_rsv->size) 4597 block_rsv->full = 0; 4598 ret = 0; 4599 } 4600 spin_unlock(&block_rsv->lock); 4601 return ret; 4602 } 4603 4604 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4605 u64 num_bytes, int update_size) 4606 { 4607 spin_lock(&block_rsv->lock); 4608 block_rsv->reserved += num_bytes; 4609 if (update_size) 4610 block_rsv->size += num_bytes; 4611 else if (block_rsv->reserved >= block_rsv->size) 4612 block_rsv->full = 1; 4613 spin_unlock(&block_rsv->lock); 4614 } 4615 4616 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 4617 struct btrfs_block_rsv *dest, u64 num_bytes, 4618 int min_factor) 4619 { 4620 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4621 u64 min_bytes; 4622 4623 if (global_rsv->space_info != dest->space_info) 4624 return -ENOSPC; 4625 4626 spin_lock(&global_rsv->lock); 4627 min_bytes = div_factor(global_rsv->size, min_factor); 4628 if (global_rsv->reserved < min_bytes + num_bytes) { 4629 spin_unlock(&global_rsv->lock); 4630 return -ENOSPC; 4631 } 4632 global_rsv->reserved -= num_bytes; 4633 if (global_rsv->reserved < global_rsv->size) 4634 global_rsv->full = 0; 4635 spin_unlock(&global_rsv->lock); 4636 4637 block_rsv_add_bytes(dest, num_bytes, 1); 4638 return 0; 4639 } 4640 4641 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4642 struct btrfs_block_rsv *block_rsv, 4643 struct btrfs_block_rsv *dest, u64 num_bytes) 4644 { 4645 struct btrfs_space_info *space_info = block_rsv->space_info; 4646 4647 spin_lock(&block_rsv->lock); 4648 if (num_bytes == (u64)-1) 4649 num_bytes = block_rsv->size; 4650 block_rsv->size -= num_bytes; 4651 if (block_rsv->reserved >= block_rsv->size) { 4652 num_bytes = block_rsv->reserved - block_rsv->size; 4653 block_rsv->reserved = block_rsv->size; 4654 block_rsv->full = 1; 4655 } else { 4656 num_bytes = 0; 4657 } 4658 spin_unlock(&block_rsv->lock); 4659 4660 if (num_bytes > 0) { 4661 if (dest) { 4662 spin_lock(&dest->lock); 4663 if (!dest->full) { 4664 u64 bytes_to_add; 4665 4666 bytes_to_add = dest->size - dest->reserved; 4667 bytes_to_add = min(num_bytes, bytes_to_add); 4668 dest->reserved += bytes_to_add; 4669 if (dest->reserved >= dest->size) 4670 dest->full = 1; 4671 num_bytes -= bytes_to_add; 4672 } 4673 spin_unlock(&dest->lock); 4674 } 4675 if (num_bytes) { 4676 spin_lock(&space_info->lock); 4677 space_info->bytes_may_use -= num_bytes; 4678 trace_btrfs_space_reservation(fs_info, "space_info", 4679 space_info->flags, num_bytes, 0); 4680 spin_unlock(&space_info->lock); 4681 } 4682 } 4683 } 4684 4685 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4686 struct btrfs_block_rsv *dst, u64 num_bytes) 4687 { 4688 int ret; 4689 4690 ret = block_rsv_use_bytes(src, num_bytes); 4691 if (ret) 4692 return ret; 4693 4694 block_rsv_add_bytes(dst, num_bytes, 1); 4695 return 0; 4696 } 4697 4698 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4699 { 4700 memset(rsv, 0, sizeof(*rsv)); 4701 spin_lock_init(&rsv->lock); 4702 rsv->type = type; 4703 } 4704 4705 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4706 unsigned short type) 4707 { 4708 struct btrfs_block_rsv *block_rsv; 4709 struct btrfs_fs_info *fs_info = root->fs_info; 4710 4711 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4712 if (!block_rsv) 4713 return NULL; 4714 4715 btrfs_init_block_rsv(block_rsv, type); 4716 block_rsv->space_info = __find_space_info(fs_info, 4717 BTRFS_BLOCK_GROUP_METADATA); 4718 return block_rsv; 4719 } 4720 4721 void btrfs_free_block_rsv(struct btrfs_root *root, 4722 struct btrfs_block_rsv *rsv) 4723 { 4724 if (!rsv) 4725 return; 4726 btrfs_block_rsv_release(root, rsv, (u64)-1); 4727 kfree(rsv); 4728 } 4729 4730 int btrfs_block_rsv_add(struct btrfs_root *root, 4731 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4732 enum btrfs_reserve_flush_enum flush) 4733 { 4734 int ret; 4735 4736 if (num_bytes == 0) 4737 return 0; 4738 4739 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4740 if (!ret) { 4741 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4742 return 0; 4743 } 4744 4745 return ret; 4746 } 4747 4748 int btrfs_block_rsv_check(struct btrfs_root *root, 4749 struct btrfs_block_rsv *block_rsv, int min_factor) 4750 { 4751 u64 num_bytes = 0; 4752 int ret = -ENOSPC; 4753 4754 if (!block_rsv) 4755 return 0; 4756 4757 spin_lock(&block_rsv->lock); 4758 num_bytes = div_factor(block_rsv->size, min_factor); 4759 if (block_rsv->reserved >= num_bytes) 4760 ret = 0; 4761 spin_unlock(&block_rsv->lock); 4762 4763 return ret; 4764 } 4765 4766 int btrfs_block_rsv_refill(struct btrfs_root *root, 4767 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4768 enum btrfs_reserve_flush_enum flush) 4769 { 4770 u64 num_bytes = 0; 4771 int ret = -ENOSPC; 4772 4773 if (!block_rsv) 4774 return 0; 4775 4776 spin_lock(&block_rsv->lock); 4777 num_bytes = min_reserved; 4778 if (block_rsv->reserved >= num_bytes) 4779 ret = 0; 4780 else 4781 num_bytes -= block_rsv->reserved; 4782 spin_unlock(&block_rsv->lock); 4783 4784 if (!ret) 4785 return 0; 4786 4787 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4788 if (!ret) { 4789 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4790 return 0; 4791 } 4792 4793 return ret; 4794 } 4795 4796 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4797 struct btrfs_block_rsv *dst_rsv, 4798 u64 num_bytes) 4799 { 4800 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4801 } 4802 4803 void btrfs_block_rsv_release(struct btrfs_root *root, 4804 struct btrfs_block_rsv *block_rsv, 4805 u64 num_bytes) 4806 { 4807 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4808 if (global_rsv == block_rsv || 4809 block_rsv->space_info != global_rsv->space_info) 4810 global_rsv = NULL; 4811 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4812 num_bytes); 4813 } 4814 4815 /* 4816 * helper to calculate size of global block reservation. 4817 * the desired value is sum of space used by extent tree, 4818 * checksum tree and root tree 4819 */ 4820 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4821 { 4822 struct btrfs_space_info *sinfo; 4823 u64 num_bytes; 4824 u64 meta_used; 4825 u64 data_used; 4826 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4827 4828 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4829 spin_lock(&sinfo->lock); 4830 data_used = sinfo->bytes_used; 4831 spin_unlock(&sinfo->lock); 4832 4833 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4834 spin_lock(&sinfo->lock); 4835 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4836 data_used = 0; 4837 meta_used = sinfo->bytes_used; 4838 spin_unlock(&sinfo->lock); 4839 4840 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4841 csum_size * 2; 4842 num_bytes += div64_u64(data_used + meta_used, 50); 4843 4844 if (num_bytes * 3 > meta_used) 4845 num_bytes = div64_u64(meta_used, 3); 4846 4847 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4848 } 4849 4850 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4851 { 4852 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4853 struct btrfs_space_info *sinfo = block_rsv->space_info; 4854 u64 num_bytes; 4855 4856 num_bytes = calc_global_metadata_size(fs_info); 4857 4858 spin_lock(&sinfo->lock); 4859 spin_lock(&block_rsv->lock); 4860 4861 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4862 4863 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4864 sinfo->bytes_reserved + sinfo->bytes_readonly + 4865 sinfo->bytes_may_use; 4866 4867 if (sinfo->total_bytes > num_bytes) { 4868 num_bytes = sinfo->total_bytes - num_bytes; 4869 block_rsv->reserved += num_bytes; 4870 sinfo->bytes_may_use += num_bytes; 4871 trace_btrfs_space_reservation(fs_info, "space_info", 4872 sinfo->flags, num_bytes, 1); 4873 } 4874 4875 if (block_rsv->reserved >= block_rsv->size) { 4876 num_bytes = block_rsv->reserved - block_rsv->size; 4877 sinfo->bytes_may_use -= num_bytes; 4878 trace_btrfs_space_reservation(fs_info, "space_info", 4879 sinfo->flags, num_bytes, 0); 4880 block_rsv->reserved = block_rsv->size; 4881 block_rsv->full = 1; 4882 } 4883 4884 spin_unlock(&block_rsv->lock); 4885 spin_unlock(&sinfo->lock); 4886 } 4887 4888 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4889 { 4890 struct btrfs_space_info *space_info; 4891 4892 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4893 fs_info->chunk_block_rsv.space_info = space_info; 4894 4895 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4896 fs_info->global_block_rsv.space_info = space_info; 4897 fs_info->delalloc_block_rsv.space_info = space_info; 4898 fs_info->trans_block_rsv.space_info = space_info; 4899 fs_info->empty_block_rsv.space_info = space_info; 4900 fs_info->delayed_block_rsv.space_info = space_info; 4901 4902 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4903 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4904 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4905 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4906 if (fs_info->quota_root) 4907 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 4908 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4909 4910 update_global_block_rsv(fs_info); 4911 } 4912 4913 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4914 { 4915 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4916 (u64)-1); 4917 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4918 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4919 WARN_ON(fs_info->trans_block_rsv.size > 0); 4920 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4921 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4922 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4923 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4924 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4925 } 4926 4927 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4928 struct btrfs_root *root) 4929 { 4930 if (!trans->block_rsv) 4931 return; 4932 4933 if (!trans->bytes_reserved) 4934 return; 4935 4936 trace_btrfs_space_reservation(root->fs_info, "transaction", 4937 trans->transid, trans->bytes_reserved, 0); 4938 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4939 trans->bytes_reserved = 0; 4940 } 4941 4942 /* Can only return 0 or -ENOSPC */ 4943 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4944 struct inode *inode) 4945 { 4946 struct btrfs_root *root = BTRFS_I(inode)->root; 4947 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4948 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4949 4950 /* 4951 * We need to hold space in order to delete our orphan item once we've 4952 * added it, so this takes the reservation so we can release it later 4953 * when we are truly done with the orphan item. 4954 */ 4955 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4956 trace_btrfs_space_reservation(root->fs_info, "orphan", 4957 btrfs_ino(inode), num_bytes, 1); 4958 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4959 } 4960 4961 void btrfs_orphan_release_metadata(struct inode *inode) 4962 { 4963 struct btrfs_root *root = BTRFS_I(inode)->root; 4964 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4965 trace_btrfs_space_reservation(root->fs_info, "orphan", 4966 btrfs_ino(inode), num_bytes, 0); 4967 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4968 } 4969 4970 /* 4971 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4972 * root: the root of the parent directory 4973 * rsv: block reservation 4974 * items: the number of items that we need do reservation 4975 * qgroup_reserved: used to return the reserved size in qgroup 4976 * 4977 * This function is used to reserve the space for snapshot/subvolume 4978 * creation and deletion. Those operations are different with the 4979 * common file/directory operations, they change two fs/file trees 4980 * and root tree, the number of items that the qgroup reserves is 4981 * different with the free space reservation. So we can not use 4982 * the space reseravtion mechanism in start_transaction(). 4983 */ 4984 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4985 struct btrfs_block_rsv *rsv, 4986 int items, 4987 u64 *qgroup_reserved, 4988 bool use_global_rsv) 4989 { 4990 u64 num_bytes; 4991 int ret; 4992 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4993 4994 if (root->fs_info->quota_enabled) { 4995 /* One for parent inode, two for dir entries */ 4996 num_bytes = 3 * root->leafsize; 4997 ret = btrfs_qgroup_reserve(root, num_bytes); 4998 if (ret) 4999 return ret; 5000 } else { 5001 num_bytes = 0; 5002 } 5003 5004 *qgroup_reserved = num_bytes; 5005 5006 num_bytes = btrfs_calc_trans_metadata_size(root, items); 5007 rsv->space_info = __find_space_info(root->fs_info, 5008 BTRFS_BLOCK_GROUP_METADATA); 5009 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5010 BTRFS_RESERVE_FLUSH_ALL); 5011 5012 if (ret == -ENOSPC && use_global_rsv) 5013 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 5014 5015 if (ret) { 5016 if (*qgroup_reserved) 5017 btrfs_qgroup_free(root, *qgroup_reserved); 5018 } 5019 5020 return ret; 5021 } 5022 5023 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 5024 struct btrfs_block_rsv *rsv, 5025 u64 qgroup_reserved) 5026 { 5027 btrfs_block_rsv_release(root, rsv, (u64)-1); 5028 if (qgroup_reserved) 5029 btrfs_qgroup_free(root, qgroup_reserved); 5030 } 5031 5032 /** 5033 * drop_outstanding_extent - drop an outstanding extent 5034 * @inode: the inode we're dropping the extent for 5035 * 5036 * This is called when we are freeing up an outstanding extent, either called 5037 * after an error or after an extent is written. This will return the number of 5038 * reserved extents that need to be freed. This must be called with 5039 * BTRFS_I(inode)->lock held. 5040 */ 5041 static unsigned drop_outstanding_extent(struct inode *inode) 5042 { 5043 unsigned drop_inode_space = 0; 5044 unsigned dropped_extents = 0; 5045 5046 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 5047 BTRFS_I(inode)->outstanding_extents--; 5048 5049 if (BTRFS_I(inode)->outstanding_extents == 0 && 5050 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5051 &BTRFS_I(inode)->runtime_flags)) 5052 drop_inode_space = 1; 5053 5054 /* 5055 * If we have more or the same amount of outsanding extents than we have 5056 * reserved then we need to leave the reserved extents count alone. 5057 */ 5058 if (BTRFS_I(inode)->outstanding_extents >= 5059 BTRFS_I(inode)->reserved_extents) 5060 return drop_inode_space; 5061 5062 dropped_extents = BTRFS_I(inode)->reserved_extents - 5063 BTRFS_I(inode)->outstanding_extents; 5064 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5065 return dropped_extents + drop_inode_space; 5066 } 5067 5068 /** 5069 * calc_csum_metadata_size - return the amount of metada space that must be 5070 * reserved/free'd for the given bytes. 5071 * @inode: the inode we're manipulating 5072 * @num_bytes: the number of bytes in question 5073 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5074 * 5075 * This adjusts the number of csum_bytes in the inode and then returns the 5076 * correct amount of metadata that must either be reserved or freed. We 5077 * calculate how many checksums we can fit into one leaf and then divide the 5078 * number of bytes that will need to be checksumed by this value to figure out 5079 * how many checksums will be required. If we are adding bytes then the number 5080 * may go up and we will return the number of additional bytes that must be 5081 * reserved. If it is going down we will return the number of bytes that must 5082 * be freed. 5083 * 5084 * This must be called with BTRFS_I(inode)->lock held. 5085 */ 5086 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5087 int reserve) 5088 { 5089 struct btrfs_root *root = BTRFS_I(inode)->root; 5090 u64 csum_size; 5091 int num_csums_per_leaf; 5092 int num_csums; 5093 int old_csums; 5094 5095 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5096 BTRFS_I(inode)->csum_bytes == 0) 5097 return 0; 5098 5099 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5100 if (reserve) 5101 BTRFS_I(inode)->csum_bytes += num_bytes; 5102 else 5103 BTRFS_I(inode)->csum_bytes -= num_bytes; 5104 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 5105 num_csums_per_leaf = (int)div64_u64(csum_size, 5106 sizeof(struct btrfs_csum_item) + 5107 sizeof(struct btrfs_disk_key)); 5108 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5109 num_csums = num_csums + num_csums_per_leaf - 1; 5110 num_csums = num_csums / num_csums_per_leaf; 5111 5112 old_csums = old_csums + num_csums_per_leaf - 1; 5113 old_csums = old_csums / num_csums_per_leaf; 5114 5115 /* No change, no need to reserve more */ 5116 if (old_csums == num_csums) 5117 return 0; 5118 5119 if (reserve) 5120 return btrfs_calc_trans_metadata_size(root, 5121 num_csums - old_csums); 5122 5123 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 5124 } 5125 5126 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5127 { 5128 struct btrfs_root *root = BTRFS_I(inode)->root; 5129 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 5130 u64 to_reserve = 0; 5131 u64 csum_bytes; 5132 unsigned nr_extents = 0; 5133 int extra_reserve = 0; 5134 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5135 int ret = 0; 5136 bool delalloc_lock = true; 5137 u64 to_free = 0; 5138 unsigned dropped; 5139 5140 /* If we are a free space inode we need to not flush since we will be in 5141 * the middle of a transaction commit. We also don't need the delalloc 5142 * mutex since we won't race with anybody. We need this mostly to make 5143 * lockdep shut its filthy mouth. 5144 */ 5145 if (btrfs_is_free_space_inode(inode)) { 5146 flush = BTRFS_RESERVE_NO_FLUSH; 5147 delalloc_lock = false; 5148 } 5149 5150 if (flush != BTRFS_RESERVE_NO_FLUSH && 5151 btrfs_transaction_in_commit(root->fs_info)) 5152 schedule_timeout(1); 5153 5154 if (delalloc_lock) 5155 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5156 5157 num_bytes = ALIGN(num_bytes, root->sectorsize); 5158 5159 spin_lock(&BTRFS_I(inode)->lock); 5160 BTRFS_I(inode)->outstanding_extents++; 5161 5162 if (BTRFS_I(inode)->outstanding_extents > 5163 BTRFS_I(inode)->reserved_extents) 5164 nr_extents = BTRFS_I(inode)->outstanding_extents - 5165 BTRFS_I(inode)->reserved_extents; 5166 5167 /* 5168 * Add an item to reserve for updating the inode when we complete the 5169 * delalloc io. 5170 */ 5171 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5172 &BTRFS_I(inode)->runtime_flags)) { 5173 nr_extents++; 5174 extra_reserve = 1; 5175 } 5176 5177 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 5178 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5179 csum_bytes = BTRFS_I(inode)->csum_bytes; 5180 spin_unlock(&BTRFS_I(inode)->lock); 5181 5182 if (root->fs_info->quota_enabled) { 5183 ret = btrfs_qgroup_reserve(root, num_bytes + 5184 nr_extents * root->leafsize); 5185 if (ret) 5186 goto out_fail; 5187 } 5188 5189 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5190 if (unlikely(ret)) { 5191 if (root->fs_info->quota_enabled) 5192 btrfs_qgroup_free(root, num_bytes + 5193 nr_extents * root->leafsize); 5194 goto out_fail; 5195 } 5196 5197 spin_lock(&BTRFS_I(inode)->lock); 5198 if (extra_reserve) { 5199 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5200 &BTRFS_I(inode)->runtime_flags); 5201 nr_extents--; 5202 } 5203 BTRFS_I(inode)->reserved_extents += nr_extents; 5204 spin_unlock(&BTRFS_I(inode)->lock); 5205 5206 if (delalloc_lock) 5207 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5208 5209 if (to_reserve) 5210 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5211 btrfs_ino(inode), to_reserve, 1); 5212 block_rsv_add_bytes(block_rsv, to_reserve, 1); 5213 5214 return 0; 5215 5216 out_fail: 5217 spin_lock(&BTRFS_I(inode)->lock); 5218 dropped = drop_outstanding_extent(inode); 5219 /* 5220 * If the inodes csum_bytes is the same as the original 5221 * csum_bytes then we know we haven't raced with any free()ers 5222 * so we can just reduce our inodes csum bytes and carry on. 5223 */ 5224 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 5225 calc_csum_metadata_size(inode, num_bytes, 0); 5226 } else { 5227 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 5228 u64 bytes; 5229 5230 /* 5231 * This is tricky, but first we need to figure out how much we 5232 * free'd from any free-ers that occured during this 5233 * reservation, so we reset ->csum_bytes to the csum_bytes 5234 * before we dropped our lock, and then call the free for the 5235 * number of bytes that were freed while we were trying our 5236 * reservation. 5237 */ 5238 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 5239 BTRFS_I(inode)->csum_bytes = csum_bytes; 5240 to_free = calc_csum_metadata_size(inode, bytes, 0); 5241 5242 5243 /* 5244 * Now we need to see how much we would have freed had we not 5245 * been making this reservation and our ->csum_bytes were not 5246 * artificially inflated. 5247 */ 5248 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 5249 bytes = csum_bytes - orig_csum_bytes; 5250 bytes = calc_csum_metadata_size(inode, bytes, 0); 5251 5252 /* 5253 * Now reset ->csum_bytes to what it should be. If bytes is 5254 * more than to_free then we would have free'd more space had we 5255 * not had an artificially high ->csum_bytes, so we need to free 5256 * the remainder. If bytes is the same or less then we don't 5257 * need to do anything, the other free-ers did the correct 5258 * thing. 5259 */ 5260 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 5261 if (bytes > to_free) 5262 to_free = bytes - to_free; 5263 else 5264 to_free = 0; 5265 } 5266 spin_unlock(&BTRFS_I(inode)->lock); 5267 if (dropped) 5268 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5269 5270 if (to_free) { 5271 btrfs_block_rsv_release(root, block_rsv, to_free); 5272 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5273 btrfs_ino(inode), to_free, 0); 5274 } 5275 if (delalloc_lock) 5276 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5277 return ret; 5278 } 5279 5280 /** 5281 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5282 * @inode: the inode to release the reservation for 5283 * @num_bytes: the number of bytes we're releasing 5284 * 5285 * This will release the metadata reservation for an inode. This can be called 5286 * once we complete IO for a given set of bytes to release their metadata 5287 * reservations. 5288 */ 5289 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 5290 { 5291 struct btrfs_root *root = BTRFS_I(inode)->root; 5292 u64 to_free = 0; 5293 unsigned dropped; 5294 5295 num_bytes = ALIGN(num_bytes, root->sectorsize); 5296 spin_lock(&BTRFS_I(inode)->lock); 5297 dropped = drop_outstanding_extent(inode); 5298 5299 if (num_bytes) 5300 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5301 spin_unlock(&BTRFS_I(inode)->lock); 5302 if (dropped > 0) 5303 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5304 5305 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5306 btrfs_ino(inode), to_free, 0); 5307 if (root->fs_info->quota_enabled) { 5308 btrfs_qgroup_free(root, num_bytes + 5309 dropped * root->leafsize); 5310 } 5311 5312 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5313 to_free); 5314 } 5315 5316 /** 5317 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 5318 * @inode: inode we're writing to 5319 * @num_bytes: the number of bytes we want to allocate 5320 * 5321 * This will do the following things 5322 * 5323 * o reserve space in the data space info for num_bytes 5324 * o reserve space in the metadata space info based on number of outstanding 5325 * extents and how much csums will be needed 5326 * o add to the inodes ->delalloc_bytes 5327 * o add it to the fs_info's delalloc inodes list. 5328 * 5329 * This will return 0 for success and -ENOSPC if there is no space left. 5330 */ 5331 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 5332 { 5333 int ret; 5334 5335 ret = btrfs_check_data_free_space(inode, num_bytes); 5336 if (ret) 5337 return ret; 5338 5339 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 5340 if (ret) { 5341 btrfs_free_reserved_data_space(inode, num_bytes); 5342 return ret; 5343 } 5344 5345 return 0; 5346 } 5347 5348 /** 5349 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5350 * @inode: inode we're releasing space for 5351 * @num_bytes: the number of bytes we want to free up 5352 * 5353 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5354 * called in the case that we don't need the metadata AND data reservations 5355 * anymore. So if there is an error or we insert an inline extent. 5356 * 5357 * This function will release the metadata space that was not used and will 5358 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5359 * list if there are no delalloc bytes left. 5360 */ 5361 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5362 { 5363 btrfs_delalloc_release_metadata(inode, num_bytes); 5364 btrfs_free_reserved_data_space(inode, num_bytes); 5365 } 5366 5367 static int update_block_group(struct btrfs_root *root, 5368 u64 bytenr, u64 num_bytes, int alloc) 5369 { 5370 struct btrfs_block_group_cache *cache = NULL; 5371 struct btrfs_fs_info *info = root->fs_info; 5372 u64 total = num_bytes; 5373 u64 old_val; 5374 u64 byte_in_group; 5375 int factor; 5376 5377 /* block accounting for super block */ 5378 spin_lock(&info->delalloc_root_lock); 5379 old_val = btrfs_super_bytes_used(info->super_copy); 5380 if (alloc) 5381 old_val += num_bytes; 5382 else 5383 old_val -= num_bytes; 5384 btrfs_set_super_bytes_used(info->super_copy, old_val); 5385 spin_unlock(&info->delalloc_root_lock); 5386 5387 while (total) { 5388 cache = btrfs_lookup_block_group(info, bytenr); 5389 if (!cache) 5390 return -ENOENT; 5391 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5392 BTRFS_BLOCK_GROUP_RAID1 | 5393 BTRFS_BLOCK_GROUP_RAID10)) 5394 factor = 2; 5395 else 5396 factor = 1; 5397 /* 5398 * If this block group has free space cache written out, we 5399 * need to make sure to load it if we are removing space. This 5400 * is because we need the unpinning stage to actually add the 5401 * space back to the block group, otherwise we will leak space. 5402 */ 5403 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5404 cache_block_group(cache, 1); 5405 5406 byte_in_group = bytenr - cache->key.objectid; 5407 WARN_ON(byte_in_group > cache->key.offset); 5408 5409 spin_lock(&cache->space_info->lock); 5410 spin_lock(&cache->lock); 5411 5412 if (btrfs_test_opt(root, SPACE_CACHE) && 5413 cache->disk_cache_state < BTRFS_DC_CLEAR) 5414 cache->disk_cache_state = BTRFS_DC_CLEAR; 5415 5416 cache->dirty = 1; 5417 old_val = btrfs_block_group_used(&cache->item); 5418 num_bytes = min(total, cache->key.offset - byte_in_group); 5419 if (alloc) { 5420 old_val += num_bytes; 5421 btrfs_set_block_group_used(&cache->item, old_val); 5422 cache->reserved -= num_bytes; 5423 cache->space_info->bytes_reserved -= num_bytes; 5424 cache->space_info->bytes_used += num_bytes; 5425 cache->space_info->disk_used += num_bytes * factor; 5426 spin_unlock(&cache->lock); 5427 spin_unlock(&cache->space_info->lock); 5428 } else { 5429 old_val -= num_bytes; 5430 btrfs_set_block_group_used(&cache->item, old_val); 5431 cache->pinned += num_bytes; 5432 cache->space_info->bytes_pinned += num_bytes; 5433 cache->space_info->bytes_used -= num_bytes; 5434 cache->space_info->disk_used -= num_bytes * factor; 5435 spin_unlock(&cache->lock); 5436 spin_unlock(&cache->space_info->lock); 5437 5438 set_extent_dirty(info->pinned_extents, 5439 bytenr, bytenr + num_bytes - 1, 5440 GFP_NOFS | __GFP_NOFAIL); 5441 } 5442 btrfs_put_block_group(cache); 5443 total -= num_bytes; 5444 bytenr += num_bytes; 5445 } 5446 return 0; 5447 } 5448 5449 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5450 { 5451 struct btrfs_block_group_cache *cache; 5452 u64 bytenr; 5453 5454 spin_lock(&root->fs_info->block_group_cache_lock); 5455 bytenr = root->fs_info->first_logical_byte; 5456 spin_unlock(&root->fs_info->block_group_cache_lock); 5457 5458 if (bytenr < (u64)-1) 5459 return bytenr; 5460 5461 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5462 if (!cache) 5463 return 0; 5464 5465 bytenr = cache->key.objectid; 5466 btrfs_put_block_group(cache); 5467 5468 return bytenr; 5469 } 5470 5471 static int pin_down_extent(struct btrfs_root *root, 5472 struct btrfs_block_group_cache *cache, 5473 u64 bytenr, u64 num_bytes, int reserved) 5474 { 5475 spin_lock(&cache->space_info->lock); 5476 spin_lock(&cache->lock); 5477 cache->pinned += num_bytes; 5478 cache->space_info->bytes_pinned += num_bytes; 5479 if (reserved) { 5480 cache->reserved -= num_bytes; 5481 cache->space_info->bytes_reserved -= num_bytes; 5482 } 5483 spin_unlock(&cache->lock); 5484 spin_unlock(&cache->space_info->lock); 5485 5486 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5487 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5488 if (reserved) 5489 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); 5490 return 0; 5491 } 5492 5493 /* 5494 * this function must be called within transaction 5495 */ 5496 int btrfs_pin_extent(struct btrfs_root *root, 5497 u64 bytenr, u64 num_bytes, int reserved) 5498 { 5499 struct btrfs_block_group_cache *cache; 5500 5501 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5502 BUG_ON(!cache); /* Logic error */ 5503 5504 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5505 5506 btrfs_put_block_group(cache); 5507 return 0; 5508 } 5509 5510 /* 5511 * this function must be called within transaction 5512 */ 5513 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5514 u64 bytenr, u64 num_bytes) 5515 { 5516 struct btrfs_block_group_cache *cache; 5517 int ret; 5518 5519 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5520 if (!cache) 5521 return -EINVAL; 5522 5523 /* 5524 * pull in the free space cache (if any) so that our pin 5525 * removes the free space from the cache. We have load_only set 5526 * to one because the slow code to read in the free extents does check 5527 * the pinned extents. 5528 */ 5529 cache_block_group(cache, 1); 5530 5531 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5532 5533 /* remove us from the free space cache (if we're there at all) */ 5534 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5535 btrfs_put_block_group(cache); 5536 return ret; 5537 } 5538 5539 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 5540 { 5541 int ret; 5542 struct btrfs_block_group_cache *block_group; 5543 struct btrfs_caching_control *caching_ctl; 5544 5545 block_group = btrfs_lookup_block_group(root->fs_info, start); 5546 if (!block_group) 5547 return -EINVAL; 5548 5549 cache_block_group(block_group, 0); 5550 caching_ctl = get_caching_control(block_group); 5551 5552 if (!caching_ctl) { 5553 /* Logic error */ 5554 BUG_ON(!block_group_cache_done(block_group)); 5555 ret = btrfs_remove_free_space(block_group, start, num_bytes); 5556 } else { 5557 mutex_lock(&caching_ctl->mutex); 5558 5559 if (start >= caching_ctl->progress) { 5560 ret = add_excluded_extent(root, start, num_bytes); 5561 } else if (start + num_bytes <= caching_ctl->progress) { 5562 ret = btrfs_remove_free_space(block_group, 5563 start, num_bytes); 5564 } else { 5565 num_bytes = caching_ctl->progress - start; 5566 ret = btrfs_remove_free_space(block_group, 5567 start, num_bytes); 5568 if (ret) 5569 goto out_lock; 5570 5571 num_bytes = (start + num_bytes) - 5572 caching_ctl->progress; 5573 start = caching_ctl->progress; 5574 ret = add_excluded_extent(root, start, num_bytes); 5575 } 5576 out_lock: 5577 mutex_unlock(&caching_ctl->mutex); 5578 put_caching_control(caching_ctl); 5579 } 5580 btrfs_put_block_group(block_group); 5581 return ret; 5582 } 5583 5584 int btrfs_exclude_logged_extents(struct btrfs_root *log, 5585 struct extent_buffer *eb) 5586 { 5587 struct btrfs_file_extent_item *item; 5588 struct btrfs_key key; 5589 int found_type; 5590 int i; 5591 5592 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 5593 return 0; 5594 5595 for (i = 0; i < btrfs_header_nritems(eb); i++) { 5596 btrfs_item_key_to_cpu(eb, &key, i); 5597 if (key.type != BTRFS_EXTENT_DATA_KEY) 5598 continue; 5599 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 5600 found_type = btrfs_file_extent_type(eb, item); 5601 if (found_type == BTRFS_FILE_EXTENT_INLINE) 5602 continue; 5603 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 5604 continue; 5605 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 5606 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 5607 __exclude_logged_extent(log, key.objectid, key.offset); 5608 } 5609 5610 return 0; 5611 } 5612 5613 /** 5614 * btrfs_update_reserved_bytes - update the block_group and space info counters 5615 * @cache: The cache we are manipulating 5616 * @num_bytes: The number of bytes in question 5617 * @reserve: One of the reservation enums 5618 * @delalloc: The blocks are allocated for the delalloc write 5619 * 5620 * This is called by the allocator when it reserves space, or by somebody who is 5621 * freeing space that was never actually used on disk. For example if you 5622 * reserve some space for a new leaf in transaction A and before transaction A 5623 * commits you free that leaf, you call this with reserve set to 0 in order to 5624 * clear the reservation. 5625 * 5626 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5627 * ENOSPC accounting. For data we handle the reservation through clearing the 5628 * delalloc bits in the io_tree. We have to do this since we could end up 5629 * allocating less disk space for the amount of data we have reserved in the 5630 * case of compression. 5631 * 5632 * If this is a reservation and the block group has become read only we cannot 5633 * make the reservation and return -EAGAIN, otherwise this function always 5634 * succeeds. 5635 */ 5636 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5637 u64 num_bytes, int reserve, int delalloc) 5638 { 5639 struct btrfs_space_info *space_info = cache->space_info; 5640 int ret = 0; 5641 5642 spin_lock(&space_info->lock); 5643 spin_lock(&cache->lock); 5644 if (reserve != RESERVE_FREE) { 5645 if (cache->ro) { 5646 ret = -EAGAIN; 5647 } else { 5648 cache->reserved += num_bytes; 5649 space_info->bytes_reserved += num_bytes; 5650 if (reserve == RESERVE_ALLOC) { 5651 trace_btrfs_space_reservation(cache->fs_info, 5652 "space_info", space_info->flags, 5653 num_bytes, 0); 5654 space_info->bytes_may_use -= num_bytes; 5655 } 5656 5657 if (delalloc) 5658 cache->delalloc_bytes += num_bytes; 5659 } 5660 } else { 5661 if (cache->ro) 5662 space_info->bytes_readonly += num_bytes; 5663 cache->reserved -= num_bytes; 5664 space_info->bytes_reserved -= num_bytes; 5665 5666 if (delalloc) 5667 cache->delalloc_bytes -= num_bytes; 5668 } 5669 spin_unlock(&cache->lock); 5670 spin_unlock(&space_info->lock); 5671 return ret; 5672 } 5673 5674 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5675 struct btrfs_root *root) 5676 { 5677 struct btrfs_fs_info *fs_info = root->fs_info; 5678 struct btrfs_caching_control *next; 5679 struct btrfs_caching_control *caching_ctl; 5680 struct btrfs_block_group_cache *cache; 5681 struct btrfs_space_info *space_info; 5682 5683 down_write(&fs_info->commit_root_sem); 5684 5685 list_for_each_entry_safe(caching_ctl, next, 5686 &fs_info->caching_block_groups, list) { 5687 cache = caching_ctl->block_group; 5688 if (block_group_cache_done(cache)) { 5689 cache->last_byte_to_unpin = (u64)-1; 5690 list_del_init(&caching_ctl->list); 5691 put_caching_control(caching_ctl); 5692 } else { 5693 cache->last_byte_to_unpin = caching_ctl->progress; 5694 } 5695 } 5696 5697 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5698 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5699 else 5700 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5701 5702 up_write(&fs_info->commit_root_sem); 5703 5704 list_for_each_entry_rcu(space_info, &fs_info->space_info, list) 5705 percpu_counter_set(&space_info->total_bytes_pinned, 0); 5706 5707 update_global_block_rsv(fs_info); 5708 } 5709 5710 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5711 { 5712 struct btrfs_fs_info *fs_info = root->fs_info; 5713 struct btrfs_block_group_cache *cache = NULL; 5714 struct btrfs_space_info *space_info; 5715 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5716 u64 len; 5717 bool readonly; 5718 5719 while (start <= end) { 5720 readonly = false; 5721 if (!cache || 5722 start >= cache->key.objectid + cache->key.offset) { 5723 if (cache) 5724 btrfs_put_block_group(cache); 5725 cache = btrfs_lookup_block_group(fs_info, start); 5726 BUG_ON(!cache); /* Logic error */ 5727 } 5728 5729 len = cache->key.objectid + cache->key.offset - start; 5730 len = min(len, end + 1 - start); 5731 5732 if (start < cache->last_byte_to_unpin) { 5733 len = min(len, cache->last_byte_to_unpin - start); 5734 btrfs_add_free_space(cache, start, len); 5735 } 5736 5737 start += len; 5738 space_info = cache->space_info; 5739 5740 spin_lock(&space_info->lock); 5741 spin_lock(&cache->lock); 5742 cache->pinned -= len; 5743 space_info->bytes_pinned -= len; 5744 if (cache->ro) { 5745 space_info->bytes_readonly += len; 5746 readonly = true; 5747 } 5748 spin_unlock(&cache->lock); 5749 if (!readonly && global_rsv->space_info == space_info) { 5750 spin_lock(&global_rsv->lock); 5751 if (!global_rsv->full) { 5752 len = min(len, global_rsv->size - 5753 global_rsv->reserved); 5754 global_rsv->reserved += len; 5755 space_info->bytes_may_use += len; 5756 if (global_rsv->reserved >= global_rsv->size) 5757 global_rsv->full = 1; 5758 } 5759 spin_unlock(&global_rsv->lock); 5760 } 5761 spin_unlock(&space_info->lock); 5762 } 5763 5764 if (cache) 5765 btrfs_put_block_group(cache); 5766 return 0; 5767 } 5768 5769 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5770 struct btrfs_root *root) 5771 { 5772 struct btrfs_fs_info *fs_info = root->fs_info; 5773 struct extent_io_tree *unpin; 5774 u64 start; 5775 u64 end; 5776 int ret; 5777 5778 if (trans->aborted) 5779 return 0; 5780 5781 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5782 unpin = &fs_info->freed_extents[1]; 5783 else 5784 unpin = &fs_info->freed_extents[0]; 5785 5786 while (1) { 5787 ret = find_first_extent_bit(unpin, 0, &start, &end, 5788 EXTENT_DIRTY, NULL); 5789 if (ret) 5790 break; 5791 5792 if (btrfs_test_opt(root, DISCARD)) 5793 ret = btrfs_discard_extent(root, start, 5794 end + 1 - start, NULL); 5795 5796 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5797 unpin_extent_range(root, start, end); 5798 cond_resched(); 5799 } 5800 5801 return 0; 5802 } 5803 5804 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 5805 u64 owner, u64 root_objectid) 5806 { 5807 struct btrfs_space_info *space_info; 5808 u64 flags; 5809 5810 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5811 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 5812 flags = BTRFS_BLOCK_GROUP_SYSTEM; 5813 else 5814 flags = BTRFS_BLOCK_GROUP_METADATA; 5815 } else { 5816 flags = BTRFS_BLOCK_GROUP_DATA; 5817 } 5818 5819 space_info = __find_space_info(fs_info, flags); 5820 BUG_ON(!space_info); /* Logic bug */ 5821 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 5822 } 5823 5824 5825 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5826 struct btrfs_root *root, 5827 u64 bytenr, u64 num_bytes, u64 parent, 5828 u64 root_objectid, u64 owner_objectid, 5829 u64 owner_offset, int refs_to_drop, 5830 struct btrfs_delayed_extent_op *extent_op, 5831 int no_quota) 5832 { 5833 struct btrfs_key key; 5834 struct btrfs_path *path; 5835 struct btrfs_fs_info *info = root->fs_info; 5836 struct btrfs_root *extent_root = info->extent_root; 5837 struct extent_buffer *leaf; 5838 struct btrfs_extent_item *ei; 5839 struct btrfs_extent_inline_ref *iref; 5840 int ret; 5841 int is_data; 5842 int extent_slot = 0; 5843 int found_extent = 0; 5844 int num_to_del = 1; 5845 u32 item_size; 5846 u64 refs; 5847 int last_ref = 0; 5848 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; 5849 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5850 SKINNY_METADATA); 5851 5852 if (!info->quota_enabled || !is_fstree(root_objectid)) 5853 no_quota = 1; 5854 5855 path = btrfs_alloc_path(); 5856 if (!path) 5857 return -ENOMEM; 5858 5859 path->reada = 1; 5860 path->leave_spinning = 1; 5861 5862 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5863 BUG_ON(!is_data && refs_to_drop != 1); 5864 5865 if (is_data) 5866 skinny_metadata = 0; 5867 5868 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5869 bytenr, num_bytes, parent, 5870 root_objectid, owner_objectid, 5871 owner_offset); 5872 if (ret == 0) { 5873 extent_slot = path->slots[0]; 5874 while (extent_slot >= 0) { 5875 btrfs_item_key_to_cpu(path->nodes[0], &key, 5876 extent_slot); 5877 if (key.objectid != bytenr) 5878 break; 5879 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5880 key.offset == num_bytes) { 5881 found_extent = 1; 5882 break; 5883 } 5884 if (key.type == BTRFS_METADATA_ITEM_KEY && 5885 key.offset == owner_objectid) { 5886 found_extent = 1; 5887 break; 5888 } 5889 if (path->slots[0] - extent_slot > 5) 5890 break; 5891 extent_slot--; 5892 } 5893 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5894 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5895 if (found_extent && item_size < sizeof(*ei)) 5896 found_extent = 0; 5897 #endif 5898 if (!found_extent) { 5899 BUG_ON(iref); 5900 ret = remove_extent_backref(trans, extent_root, path, 5901 NULL, refs_to_drop, 5902 is_data, &last_ref); 5903 if (ret) { 5904 btrfs_abort_transaction(trans, extent_root, ret); 5905 goto out; 5906 } 5907 btrfs_release_path(path); 5908 path->leave_spinning = 1; 5909 5910 key.objectid = bytenr; 5911 key.type = BTRFS_EXTENT_ITEM_KEY; 5912 key.offset = num_bytes; 5913 5914 if (!is_data && skinny_metadata) { 5915 key.type = BTRFS_METADATA_ITEM_KEY; 5916 key.offset = owner_objectid; 5917 } 5918 5919 ret = btrfs_search_slot(trans, extent_root, 5920 &key, path, -1, 1); 5921 if (ret > 0 && skinny_metadata && path->slots[0]) { 5922 /* 5923 * Couldn't find our skinny metadata item, 5924 * see if we have ye olde extent item. 5925 */ 5926 path->slots[0]--; 5927 btrfs_item_key_to_cpu(path->nodes[0], &key, 5928 path->slots[0]); 5929 if (key.objectid == bytenr && 5930 key.type == BTRFS_EXTENT_ITEM_KEY && 5931 key.offset == num_bytes) 5932 ret = 0; 5933 } 5934 5935 if (ret > 0 && skinny_metadata) { 5936 skinny_metadata = false; 5937 key.objectid = bytenr; 5938 key.type = BTRFS_EXTENT_ITEM_KEY; 5939 key.offset = num_bytes; 5940 btrfs_release_path(path); 5941 ret = btrfs_search_slot(trans, extent_root, 5942 &key, path, -1, 1); 5943 } 5944 5945 if (ret) { 5946 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5947 ret, bytenr); 5948 if (ret > 0) 5949 btrfs_print_leaf(extent_root, 5950 path->nodes[0]); 5951 } 5952 if (ret < 0) { 5953 btrfs_abort_transaction(trans, extent_root, ret); 5954 goto out; 5955 } 5956 extent_slot = path->slots[0]; 5957 } 5958 } else if (WARN_ON(ret == -ENOENT)) { 5959 btrfs_print_leaf(extent_root, path->nodes[0]); 5960 btrfs_err(info, 5961 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5962 bytenr, parent, root_objectid, owner_objectid, 5963 owner_offset); 5964 btrfs_abort_transaction(trans, extent_root, ret); 5965 goto out; 5966 } else { 5967 btrfs_abort_transaction(trans, extent_root, ret); 5968 goto out; 5969 } 5970 5971 leaf = path->nodes[0]; 5972 item_size = btrfs_item_size_nr(leaf, extent_slot); 5973 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5974 if (item_size < sizeof(*ei)) { 5975 BUG_ON(found_extent || extent_slot != path->slots[0]); 5976 ret = convert_extent_item_v0(trans, extent_root, path, 5977 owner_objectid, 0); 5978 if (ret < 0) { 5979 btrfs_abort_transaction(trans, extent_root, ret); 5980 goto out; 5981 } 5982 5983 btrfs_release_path(path); 5984 path->leave_spinning = 1; 5985 5986 key.objectid = bytenr; 5987 key.type = BTRFS_EXTENT_ITEM_KEY; 5988 key.offset = num_bytes; 5989 5990 ret = btrfs_search_slot(trans, extent_root, &key, path, 5991 -1, 1); 5992 if (ret) { 5993 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5994 ret, bytenr); 5995 btrfs_print_leaf(extent_root, path->nodes[0]); 5996 } 5997 if (ret < 0) { 5998 btrfs_abort_transaction(trans, extent_root, ret); 5999 goto out; 6000 } 6001 6002 extent_slot = path->slots[0]; 6003 leaf = path->nodes[0]; 6004 item_size = btrfs_item_size_nr(leaf, extent_slot); 6005 } 6006 #endif 6007 BUG_ON(item_size < sizeof(*ei)); 6008 ei = btrfs_item_ptr(leaf, extent_slot, 6009 struct btrfs_extent_item); 6010 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6011 key.type == BTRFS_EXTENT_ITEM_KEY) { 6012 struct btrfs_tree_block_info *bi; 6013 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6014 bi = (struct btrfs_tree_block_info *)(ei + 1); 6015 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6016 } 6017 6018 refs = btrfs_extent_refs(leaf, ei); 6019 if (refs < refs_to_drop) { 6020 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 6021 "for bytenr %Lu", refs_to_drop, refs, bytenr); 6022 ret = -EINVAL; 6023 btrfs_abort_transaction(trans, extent_root, ret); 6024 goto out; 6025 } 6026 refs -= refs_to_drop; 6027 6028 if (refs > 0) { 6029 type = BTRFS_QGROUP_OPER_SUB_SHARED; 6030 if (extent_op) 6031 __run_delayed_extent_op(extent_op, leaf, ei); 6032 /* 6033 * In the case of inline back ref, reference count will 6034 * be updated by remove_extent_backref 6035 */ 6036 if (iref) { 6037 BUG_ON(!found_extent); 6038 } else { 6039 btrfs_set_extent_refs(leaf, ei, refs); 6040 btrfs_mark_buffer_dirty(leaf); 6041 } 6042 if (found_extent) { 6043 ret = remove_extent_backref(trans, extent_root, path, 6044 iref, refs_to_drop, 6045 is_data, &last_ref); 6046 if (ret) { 6047 btrfs_abort_transaction(trans, extent_root, ret); 6048 goto out; 6049 } 6050 } 6051 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 6052 root_objectid); 6053 } else { 6054 if (found_extent) { 6055 BUG_ON(is_data && refs_to_drop != 6056 extent_data_ref_count(root, path, iref)); 6057 if (iref) { 6058 BUG_ON(path->slots[0] != extent_slot); 6059 } else { 6060 BUG_ON(path->slots[0] != extent_slot + 1); 6061 path->slots[0] = extent_slot; 6062 num_to_del = 2; 6063 } 6064 } 6065 6066 last_ref = 1; 6067 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6068 num_to_del); 6069 if (ret) { 6070 btrfs_abort_transaction(trans, extent_root, ret); 6071 goto out; 6072 } 6073 btrfs_release_path(path); 6074 6075 if (is_data) { 6076 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 6077 if (ret) { 6078 btrfs_abort_transaction(trans, extent_root, ret); 6079 goto out; 6080 } 6081 } 6082 6083 ret = update_block_group(root, bytenr, num_bytes, 0); 6084 if (ret) { 6085 btrfs_abort_transaction(trans, extent_root, ret); 6086 goto out; 6087 } 6088 } 6089 btrfs_release_path(path); 6090 6091 /* Deal with the quota accounting */ 6092 if (!ret && last_ref && !no_quota) { 6093 int mod_seq = 0; 6094 6095 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && 6096 type == BTRFS_QGROUP_OPER_SUB_SHARED) 6097 mod_seq = 1; 6098 6099 ret = btrfs_qgroup_record_ref(trans, info, root_objectid, 6100 bytenr, num_bytes, type, 6101 mod_seq); 6102 } 6103 out: 6104 btrfs_free_path(path); 6105 return ret; 6106 } 6107 6108 /* 6109 * when we free an block, it is possible (and likely) that we free the last 6110 * delayed ref for that extent as well. This searches the delayed ref tree for 6111 * a given extent, and if there are no other delayed refs to be processed, it 6112 * removes it from the tree. 6113 */ 6114 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 6115 struct btrfs_root *root, u64 bytenr) 6116 { 6117 struct btrfs_delayed_ref_head *head; 6118 struct btrfs_delayed_ref_root *delayed_refs; 6119 int ret = 0; 6120 6121 delayed_refs = &trans->transaction->delayed_refs; 6122 spin_lock(&delayed_refs->lock); 6123 head = btrfs_find_delayed_ref_head(trans, bytenr); 6124 if (!head) 6125 goto out_delayed_unlock; 6126 6127 spin_lock(&head->lock); 6128 if (rb_first(&head->ref_root)) 6129 goto out; 6130 6131 if (head->extent_op) { 6132 if (!head->must_insert_reserved) 6133 goto out; 6134 btrfs_free_delayed_extent_op(head->extent_op); 6135 head->extent_op = NULL; 6136 } 6137 6138 /* 6139 * waiting for the lock here would deadlock. If someone else has it 6140 * locked they are already in the process of dropping it anyway 6141 */ 6142 if (!mutex_trylock(&head->mutex)) 6143 goto out; 6144 6145 /* 6146 * at this point we have a head with no other entries. Go 6147 * ahead and process it. 6148 */ 6149 head->node.in_tree = 0; 6150 rb_erase(&head->href_node, &delayed_refs->href_root); 6151 6152 atomic_dec(&delayed_refs->num_entries); 6153 6154 /* 6155 * we don't take a ref on the node because we're removing it from the 6156 * tree, so we just steal the ref the tree was holding. 6157 */ 6158 delayed_refs->num_heads--; 6159 if (head->processing == 0) 6160 delayed_refs->num_heads_ready--; 6161 head->processing = 0; 6162 spin_unlock(&head->lock); 6163 spin_unlock(&delayed_refs->lock); 6164 6165 BUG_ON(head->extent_op); 6166 if (head->must_insert_reserved) 6167 ret = 1; 6168 6169 mutex_unlock(&head->mutex); 6170 btrfs_put_delayed_ref(&head->node); 6171 return ret; 6172 out: 6173 spin_unlock(&head->lock); 6174 6175 out_delayed_unlock: 6176 spin_unlock(&delayed_refs->lock); 6177 return 0; 6178 } 6179 6180 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 6181 struct btrfs_root *root, 6182 struct extent_buffer *buf, 6183 u64 parent, int last_ref) 6184 { 6185 struct btrfs_block_group_cache *cache = NULL; 6186 int pin = 1; 6187 int ret; 6188 6189 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6190 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6191 buf->start, buf->len, 6192 parent, root->root_key.objectid, 6193 btrfs_header_level(buf), 6194 BTRFS_DROP_DELAYED_REF, NULL, 0); 6195 BUG_ON(ret); /* -ENOMEM */ 6196 } 6197 6198 if (!last_ref) 6199 return; 6200 6201 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 6202 6203 if (btrfs_header_generation(buf) == trans->transid) { 6204 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6205 ret = check_ref_cleanup(trans, root, buf->start); 6206 if (!ret) 6207 goto out; 6208 } 6209 6210 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6211 pin_down_extent(root, cache, buf->start, buf->len, 1); 6212 goto out; 6213 } 6214 6215 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6216 6217 btrfs_add_free_space(cache, buf->start, buf->len); 6218 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 6219 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6220 pin = 0; 6221 } 6222 out: 6223 if (pin) 6224 add_pinned_bytes(root->fs_info, buf->len, 6225 btrfs_header_level(buf), 6226 root->root_key.objectid); 6227 6228 /* 6229 * Deleting the buffer, clear the corrupt flag since it doesn't matter 6230 * anymore. 6231 */ 6232 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6233 btrfs_put_block_group(cache); 6234 } 6235 6236 /* Can return -ENOMEM */ 6237 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6238 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6239 u64 owner, u64 offset, int no_quota) 6240 { 6241 int ret; 6242 struct btrfs_fs_info *fs_info = root->fs_info; 6243 6244 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6245 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 6246 return 0; 6247 #endif 6248 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6249 6250 /* 6251 * tree log blocks never actually go into the extent allocation 6252 * tree, just update pinning info and exit early. 6253 */ 6254 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 6255 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 6256 /* unlocks the pinned mutex */ 6257 btrfs_pin_extent(root, bytenr, num_bytes, 1); 6258 ret = 0; 6259 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6260 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6261 num_bytes, 6262 parent, root_objectid, (int)owner, 6263 BTRFS_DROP_DELAYED_REF, NULL, no_quota); 6264 } else { 6265 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6266 num_bytes, 6267 parent, root_objectid, owner, 6268 offset, BTRFS_DROP_DELAYED_REF, 6269 NULL, no_quota); 6270 } 6271 return ret; 6272 } 6273 6274 static u64 stripe_align(struct btrfs_root *root, 6275 struct btrfs_block_group_cache *cache, 6276 u64 val, u64 num_bytes) 6277 { 6278 u64 ret = ALIGN(val, root->stripesize); 6279 return ret; 6280 } 6281 6282 /* 6283 * when we wait for progress in the block group caching, its because 6284 * our allocation attempt failed at least once. So, we must sleep 6285 * and let some progress happen before we try again. 6286 * 6287 * This function will sleep at least once waiting for new free space to 6288 * show up, and then it will check the block group free space numbers 6289 * for our min num_bytes. Another option is to have it go ahead 6290 * and look in the rbtree for a free extent of a given size, but this 6291 * is a good start. 6292 * 6293 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 6294 * any of the information in this block group. 6295 */ 6296 static noinline void 6297 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 6298 u64 num_bytes) 6299 { 6300 struct btrfs_caching_control *caching_ctl; 6301 6302 caching_ctl = get_caching_control(cache); 6303 if (!caching_ctl) 6304 return; 6305 6306 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 6307 (cache->free_space_ctl->free_space >= num_bytes)); 6308 6309 put_caching_control(caching_ctl); 6310 } 6311 6312 static noinline int 6313 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 6314 { 6315 struct btrfs_caching_control *caching_ctl; 6316 int ret = 0; 6317 6318 caching_ctl = get_caching_control(cache); 6319 if (!caching_ctl) 6320 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 6321 6322 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 6323 if (cache->cached == BTRFS_CACHE_ERROR) 6324 ret = -EIO; 6325 put_caching_control(caching_ctl); 6326 return ret; 6327 } 6328 6329 int __get_raid_index(u64 flags) 6330 { 6331 if (flags & BTRFS_BLOCK_GROUP_RAID10) 6332 return BTRFS_RAID_RAID10; 6333 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 6334 return BTRFS_RAID_RAID1; 6335 else if (flags & BTRFS_BLOCK_GROUP_DUP) 6336 return BTRFS_RAID_DUP; 6337 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 6338 return BTRFS_RAID_RAID0; 6339 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 6340 return BTRFS_RAID_RAID5; 6341 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 6342 return BTRFS_RAID_RAID6; 6343 6344 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 6345 } 6346 6347 int get_block_group_index(struct btrfs_block_group_cache *cache) 6348 { 6349 return __get_raid_index(cache->flags); 6350 } 6351 6352 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 6353 [BTRFS_RAID_RAID10] = "raid10", 6354 [BTRFS_RAID_RAID1] = "raid1", 6355 [BTRFS_RAID_DUP] = "dup", 6356 [BTRFS_RAID_RAID0] = "raid0", 6357 [BTRFS_RAID_SINGLE] = "single", 6358 [BTRFS_RAID_RAID5] = "raid5", 6359 [BTRFS_RAID_RAID6] = "raid6", 6360 }; 6361 6362 static const char *get_raid_name(enum btrfs_raid_types type) 6363 { 6364 if (type >= BTRFS_NR_RAID_TYPES) 6365 return NULL; 6366 6367 return btrfs_raid_type_names[type]; 6368 } 6369 6370 enum btrfs_loop_type { 6371 LOOP_CACHING_NOWAIT = 0, 6372 LOOP_CACHING_WAIT = 1, 6373 LOOP_ALLOC_CHUNK = 2, 6374 LOOP_NO_EMPTY_SIZE = 3, 6375 }; 6376 6377 static inline void 6378 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 6379 int delalloc) 6380 { 6381 if (delalloc) 6382 down_read(&cache->data_rwsem); 6383 } 6384 6385 static inline void 6386 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 6387 int delalloc) 6388 { 6389 btrfs_get_block_group(cache); 6390 if (delalloc) 6391 down_read(&cache->data_rwsem); 6392 } 6393 6394 static struct btrfs_block_group_cache * 6395 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 6396 struct btrfs_free_cluster *cluster, 6397 int delalloc) 6398 { 6399 struct btrfs_block_group_cache *used_bg; 6400 bool locked = false; 6401 again: 6402 spin_lock(&cluster->refill_lock); 6403 if (locked) { 6404 if (used_bg == cluster->block_group) 6405 return used_bg; 6406 6407 up_read(&used_bg->data_rwsem); 6408 btrfs_put_block_group(used_bg); 6409 } 6410 6411 used_bg = cluster->block_group; 6412 if (!used_bg) 6413 return NULL; 6414 6415 if (used_bg == block_group) 6416 return used_bg; 6417 6418 btrfs_get_block_group(used_bg); 6419 6420 if (!delalloc) 6421 return used_bg; 6422 6423 if (down_read_trylock(&used_bg->data_rwsem)) 6424 return used_bg; 6425 6426 spin_unlock(&cluster->refill_lock); 6427 down_read(&used_bg->data_rwsem); 6428 locked = true; 6429 goto again; 6430 } 6431 6432 static inline void 6433 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 6434 int delalloc) 6435 { 6436 if (delalloc) 6437 up_read(&cache->data_rwsem); 6438 btrfs_put_block_group(cache); 6439 } 6440 6441 /* 6442 * walks the btree of allocated extents and find a hole of a given size. 6443 * The key ins is changed to record the hole: 6444 * ins->objectid == start position 6445 * ins->flags = BTRFS_EXTENT_ITEM_KEY 6446 * ins->offset == the size of the hole. 6447 * Any available blocks before search_start are skipped. 6448 * 6449 * If there is no suitable free space, we will record the max size of 6450 * the free space extent currently. 6451 */ 6452 static noinline int find_free_extent(struct btrfs_root *orig_root, 6453 u64 num_bytes, u64 empty_size, 6454 u64 hint_byte, struct btrfs_key *ins, 6455 u64 flags, int delalloc) 6456 { 6457 int ret = 0; 6458 struct btrfs_root *root = orig_root->fs_info->extent_root; 6459 struct btrfs_free_cluster *last_ptr = NULL; 6460 struct btrfs_block_group_cache *block_group = NULL; 6461 u64 search_start = 0; 6462 u64 max_extent_size = 0; 6463 int empty_cluster = 2 * 1024 * 1024; 6464 struct btrfs_space_info *space_info; 6465 int loop = 0; 6466 int index = __get_raid_index(flags); 6467 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 6468 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 6469 bool failed_cluster_refill = false; 6470 bool failed_alloc = false; 6471 bool use_cluster = true; 6472 bool have_caching_bg = false; 6473 6474 WARN_ON(num_bytes < root->sectorsize); 6475 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 6476 ins->objectid = 0; 6477 ins->offset = 0; 6478 6479 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 6480 6481 space_info = __find_space_info(root->fs_info, flags); 6482 if (!space_info) { 6483 btrfs_err(root->fs_info, "No space info for %llu", flags); 6484 return -ENOSPC; 6485 } 6486 6487 /* 6488 * If the space info is for both data and metadata it means we have a 6489 * small filesystem and we can't use the clustering stuff. 6490 */ 6491 if (btrfs_mixed_space_info(space_info)) 6492 use_cluster = false; 6493 6494 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 6495 last_ptr = &root->fs_info->meta_alloc_cluster; 6496 if (!btrfs_test_opt(root, SSD)) 6497 empty_cluster = 64 * 1024; 6498 } 6499 6500 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 6501 btrfs_test_opt(root, SSD)) { 6502 last_ptr = &root->fs_info->data_alloc_cluster; 6503 } 6504 6505 if (last_ptr) { 6506 spin_lock(&last_ptr->lock); 6507 if (last_ptr->block_group) 6508 hint_byte = last_ptr->window_start; 6509 spin_unlock(&last_ptr->lock); 6510 } 6511 6512 search_start = max(search_start, first_logical_byte(root, 0)); 6513 search_start = max(search_start, hint_byte); 6514 6515 if (!last_ptr) 6516 empty_cluster = 0; 6517 6518 if (search_start == hint_byte) { 6519 block_group = btrfs_lookup_block_group(root->fs_info, 6520 search_start); 6521 /* 6522 * we don't want to use the block group if it doesn't match our 6523 * allocation bits, or if its not cached. 6524 * 6525 * However if we are re-searching with an ideal block group 6526 * picked out then we don't care that the block group is cached. 6527 */ 6528 if (block_group && block_group_bits(block_group, flags) && 6529 block_group->cached != BTRFS_CACHE_NO) { 6530 down_read(&space_info->groups_sem); 6531 if (list_empty(&block_group->list) || 6532 block_group->ro) { 6533 /* 6534 * someone is removing this block group, 6535 * we can't jump into the have_block_group 6536 * target because our list pointers are not 6537 * valid 6538 */ 6539 btrfs_put_block_group(block_group); 6540 up_read(&space_info->groups_sem); 6541 } else { 6542 index = get_block_group_index(block_group); 6543 btrfs_lock_block_group(block_group, delalloc); 6544 goto have_block_group; 6545 } 6546 } else if (block_group) { 6547 btrfs_put_block_group(block_group); 6548 } 6549 } 6550 search: 6551 have_caching_bg = false; 6552 down_read(&space_info->groups_sem); 6553 list_for_each_entry(block_group, &space_info->block_groups[index], 6554 list) { 6555 u64 offset; 6556 int cached; 6557 6558 btrfs_grab_block_group(block_group, delalloc); 6559 search_start = block_group->key.objectid; 6560 6561 /* 6562 * this can happen if we end up cycling through all the 6563 * raid types, but we want to make sure we only allocate 6564 * for the proper type. 6565 */ 6566 if (!block_group_bits(block_group, flags)) { 6567 u64 extra = BTRFS_BLOCK_GROUP_DUP | 6568 BTRFS_BLOCK_GROUP_RAID1 | 6569 BTRFS_BLOCK_GROUP_RAID5 | 6570 BTRFS_BLOCK_GROUP_RAID6 | 6571 BTRFS_BLOCK_GROUP_RAID10; 6572 6573 /* 6574 * if they asked for extra copies and this block group 6575 * doesn't provide them, bail. This does allow us to 6576 * fill raid0 from raid1. 6577 */ 6578 if ((flags & extra) && !(block_group->flags & extra)) 6579 goto loop; 6580 } 6581 6582 have_block_group: 6583 cached = block_group_cache_done(block_group); 6584 if (unlikely(!cached)) { 6585 ret = cache_block_group(block_group, 0); 6586 BUG_ON(ret < 0); 6587 ret = 0; 6588 } 6589 6590 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 6591 goto loop; 6592 if (unlikely(block_group->ro)) 6593 goto loop; 6594 6595 /* 6596 * Ok we want to try and use the cluster allocator, so 6597 * lets look there 6598 */ 6599 if (last_ptr) { 6600 struct btrfs_block_group_cache *used_block_group; 6601 unsigned long aligned_cluster; 6602 /* 6603 * the refill lock keeps out other 6604 * people trying to start a new cluster 6605 */ 6606 used_block_group = btrfs_lock_cluster(block_group, 6607 last_ptr, 6608 delalloc); 6609 if (!used_block_group) 6610 goto refill_cluster; 6611 6612 if (used_block_group != block_group && 6613 (used_block_group->ro || 6614 !block_group_bits(used_block_group, flags))) 6615 goto release_cluster; 6616 6617 offset = btrfs_alloc_from_cluster(used_block_group, 6618 last_ptr, 6619 num_bytes, 6620 used_block_group->key.objectid, 6621 &max_extent_size); 6622 if (offset) { 6623 /* we have a block, we're done */ 6624 spin_unlock(&last_ptr->refill_lock); 6625 trace_btrfs_reserve_extent_cluster(root, 6626 used_block_group, 6627 search_start, num_bytes); 6628 if (used_block_group != block_group) { 6629 btrfs_release_block_group(block_group, 6630 delalloc); 6631 block_group = used_block_group; 6632 } 6633 goto checks; 6634 } 6635 6636 WARN_ON(last_ptr->block_group != used_block_group); 6637 release_cluster: 6638 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6639 * set up a new clusters, so lets just skip it 6640 * and let the allocator find whatever block 6641 * it can find. If we reach this point, we 6642 * will have tried the cluster allocator 6643 * plenty of times and not have found 6644 * anything, so we are likely way too 6645 * fragmented for the clustering stuff to find 6646 * anything. 6647 * 6648 * However, if the cluster is taken from the 6649 * current block group, release the cluster 6650 * first, so that we stand a better chance of 6651 * succeeding in the unclustered 6652 * allocation. */ 6653 if (loop >= LOOP_NO_EMPTY_SIZE && 6654 used_block_group != block_group) { 6655 spin_unlock(&last_ptr->refill_lock); 6656 btrfs_release_block_group(used_block_group, 6657 delalloc); 6658 goto unclustered_alloc; 6659 } 6660 6661 /* 6662 * this cluster didn't work out, free it and 6663 * start over 6664 */ 6665 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6666 6667 if (used_block_group != block_group) 6668 btrfs_release_block_group(used_block_group, 6669 delalloc); 6670 refill_cluster: 6671 if (loop >= LOOP_NO_EMPTY_SIZE) { 6672 spin_unlock(&last_ptr->refill_lock); 6673 goto unclustered_alloc; 6674 } 6675 6676 aligned_cluster = max_t(unsigned long, 6677 empty_cluster + empty_size, 6678 block_group->full_stripe_len); 6679 6680 /* allocate a cluster in this block group */ 6681 ret = btrfs_find_space_cluster(root, block_group, 6682 last_ptr, search_start, 6683 num_bytes, 6684 aligned_cluster); 6685 if (ret == 0) { 6686 /* 6687 * now pull our allocation out of this 6688 * cluster 6689 */ 6690 offset = btrfs_alloc_from_cluster(block_group, 6691 last_ptr, 6692 num_bytes, 6693 search_start, 6694 &max_extent_size); 6695 if (offset) { 6696 /* we found one, proceed */ 6697 spin_unlock(&last_ptr->refill_lock); 6698 trace_btrfs_reserve_extent_cluster(root, 6699 block_group, search_start, 6700 num_bytes); 6701 goto checks; 6702 } 6703 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6704 && !failed_cluster_refill) { 6705 spin_unlock(&last_ptr->refill_lock); 6706 6707 failed_cluster_refill = true; 6708 wait_block_group_cache_progress(block_group, 6709 num_bytes + empty_cluster + empty_size); 6710 goto have_block_group; 6711 } 6712 6713 /* 6714 * at this point we either didn't find a cluster 6715 * or we weren't able to allocate a block from our 6716 * cluster. Free the cluster we've been trying 6717 * to use, and go to the next block group 6718 */ 6719 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6720 spin_unlock(&last_ptr->refill_lock); 6721 goto loop; 6722 } 6723 6724 unclustered_alloc: 6725 spin_lock(&block_group->free_space_ctl->tree_lock); 6726 if (cached && 6727 block_group->free_space_ctl->free_space < 6728 num_bytes + empty_cluster + empty_size) { 6729 if (block_group->free_space_ctl->free_space > 6730 max_extent_size) 6731 max_extent_size = 6732 block_group->free_space_ctl->free_space; 6733 spin_unlock(&block_group->free_space_ctl->tree_lock); 6734 goto loop; 6735 } 6736 spin_unlock(&block_group->free_space_ctl->tree_lock); 6737 6738 offset = btrfs_find_space_for_alloc(block_group, search_start, 6739 num_bytes, empty_size, 6740 &max_extent_size); 6741 /* 6742 * If we didn't find a chunk, and we haven't failed on this 6743 * block group before, and this block group is in the middle of 6744 * caching and we are ok with waiting, then go ahead and wait 6745 * for progress to be made, and set failed_alloc to true. 6746 * 6747 * If failed_alloc is true then we've already waited on this 6748 * block group once and should move on to the next block group. 6749 */ 6750 if (!offset && !failed_alloc && !cached && 6751 loop > LOOP_CACHING_NOWAIT) { 6752 wait_block_group_cache_progress(block_group, 6753 num_bytes + empty_size); 6754 failed_alloc = true; 6755 goto have_block_group; 6756 } else if (!offset) { 6757 if (!cached) 6758 have_caching_bg = true; 6759 goto loop; 6760 } 6761 checks: 6762 search_start = stripe_align(root, block_group, 6763 offset, num_bytes); 6764 6765 /* move on to the next group */ 6766 if (search_start + num_bytes > 6767 block_group->key.objectid + block_group->key.offset) { 6768 btrfs_add_free_space(block_group, offset, num_bytes); 6769 goto loop; 6770 } 6771 6772 if (offset < search_start) 6773 btrfs_add_free_space(block_group, offset, 6774 search_start - offset); 6775 BUG_ON(offset > search_start); 6776 6777 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 6778 alloc_type, delalloc); 6779 if (ret == -EAGAIN) { 6780 btrfs_add_free_space(block_group, offset, num_bytes); 6781 goto loop; 6782 } 6783 6784 /* we are all good, lets return */ 6785 ins->objectid = search_start; 6786 ins->offset = num_bytes; 6787 6788 trace_btrfs_reserve_extent(orig_root, block_group, 6789 search_start, num_bytes); 6790 btrfs_release_block_group(block_group, delalloc); 6791 break; 6792 loop: 6793 failed_cluster_refill = false; 6794 failed_alloc = false; 6795 BUG_ON(index != get_block_group_index(block_group)); 6796 btrfs_release_block_group(block_group, delalloc); 6797 } 6798 up_read(&space_info->groups_sem); 6799 6800 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6801 goto search; 6802 6803 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6804 goto search; 6805 6806 /* 6807 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6808 * caching kthreads as we move along 6809 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6810 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6811 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6812 * again 6813 */ 6814 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6815 index = 0; 6816 loop++; 6817 if (loop == LOOP_ALLOC_CHUNK) { 6818 struct btrfs_trans_handle *trans; 6819 int exist = 0; 6820 6821 trans = current->journal_info; 6822 if (trans) 6823 exist = 1; 6824 else 6825 trans = btrfs_join_transaction(root); 6826 6827 if (IS_ERR(trans)) { 6828 ret = PTR_ERR(trans); 6829 goto out; 6830 } 6831 6832 ret = do_chunk_alloc(trans, root, flags, 6833 CHUNK_ALLOC_FORCE); 6834 /* 6835 * Do not bail out on ENOSPC since we 6836 * can do more things. 6837 */ 6838 if (ret < 0 && ret != -ENOSPC) 6839 btrfs_abort_transaction(trans, 6840 root, ret); 6841 else 6842 ret = 0; 6843 if (!exist) 6844 btrfs_end_transaction(trans, root); 6845 if (ret) 6846 goto out; 6847 } 6848 6849 if (loop == LOOP_NO_EMPTY_SIZE) { 6850 empty_size = 0; 6851 empty_cluster = 0; 6852 } 6853 6854 goto search; 6855 } else if (!ins->objectid) { 6856 ret = -ENOSPC; 6857 } else if (ins->objectid) { 6858 ret = 0; 6859 } 6860 out: 6861 if (ret == -ENOSPC) 6862 ins->offset = max_extent_size; 6863 return ret; 6864 } 6865 6866 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6867 int dump_block_groups) 6868 { 6869 struct btrfs_block_group_cache *cache; 6870 int index = 0; 6871 6872 spin_lock(&info->lock); 6873 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", 6874 info->flags, 6875 info->total_bytes - info->bytes_used - info->bytes_pinned - 6876 info->bytes_reserved - info->bytes_readonly, 6877 (info->full) ? "" : "not "); 6878 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " 6879 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6880 info->total_bytes, info->bytes_used, info->bytes_pinned, 6881 info->bytes_reserved, info->bytes_may_use, 6882 info->bytes_readonly); 6883 spin_unlock(&info->lock); 6884 6885 if (!dump_block_groups) 6886 return; 6887 6888 down_read(&info->groups_sem); 6889 again: 6890 list_for_each_entry(cache, &info->block_groups[index], list) { 6891 spin_lock(&cache->lock); 6892 printk(KERN_INFO "BTRFS: " 6893 "block group %llu has %llu bytes, " 6894 "%llu used %llu pinned %llu reserved %s\n", 6895 cache->key.objectid, cache->key.offset, 6896 btrfs_block_group_used(&cache->item), cache->pinned, 6897 cache->reserved, cache->ro ? "[readonly]" : ""); 6898 btrfs_dump_free_space(cache, bytes); 6899 spin_unlock(&cache->lock); 6900 } 6901 if (++index < BTRFS_NR_RAID_TYPES) 6902 goto again; 6903 up_read(&info->groups_sem); 6904 } 6905 6906 int btrfs_reserve_extent(struct btrfs_root *root, 6907 u64 num_bytes, u64 min_alloc_size, 6908 u64 empty_size, u64 hint_byte, 6909 struct btrfs_key *ins, int is_data, int delalloc) 6910 { 6911 bool final_tried = false; 6912 u64 flags; 6913 int ret; 6914 6915 flags = btrfs_get_alloc_profile(root, is_data); 6916 again: 6917 WARN_ON(num_bytes < root->sectorsize); 6918 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 6919 flags, delalloc); 6920 6921 if (ret == -ENOSPC) { 6922 if (!final_tried && ins->offset) { 6923 num_bytes = min(num_bytes >> 1, ins->offset); 6924 num_bytes = round_down(num_bytes, root->sectorsize); 6925 num_bytes = max(num_bytes, min_alloc_size); 6926 if (num_bytes == min_alloc_size) 6927 final_tried = true; 6928 goto again; 6929 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6930 struct btrfs_space_info *sinfo; 6931 6932 sinfo = __find_space_info(root->fs_info, flags); 6933 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6934 flags, num_bytes); 6935 if (sinfo) 6936 dump_space_info(sinfo, num_bytes, 1); 6937 } 6938 } 6939 6940 return ret; 6941 } 6942 6943 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6944 u64 start, u64 len, 6945 int pin, int delalloc) 6946 { 6947 struct btrfs_block_group_cache *cache; 6948 int ret = 0; 6949 6950 cache = btrfs_lookup_block_group(root->fs_info, start); 6951 if (!cache) { 6952 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6953 start); 6954 return -ENOSPC; 6955 } 6956 6957 if (btrfs_test_opt(root, DISCARD)) 6958 ret = btrfs_discard_extent(root, start, len, NULL); 6959 6960 if (pin) 6961 pin_down_extent(root, cache, start, len, 1); 6962 else { 6963 btrfs_add_free_space(cache, start, len); 6964 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 6965 } 6966 btrfs_put_block_group(cache); 6967 6968 trace_btrfs_reserved_extent_free(root, start, len); 6969 6970 return ret; 6971 } 6972 6973 int btrfs_free_reserved_extent(struct btrfs_root *root, 6974 u64 start, u64 len, int delalloc) 6975 { 6976 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); 6977 } 6978 6979 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6980 u64 start, u64 len) 6981 { 6982 return __btrfs_free_reserved_extent(root, start, len, 1, 0); 6983 } 6984 6985 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6986 struct btrfs_root *root, 6987 u64 parent, u64 root_objectid, 6988 u64 flags, u64 owner, u64 offset, 6989 struct btrfs_key *ins, int ref_mod) 6990 { 6991 int ret; 6992 struct btrfs_fs_info *fs_info = root->fs_info; 6993 struct btrfs_extent_item *extent_item; 6994 struct btrfs_extent_inline_ref *iref; 6995 struct btrfs_path *path; 6996 struct extent_buffer *leaf; 6997 int type; 6998 u32 size; 6999 7000 if (parent > 0) 7001 type = BTRFS_SHARED_DATA_REF_KEY; 7002 else 7003 type = BTRFS_EXTENT_DATA_REF_KEY; 7004 7005 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 7006 7007 path = btrfs_alloc_path(); 7008 if (!path) 7009 return -ENOMEM; 7010 7011 path->leave_spinning = 1; 7012 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7013 ins, size); 7014 if (ret) { 7015 btrfs_free_path(path); 7016 return ret; 7017 } 7018 7019 leaf = path->nodes[0]; 7020 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7021 struct btrfs_extent_item); 7022 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 7023 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7024 btrfs_set_extent_flags(leaf, extent_item, 7025 flags | BTRFS_EXTENT_FLAG_DATA); 7026 7027 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7028 btrfs_set_extent_inline_ref_type(leaf, iref, type); 7029 if (parent > 0) { 7030 struct btrfs_shared_data_ref *ref; 7031 ref = (struct btrfs_shared_data_ref *)(iref + 1); 7032 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7033 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 7034 } else { 7035 struct btrfs_extent_data_ref *ref; 7036 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 7037 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 7038 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 7039 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 7040 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 7041 } 7042 7043 btrfs_mark_buffer_dirty(path->nodes[0]); 7044 btrfs_free_path(path); 7045 7046 /* Always set parent to 0 here since its exclusive anyway. */ 7047 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7048 ins->objectid, ins->offset, 7049 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7050 if (ret) 7051 return ret; 7052 7053 ret = update_block_group(root, ins->objectid, ins->offset, 1); 7054 if (ret) { /* -ENOENT, logic error */ 7055 btrfs_err(fs_info, "update block group failed for %llu %llu", 7056 ins->objectid, ins->offset); 7057 BUG(); 7058 } 7059 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 7060 return ret; 7061 } 7062 7063 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 7064 struct btrfs_root *root, 7065 u64 parent, u64 root_objectid, 7066 u64 flags, struct btrfs_disk_key *key, 7067 int level, struct btrfs_key *ins, 7068 int no_quota) 7069 { 7070 int ret; 7071 struct btrfs_fs_info *fs_info = root->fs_info; 7072 struct btrfs_extent_item *extent_item; 7073 struct btrfs_tree_block_info *block_info; 7074 struct btrfs_extent_inline_ref *iref; 7075 struct btrfs_path *path; 7076 struct extent_buffer *leaf; 7077 u32 size = sizeof(*extent_item) + sizeof(*iref); 7078 u64 num_bytes = ins->offset; 7079 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7080 SKINNY_METADATA); 7081 7082 if (!skinny_metadata) 7083 size += sizeof(*block_info); 7084 7085 path = btrfs_alloc_path(); 7086 if (!path) { 7087 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7088 root->leafsize); 7089 return -ENOMEM; 7090 } 7091 7092 path->leave_spinning = 1; 7093 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7094 ins, size); 7095 if (ret) { 7096 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7097 root->leafsize); 7098 btrfs_free_path(path); 7099 return ret; 7100 } 7101 7102 leaf = path->nodes[0]; 7103 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7104 struct btrfs_extent_item); 7105 btrfs_set_extent_refs(leaf, extent_item, 1); 7106 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7107 btrfs_set_extent_flags(leaf, extent_item, 7108 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 7109 7110 if (skinny_metadata) { 7111 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7112 num_bytes = root->leafsize; 7113 } else { 7114 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7115 btrfs_set_tree_block_key(leaf, block_info, key); 7116 btrfs_set_tree_block_level(leaf, block_info, level); 7117 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 7118 } 7119 7120 if (parent > 0) { 7121 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 7122 btrfs_set_extent_inline_ref_type(leaf, iref, 7123 BTRFS_SHARED_BLOCK_REF_KEY); 7124 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7125 } else { 7126 btrfs_set_extent_inline_ref_type(leaf, iref, 7127 BTRFS_TREE_BLOCK_REF_KEY); 7128 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 7129 } 7130 7131 btrfs_mark_buffer_dirty(leaf); 7132 btrfs_free_path(path); 7133 7134 if (!no_quota) { 7135 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7136 ins->objectid, num_bytes, 7137 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7138 if (ret) 7139 return ret; 7140 } 7141 7142 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7143 if (ret) { /* -ENOENT, logic error */ 7144 btrfs_err(fs_info, "update block group failed for %llu %llu", 7145 ins->objectid, ins->offset); 7146 BUG(); 7147 } 7148 7149 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); 7150 return ret; 7151 } 7152 7153 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7154 struct btrfs_root *root, 7155 u64 root_objectid, u64 owner, 7156 u64 offset, struct btrfs_key *ins) 7157 { 7158 int ret; 7159 7160 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 7161 7162 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 7163 ins->offset, 0, 7164 root_objectid, owner, offset, 7165 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 7166 return ret; 7167 } 7168 7169 /* 7170 * this is used by the tree logging recovery code. It records that 7171 * an extent has been allocated and makes sure to clear the free 7172 * space cache bits as well 7173 */ 7174 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 7175 struct btrfs_root *root, 7176 u64 root_objectid, u64 owner, u64 offset, 7177 struct btrfs_key *ins) 7178 { 7179 int ret; 7180 struct btrfs_block_group_cache *block_group; 7181 7182 /* 7183 * Mixed block groups will exclude before processing the log so we only 7184 * need to do the exlude dance if this fs isn't mixed. 7185 */ 7186 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 7187 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 7188 if (ret) 7189 return ret; 7190 } 7191 7192 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 7193 if (!block_group) 7194 return -EINVAL; 7195 7196 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 7197 RESERVE_ALLOC_NO_ACCOUNT, 0); 7198 BUG_ON(ret); /* logic error */ 7199 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 7200 0, owner, offset, ins, 1); 7201 btrfs_put_block_group(block_group); 7202 return ret; 7203 } 7204 7205 static struct extent_buffer * 7206 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 7207 u64 bytenr, u32 blocksize, int level) 7208 { 7209 struct extent_buffer *buf; 7210 7211 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 7212 if (!buf) 7213 return ERR_PTR(-ENOMEM); 7214 btrfs_set_header_generation(buf, trans->transid); 7215 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 7216 btrfs_tree_lock(buf); 7217 clean_tree_block(trans, root, buf); 7218 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 7219 7220 btrfs_set_lock_blocking(buf); 7221 btrfs_set_buffer_uptodate(buf); 7222 7223 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7224 /* 7225 * we allow two log transactions at a time, use different 7226 * EXENT bit to differentiate dirty pages. 7227 */ 7228 if (root->log_transid % 2 == 0) 7229 set_extent_dirty(&root->dirty_log_pages, buf->start, 7230 buf->start + buf->len - 1, GFP_NOFS); 7231 else 7232 set_extent_new(&root->dirty_log_pages, buf->start, 7233 buf->start + buf->len - 1, GFP_NOFS); 7234 } else { 7235 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7236 buf->start + buf->len - 1, GFP_NOFS); 7237 } 7238 trans->blocks_used++; 7239 /* this returns a buffer locked for blocking */ 7240 return buf; 7241 } 7242 7243 static struct btrfs_block_rsv * 7244 use_block_rsv(struct btrfs_trans_handle *trans, 7245 struct btrfs_root *root, u32 blocksize) 7246 { 7247 struct btrfs_block_rsv *block_rsv; 7248 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 7249 int ret; 7250 bool global_updated = false; 7251 7252 block_rsv = get_block_rsv(trans, root); 7253 7254 if (unlikely(block_rsv->size == 0)) 7255 goto try_reserve; 7256 again: 7257 ret = block_rsv_use_bytes(block_rsv, blocksize); 7258 if (!ret) 7259 return block_rsv; 7260 7261 if (block_rsv->failfast) 7262 return ERR_PTR(ret); 7263 7264 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 7265 global_updated = true; 7266 update_global_block_rsv(root->fs_info); 7267 goto again; 7268 } 7269 7270 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 7271 static DEFINE_RATELIMIT_STATE(_rs, 7272 DEFAULT_RATELIMIT_INTERVAL * 10, 7273 /*DEFAULT_RATELIMIT_BURST*/ 1); 7274 if (__ratelimit(&_rs)) 7275 WARN(1, KERN_DEBUG 7276 "BTRFS: block rsv returned %d\n", ret); 7277 } 7278 try_reserve: 7279 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 7280 BTRFS_RESERVE_NO_FLUSH); 7281 if (!ret) 7282 return block_rsv; 7283 /* 7284 * If we couldn't reserve metadata bytes try and use some from 7285 * the global reserve if its space type is the same as the global 7286 * reservation. 7287 */ 7288 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 7289 block_rsv->space_info == global_rsv->space_info) { 7290 ret = block_rsv_use_bytes(global_rsv, blocksize); 7291 if (!ret) 7292 return global_rsv; 7293 } 7294 return ERR_PTR(ret); 7295 } 7296 7297 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 7298 struct btrfs_block_rsv *block_rsv, u32 blocksize) 7299 { 7300 block_rsv_add_bytes(block_rsv, blocksize, 0); 7301 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 7302 } 7303 7304 /* 7305 * finds a free extent and does all the dirty work required for allocation 7306 * returns the key for the extent through ins, and a tree buffer for 7307 * the first block of the extent through buf. 7308 * 7309 * returns the tree buffer or NULL. 7310 */ 7311 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 7312 struct btrfs_root *root, u32 blocksize, 7313 u64 parent, u64 root_objectid, 7314 struct btrfs_disk_key *key, int level, 7315 u64 hint, u64 empty_size) 7316 { 7317 struct btrfs_key ins; 7318 struct btrfs_block_rsv *block_rsv; 7319 struct extent_buffer *buf; 7320 u64 flags = 0; 7321 int ret; 7322 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7323 SKINNY_METADATA); 7324 7325 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7326 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { 7327 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7328 blocksize, level); 7329 if (!IS_ERR(buf)) 7330 root->alloc_bytenr += blocksize; 7331 return buf; 7332 } 7333 #endif 7334 block_rsv = use_block_rsv(trans, root, blocksize); 7335 if (IS_ERR(block_rsv)) 7336 return ERR_CAST(block_rsv); 7337 7338 ret = btrfs_reserve_extent(root, blocksize, blocksize, 7339 empty_size, hint, &ins, 0, 0); 7340 if (ret) { 7341 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 7342 return ERR_PTR(ret); 7343 } 7344 7345 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 7346 blocksize, level); 7347 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 7348 7349 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 7350 if (parent == 0) 7351 parent = ins.objectid; 7352 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 7353 } else 7354 BUG_ON(parent > 0); 7355 7356 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 7357 struct btrfs_delayed_extent_op *extent_op; 7358 extent_op = btrfs_alloc_delayed_extent_op(); 7359 BUG_ON(!extent_op); /* -ENOMEM */ 7360 if (key) 7361 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 7362 else 7363 memset(&extent_op->key, 0, sizeof(extent_op->key)); 7364 extent_op->flags_to_set = flags; 7365 if (skinny_metadata) 7366 extent_op->update_key = 0; 7367 else 7368 extent_op->update_key = 1; 7369 extent_op->update_flags = 1; 7370 extent_op->is_data = 0; 7371 extent_op->level = level; 7372 7373 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 7374 ins.objectid, 7375 ins.offset, parent, root_objectid, 7376 level, BTRFS_ADD_DELAYED_EXTENT, 7377 extent_op, 0); 7378 BUG_ON(ret); /* -ENOMEM */ 7379 } 7380 return buf; 7381 } 7382 7383 struct walk_control { 7384 u64 refs[BTRFS_MAX_LEVEL]; 7385 u64 flags[BTRFS_MAX_LEVEL]; 7386 struct btrfs_key update_progress; 7387 int stage; 7388 int level; 7389 int shared_level; 7390 int update_ref; 7391 int keep_locks; 7392 int reada_slot; 7393 int reada_count; 7394 int for_reloc; 7395 }; 7396 7397 #define DROP_REFERENCE 1 7398 #define UPDATE_BACKREF 2 7399 7400 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 7401 struct btrfs_root *root, 7402 struct walk_control *wc, 7403 struct btrfs_path *path) 7404 { 7405 u64 bytenr; 7406 u64 generation; 7407 u64 refs; 7408 u64 flags; 7409 u32 nritems; 7410 u32 blocksize; 7411 struct btrfs_key key; 7412 struct extent_buffer *eb; 7413 int ret; 7414 int slot; 7415 int nread = 0; 7416 7417 if (path->slots[wc->level] < wc->reada_slot) { 7418 wc->reada_count = wc->reada_count * 2 / 3; 7419 wc->reada_count = max(wc->reada_count, 2); 7420 } else { 7421 wc->reada_count = wc->reada_count * 3 / 2; 7422 wc->reada_count = min_t(int, wc->reada_count, 7423 BTRFS_NODEPTRS_PER_BLOCK(root)); 7424 } 7425 7426 eb = path->nodes[wc->level]; 7427 nritems = btrfs_header_nritems(eb); 7428 blocksize = btrfs_level_size(root, wc->level - 1); 7429 7430 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7431 if (nread >= wc->reada_count) 7432 break; 7433 7434 cond_resched(); 7435 bytenr = btrfs_node_blockptr(eb, slot); 7436 generation = btrfs_node_ptr_generation(eb, slot); 7437 7438 if (slot == path->slots[wc->level]) 7439 goto reada; 7440 7441 if (wc->stage == UPDATE_BACKREF && 7442 generation <= root->root_key.offset) 7443 continue; 7444 7445 /* We don't lock the tree block, it's OK to be racy here */ 7446 ret = btrfs_lookup_extent_info(trans, root, bytenr, 7447 wc->level - 1, 1, &refs, 7448 &flags); 7449 /* We don't care about errors in readahead. */ 7450 if (ret < 0) 7451 continue; 7452 BUG_ON(refs == 0); 7453 7454 if (wc->stage == DROP_REFERENCE) { 7455 if (refs == 1) 7456 goto reada; 7457 7458 if (wc->level == 1 && 7459 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7460 continue; 7461 if (!wc->update_ref || 7462 generation <= root->root_key.offset) 7463 continue; 7464 btrfs_node_key_to_cpu(eb, &key, slot); 7465 ret = btrfs_comp_cpu_keys(&key, 7466 &wc->update_progress); 7467 if (ret < 0) 7468 continue; 7469 } else { 7470 if (wc->level == 1 && 7471 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7472 continue; 7473 } 7474 reada: 7475 ret = readahead_tree_block(root, bytenr, blocksize, 7476 generation); 7477 if (ret) 7478 break; 7479 nread++; 7480 } 7481 wc->reada_slot = slot; 7482 } 7483 7484 /* 7485 * helper to process tree block while walking down the tree. 7486 * 7487 * when wc->stage == UPDATE_BACKREF, this function updates 7488 * back refs for pointers in the block. 7489 * 7490 * NOTE: return value 1 means we should stop walking down. 7491 */ 7492 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 7493 struct btrfs_root *root, 7494 struct btrfs_path *path, 7495 struct walk_control *wc, int lookup_info) 7496 { 7497 int level = wc->level; 7498 struct extent_buffer *eb = path->nodes[level]; 7499 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7500 int ret; 7501 7502 if (wc->stage == UPDATE_BACKREF && 7503 btrfs_header_owner(eb) != root->root_key.objectid) 7504 return 1; 7505 7506 /* 7507 * when reference count of tree block is 1, it won't increase 7508 * again. once full backref flag is set, we never clear it. 7509 */ 7510 if (lookup_info && 7511 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 7512 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 7513 BUG_ON(!path->locks[level]); 7514 ret = btrfs_lookup_extent_info(trans, root, 7515 eb->start, level, 1, 7516 &wc->refs[level], 7517 &wc->flags[level]); 7518 BUG_ON(ret == -ENOMEM); 7519 if (ret) 7520 return ret; 7521 BUG_ON(wc->refs[level] == 0); 7522 } 7523 7524 if (wc->stage == DROP_REFERENCE) { 7525 if (wc->refs[level] > 1) 7526 return 1; 7527 7528 if (path->locks[level] && !wc->keep_locks) { 7529 btrfs_tree_unlock_rw(eb, path->locks[level]); 7530 path->locks[level] = 0; 7531 } 7532 return 0; 7533 } 7534 7535 /* wc->stage == UPDATE_BACKREF */ 7536 if (!(wc->flags[level] & flag)) { 7537 BUG_ON(!path->locks[level]); 7538 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 7539 BUG_ON(ret); /* -ENOMEM */ 7540 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 7541 BUG_ON(ret); /* -ENOMEM */ 7542 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7543 eb->len, flag, 7544 btrfs_header_level(eb), 0); 7545 BUG_ON(ret); /* -ENOMEM */ 7546 wc->flags[level] |= flag; 7547 } 7548 7549 /* 7550 * the block is shared by multiple trees, so it's not good to 7551 * keep the tree lock 7552 */ 7553 if (path->locks[level] && level > 0) { 7554 btrfs_tree_unlock_rw(eb, path->locks[level]); 7555 path->locks[level] = 0; 7556 } 7557 return 0; 7558 } 7559 7560 /* 7561 * helper to process tree block pointer. 7562 * 7563 * when wc->stage == DROP_REFERENCE, this function checks 7564 * reference count of the block pointed to. if the block 7565 * is shared and we need update back refs for the subtree 7566 * rooted at the block, this function changes wc->stage to 7567 * UPDATE_BACKREF. if the block is shared and there is no 7568 * need to update back, this function drops the reference 7569 * to the block. 7570 * 7571 * NOTE: return value 1 means we should stop walking down. 7572 */ 7573 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 7574 struct btrfs_root *root, 7575 struct btrfs_path *path, 7576 struct walk_control *wc, int *lookup_info) 7577 { 7578 u64 bytenr; 7579 u64 generation; 7580 u64 parent; 7581 u32 blocksize; 7582 struct btrfs_key key; 7583 struct extent_buffer *next; 7584 int level = wc->level; 7585 int reada = 0; 7586 int ret = 0; 7587 7588 generation = btrfs_node_ptr_generation(path->nodes[level], 7589 path->slots[level]); 7590 /* 7591 * if the lower level block was created before the snapshot 7592 * was created, we know there is no need to update back refs 7593 * for the subtree 7594 */ 7595 if (wc->stage == UPDATE_BACKREF && 7596 generation <= root->root_key.offset) { 7597 *lookup_info = 1; 7598 return 1; 7599 } 7600 7601 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7602 blocksize = btrfs_level_size(root, level - 1); 7603 7604 next = btrfs_find_tree_block(root, bytenr, blocksize); 7605 if (!next) { 7606 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7607 if (!next) 7608 return -ENOMEM; 7609 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 7610 level - 1); 7611 reada = 1; 7612 } 7613 btrfs_tree_lock(next); 7614 btrfs_set_lock_blocking(next); 7615 7616 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7617 &wc->refs[level - 1], 7618 &wc->flags[level - 1]); 7619 if (ret < 0) { 7620 btrfs_tree_unlock(next); 7621 return ret; 7622 } 7623 7624 if (unlikely(wc->refs[level - 1] == 0)) { 7625 btrfs_err(root->fs_info, "Missing references."); 7626 BUG(); 7627 } 7628 *lookup_info = 0; 7629 7630 if (wc->stage == DROP_REFERENCE) { 7631 if (wc->refs[level - 1] > 1) { 7632 if (level == 1 && 7633 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7634 goto skip; 7635 7636 if (!wc->update_ref || 7637 generation <= root->root_key.offset) 7638 goto skip; 7639 7640 btrfs_node_key_to_cpu(path->nodes[level], &key, 7641 path->slots[level]); 7642 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7643 if (ret < 0) 7644 goto skip; 7645 7646 wc->stage = UPDATE_BACKREF; 7647 wc->shared_level = level - 1; 7648 } 7649 } else { 7650 if (level == 1 && 7651 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7652 goto skip; 7653 } 7654 7655 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7656 btrfs_tree_unlock(next); 7657 free_extent_buffer(next); 7658 next = NULL; 7659 *lookup_info = 1; 7660 } 7661 7662 if (!next) { 7663 if (reada && level == 1) 7664 reada_walk_down(trans, root, wc, path); 7665 next = read_tree_block(root, bytenr, blocksize, generation); 7666 if (!next || !extent_buffer_uptodate(next)) { 7667 free_extent_buffer(next); 7668 return -EIO; 7669 } 7670 btrfs_tree_lock(next); 7671 btrfs_set_lock_blocking(next); 7672 } 7673 7674 level--; 7675 BUG_ON(level != btrfs_header_level(next)); 7676 path->nodes[level] = next; 7677 path->slots[level] = 0; 7678 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7679 wc->level = level; 7680 if (wc->level == 1) 7681 wc->reada_slot = 0; 7682 return 0; 7683 skip: 7684 wc->refs[level - 1] = 0; 7685 wc->flags[level - 1] = 0; 7686 if (wc->stage == DROP_REFERENCE) { 7687 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7688 parent = path->nodes[level]->start; 7689 } else { 7690 BUG_ON(root->root_key.objectid != 7691 btrfs_header_owner(path->nodes[level])); 7692 parent = 0; 7693 } 7694 7695 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7696 root->root_key.objectid, level - 1, 0, 0); 7697 BUG_ON(ret); /* -ENOMEM */ 7698 } 7699 btrfs_tree_unlock(next); 7700 free_extent_buffer(next); 7701 *lookup_info = 1; 7702 return 1; 7703 } 7704 7705 /* 7706 * helper to process tree block while walking up the tree. 7707 * 7708 * when wc->stage == DROP_REFERENCE, this function drops 7709 * reference count on the block. 7710 * 7711 * when wc->stage == UPDATE_BACKREF, this function changes 7712 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7713 * to UPDATE_BACKREF previously while processing the block. 7714 * 7715 * NOTE: return value 1 means we should stop walking up. 7716 */ 7717 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7718 struct btrfs_root *root, 7719 struct btrfs_path *path, 7720 struct walk_control *wc) 7721 { 7722 int ret; 7723 int level = wc->level; 7724 struct extent_buffer *eb = path->nodes[level]; 7725 u64 parent = 0; 7726 7727 if (wc->stage == UPDATE_BACKREF) { 7728 BUG_ON(wc->shared_level < level); 7729 if (level < wc->shared_level) 7730 goto out; 7731 7732 ret = find_next_key(path, level + 1, &wc->update_progress); 7733 if (ret > 0) 7734 wc->update_ref = 0; 7735 7736 wc->stage = DROP_REFERENCE; 7737 wc->shared_level = -1; 7738 path->slots[level] = 0; 7739 7740 /* 7741 * check reference count again if the block isn't locked. 7742 * we should start walking down the tree again if reference 7743 * count is one. 7744 */ 7745 if (!path->locks[level]) { 7746 BUG_ON(level == 0); 7747 btrfs_tree_lock(eb); 7748 btrfs_set_lock_blocking(eb); 7749 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7750 7751 ret = btrfs_lookup_extent_info(trans, root, 7752 eb->start, level, 1, 7753 &wc->refs[level], 7754 &wc->flags[level]); 7755 if (ret < 0) { 7756 btrfs_tree_unlock_rw(eb, path->locks[level]); 7757 path->locks[level] = 0; 7758 return ret; 7759 } 7760 BUG_ON(wc->refs[level] == 0); 7761 if (wc->refs[level] == 1) { 7762 btrfs_tree_unlock_rw(eb, path->locks[level]); 7763 path->locks[level] = 0; 7764 return 1; 7765 } 7766 } 7767 } 7768 7769 /* wc->stage == DROP_REFERENCE */ 7770 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7771 7772 if (wc->refs[level] == 1) { 7773 if (level == 0) { 7774 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7775 ret = btrfs_dec_ref(trans, root, eb, 1, 7776 wc->for_reloc); 7777 else 7778 ret = btrfs_dec_ref(trans, root, eb, 0, 7779 wc->for_reloc); 7780 BUG_ON(ret); /* -ENOMEM */ 7781 } 7782 /* make block locked assertion in clean_tree_block happy */ 7783 if (!path->locks[level] && 7784 btrfs_header_generation(eb) == trans->transid) { 7785 btrfs_tree_lock(eb); 7786 btrfs_set_lock_blocking(eb); 7787 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7788 } 7789 clean_tree_block(trans, root, eb); 7790 } 7791 7792 if (eb == root->node) { 7793 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7794 parent = eb->start; 7795 else 7796 BUG_ON(root->root_key.objectid != 7797 btrfs_header_owner(eb)); 7798 } else { 7799 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7800 parent = path->nodes[level + 1]->start; 7801 else 7802 BUG_ON(root->root_key.objectid != 7803 btrfs_header_owner(path->nodes[level + 1])); 7804 } 7805 7806 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 7807 out: 7808 wc->refs[level] = 0; 7809 wc->flags[level] = 0; 7810 return 0; 7811 } 7812 7813 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 7814 struct btrfs_root *root, 7815 struct btrfs_path *path, 7816 struct walk_control *wc) 7817 { 7818 int level = wc->level; 7819 int lookup_info = 1; 7820 int ret; 7821 7822 while (level >= 0) { 7823 ret = walk_down_proc(trans, root, path, wc, lookup_info); 7824 if (ret > 0) 7825 break; 7826 7827 if (level == 0) 7828 break; 7829 7830 if (path->slots[level] >= 7831 btrfs_header_nritems(path->nodes[level])) 7832 break; 7833 7834 ret = do_walk_down(trans, root, path, wc, &lookup_info); 7835 if (ret > 0) { 7836 path->slots[level]++; 7837 continue; 7838 } else if (ret < 0) 7839 return ret; 7840 level = wc->level; 7841 } 7842 return 0; 7843 } 7844 7845 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 7846 struct btrfs_root *root, 7847 struct btrfs_path *path, 7848 struct walk_control *wc, int max_level) 7849 { 7850 int level = wc->level; 7851 int ret; 7852 7853 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 7854 while (level < max_level && path->nodes[level]) { 7855 wc->level = level; 7856 if (path->slots[level] + 1 < 7857 btrfs_header_nritems(path->nodes[level])) { 7858 path->slots[level]++; 7859 return 0; 7860 } else { 7861 ret = walk_up_proc(trans, root, path, wc); 7862 if (ret > 0) 7863 return 0; 7864 7865 if (path->locks[level]) { 7866 btrfs_tree_unlock_rw(path->nodes[level], 7867 path->locks[level]); 7868 path->locks[level] = 0; 7869 } 7870 free_extent_buffer(path->nodes[level]); 7871 path->nodes[level] = NULL; 7872 level++; 7873 } 7874 } 7875 return 1; 7876 } 7877 7878 /* 7879 * drop a subvolume tree. 7880 * 7881 * this function traverses the tree freeing any blocks that only 7882 * referenced by the tree. 7883 * 7884 * when a shared tree block is found. this function decreases its 7885 * reference count by one. if update_ref is true, this function 7886 * also make sure backrefs for the shared block and all lower level 7887 * blocks are properly updated. 7888 * 7889 * If called with for_reloc == 0, may exit early with -EAGAIN 7890 */ 7891 int btrfs_drop_snapshot(struct btrfs_root *root, 7892 struct btrfs_block_rsv *block_rsv, int update_ref, 7893 int for_reloc) 7894 { 7895 struct btrfs_path *path; 7896 struct btrfs_trans_handle *trans; 7897 struct btrfs_root *tree_root = root->fs_info->tree_root; 7898 struct btrfs_root_item *root_item = &root->root_item; 7899 struct walk_control *wc; 7900 struct btrfs_key key; 7901 int err = 0; 7902 int ret; 7903 int level; 7904 bool root_dropped = false; 7905 7906 path = btrfs_alloc_path(); 7907 if (!path) { 7908 err = -ENOMEM; 7909 goto out; 7910 } 7911 7912 wc = kzalloc(sizeof(*wc), GFP_NOFS); 7913 if (!wc) { 7914 btrfs_free_path(path); 7915 err = -ENOMEM; 7916 goto out; 7917 } 7918 7919 trans = btrfs_start_transaction(tree_root, 0); 7920 if (IS_ERR(trans)) { 7921 err = PTR_ERR(trans); 7922 goto out_free; 7923 } 7924 7925 if (block_rsv) 7926 trans->block_rsv = block_rsv; 7927 7928 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 7929 level = btrfs_header_level(root->node); 7930 path->nodes[level] = btrfs_lock_root_node(root); 7931 btrfs_set_lock_blocking(path->nodes[level]); 7932 path->slots[level] = 0; 7933 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7934 memset(&wc->update_progress, 0, 7935 sizeof(wc->update_progress)); 7936 } else { 7937 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 7938 memcpy(&wc->update_progress, &key, 7939 sizeof(wc->update_progress)); 7940 7941 level = root_item->drop_level; 7942 BUG_ON(level == 0); 7943 path->lowest_level = level; 7944 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7945 path->lowest_level = 0; 7946 if (ret < 0) { 7947 err = ret; 7948 goto out_end_trans; 7949 } 7950 WARN_ON(ret > 0); 7951 7952 /* 7953 * unlock our path, this is safe because only this 7954 * function is allowed to delete this snapshot 7955 */ 7956 btrfs_unlock_up_safe(path, 0); 7957 7958 level = btrfs_header_level(root->node); 7959 while (1) { 7960 btrfs_tree_lock(path->nodes[level]); 7961 btrfs_set_lock_blocking(path->nodes[level]); 7962 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7963 7964 ret = btrfs_lookup_extent_info(trans, root, 7965 path->nodes[level]->start, 7966 level, 1, &wc->refs[level], 7967 &wc->flags[level]); 7968 if (ret < 0) { 7969 err = ret; 7970 goto out_end_trans; 7971 } 7972 BUG_ON(wc->refs[level] == 0); 7973 7974 if (level == root_item->drop_level) 7975 break; 7976 7977 btrfs_tree_unlock(path->nodes[level]); 7978 path->locks[level] = 0; 7979 WARN_ON(wc->refs[level] != 1); 7980 level--; 7981 } 7982 } 7983 7984 wc->level = level; 7985 wc->shared_level = -1; 7986 wc->stage = DROP_REFERENCE; 7987 wc->update_ref = update_ref; 7988 wc->keep_locks = 0; 7989 wc->for_reloc = for_reloc; 7990 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7991 7992 while (1) { 7993 7994 ret = walk_down_tree(trans, root, path, wc); 7995 if (ret < 0) { 7996 err = ret; 7997 break; 7998 } 7999 8000 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 8001 if (ret < 0) { 8002 err = ret; 8003 break; 8004 } 8005 8006 if (ret > 0) { 8007 BUG_ON(wc->stage != DROP_REFERENCE); 8008 break; 8009 } 8010 8011 if (wc->stage == DROP_REFERENCE) { 8012 level = wc->level; 8013 btrfs_node_key(path->nodes[level], 8014 &root_item->drop_progress, 8015 path->slots[level]); 8016 root_item->drop_level = level; 8017 } 8018 8019 BUG_ON(wc->level == 0); 8020 if (btrfs_should_end_transaction(trans, tree_root) || 8021 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 8022 ret = btrfs_update_root(trans, tree_root, 8023 &root->root_key, 8024 root_item); 8025 if (ret) { 8026 btrfs_abort_transaction(trans, tree_root, ret); 8027 err = ret; 8028 goto out_end_trans; 8029 } 8030 8031 btrfs_end_transaction_throttle(trans, tree_root); 8032 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8033 pr_debug("BTRFS: drop snapshot early exit\n"); 8034 err = -EAGAIN; 8035 goto out_free; 8036 } 8037 8038 trans = btrfs_start_transaction(tree_root, 0); 8039 if (IS_ERR(trans)) { 8040 err = PTR_ERR(trans); 8041 goto out_free; 8042 } 8043 if (block_rsv) 8044 trans->block_rsv = block_rsv; 8045 } 8046 } 8047 btrfs_release_path(path); 8048 if (err) 8049 goto out_end_trans; 8050 8051 ret = btrfs_del_root(trans, tree_root, &root->root_key); 8052 if (ret) { 8053 btrfs_abort_transaction(trans, tree_root, ret); 8054 goto out_end_trans; 8055 } 8056 8057 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 8058 ret = btrfs_find_root(tree_root, &root->root_key, path, 8059 NULL, NULL); 8060 if (ret < 0) { 8061 btrfs_abort_transaction(trans, tree_root, ret); 8062 err = ret; 8063 goto out_end_trans; 8064 } else if (ret > 0) { 8065 /* if we fail to delete the orphan item this time 8066 * around, it'll get picked up the next time. 8067 * 8068 * The most common failure here is just -ENOENT. 8069 */ 8070 btrfs_del_orphan_item(trans, tree_root, 8071 root->root_key.objectid); 8072 } 8073 } 8074 8075 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 8076 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 8077 } else { 8078 free_extent_buffer(root->node); 8079 free_extent_buffer(root->commit_root); 8080 btrfs_put_fs_root(root); 8081 } 8082 root_dropped = true; 8083 out_end_trans: 8084 btrfs_end_transaction_throttle(trans, tree_root); 8085 out_free: 8086 kfree(wc); 8087 btrfs_free_path(path); 8088 out: 8089 /* 8090 * So if we need to stop dropping the snapshot for whatever reason we 8091 * need to make sure to add it back to the dead root list so that we 8092 * keep trying to do the work later. This also cleans up roots if we 8093 * don't have it in the radix (like when we recover after a power fail 8094 * or unmount) so we don't leak memory. 8095 */ 8096 if (!for_reloc && root_dropped == false) 8097 btrfs_add_dead_root(root); 8098 if (err && err != -EAGAIN) 8099 btrfs_std_error(root->fs_info, err); 8100 return err; 8101 } 8102 8103 /* 8104 * drop subtree rooted at tree block 'node'. 8105 * 8106 * NOTE: this function will unlock and release tree block 'node' 8107 * only used by relocation code 8108 */ 8109 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 8110 struct btrfs_root *root, 8111 struct extent_buffer *node, 8112 struct extent_buffer *parent) 8113 { 8114 struct btrfs_path *path; 8115 struct walk_control *wc; 8116 int level; 8117 int parent_level; 8118 int ret = 0; 8119 int wret; 8120 8121 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 8122 8123 path = btrfs_alloc_path(); 8124 if (!path) 8125 return -ENOMEM; 8126 8127 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8128 if (!wc) { 8129 btrfs_free_path(path); 8130 return -ENOMEM; 8131 } 8132 8133 btrfs_assert_tree_locked(parent); 8134 parent_level = btrfs_header_level(parent); 8135 extent_buffer_get(parent); 8136 path->nodes[parent_level] = parent; 8137 path->slots[parent_level] = btrfs_header_nritems(parent); 8138 8139 btrfs_assert_tree_locked(node); 8140 level = btrfs_header_level(node); 8141 path->nodes[level] = node; 8142 path->slots[level] = 0; 8143 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8144 8145 wc->refs[parent_level] = 1; 8146 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8147 wc->level = level; 8148 wc->shared_level = -1; 8149 wc->stage = DROP_REFERENCE; 8150 wc->update_ref = 0; 8151 wc->keep_locks = 1; 8152 wc->for_reloc = 1; 8153 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8154 8155 while (1) { 8156 wret = walk_down_tree(trans, root, path, wc); 8157 if (wret < 0) { 8158 ret = wret; 8159 break; 8160 } 8161 8162 wret = walk_up_tree(trans, root, path, wc, parent_level); 8163 if (wret < 0) 8164 ret = wret; 8165 if (wret != 0) 8166 break; 8167 } 8168 8169 kfree(wc); 8170 btrfs_free_path(path); 8171 return ret; 8172 } 8173 8174 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 8175 { 8176 u64 num_devices; 8177 u64 stripped; 8178 8179 /* 8180 * if restripe for this chunk_type is on pick target profile and 8181 * return, otherwise do the usual balance 8182 */ 8183 stripped = get_restripe_target(root->fs_info, flags); 8184 if (stripped) 8185 return extended_to_chunk(stripped); 8186 8187 /* 8188 * we add in the count of missing devices because we want 8189 * to make sure that any RAID levels on a degraded FS 8190 * continue to be honored. 8191 */ 8192 num_devices = root->fs_info->fs_devices->rw_devices + 8193 root->fs_info->fs_devices->missing_devices; 8194 8195 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8196 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8197 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 8198 8199 if (num_devices == 1) { 8200 stripped |= BTRFS_BLOCK_GROUP_DUP; 8201 stripped = flags & ~stripped; 8202 8203 /* turn raid0 into single device chunks */ 8204 if (flags & BTRFS_BLOCK_GROUP_RAID0) 8205 return stripped; 8206 8207 /* turn mirroring into duplication */ 8208 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 8209 BTRFS_BLOCK_GROUP_RAID10)) 8210 return stripped | BTRFS_BLOCK_GROUP_DUP; 8211 } else { 8212 /* they already had raid on here, just return */ 8213 if (flags & stripped) 8214 return flags; 8215 8216 stripped |= BTRFS_BLOCK_GROUP_DUP; 8217 stripped = flags & ~stripped; 8218 8219 /* switch duplicated blocks with raid1 */ 8220 if (flags & BTRFS_BLOCK_GROUP_DUP) 8221 return stripped | BTRFS_BLOCK_GROUP_RAID1; 8222 8223 /* this is drive concat, leave it alone */ 8224 } 8225 8226 return flags; 8227 } 8228 8229 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 8230 { 8231 struct btrfs_space_info *sinfo = cache->space_info; 8232 u64 num_bytes; 8233 u64 min_allocable_bytes; 8234 int ret = -ENOSPC; 8235 8236 8237 /* 8238 * We need some metadata space and system metadata space for 8239 * allocating chunks in some corner cases until we force to set 8240 * it to be readonly. 8241 */ 8242 if ((sinfo->flags & 8243 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 8244 !force) 8245 min_allocable_bytes = 1 * 1024 * 1024; 8246 else 8247 min_allocable_bytes = 0; 8248 8249 spin_lock(&sinfo->lock); 8250 spin_lock(&cache->lock); 8251 8252 if (cache->ro) { 8253 ret = 0; 8254 goto out; 8255 } 8256 8257 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8258 cache->bytes_super - btrfs_block_group_used(&cache->item); 8259 8260 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8261 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 8262 min_allocable_bytes <= sinfo->total_bytes) { 8263 sinfo->bytes_readonly += num_bytes; 8264 cache->ro = 1; 8265 ret = 0; 8266 } 8267 out: 8268 spin_unlock(&cache->lock); 8269 spin_unlock(&sinfo->lock); 8270 return ret; 8271 } 8272 8273 int btrfs_set_block_group_ro(struct btrfs_root *root, 8274 struct btrfs_block_group_cache *cache) 8275 8276 { 8277 struct btrfs_trans_handle *trans; 8278 u64 alloc_flags; 8279 int ret; 8280 8281 BUG_ON(cache->ro); 8282 8283 trans = btrfs_join_transaction(root); 8284 if (IS_ERR(trans)) 8285 return PTR_ERR(trans); 8286 8287 alloc_flags = update_block_group_flags(root, cache->flags); 8288 if (alloc_flags != cache->flags) { 8289 ret = do_chunk_alloc(trans, root, alloc_flags, 8290 CHUNK_ALLOC_FORCE); 8291 if (ret < 0) 8292 goto out; 8293 } 8294 8295 ret = set_block_group_ro(cache, 0); 8296 if (!ret) 8297 goto out; 8298 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8299 ret = do_chunk_alloc(trans, root, alloc_flags, 8300 CHUNK_ALLOC_FORCE); 8301 if (ret < 0) 8302 goto out; 8303 ret = set_block_group_ro(cache, 0); 8304 out: 8305 btrfs_end_transaction(trans, root); 8306 return ret; 8307 } 8308 8309 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 8310 struct btrfs_root *root, u64 type) 8311 { 8312 u64 alloc_flags = get_alloc_profile(root, type); 8313 return do_chunk_alloc(trans, root, alloc_flags, 8314 CHUNK_ALLOC_FORCE); 8315 } 8316 8317 /* 8318 * helper to account the unused space of all the readonly block group in the 8319 * list. takes mirrors into account. 8320 */ 8321 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8322 { 8323 struct btrfs_block_group_cache *block_group; 8324 u64 free_bytes = 0; 8325 int factor; 8326 8327 list_for_each_entry(block_group, groups_list, list) { 8328 spin_lock(&block_group->lock); 8329 8330 if (!block_group->ro) { 8331 spin_unlock(&block_group->lock); 8332 continue; 8333 } 8334 8335 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 8336 BTRFS_BLOCK_GROUP_RAID10 | 8337 BTRFS_BLOCK_GROUP_DUP)) 8338 factor = 2; 8339 else 8340 factor = 1; 8341 8342 free_bytes += (block_group->key.offset - 8343 btrfs_block_group_used(&block_group->item)) * 8344 factor; 8345 8346 spin_unlock(&block_group->lock); 8347 } 8348 8349 return free_bytes; 8350 } 8351 8352 /* 8353 * helper to account the unused space of all the readonly block group in the 8354 * space_info. takes mirrors into account. 8355 */ 8356 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 8357 { 8358 int i; 8359 u64 free_bytes = 0; 8360 8361 spin_lock(&sinfo->lock); 8362 8363 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 8364 if (!list_empty(&sinfo->block_groups[i])) 8365 free_bytes += __btrfs_get_ro_block_group_free_space( 8366 &sinfo->block_groups[i]); 8367 8368 spin_unlock(&sinfo->lock); 8369 8370 return free_bytes; 8371 } 8372 8373 void btrfs_set_block_group_rw(struct btrfs_root *root, 8374 struct btrfs_block_group_cache *cache) 8375 { 8376 struct btrfs_space_info *sinfo = cache->space_info; 8377 u64 num_bytes; 8378 8379 BUG_ON(!cache->ro); 8380 8381 spin_lock(&sinfo->lock); 8382 spin_lock(&cache->lock); 8383 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8384 cache->bytes_super - btrfs_block_group_used(&cache->item); 8385 sinfo->bytes_readonly -= num_bytes; 8386 cache->ro = 0; 8387 spin_unlock(&cache->lock); 8388 spin_unlock(&sinfo->lock); 8389 } 8390 8391 /* 8392 * checks to see if its even possible to relocate this block group. 8393 * 8394 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 8395 * ok to go ahead and try. 8396 */ 8397 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 8398 { 8399 struct btrfs_block_group_cache *block_group; 8400 struct btrfs_space_info *space_info; 8401 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 8402 struct btrfs_device *device; 8403 struct btrfs_trans_handle *trans; 8404 u64 min_free; 8405 u64 dev_min = 1; 8406 u64 dev_nr = 0; 8407 u64 target; 8408 int index; 8409 int full = 0; 8410 int ret = 0; 8411 8412 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 8413 8414 /* odd, couldn't find the block group, leave it alone */ 8415 if (!block_group) 8416 return -1; 8417 8418 min_free = btrfs_block_group_used(&block_group->item); 8419 8420 /* no bytes used, we're good */ 8421 if (!min_free) 8422 goto out; 8423 8424 space_info = block_group->space_info; 8425 spin_lock(&space_info->lock); 8426 8427 full = space_info->full; 8428 8429 /* 8430 * if this is the last block group we have in this space, we can't 8431 * relocate it unless we're able to allocate a new chunk below. 8432 * 8433 * Otherwise, we need to make sure we have room in the space to handle 8434 * all of the extents from this block group. If we can, we're good 8435 */ 8436 if ((space_info->total_bytes != block_group->key.offset) && 8437 (space_info->bytes_used + space_info->bytes_reserved + 8438 space_info->bytes_pinned + space_info->bytes_readonly + 8439 min_free < space_info->total_bytes)) { 8440 spin_unlock(&space_info->lock); 8441 goto out; 8442 } 8443 spin_unlock(&space_info->lock); 8444 8445 /* 8446 * ok we don't have enough space, but maybe we have free space on our 8447 * devices to allocate new chunks for relocation, so loop through our 8448 * alloc devices and guess if we have enough space. if this block 8449 * group is going to be restriped, run checks against the target 8450 * profile instead of the current one. 8451 */ 8452 ret = -1; 8453 8454 /* 8455 * index: 8456 * 0: raid10 8457 * 1: raid1 8458 * 2: dup 8459 * 3: raid0 8460 * 4: single 8461 */ 8462 target = get_restripe_target(root->fs_info, block_group->flags); 8463 if (target) { 8464 index = __get_raid_index(extended_to_chunk(target)); 8465 } else { 8466 /* 8467 * this is just a balance, so if we were marked as full 8468 * we know there is no space for a new chunk 8469 */ 8470 if (full) 8471 goto out; 8472 8473 index = get_block_group_index(block_group); 8474 } 8475 8476 if (index == BTRFS_RAID_RAID10) { 8477 dev_min = 4; 8478 /* Divide by 2 */ 8479 min_free >>= 1; 8480 } else if (index == BTRFS_RAID_RAID1) { 8481 dev_min = 2; 8482 } else if (index == BTRFS_RAID_DUP) { 8483 /* Multiply by 2 */ 8484 min_free <<= 1; 8485 } else if (index == BTRFS_RAID_RAID0) { 8486 dev_min = fs_devices->rw_devices; 8487 do_div(min_free, dev_min); 8488 } 8489 8490 /* We need to do this so that we can look at pending chunks */ 8491 trans = btrfs_join_transaction(root); 8492 if (IS_ERR(trans)) { 8493 ret = PTR_ERR(trans); 8494 goto out; 8495 } 8496 8497 mutex_lock(&root->fs_info->chunk_mutex); 8498 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8499 u64 dev_offset; 8500 8501 /* 8502 * check to make sure we can actually find a chunk with enough 8503 * space to fit our block group in. 8504 */ 8505 if (device->total_bytes > device->bytes_used + min_free && 8506 !device->is_tgtdev_for_dev_replace) { 8507 ret = find_free_dev_extent(trans, device, min_free, 8508 &dev_offset, NULL); 8509 if (!ret) 8510 dev_nr++; 8511 8512 if (dev_nr >= dev_min) 8513 break; 8514 8515 ret = -1; 8516 } 8517 } 8518 mutex_unlock(&root->fs_info->chunk_mutex); 8519 btrfs_end_transaction(trans, root); 8520 out: 8521 btrfs_put_block_group(block_group); 8522 return ret; 8523 } 8524 8525 static int find_first_block_group(struct btrfs_root *root, 8526 struct btrfs_path *path, struct btrfs_key *key) 8527 { 8528 int ret = 0; 8529 struct btrfs_key found_key; 8530 struct extent_buffer *leaf; 8531 int slot; 8532 8533 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 8534 if (ret < 0) 8535 goto out; 8536 8537 while (1) { 8538 slot = path->slots[0]; 8539 leaf = path->nodes[0]; 8540 if (slot >= btrfs_header_nritems(leaf)) { 8541 ret = btrfs_next_leaf(root, path); 8542 if (ret == 0) 8543 continue; 8544 if (ret < 0) 8545 goto out; 8546 break; 8547 } 8548 btrfs_item_key_to_cpu(leaf, &found_key, slot); 8549 8550 if (found_key.objectid >= key->objectid && 8551 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 8552 ret = 0; 8553 goto out; 8554 } 8555 path->slots[0]++; 8556 } 8557 out: 8558 return ret; 8559 } 8560 8561 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 8562 { 8563 struct btrfs_block_group_cache *block_group; 8564 u64 last = 0; 8565 8566 while (1) { 8567 struct inode *inode; 8568 8569 block_group = btrfs_lookup_first_block_group(info, last); 8570 while (block_group) { 8571 spin_lock(&block_group->lock); 8572 if (block_group->iref) 8573 break; 8574 spin_unlock(&block_group->lock); 8575 block_group = next_block_group(info->tree_root, 8576 block_group); 8577 } 8578 if (!block_group) { 8579 if (last == 0) 8580 break; 8581 last = 0; 8582 continue; 8583 } 8584 8585 inode = block_group->inode; 8586 block_group->iref = 0; 8587 block_group->inode = NULL; 8588 spin_unlock(&block_group->lock); 8589 iput(inode); 8590 last = block_group->key.objectid + block_group->key.offset; 8591 btrfs_put_block_group(block_group); 8592 } 8593 } 8594 8595 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8596 { 8597 struct btrfs_block_group_cache *block_group; 8598 struct btrfs_space_info *space_info; 8599 struct btrfs_caching_control *caching_ctl; 8600 struct rb_node *n; 8601 8602 down_write(&info->commit_root_sem); 8603 while (!list_empty(&info->caching_block_groups)) { 8604 caching_ctl = list_entry(info->caching_block_groups.next, 8605 struct btrfs_caching_control, list); 8606 list_del(&caching_ctl->list); 8607 put_caching_control(caching_ctl); 8608 } 8609 up_write(&info->commit_root_sem); 8610 8611 spin_lock(&info->block_group_cache_lock); 8612 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8613 block_group = rb_entry(n, struct btrfs_block_group_cache, 8614 cache_node); 8615 rb_erase(&block_group->cache_node, 8616 &info->block_group_cache_tree); 8617 spin_unlock(&info->block_group_cache_lock); 8618 8619 down_write(&block_group->space_info->groups_sem); 8620 list_del(&block_group->list); 8621 up_write(&block_group->space_info->groups_sem); 8622 8623 if (block_group->cached == BTRFS_CACHE_STARTED) 8624 wait_block_group_cache_done(block_group); 8625 8626 /* 8627 * We haven't cached this block group, which means we could 8628 * possibly have excluded extents on this block group. 8629 */ 8630 if (block_group->cached == BTRFS_CACHE_NO || 8631 block_group->cached == BTRFS_CACHE_ERROR) 8632 free_excluded_extents(info->extent_root, block_group); 8633 8634 btrfs_remove_free_space_cache(block_group); 8635 btrfs_put_block_group(block_group); 8636 8637 spin_lock(&info->block_group_cache_lock); 8638 } 8639 spin_unlock(&info->block_group_cache_lock); 8640 8641 /* now that all the block groups are freed, go through and 8642 * free all the space_info structs. This is only called during 8643 * the final stages of unmount, and so we know nobody is 8644 * using them. We call synchronize_rcu() once before we start, 8645 * just to be on the safe side. 8646 */ 8647 synchronize_rcu(); 8648 8649 release_global_block_rsv(info); 8650 8651 while (!list_empty(&info->space_info)) { 8652 int i; 8653 8654 space_info = list_entry(info->space_info.next, 8655 struct btrfs_space_info, 8656 list); 8657 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8658 if (WARN_ON(space_info->bytes_pinned > 0 || 8659 space_info->bytes_reserved > 0 || 8660 space_info->bytes_may_use > 0)) { 8661 dump_space_info(space_info, 0, 0); 8662 } 8663 } 8664 list_del(&space_info->list); 8665 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 8666 struct kobject *kobj; 8667 kobj = space_info->block_group_kobjs[i]; 8668 space_info->block_group_kobjs[i] = NULL; 8669 if (kobj) { 8670 kobject_del(kobj); 8671 kobject_put(kobj); 8672 } 8673 } 8674 kobject_del(&space_info->kobj); 8675 kobject_put(&space_info->kobj); 8676 } 8677 return 0; 8678 } 8679 8680 static void __link_block_group(struct btrfs_space_info *space_info, 8681 struct btrfs_block_group_cache *cache) 8682 { 8683 int index = get_block_group_index(cache); 8684 bool first = false; 8685 8686 down_write(&space_info->groups_sem); 8687 if (list_empty(&space_info->block_groups[index])) 8688 first = true; 8689 list_add_tail(&cache->list, &space_info->block_groups[index]); 8690 up_write(&space_info->groups_sem); 8691 8692 if (first) { 8693 struct raid_kobject *rkobj; 8694 int ret; 8695 8696 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 8697 if (!rkobj) 8698 goto out_err; 8699 rkobj->raid_type = index; 8700 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 8701 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 8702 "%s", get_raid_name(index)); 8703 if (ret) { 8704 kobject_put(&rkobj->kobj); 8705 goto out_err; 8706 } 8707 space_info->block_group_kobjs[index] = &rkobj->kobj; 8708 } 8709 8710 return; 8711 out_err: 8712 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 8713 } 8714 8715 static struct btrfs_block_group_cache * 8716 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) 8717 { 8718 struct btrfs_block_group_cache *cache; 8719 8720 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8721 if (!cache) 8722 return NULL; 8723 8724 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8725 GFP_NOFS); 8726 if (!cache->free_space_ctl) { 8727 kfree(cache); 8728 return NULL; 8729 } 8730 8731 cache->key.objectid = start; 8732 cache->key.offset = size; 8733 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8734 8735 cache->sectorsize = root->sectorsize; 8736 cache->fs_info = root->fs_info; 8737 cache->full_stripe_len = btrfs_full_stripe_len(root, 8738 &root->fs_info->mapping_tree, 8739 start); 8740 atomic_set(&cache->count, 1); 8741 spin_lock_init(&cache->lock); 8742 init_rwsem(&cache->data_rwsem); 8743 INIT_LIST_HEAD(&cache->list); 8744 INIT_LIST_HEAD(&cache->cluster_list); 8745 INIT_LIST_HEAD(&cache->new_bg_list); 8746 btrfs_init_free_space_ctl(cache); 8747 8748 return cache; 8749 } 8750 8751 int btrfs_read_block_groups(struct btrfs_root *root) 8752 { 8753 struct btrfs_path *path; 8754 int ret; 8755 struct btrfs_block_group_cache *cache; 8756 struct btrfs_fs_info *info = root->fs_info; 8757 struct btrfs_space_info *space_info; 8758 struct btrfs_key key; 8759 struct btrfs_key found_key; 8760 struct extent_buffer *leaf; 8761 int need_clear = 0; 8762 u64 cache_gen; 8763 8764 root = info->extent_root; 8765 key.objectid = 0; 8766 key.offset = 0; 8767 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 8768 path = btrfs_alloc_path(); 8769 if (!path) 8770 return -ENOMEM; 8771 path->reada = 1; 8772 8773 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 8774 if (btrfs_test_opt(root, SPACE_CACHE) && 8775 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 8776 need_clear = 1; 8777 if (btrfs_test_opt(root, CLEAR_CACHE)) 8778 need_clear = 1; 8779 8780 while (1) { 8781 ret = find_first_block_group(root, path, &key); 8782 if (ret > 0) 8783 break; 8784 if (ret != 0) 8785 goto error; 8786 8787 leaf = path->nodes[0]; 8788 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8789 8790 cache = btrfs_create_block_group_cache(root, found_key.objectid, 8791 found_key.offset); 8792 if (!cache) { 8793 ret = -ENOMEM; 8794 goto error; 8795 } 8796 8797 if (need_clear) { 8798 /* 8799 * When we mount with old space cache, we need to 8800 * set BTRFS_DC_CLEAR and set dirty flag. 8801 * 8802 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 8803 * truncate the old free space cache inode and 8804 * setup a new one. 8805 * b) Setting 'dirty flag' makes sure that we flush 8806 * the new space cache info onto disk. 8807 */ 8808 cache->disk_cache_state = BTRFS_DC_CLEAR; 8809 if (btrfs_test_opt(root, SPACE_CACHE)) 8810 cache->dirty = 1; 8811 } 8812 8813 read_extent_buffer(leaf, &cache->item, 8814 btrfs_item_ptr_offset(leaf, path->slots[0]), 8815 sizeof(cache->item)); 8816 cache->flags = btrfs_block_group_flags(&cache->item); 8817 8818 key.objectid = found_key.objectid + found_key.offset; 8819 btrfs_release_path(path); 8820 8821 /* 8822 * We need to exclude the super stripes now so that the space 8823 * info has super bytes accounted for, otherwise we'll think 8824 * we have more space than we actually do. 8825 */ 8826 ret = exclude_super_stripes(root, cache); 8827 if (ret) { 8828 /* 8829 * We may have excluded something, so call this just in 8830 * case. 8831 */ 8832 free_excluded_extents(root, cache); 8833 btrfs_put_block_group(cache); 8834 goto error; 8835 } 8836 8837 /* 8838 * check for two cases, either we are full, and therefore 8839 * don't need to bother with the caching work since we won't 8840 * find any space, or we are empty, and we can just add all 8841 * the space in and be done with it. This saves us _alot_ of 8842 * time, particularly in the full case. 8843 */ 8844 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8845 cache->last_byte_to_unpin = (u64)-1; 8846 cache->cached = BTRFS_CACHE_FINISHED; 8847 free_excluded_extents(root, cache); 8848 } else if (btrfs_block_group_used(&cache->item) == 0) { 8849 cache->last_byte_to_unpin = (u64)-1; 8850 cache->cached = BTRFS_CACHE_FINISHED; 8851 add_new_free_space(cache, root->fs_info, 8852 found_key.objectid, 8853 found_key.objectid + 8854 found_key.offset); 8855 free_excluded_extents(root, cache); 8856 } 8857 8858 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8859 if (ret) { 8860 btrfs_remove_free_space_cache(cache); 8861 btrfs_put_block_group(cache); 8862 goto error; 8863 } 8864 8865 ret = update_space_info(info, cache->flags, found_key.offset, 8866 btrfs_block_group_used(&cache->item), 8867 &space_info); 8868 if (ret) { 8869 btrfs_remove_free_space_cache(cache); 8870 spin_lock(&info->block_group_cache_lock); 8871 rb_erase(&cache->cache_node, 8872 &info->block_group_cache_tree); 8873 spin_unlock(&info->block_group_cache_lock); 8874 btrfs_put_block_group(cache); 8875 goto error; 8876 } 8877 8878 cache->space_info = space_info; 8879 spin_lock(&cache->space_info->lock); 8880 cache->space_info->bytes_readonly += cache->bytes_super; 8881 spin_unlock(&cache->space_info->lock); 8882 8883 __link_block_group(space_info, cache); 8884 8885 set_avail_alloc_bits(root->fs_info, cache->flags); 8886 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8887 set_block_group_ro(cache, 1); 8888 } 8889 8890 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 8891 if (!(get_alloc_profile(root, space_info->flags) & 8892 (BTRFS_BLOCK_GROUP_RAID10 | 8893 BTRFS_BLOCK_GROUP_RAID1 | 8894 BTRFS_BLOCK_GROUP_RAID5 | 8895 BTRFS_BLOCK_GROUP_RAID6 | 8896 BTRFS_BLOCK_GROUP_DUP))) 8897 continue; 8898 /* 8899 * avoid allocating from un-mirrored block group if there are 8900 * mirrored block groups. 8901 */ 8902 list_for_each_entry(cache, 8903 &space_info->block_groups[BTRFS_RAID_RAID0], 8904 list) 8905 set_block_group_ro(cache, 1); 8906 list_for_each_entry(cache, 8907 &space_info->block_groups[BTRFS_RAID_SINGLE], 8908 list) 8909 set_block_group_ro(cache, 1); 8910 } 8911 8912 init_global_block_rsv(info); 8913 ret = 0; 8914 error: 8915 btrfs_free_path(path); 8916 return ret; 8917 } 8918 8919 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 8920 struct btrfs_root *root) 8921 { 8922 struct btrfs_block_group_cache *block_group, *tmp; 8923 struct btrfs_root *extent_root = root->fs_info->extent_root; 8924 struct btrfs_block_group_item item; 8925 struct btrfs_key key; 8926 int ret = 0; 8927 8928 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 8929 new_bg_list) { 8930 list_del_init(&block_group->new_bg_list); 8931 8932 if (ret) 8933 continue; 8934 8935 spin_lock(&block_group->lock); 8936 memcpy(&item, &block_group->item, sizeof(item)); 8937 memcpy(&key, &block_group->key, sizeof(key)); 8938 spin_unlock(&block_group->lock); 8939 8940 ret = btrfs_insert_item(trans, extent_root, &key, &item, 8941 sizeof(item)); 8942 if (ret) 8943 btrfs_abort_transaction(trans, extent_root, ret); 8944 ret = btrfs_finish_chunk_alloc(trans, extent_root, 8945 key.objectid, key.offset); 8946 if (ret) 8947 btrfs_abort_transaction(trans, extent_root, ret); 8948 } 8949 } 8950 8951 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 8952 struct btrfs_root *root, u64 bytes_used, 8953 u64 type, u64 chunk_objectid, u64 chunk_offset, 8954 u64 size) 8955 { 8956 int ret; 8957 struct btrfs_root *extent_root; 8958 struct btrfs_block_group_cache *cache; 8959 8960 extent_root = root->fs_info->extent_root; 8961 8962 btrfs_set_log_full_commit(root->fs_info, trans); 8963 8964 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 8965 if (!cache) 8966 return -ENOMEM; 8967 8968 btrfs_set_block_group_used(&cache->item, bytes_used); 8969 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8970 btrfs_set_block_group_flags(&cache->item, type); 8971 8972 cache->flags = type; 8973 cache->last_byte_to_unpin = (u64)-1; 8974 cache->cached = BTRFS_CACHE_FINISHED; 8975 ret = exclude_super_stripes(root, cache); 8976 if (ret) { 8977 /* 8978 * We may have excluded something, so call this just in 8979 * case. 8980 */ 8981 free_excluded_extents(root, cache); 8982 btrfs_put_block_group(cache); 8983 return ret; 8984 } 8985 8986 add_new_free_space(cache, root->fs_info, chunk_offset, 8987 chunk_offset + size); 8988 8989 free_excluded_extents(root, cache); 8990 8991 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8992 if (ret) { 8993 btrfs_remove_free_space_cache(cache); 8994 btrfs_put_block_group(cache); 8995 return ret; 8996 } 8997 8998 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 8999 &cache->space_info); 9000 if (ret) { 9001 btrfs_remove_free_space_cache(cache); 9002 spin_lock(&root->fs_info->block_group_cache_lock); 9003 rb_erase(&cache->cache_node, 9004 &root->fs_info->block_group_cache_tree); 9005 spin_unlock(&root->fs_info->block_group_cache_lock); 9006 btrfs_put_block_group(cache); 9007 return ret; 9008 } 9009 update_global_block_rsv(root->fs_info); 9010 9011 spin_lock(&cache->space_info->lock); 9012 cache->space_info->bytes_readonly += cache->bytes_super; 9013 spin_unlock(&cache->space_info->lock); 9014 9015 __link_block_group(cache->space_info, cache); 9016 9017 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 9018 9019 set_avail_alloc_bits(extent_root->fs_info, type); 9020 9021 return 0; 9022 } 9023 9024 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 9025 { 9026 u64 extra_flags = chunk_to_extended(flags) & 9027 BTRFS_EXTENDED_PROFILE_MASK; 9028 9029 write_seqlock(&fs_info->profiles_lock); 9030 if (flags & BTRFS_BLOCK_GROUP_DATA) 9031 fs_info->avail_data_alloc_bits &= ~extra_flags; 9032 if (flags & BTRFS_BLOCK_GROUP_METADATA) 9033 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 9034 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 9035 fs_info->avail_system_alloc_bits &= ~extra_flags; 9036 write_sequnlock(&fs_info->profiles_lock); 9037 } 9038 9039 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9040 struct btrfs_root *root, u64 group_start) 9041 { 9042 struct btrfs_path *path; 9043 struct btrfs_block_group_cache *block_group; 9044 struct btrfs_free_cluster *cluster; 9045 struct btrfs_root *tree_root = root->fs_info->tree_root; 9046 struct btrfs_key key; 9047 struct inode *inode; 9048 struct kobject *kobj = NULL; 9049 int ret; 9050 int index; 9051 int factor; 9052 9053 root = root->fs_info->extent_root; 9054 9055 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 9056 BUG_ON(!block_group); 9057 BUG_ON(!block_group->ro); 9058 9059 /* 9060 * Free the reserved super bytes from this block group before 9061 * remove it. 9062 */ 9063 free_excluded_extents(root, block_group); 9064 9065 memcpy(&key, &block_group->key, sizeof(key)); 9066 index = get_block_group_index(block_group); 9067 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 9068 BTRFS_BLOCK_GROUP_RAID1 | 9069 BTRFS_BLOCK_GROUP_RAID10)) 9070 factor = 2; 9071 else 9072 factor = 1; 9073 9074 /* make sure this block group isn't part of an allocation cluster */ 9075 cluster = &root->fs_info->data_alloc_cluster; 9076 spin_lock(&cluster->refill_lock); 9077 btrfs_return_cluster_to_free_space(block_group, cluster); 9078 spin_unlock(&cluster->refill_lock); 9079 9080 /* 9081 * make sure this block group isn't part of a metadata 9082 * allocation cluster 9083 */ 9084 cluster = &root->fs_info->meta_alloc_cluster; 9085 spin_lock(&cluster->refill_lock); 9086 btrfs_return_cluster_to_free_space(block_group, cluster); 9087 spin_unlock(&cluster->refill_lock); 9088 9089 path = btrfs_alloc_path(); 9090 if (!path) { 9091 ret = -ENOMEM; 9092 goto out; 9093 } 9094 9095 inode = lookup_free_space_inode(tree_root, block_group, path); 9096 if (!IS_ERR(inode)) { 9097 ret = btrfs_orphan_add(trans, inode); 9098 if (ret) { 9099 btrfs_add_delayed_iput(inode); 9100 goto out; 9101 } 9102 clear_nlink(inode); 9103 /* One for the block groups ref */ 9104 spin_lock(&block_group->lock); 9105 if (block_group->iref) { 9106 block_group->iref = 0; 9107 block_group->inode = NULL; 9108 spin_unlock(&block_group->lock); 9109 iput(inode); 9110 } else { 9111 spin_unlock(&block_group->lock); 9112 } 9113 /* One for our lookup ref */ 9114 btrfs_add_delayed_iput(inode); 9115 } 9116 9117 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 9118 key.offset = block_group->key.objectid; 9119 key.type = 0; 9120 9121 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 9122 if (ret < 0) 9123 goto out; 9124 if (ret > 0) 9125 btrfs_release_path(path); 9126 if (ret == 0) { 9127 ret = btrfs_del_item(trans, tree_root, path); 9128 if (ret) 9129 goto out; 9130 btrfs_release_path(path); 9131 } 9132 9133 spin_lock(&root->fs_info->block_group_cache_lock); 9134 rb_erase(&block_group->cache_node, 9135 &root->fs_info->block_group_cache_tree); 9136 9137 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9138 root->fs_info->first_logical_byte = (u64)-1; 9139 spin_unlock(&root->fs_info->block_group_cache_lock); 9140 9141 down_write(&block_group->space_info->groups_sem); 9142 /* 9143 * we must use list_del_init so people can check to see if they 9144 * are still on the list after taking the semaphore 9145 */ 9146 list_del_init(&block_group->list); 9147 if (list_empty(&block_group->space_info->block_groups[index])) { 9148 kobj = block_group->space_info->block_group_kobjs[index]; 9149 block_group->space_info->block_group_kobjs[index] = NULL; 9150 clear_avail_alloc_bits(root->fs_info, block_group->flags); 9151 } 9152 up_write(&block_group->space_info->groups_sem); 9153 if (kobj) { 9154 kobject_del(kobj); 9155 kobject_put(kobj); 9156 } 9157 9158 if (block_group->cached == BTRFS_CACHE_STARTED) 9159 wait_block_group_cache_done(block_group); 9160 9161 btrfs_remove_free_space_cache(block_group); 9162 9163 spin_lock(&block_group->space_info->lock); 9164 block_group->space_info->total_bytes -= block_group->key.offset; 9165 block_group->space_info->bytes_readonly -= block_group->key.offset; 9166 block_group->space_info->disk_total -= block_group->key.offset * factor; 9167 spin_unlock(&block_group->space_info->lock); 9168 9169 memcpy(&key, &block_group->key, sizeof(key)); 9170 9171 btrfs_clear_space_info_full(root->fs_info); 9172 9173 btrfs_put_block_group(block_group); 9174 btrfs_put_block_group(block_group); 9175 9176 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 9177 if (ret > 0) 9178 ret = -EIO; 9179 if (ret < 0) 9180 goto out; 9181 9182 ret = btrfs_del_item(trans, root, path); 9183 out: 9184 btrfs_free_path(path); 9185 return ret; 9186 } 9187 9188 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9189 { 9190 struct btrfs_space_info *space_info; 9191 struct btrfs_super_block *disk_super; 9192 u64 features; 9193 u64 flags; 9194 int mixed = 0; 9195 int ret; 9196 9197 disk_super = fs_info->super_copy; 9198 if (!btrfs_super_root(disk_super)) 9199 return 1; 9200 9201 features = btrfs_super_incompat_flags(disk_super); 9202 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 9203 mixed = 1; 9204 9205 flags = BTRFS_BLOCK_GROUP_SYSTEM; 9206 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9207 if (ret) 9208 goto out; 9209 9210 if (mixed) { 9211 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 9212 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9213 } else { 9214 flags = BTRFS_BLOCK_GROUP_METADATA; 9215 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9216 if (ret) 9217 goto out; 9218 9219 flags = BTRFS_BLOCK_GROUP_DATA; 9220 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9221 } 9222 out: 9223 return ret; 9224 } 9225 9226 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 9227 { 9228 return unpin_extent_range(root, start, end); 9229 } 9230 9231 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 9232 u64 num_bytes, u64 *actual_bytes) 9233 { 9234 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 9235 } 9236 9237 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 9238 { 9239 struct btrfs_fs_info *fs_info = root->fs_info; 9240 struct btrfs_block_group_cache *cache = NULL; 9241 u64 group_trimmed; 9242 u64 start; 9243 u64 end; 9244 u64 trimmed = 0; 9245 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 9246 int ret = 0; 9247 9248 /* 9249 * try to trim all FS space, our block group may start from non-zero. 9250 */ 9251 if (range->len == total_bytes) 9252 cache = btrfs_lookup_first_block_group(fs_info, range->start); 9253 else 9254 cache = btrfs_lookup_block_group(fs_info, range->start); 9255 9256 while (cache) { 9257 if (cache->key.objectid >= (range->start + range->len)) { 9258 btrfs_put_block_group(cache); 9259 break; 9260 } 9261 9262 start = max(range->start, cache->key.objectid); 9263 end = min(range->start + range->len, 9264 cache->key.objectid + cache->key.offset); 9265 9266 if (end - start >= range->minlen) { 9267 if (!block_group_cache_done(cache)) { 9268 ret = cache_block_group(cache, 0); 9269 if (ret) { 9270 btrfs_put_block_group(cache); 9271 break; 9272 } 9273 ret = wait_block_group_cache_done(cache); 9274 if (ret) { 9275 btrfs_put_block_group(cache); 9276 break; 9277 } 9278 } 9279 ret = btrfs_trim_block_group(cache, 9280 &group_trimmed, 9281 start, 9282 end, 9283 range->minlen); 9284 9285 trimmed += group_trimmed; 9286 if (ret) { 9287 btrfs_put_block_group(cache); 9288 break; 9289 } 9290 } 9291 9292 cache = next_block_group(fs_info->tree_root, cache); 9293 } 9294 9295 range->len = trimmed; 9296 return ret; 9297 } 9298 9299 /* 9300 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), 9301 * they are used to prevent the some tasks writing data into the page cache 9302 * by nocow before the subvolume is snapshoted, but flush the data into 9303 * the disk after the snapshot creation. 9304 */ 9305 void btrfs_end_nocow_write(struct btrfs_root *root) 9306 { 9307 percpu_counter_dec(&root->subv_writers->counter); 9308 /* 9309 * Make sure counter is updated before we wake up 9310 * waiters. 9311 */ 9312 smp_mb(); 9313 if (waitqueue_active(&root->subv_writers->wait)) 9314 wake_up(&root->subv_writers->wait); 9315 } 9316 9317 int btrfs_start_nocow_write(struct btrfs_root *root) 9318 { 9319 if (unlikely(atomic_read(&root->will_be_snapshoted))) 9320 return 0; 9321 9322 percpu_counter_inc(&root->subv_writers->counter); 9323 /* 9324 * Make sure counter is updated before we check for snapshot creation. 9325 */ 9326 smp_mb(); 9327 if (unlikely(atomic_read(&root->will_be_snapshoted))) { 9328 btrfs_end_nocow_write(root); 9329 return 0; 9330 } 9331 return 1; 9332 } 9333