1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "math.h" 37 #include "sysfs.h" 38 #include "qgroup.h" 39 40 #undef SCRAMBLE_DELAYED_REFS 41 42 /* 43 * control flags for do_chunk_alloc's force field 44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 45 * if we really need one. 46 * 47 * CHUNK_ALLOC_LIMITED means to only try and allocate one 48 * if we have very few chunks already allocated. This is 49 * used as part of the clustering code to help make sure 50 * we have a good pool of storage to cluster in, without 51 * filling the FS with empty chunks 52 * 53 * CHUNK_ALLOC_FORCE means it must try to allocate one 54 * 55 */ 56 enum { 57 CHUNK_ALLOC_NO_FORCE = 0, 58 CHUNK_ALLOC_LIMITED = 1, 59 CHUNK_ALLOC_FORCE = 2, 60 }; 61 62 /* 63 * Control how reservations are dealt with. 64 * 65 * RESERVE_FREE - freeing a reservation. 66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for 67 * ENOSPC accounting 68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update 69 * bytes_may_use as the ENOSPC accounting is done elsewhere 70 */ 71 enum { 72 RESERVE_FREE = 0, 73 RESERVE_ALLOC = 1, 74 RESERVE_ALLOC_NO_ACCOUNT = 2, 75 }; 76 77 static int update_block_group(struct btrfs_root *root, 78 u64 bytenr, u64 num_bytes, int alloc); 79 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 80 struct btrfs_root *root, 81 u64 bytenr, u64 num_bytes, u64 parent, 82 u64 root_objectid, u64 owner_objectid, 83 u64 owner_offset, int refs_to_drop, 84 struct btrfs_delayed_extent_op *extra_op, 85 int no_quota); 86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 87 struct extent_buffer *leaf, 88 struct btrfs_extent_item *ei); 89 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 90 struct btrfs_root *root, 91 u64 parent, u64 root_objectid, 92 u64 flags, u64 owner, u64 offset, 93 struct btrfs_key *ins, int ref_mod); 94 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, 96 u64 parent, u64 root_objectid, 97 u64 flags, struct btrfs_disk_key *key, 98 int level, struct btrfs_key *ins, 99 int no_quota); 100 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 101 struct btrfs_root *extent_root, u64 flags, 102 int force); 103 static int find_next_key(struct btrfs_path *path, int level, 104 struct btrfs_key *key); 105 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 106 int dump_block_groups); 107 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 108 u64 num_bytes, int reserve, 109 int delalloc); 110 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 111 u64 num_bytes); 112 int btrfs_pin_extent(struct btrfs_root *root, 113 u64 bytenr, u64 num_bytes, int reserved); 114 115 static noinline int 116 block_group_cache_done(struct btrfs_block_group_cache *cache) 117 { 118 smp_mb(); 119 return cache->cached == BTRFS_CACHE_FINISHED || 120 cache->cached == BTRFS_CACHE_ERROR; 121 } 122 123 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 124 { 125 return (cache->flags & bits) == bits; 126 } 127 128 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 129 { 130 atomic_inc(&cache->count); 131 } 132 133 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 134 { 135 if (atomic_dec_and_test(&cache->count)) { 136 WARN_ON(cache->pinned > 0); 137 WARN_ON(cache->reserved > 0); 138 kfree(cache->free_space_ctl); 139 kfree(cache); 140 } 141 } 142 143 /* 144 * this adds the block group to the fs_info rb tree for the block group 145 * cache 146 */ 147 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 148 struct btrfs_block_group_cache *block_group) 149 { 150 struct rb_node **p; 151 struct rb_node *parent = NULL; 152 struct btrfs_block_group_cache *cache; 153 154 spin_lock(&info->block_group_cache_lock); 155 p = &info->block_group_cache_tree.rb_node; 156 157 while (*p) { 158 parent = *p; 159 cache = rb_entry(parent, struct btrfs_block_group_cache, 160 cache_node); 161 if (block_group->key.objectid < cache->key.objectid) { 162 p = &(*p)->rb_left; 163 } else if (block_group->key.objectid > cache->key.objectid) { 164 p = &(*p)->rb_right; 165 } else { 166 spin_unlock(&info->block_group_cache_lock); 167 return -EEXIST; 168 } 169 } 170 171 rb_link_node(&block_group->cache_node, parent, p); 172 rb_insert_color(&block_group->cache_node, 173 &info->block_group_cache_tree); 174 175 if (info->first_logical_byte > block_group->key.objectid) 176 info->first_logical_byte = block_group->key.objectid; 177 178 spin_unlock(&info->block_group_cache_lock); 179 180 return 0; 181 } 182 183 /* 184 * This will return the block group at or after bytenr if contains is 0, else 185 * it will return the block group that contains the bytenr 186 */ 187 static struct btrfs_block_group_cache * 188 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 189 int contains) 190 { 191 struct btrfs_block_group_cache *cache, *ret = NULL; 192 struct rb_node *n; 193 u64 end, start; 194 195 spin_lock(&info->block_group_cache_lock); 196 n = info->block_group_cache_tree.rb_node; 197 198 while (n) { 199 cache = rb_entry(n, struct btrfs_block_group_cache, 200 cache_node); 201 end = cache->key.objectid + cache->key.offset - 1; 202 start = cache->key.objectid; 203 204 if (bytenr < start) { 205 if (!contains && (!ret || start < ret->key.objectid)) 206 ret = cache; 207 n = n->rb_left; 208 } else if (bytenr > start) { 209 if (contains && bytenr <= end) { 210 ret = cache; 211 break; 212 } 213 n = n->rb_right; 214 } else { 215 ret = cache; 216 break; 217 } 218 } 219 if (ret) { 220 btrfs_get_block_group(ret); 221 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 222 info->first_logical_byte = ret->key.objectid; 223 } 224 spin_unlock(&info->block_group_cache_lock); 225 226 return ret; 227 } 228 229 static int add_excluded_extent(struct btrfs_root *root, 230 u64 start, u64 num_bytes) 231 { 232 u64 end = start + num_bytes - 1; 233 set_extent_bits(&root->fs_info->freed_extents[0], 234 start, end, EXTENT_UPTODATE, GFP_NOFS); 235 set_extent_bits(&root->fs_info->freed_extents[1], 236 start, end, EXTENT_UPTODATE, GFP_NOFS); 237 return 0; 238 } 239 240 static void free_excluded_extents(struct btrfs_root *root, 241 struct btrfs_block_group_cache *cache) 242 { 243 u64 start, end; 244 245 start = cache->key.objectid; 246 end = start + cache->key.offset - 1; 247 248 clear_extent_bits(&root->fs_info->freed_extents[0], 249 start, end, EXTENT_UPTODATE, GFP_NOFS); 250 clear_extent_bits(&root->fs_info->freed_extents[1], 251 start, end, EXTENT_UPTODATE, GFP_NOFS); 252 } 253 254 static int exclude_super_stripes(struct btrfs_root *root, 255 struct btrfs_block_group_cache *cache) 256 { 257 u64 bytenr; 258 u64 *logical; 259 int stripe_len; 260 int i, nr, ret; 261 262 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 263 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 264 cache->bytes_super += stripe_len; 265 ret = add_excluded_extent(root, cache->key.objectid, 266 stripe_len); 267 if (ret) 268 return ret; 269 } 270 271 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 272 bytenr = btrfs_sb_offset(i); 273 ret = btrfs_rmap_block(&root->fs_info->mapping_tree, 274 cache->key.objectid, bytenr, 275 0, &logical, &nr, &stripe_len); 276 if (ret) 277 return ret; 278 279 while (nr--) { 280 u64 start, len; 281 282 if (logical[nr] > cache->key.objectid + 283 cache->key.offset) 284 continue; 285 286 if (logical[nr] + stripe_len <= cache->key.objectid) 287 continue; 288 289 start = logical[nr]; 290 if (start < cache->key.objectid) { 291 start = cache->key.objectid; 292 len = (logical[nr] + stripe_len) - start; 293 } else { 294 len = min_t(u64, stripe_len, 295 cache->key.objectid + 296 cache->key.offset - start); 297 } 298 299 cache->bytes_super += len; 300 ret = add_excluded_extent(root, start, len); 301 if (ret) { 302 kfree(logical); 303 return ret; 304 } 305 } 306 307 kfree(logical); 308 } 309 return 0; 310 } 311 312 static struct btrfs_caching_control * 313 get_caching_control(struct btrfs_block_group_cache *cache) 314 { 315 struct btrfs_caching_control *ctl; 316 317 spin_lock(&cache->lock); 318 if (cache->cached != BTRFS_CACHE_STARTED) { 319 spin_unlock(&cache->lock); 320 return NULL; 321 } 322 323 /* We're loading it the fast way, so we don't have a caching_ctl. */ 324 if (!cache->caching_ctl) { 325 spin_unlock(&cache->lock); 326 return NULL; 327 } 328 329 ctl = cache->caching_ctl; 330 atomic_inc(&ctl->count); 331 spin_unlock(&cache->lock); 332 return ctl; 333 } 334 335 static void put_caching_control(struct btrfs_caching_control *ctl) 336 { 337 if (atomic_dec_and_test(&ctl->count)) 338 kfree(ctl); 339 } 340 341 /* 342 * this is only called by cache_block_group, since we could have freed extents 343 * we need to check the pinned_extents for any extents that can't be used yet 344 * since their free space will be released as soon as the transaction commits. 345 */ 346 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 347 struct btrfs_fs_info *info, u64 start, u64 end) 348 { 349 u64 extent_start, extent_end, size, total_added = 0; 350 int ret; 351 352 while (start < end) { 353 ret = find_first_extent_bit(info->pinned_extents, start, 354 &extent_start, &extent_end, 355 EXTENT_DIRTY | EXTENT_UPTODATE, 356 NULL); 357 if (ret) 358 break; 359 360 if (extent_start <= start) { 361 start = extent_end + 1; 362 } else if (extent_start > start && extent_start < end) { 363 size = extent_start - start; 364 total_added += size; 365 ret = btrfs_add_free_space(block_group, start, 366 size); 367 BUG_ON(ret); /* -ENOMEM or logic error */ 368 start = extent_end + 1; 369 } else { 370 break; 371 } 372 } 373 374 if (start < end) { 375 size = end - start; 376 total_added += size; 377 ret = btrfs_add_free_space(block_group, start, size); 378 BUG_ON(ret); /* -ENOMEM or logic error */ 379 } 380 381 return total_added; 382 } 383 384 static noinline void caching_thread(struct btrfs_work *work) 385 { 386 struct btrfs_block_group_cache *block_group; 387 struct btrfs_fs_info *fs_info; 388 struct btrfs_caching_control *caching_ctl; 389 struct btrfs_root *extent_root; 390 struct btrfs_path *path; 391 struct extent_buffer *leaf; 392 struct btrfs_key key; 393 u64 total_found = 0; 394 u64 last = 0; 395 u32 nritems; 396 int ret = -ENOMEM; 397 398 caching_ctl = container_of(work, struct btrfs_caching_control, work); 399 block_group = caching_ctl->block_group; 400 fs_info = block_group->fs_info; 401 extent_root = fs_info->extent_root; 402 403 path = btrfs_alloc_path(); 404 if (!path) 405 goto out; 406 407 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 408 409 /* 410 * We don't want to deadlock with somebody trying to allocate a new 411 * extent for the extent root while also trying to search the extent 412 * root to add free space. So we skip locking and search the commit 413 * root, since its read-only 414 */ 415 path->skip_locking = 1; 416 path->search_commit_root = 1; 417 path->reada = 1; 418 419 key.objectid = last; 420 key.offset = 0; 421 key.type = BTRFS_EXTENT_ITEM_KEY; 422 again: 423 mutex_lock(&caching_ctl->mutex); 424 /* need to make sure the commit_root doesn't disappear */ 425 down_read(&fs_info->commit_root_sem); 426 427 next: 428 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 429 if (ret < 0) 430 goto err; 431 432 leaf = path->nodes[0]; 433 nritems = btrfs_header_nritems(leaf); 434 435 while (1) { 436 if (btrfs_fs_closing(fs_info) > 1) { 437 last = (u64)-1; 438 break; 439 } 440 441 if (path->slots[0] < nritems) { 442 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 443 } else { 444 ret = find_next_key(path, 0, &key); 445 if (ret) 446 break; 447 448 if (need_resched() || 449 rwsem_is_contended(&fs_info->commit_root_sem)) { 450 caching_ctl->progress = last; 451 btrfs_release_path(path); 452 up_read(&fs_info->commit_root_sem); 453 mutex_unlock(&caching_ctl->mutex); 454 cond_resched(); 455 goto again; 456 } 457 458 ret = btrfs_next_leaf(extent_root, path); 459 if (ret < 0) 460 goto err; 461 if (ret) 462 break; 463 leaf = path->nodes[0]; 464 nritems = btrfs_header_nritems(leaf); 465 continue; 466 } 467 468 if (key.objectid < last) { 469 key.objectid = last; 470 key.offset = 0; 471 key.type = BTRFS_EXTENT_ITEM_KEY; 472 473 caching_ctl->progress = last; 474 btrfs_release_path(path); 475 goto next; 476 } 477 478 if (key.objectid < block_group->key.objectid) { 479 path->slots[0]++; 480 continue; 481 } 482 483 if (key.objectid >= block_group->key.objectid + 484 block_group->key.offset) 485 break; 486 487 if (key.type == BTRFS_EXTENT_ITEM_KEY || 488 key.type == BTRFS_METADATA_ITEM_KEY) { 489 total_found += add_new_free_space(block_group, 490 fs_info, last, 491 key.objectid); 492 if (key.type == BTRFS_METADATA_ITEM_KEY) 493 last = key.objectid + 494 fs_info->tree_root->leafsize; 495 else 496 last = key.objectid + key.offset; 497 498 if (total_found > (1024 * 1024 * 2)) { 499 total_found = 0; 500 wake_up(&caching_ctl->wait); 501 } 502 } 503 path->slots[0]++; 504 } 505 ret = 0; 506 507 total_found += add_new_free_space(block_group, fs_info, last, 508 block_group->key.objectid + 509 block_group->key.offset); 510 caching_ctl->progress = (u64)-1; 511 512 spin_lock(&block_group->lock); 513 block_group->caching_ctl = NULL; 514 block_group->cached = BTRFS_CACHE_FINISHED; 515 spin_unlock(&block_group->lock); 516 517 err: 518 btrfs_free_path(path); 519 up_read(&fs_info->commit_root_sem); 520 521 free_excluded_extents(extent_root, block_group); 522 523 mutex_unlock(&caching_ctl->mutex); 524 out: 525 if (ret) { 526 spin_lock(&block_group->lock); 527 block_group->caching_ctl = NULL; 528 block_group->cached = BTRFS_CACHE_ERROR; 529 spin_unlock(&block_group->lock); 530 } 531 wake_up(&caching_ctl->wait); 532 533 put_caching_control(caching_ctl); 534 btrfs_put_block_group(block_group); 535 } 536 537 static int cache_block_group(struct btrfs_block_group_cache *cache, 538 int load_cache_only) 539 { 540 DEFINE_WAIT(wait); 541 struct btrfs_fs_info *fs_info = cache->fs_info; 542 struct btrfs_caching_control *caching_ctl; 543 int ret = 0; 544 545 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 546 if (!caching_ctl) 547 return -ENOMEM; 548 549 INIT_LIST_HEAD(&caching_ctl->list); 550 mutex_init(&caching_ctl->mutex); 551 init_waitqueue_head(&caching_ctl->wait); 552 caching_ctl->block_group = cache; 553 caching_ctl->progress = cache->key.objectid; 554 atomic_set(&caching_ctl->count, 1); 555 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 556 557 spin_lock(&cache->lock); 558 /* 559 * This should be a rare occasion, but this could happen I think in the 560 * case where one thread starts to load the space cache info, and then 561 * some other thread starts a transaction commit which tries to do an 562 * allocation while the other thread is still loading the space cache 563 * info. The previous loop should have kept us from choosing this block 564 * group, but if we've moved to the state where we will wait on caching 565 * block groups we need to first check if we're doing a fast load here, 566 * so we can wait for it to finish, otherwise we could end up allocating 567 * from a block group who's cache gets evicted for one reason or 568 * another. 569 */ 570 while (cache->cached == BTRFS_CACHE_FAST) { 571 struct btrfs_caching_control *ctl; 572 573 ctl = cache->caching_ctl; 574 atomic_inc(&ctl->count); 575 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 576 spin_unlock(&cache->lock); 577 578 schedule(); 579 580 finish_wait(&ctl->wait, &wait); 581 put_caching_control(ctl); 582 spin_lock(&cache->lock); 583 } 584 585 if (cache->cached != BTRFS_CACHE_NO) { 586 spin_unlock(&cache->lock); 587 kfree(caching_ctl); 588 return 0; 589 } 590 WARN_ON(cache->caching_ctl); 591 cache->caching_ctl = caching_ctl; 592 cache->cached = BTRFS_CACHE_FAST; 593 spin_unlock(&cache->lock); 594 595 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 596 ret = load_free_space_cache(fs_info, cache); 597 598 spin_lock(&cache->lock); 599 if (ret == 1) { 600 cache->caching_ctl = NULL; 601 cache->cached = BTRFS_CACHE_FINISHED; 602 cache->last_byte_to_unpin = (u64)-1; 603 } else { 604 if (load_cache_only) { 605 cache->caching_ctl = NULL; 606 cache->cached = BTRFS_CACHE_NO; 607 } else { 608 cache->cached = BTRFS_CACHE_STARTED; 609 } 610 } 611 spin_unlock(&cache->lock); 612 wake_up(&caching_ctl->wait); 613 if (ret == 1) { 614 put_caching_control(caching_ctl); 615 free_excluded_extents(fs_info->extent_root, cache); 616 return 0; 617 } 618 } else { 619 /* 620 * We are not going to do the fast caching, set cached to the 621 * appropriate value and wakeup any waiters. 622 */ 623 spin_lock(&cache->lock); 624 if (load_cache_only) { 625 cache->caching_ctl = NULL; 626 cache->cached = BTRFS_CACHE_NO; 627 } else { 628 cache->cached = BTRFS_CACHE_STARTED; 629 } 630 spin_unlock(&cache->lock); 631 wake_up(&caching_ctl->wait); 632 } 633 634 if (load_cache_only) { 635 put_caching_control(caching_ctl); 636 return 0; 637 } 638 639 down_write(&fs_info->commit_root_sem); 640 atomic_inc(&caching_ctl->count); 641 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 642 up_write(&fs_info->commit_root_sem); 643 644 btrfs_get_block_group(cache); 645 646 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 647 648 return ret; 649 } 650 651 /* 652 * return the block group that starts at or after bytenr 653 */ 654 static struct btrfs_block_group_cache * 655 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 656 { 657 struct btrfs_block_group_cache *cache; 658 659 cache = block_group_cache_tree_search(info, bytenr, 0); 660 661 return cache; 662 } 663 664 /* 665 * return the block group that contains the given bytenr 666 */ 667 struct btrfs_block_group_cache *btrfs_lookup_block_group( 668 struct btrfs_fs_info *info, 669 u64 bytenr) 670 { 671 struct btrfs_block_group_cache *cache; 672 673 cache = block_group_cache_tree_search(info, bytenr, 1); 674 675 return cache; 676 } 677 678 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 679 u64 flags) 680 { 681 struct list_head *head = &info->space_info; 682 struct btrfs_space_info *found; 683 684 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 685 686 rcu_read_lock(); 687 list_for_each_entry_rcu(found, head, list) { 688 if (found->flags & flags) { 689 rcu_read_unlock(); 690 return found; 691 } 692 } 693 rcu_read_unlock(); 694 return NULL; 695 } 696 697 /* 698 * after adding space to the filesystem, we need to clear the full flags 699 * on all the space infos. 700 */ 701 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 702 { 703 struct list_head *head = &info->space_info; 704 struct btrfs_space_info *found; 705 706 rcu_read_lock(); 707 list_for_each_entry_rcu(found, head, list) 708 found->full = 0; 709 rcu_read_unlock(); 710 } 711 712 /* simple helper to search for an existing extent at a given offset */ 713 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 714 { 715 int ret; 716 struct btrfs_key key; 717 struct btrfs_path *path; 718 719 path = btrfs_alloc_path(); 720 if (!path) 721 return -ENOMEM; 722 723 key.objectid = start; 724 key.offset = len; 725 key.type = BTRFS_EXTENT_ITEM_KEY; 726 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 727 0, 0); 728 if (ret > 0) { 729 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 730 if (key.objectid == start && 731 key.type == BTRFS_METADATA_ITEM_KEY) 732 ret = 0; 733 } 734 btrfs_free_path(path); 735 return ret; 736 } 737 738 /* 739 * helper function to lookup reference count and flags of a tree block. 740 * 741 * the head node for delayed ref is used to store the sum of all the 742 * reference count modifications queued up in the rbtree. the head 743 * node may also store the extent flags to set. This way you can check 744 * to see what the reference count and extent flags would be if all of 745 * the delayed refs are not processed. 746 */ 747 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 748 struct btrfs_root *root, u64 bytenr, 749 u64 offset, int metadata, u64 *refs, u64 *flags) 750 { 751 struct btrfs_delayed_ref_head *head; 752 struct btrfs_delayed_ref_root *delayed_refs; 753 struct btrfs_path *path; 754 struct btrfs_extent_item *ei; 755 struct extent_buffer *leaf; 756 struct btrfs_key key; 757 u32 item_size; 758 u64 num_refs; 759 u64 extent_flags; 760 int ret; 761 762 /* 763 * If we don't have skinny metadata, don't bother doing anything 764 * different 765 */ 766 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 767 offset = root->leafsize; 768 metadata = 0; 769 } 770 771 path = btrfs_alloc_path(); 772 if (!path) 773 return -ENOMEM; 774 775 if (!trans) { 776 path->skip_locking = 1; 777 path->search_commit_root = 1; 778 } 779 780 search_again: 781 key.objectid = bytenr; 782 key.offset = offset; 783 if (metadata) 784 key.type = BTRFS_METADATA_ITEM_KEY; 785 else 786 key.type = BTRFS_EXTENT_ITEM_KEY; 787 788 again: 789 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 790 &key, path, 0, 0); 791 if (ret < 0) 792 goto out_free; 793 794 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 795 if (path->slots[0]) { 796 path->slots[0]--; 797 btrfs_item_key_to_cpu(path->nodes[0], &key, 798 path->slots[0]); 799 if (key.objectid == bytenr && 800 key.type == BTRFS_EXTENT_ITEM_KEY && 801 key.offset == root->leafsize) 802 ret = 0; 803 } 804 if (ret) { 805 key.objectid = bytenr; 806 key.type = BTRFS_EXTENT_ITEM_KEY; 807 key.offset = root->leafsize; 808 btrfs_release_path(path); 809 goto again; 810 } 811 } 812 813 if (ret == 0) { 814 leaf = path->nodes[0]; 815 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 816 if (item_size >= sizeof(*ei)) { 817 ei = btrfs_item_ptr(leaf, path->slots[0], 818 struct btrfs_extent_item); 819 num_refs = btrfs_extent_refs(leaf, ei); 820 extent_flags = btrfs_extent_flags(leaf, ei); 821 } else { 822 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 823 struct btrfs_extent_item_v0 *ei0; 824 BUG_ON(item_size != sizeof(*ei0)); 825 ei0 = btrfs_item_ptr(leaf, path->slots[0], 826 struct btrfs_extent_item_v0); 827 num_refs = btrfs_extent_refs_v0(leaf, ei0); 828 /* FIXME: this isn't correct for data */ 829 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 830 #else 831 BUG(); 832 #endif 833 } 834 BUG_ON(num_refs == 0); 835 } else { 836 num_refs = 0; 837 extent_flags = 0; 838 ret = 0; 839 } 840 841 if (!trans) 842 goto out; 843 844 delayed_refs = &trans->transaction->delayed_refs; 845 spin_lock(&delayed_refs->lock); 846 head = btrfs_find_delayed_ref_head(trans, bytenr); 847 if (head) { 848 if (!mutex_trylock(&head->mutex)) { 849 atomic_inc(&head->node.refs); 850 spin_unlock(&delayed_refs->lock); 851 852 btrfs_release_path(path); 853 854 /* 855 * Mutex was contended, block until it's released and try 856 * again 857 */ 858 mutex_lock(&head->mutex); 859 mutex_unlock(&head->mutex); 860 btrfs_put_delayed_ref(&head->node); 861 goto search_again; 862 } 863 spin_lock(&head->lock); 864 if (head->extent_op && head->extent_op->update_flags) 865 extent_flags |= head->extent_op->flags_to_set; 866 else 867 BUG_ON(num_refs == 0); 868 869 num_refs += head->node.ref_mod; 870 spin_unlock(&head->lock); 871 mutex_unlock(&head->mutex); 872 } 873 spin_unlock(&delayed_refs->lock); 874 out: 875 WARN_ON(num_refs == 0); 876 if (refs) 877 *refs = num_refs; 878 if (flags) 879 *flags = extent_flags; 880 out_free: 881 btrfs_free_path(path); 882 return ret; 883 } 884 885 /* 886 * Back reference rules. Back refs have three main goals: 887 * 888 * 1) differentiate between all holders of references to an extent so that 889 * when a reference is dropped we can make sure it was a valid reference 890 * before freeing the extent. 891 * 892 * 2) Provide enough information to quickly find the holders of an extent 893 * if we notice a given block is corrupted or bad. 894 * 895 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 896 * maintenance. This is actually the same as #2, but with a slightly 897 * different use case. 898 * 899 * There are two kinds of back refs. The implicit back refs is optimized 900 * for pointers in non-shared tree blocks. For a given pointer in a block, 901 * back refs of this kind provide information about the block's owner tree 902 * and the pointer's key. These information allow us to find the block by 903 * b-tree searching. The full back refs is for pointers in tree blocks not 904 * referenced by their owner trees. The location of tree block is recorded 905 * in the back refs. Actually the full back refs is generic, and can be 906 * used in all cases the implicit back refs is used. The major shortcoming 907 * of the full back refs is its overhead. Every time a tree block gets 908 * COWed, we have to update back refs entry for all pointers in it. 909 * 910 * For a newly allocated tree block, we use implicit back refs for 911 * pointers in it. This means most tree related operations only involve 912 * implicit back refs. For a tree block created in old transaction, the 913 * only way to drop a reference to it is COW it. So we can detect the 914 * event that tree block loses its owner tree's reference and do the 915 * back refs conversion. 916 * 917 * When a tree block is COW'd through a tree, there are four cases: 918 * 919 * The reference count of the block is one and the tree is the block's 920 * owner tree. Nothing to do in this case. 921 * 922 * The reference count of the block is one and the tree is not the 923 * block's owner tree. In this case, full back refs is used for pointers 924 * in the block. Remove these full back refs, add implicit back refs for 925 * every pointers in the new block. 926 * 927 * The reference count of the block is greater than one and the tree is 928 * the block's owner tree. In this case, implicit back refs is used for 929 * pointers in the block. Add full back refs for every pointers in the 930 * block, increase lower level extents' reference counts. The original 931 * implicit back refs are entailed to the new block. 932 * 933 * The reference count of the block is greater than one and the tree is 934 * not the block's owner tree. Add implicit back refs for every pointer in 935 * the new block, increase lower level extents' reference count. 936 * 937 * Back Reference Key composing: 938 * 939 * The key objectid corresponds to the first byte in the extent, 940 * The key type is used to differentiate between types of back refs. 941 * There are different meanings of the key offset for different types 942 * of back refs. 943 * 944 * File extents can be referenced by: 945 * 946 * - multiple snapshots, subvolumes, or different generations in one subvol 947 * - different files inside a single subvolume 948 * - different offsets inside a file (bookend extents in file.c) 949 * 950 * The extent ref structure for the implicit back refs has fields for: 951 * 952 * - Objectid of the subvolume root 953 * - objectid of the file holding the reference 954 * - original offset in the file 955 * - how many bookend extents 956 * 957 * The key offset for the implicit back refs is hash of the first 958 * three fields. 959 * 960 * The extent ref structure for the full back refs has field for: 961 * 962 * - number of pointers in the tree leaf 963 * 964 * The key offset for the implicit back refs is the first byte of 965 * the tree leaf 966 * 967 * When a file extent is allocated, The implicit back refs is used. 968 * the fields are filled in: 969 * 970 * (root_key.objectid, inode objectid, offset in file, 1) 971 * 972 * When a file extent is removed file truncation, we find the 973 * corresponding implicit back refs and check the following fields: 974 * 975 * (btrfs_header_owner(leaf), inode objectid, offset in file) 976 * 977 * Btree extents can be referenced by: 978 * 979 * - Different subvolumes 980 * 981 * Both the implicit back refs and the full back refs for tree blocks 982 * only consist of key. The key offset for the implicit back refs is 983 * objectid of block's owner tree. The key offset for the full back refs 984 * is the first byte of parent block. 985 * 986 * When implicit back refs is used, information about the lowest key and 987 * level of the tree block are required. These information are stored in 988 * tree block info structure. 989 */ 990 991 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 992 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 993 struct btrfs_root *root, 994 struct btrfs_path *path, 995 u64 owner, u32 extra_size) 996 { 997 struct btrfs_extent_item *item; 998 struct btrfs_extent_item_v0 *ei0; 999 struct btrfs_extent_ref_v0 *ref0; 1000 struct btrfs_tree_block_info *bi; 1001 struct extent_buffer *leaf; 1002 struct btrfs_key key; 1003 struct btrfs_key found_key; 1004 u32 new_size = sizeof(*item); 1005 u64 refs; 1006 int ret; 1007 1008 leaf = path->nodes[0]; 1009 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1010 1011 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1012 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1013 struct btrfs_extent_item_v0); 1014 refs = btrfs_extent_refs_v0(leaf, ei0); 1015 1016 if (owner == (u64)-1) { 1017 while (1) { 1018 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1019 ret = btrfs_next_leaf(root, path); 1020 if (ret < 0) 1021 return ret; 1022 BUG_ON(ret > 0); /* Corruption */ 1023 leaf = path->nodes[0]; 1024 } 1025 btrfs_item_key_to_cpu(leaf, &found_key, 1026 path->slots[0]); 1027 BUG_ON(key.objectid != found_key.objectid); 1028 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1029 path->slots[0]++; 1030 continue; 1031 } 1032 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1033 struct btrfs_extent_ref_v0); 1034 owner = btrfs_ref_objectid_v0(leaf, ref0); 1035 break; 1036 } 1037 } 1038 btrfs_release_path(path); 1039 1040 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1041 new_size += sizeof(*bi); 1042 1043 new_size -= sizeof(*ei0); 1044 ret = btrfs_search_slot(trans, root, &key, path, 1045 new_size + extra_size, 1); 1046 if (ret < 0) 1047 return ret; 1048 BUG_ON(ret); /* Corruption */ 1049 1050 btrfs_extend_item(root, path, new_size); 1051 1052 leaf = path->nodes[0]; 1053 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1054 btrfs_set_extent_refs(leaf, item, refs); 1055 /* FIXME: get real generation */ 1056 btrfs_set_extent_generation(leaf, item, 0); 1057 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1058 btrfs_set_extent_flags(leaf, item, 1059 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1060 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1061 bi = (struct btrfs_tree_block_info *)(item + 1); 1062 /* FIXME: get first key of the block */ 1063 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); 1064 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1065 } else { 1066 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1067 } 1068 btrfs_mark_buffer_dirty(leaf); 1069 return 0; 1070 } 1071 #endif 1072 1073 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1074 { 1075 u32 high_crc = ~(u32)0; 1076 u32 low_crc = ~(u32)0; 1077 __le64 lenum; 1078 1079 lenum = cpu_to_le64(root_objectid); 1080 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1081 lenum = cpu_to_le64(owner); 1082 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1083 lenum = cpu_to_le64(offset); 1084 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1085 1086 return ((u64)high_crc << 31) ^ (u64)low_crc; 1087 } 1088 1089 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1090 struct btrfs_extent_data_ref *ref) 1091 { 1092 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1093 btrfs_extent_data_ref_objectid(leaf, ref), 1094 btrfs_extent_data_ref_offset(leaf, ref)); 1095 } 1096 1097 static int match_extent_data_ref(struct extent_buffer *leaf, 1098 struct btrfs_extent_data_ref *ref, 1099 u64 root_objectid, u64 owner, u64 offset) 1100 { 1101 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1102 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1103 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1104 return 0; 1105 return 1; 1106 } 1107 1108 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1109 struct btrfs_root *root, 1110 struct btrfs_path *path, 1111 u64 bytenr, u64 parent, 1112 u64 root_objectid, 1113 u64 owner, u64 offset) 1114 { 1115 struct btrfs_key key; 1116 struct btrfs_extent_data_ref *ref; 1117 struct extent_buffer *leaf; 1118 u32 nritems; 1119 int ret; 1120 int recow; 1121 int err = -ENOENT; 1122 1123 key.objectid = bytenr; 1124 if (parent) { 1125 key.type = BTRFS_SHARED_DATA_REF_KEY; 1126 key.offset = parent; 1127 } else { 1128 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1129 key.offset = hash_extent_data_ref(root_objectid, 1130 owner, offset); 1131 } 1132 again: 1133 recow = 0; 1134 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1135 if (ret < 0) { 1136 err = ret; 1137 goto fail; 1138 } 1139 1140 if (parent) { 1141 if (!ret) 1142 return 0; 1143 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1144 key.type = BTRFS_EXTENT_REF_V0_KEY; 1145 btrfs_release_path(path); 1146 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1147 if (ret < 0) { 1148 err = ret; 1149 goto fail; 1150 } 1151 if (!ret) 1152 return 0; 1153 #endif 1154 goto fail; 1155 } 1156 1157 leaf = path->nodes[0]; 1158 nritems = btrfs_header_nritems(leaf); 1159 while (1) { 1160 if (path->slots[0] >= nritems) { 1161 ret = btrfs_next_leaf(root, path); 1162 if (ret < 0) 1163 err = ret; 1164 if (ret) 1165 goto fail; 1166 1167 leaf = path->nodes[0]; 1168 nritems = btrfs_header_nritems(leaf); 1169 recow = 1; 1170 } 1171 1172 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1173 if (key.objectid != bytenr || 1174 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1175 goto fail; 1176 1177 ref = btrfs_item_ptr(leaf, path->slots[0], 1178 struct btrfs_extent_data_ref); 1179 1180 if (match_extent_data_ref(leaf, ref, root_objectid, 1181 owner, offset)) { 1182 if (recow) { 1183 btrfs_release_path(path); 1184 goto again; 1185 } 1186 err = 0; 1187 break; 1188 } 1189 path->slots[0]++; 1190 } 1191 fail: 1192 return err; 1193 } 1194 1195 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1196 struct btrfs_root *root, 1197 struct btrfs_path *path, 1198 u64 bytenr, u64 parent, 1199 u64 root_objectid, u64 owner, 1200 u64 offset, int refs_to_add) 1201 { 1202 struct btrfs_key key; 1203 struct extent_buffer *leaf; 1204 u32 size; 1205 u32 num_refs; 1206 int ret; 1207 1208 key.objectid = bytenr; 1209 if (parent) { 1210 key.type = BTRFS_SHARED_DATA_REF_KEY; 1211 key.offset = parent; 1212 size = sizeof(struct btrfs_shared_data_ref); 1213 } else { 1214 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1215 key.offset = hash_extent_data_ref(root_objectid, 1216 owner, offset); 1217 size = sizeof(struct btrfs_extent_data_ref); 1218 } 1219 1220 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1221 if (ret && ret != -EEXIST) 1222 goto fail; 1223 1224 leaf = path->nodes[0]; 1225 if (parent) { 1226 struct btrfs_shared_data_ref *ref; 1227 ref = btrfs_item_ptr(leaf, path->slots[0], 1228 struct btrfs_shared_data_ref); 1229 if (ret == 0) { 1230 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1231 } else { 1232 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1233 num_refs += refs_to_add; 1234 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1235 } 1236 } else { 1237 struct btrfs_extent_data_ref *ref; 1238 while (ret == -EEXIST) { 1239 ref = btrfs_item_ptr(leaf, path->slots[0], 1240 struct btrfs_extent_data_ref); 1241 if (match_extent_data_ref(leaf, ref, root_objectid, 1242 owner, offset)) 1243 break; 1244 btrfs_release_path(path); 1245 key.offset++; 1246 ret = btrfs_insert_empty_item(trans, root, path, &key, 1247 size); 1248 if (ret && ret != -EEXIST) 1249 goto fail; 1250 1251 leaf = path->nodes[0]; 1252 } 1253 ref = btrfs_item_ptr(leaf, path->slots[0], 1254 struct btrfs_extent_data_ref); 1255 if (ret == 0) { 1256 btrfs_set_extent_data_ref_root(leaf, ref, 1257 root_objectid); 1258 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1259 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1260 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1261 } else { 1262 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1263 num_refs += refs_to_add; 1264 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1265 } 1266 } 1267 btrfs_mark_buffer_dirty(leaf); 1268 ret = 0; 1269 fail: 1270 btrfs_release_path(path); 1271 return ret; 1272 } 1273 1274 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1275 struct btrfs_root *root, 1276 struct btrfs_path *path, 1277 int refs_to_drop, int *last_ref) 1278 { 1279 struct btrfs_key key; 1280 struct btrfs_extent_data_ref *ref1 = NULL; 1281 struct btrfs_shared_data_ref *ref2 = NULL; 1282 struct extent_buffer *leaf; 1283 u32 num_refs = 0; 1284 int ret = 0; 1285 1286 leaf = path->nodes[0]; 1287 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1288 1289 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1290 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1291 struct btrfs_extent_data_ref); 1292 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1293 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1294 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1295 struct btrfs_shared_data_ref); 1296 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1297 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1298 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1299 struct btrfs_extent_ref_v0 *ref0; 1300 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1301 struct btrfs_extent_ref_v0); 1302 num_refs = btrfs_ref_count_v0(leaf, ref0); 1303 #endif 1304 } else { 1305 BUG(); 1306 } 1307 1308 BUG_ON(num_refs < refs_to_drop); 1309 num_refs -= refs_to_drop; 1310 1311 if (num_refs == 0) { 1312 ret = btrfs_del_item(trans, root, path); 1313 *last_ref = 1; 1314 } else { 1315 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1316 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1317 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1318 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1319 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1320 else { 1321 struct btrfs_extent_ref_v0 *ref0; 1322 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1323 struct btrfs_extent_ref_v0); 1324 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1325 } 1326 #endif 1327 btrfs_mark_buffer_dirty(leaf); 1328 } 1329 return ret; 1330 } 1331 1332 static noinline u32 extent_data_ref_count(struct btrfs_root *root, 1333 struct btrfs_path *path, 1334 struct btrfs_extent_inline_ref *iref) 1335 { 1336 struct btrfs_key key; 1337 struct extent_buffer *leaf; 1338 struct btrfs_extent_data_ref *ref1; 1339 struct btrfs_shared_data_ref *ref2; 1340 u32 num_refs = 0; 1341 1342 leaf = path->nodes[0]; 1343 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1344 if (iref) { 1345 if (btrfs_extent_inline_ref_type(leaf, iref) == 1346 BTRFS_EXTENT_DATA_REF_KEY) { 1347 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1348 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1349 } else { 1350 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1351 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1352 } 1353 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1354 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1355 struct btrfs_extent_data_ref); 1356 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1357 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1358 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1359 struct btrfs_shared_data_ref); 1360 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1361 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1362 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1363 struct btrfs_extent_ref_v0 *ref0; 1364 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1365 struct btrfs_extent_ref_v0); 1366 num_refs = btrfs_ref_count_v0(leaf, ref0); 1367 #endif 1368 } else { 1369 WARN_ON(1); 1370 } 1371 return num_refs; 1372 } 1373 1374 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1375 struct btrfs_root *root, 1376 struct btrfs_path *path, 1377 u64 bytenr, u64 parent, 1378 u64 root_objectid) 1379 { 1380 struct btrfs_key key; 1381 int ret; 1382 1383 key.objectid = bytenr; 1384 if (parent) { 1385 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1386 key.offset = parent; 1387 } else { 1388 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1389 key.offset = root_objectid; 1390 } 1391 1392 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1393 if (ret > 0) 1394 ret = -ENOENT; 1395 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1396 if (ret == -ENOENT && parent) { 1397 btrfs_release_path(path); 1398 key.type = BTRFS_EXTENT_REF_V0_KEY; 1399 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1400 if (ret > 0) 1401 ret = -ENOENT; 1402 } 1403 #endif 1404 return ret; 1405 } 1406 1407 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1408 struct btrfs_root *root, 1409 struct btrfs_path *path, 1410 u64 bytenr, u64 parent, 1411 u64 root_objectid) 1412 { 1413 struct btrfs_key key; 1414 int ret; 1415 1416 key.objectid = bytenr; 1417 if (parent) { 1418 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1419 key.offset = parent; 1420 } else { 1421 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1422 key.offset = root_objectid; 1423 } 1424 1425 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1426 btrfs_release_path(path); 1427 return ret; 1428 } 1429 1430 static inline int extent_ref_type(u64 parent, u64 owner) 1431 { 1432 int type; 1433 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1434 if (parent > 0) 1435 type = BTRFS_SHARED_BLOCK_REF_KEY; 1436 else 1437 type = BTRFS_TREE_BLOCK_REF_KEY; 1438 } else { 1439 if (parent > 0) 1440 type = BTRFS_SHARED_DATA_REF_KEY; 1441 else 1442 type = BTRFS_EXTENT_DATA_REF_KEY; 1443 } 1444 return type; 1445 } 1446 1447 static int find_next_key(struct btrfs_path *path, int level, 1448 struct btrfs_key *key) 1449 1450 { 1451 for (; level < BTRFS_MAX_LEVEL; level++) { 1452 if (!path->nodes[level]) 1453 break; 1454 if (path->slots[level] + 1 >= 1455 btrfs_header_nritems(path->nodes[level])) 1456 continue; 1457 if (level == 0) 1458 btrfs_item_key_to_cpu(path->nodes[level], key, 1459 path->slots[level] + 1); 1460 else 1461 btrfs_node_key_to_cpu(path->nodes[level], key, 1462 path->slots[level] + 1); 1463 return 0; 1464 } 1465 return 1; 1466 } 1467 1468 /* 1469 * look for inline back ref. if back ref is found, *ref_ret is set 1470 * to the address of inline back ref, and 0 is returned. 1471 * 1472 * if back ref isn't found, *ref_ret is set to the address where it 1473 * should be inserted, and -ENOENT is returned. 1474 * 1475 * if insert is true and there are too many inline back refs, the path 1476 * points to the extent item, and -EAGAIN is returned. 1477 * 1478 * NOTE: inline back refs are ordered in the same way that back ref 1479 * items in the tree are ordered. 1480 */ 1481 static noinline_for_stack 1482 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1483 struct btrfs_root *root, 1484 struct btrfs_path *path, 1485 struct btrfs_extent_inline_ref **ref_ret, 1486 u64 bytenr, u64 num_bytes, 1487 u64 parent, u64 root_objectid, 1488 u64 owner, u64 offset, int insert) 1489 { 1490 struct btrfs_key key; 1491 struct extent_buffer *leaf; 1492 struct btrfs_extent_item *ei; 1493 struct btrfs_extent_inline_ref *iref; 1494 u64 flags; 1495 u64 item_size; 1496 unsigned long ptr; 1497 unsigned long end; 1498 int extra_size; 1499 int type; 1500 int want; 1501 int ret; 1502 int err = 0; 1503 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 1504 SKINNY_METADATA); 1505 1506 key.objectid = bytenr; 1507 key.type = BTRFS_EXTENT_ITEM_KEY; 1508 key.offset = num_bytes; 1509 1510 want = extent_ref_type(parent, owner); 1511 if (insert) { 1512 extra_size = btrfs_extent_inline_ref_size(want); 1513 path->keep_locks = 1; 1514 } else 1515 extra_size = -1; 1516 1517 /* 1518 * Owner is our parent level, so we can just add one to get the level 1519 * for the block we are interested in. 1520 */ 1521 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1522 key.type = BTRFS_METADATA_ITEM_KEY; 1523 key.offset = owner; 1524 } 1525 1526 again: 1527 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1528 if (ret < 0) { 1529 err = ret; 1530 goto out; 1531 } 1532 1533 /* 1534 * We may be a newly converted file system which still has the old fat 1535 * extent entries for metadata, so try and see if we have one of those. 1536 */ 1537 if (ret > 0 && skinny_metadata) { 1538 skinny_metadata = false; 1539 if (path->slots[0]) { 1540 path->slots[0]--; 1541 btrfs_item_key_to_cpu(path->nodes[0], &key, 1542 path->slots[0]); 1543 if (key.objectid == bytenr && 1544 key.type == BTRFS_EXTENT_ITEM_KEY && 1545 key.offset == num_bytes) 1546 ret = 0; 1547 } 1548 if (ret) { 1549 key.objectid = bytenr; 1550 key.type = BTRFS_EXTENT_ITEM_KEY; 1551 key.offset = num_bytes; 1552 btrfs_release_path(path); 1553 goto again; 1554 } 1555 } 1556 1557 if (ret && !insert) { 1558 err = -ENOENT; 1559 goto out; 1560 } else if (WARN_ON(ret)) { 1561 err = -EIO; 1562 goto out; 1563 } 1564 1565 leaf = path->nodes[0]; 1566 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1567 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1568 if (item_size < sizeof(*ei)) { 1569 if (!insert) { 1570 err = -ENOENT; 1571 goto out; 1572 } 1573 ret = convert_extent_item_v0(trans, root, path, owner, 1574 extra_size); 1575 if (ret < 0) { 1576 err = ret; 1577 goto out; 1578 } 1579 leaf = path->nodes[0]; 1580 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1581 } 1582 #endif 1583 BUG_ON(item_size < sizeof(*ei)); 1584 1585 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1586 flags = btrfs_extent_flags(leaf, ei); 1587 1588 ptr = (unsigned long)(ei + 1); 1589 end = (unsigned long)ei + item_size; 1590 1591 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1592 ptr += sizeof(struct btrfs_tree_block_info); 1593 BUG_ON(ptr > end); 1594 } 1595 1596 err = -ENOENT; 1597 while (1) { 1598 if (ptr >= end) { 1599 WARN_ON(ptr > end); 1600 break; 1601 } 1602 iref = (struct btrfs_extent_inline_ref *)ptr; 1603 type = btrfs_extent_inline_ref_type(leaf, iref); 1604 if (want < type) 1605 break; 1606 if (want > type) { 1607 ptr += btrfs_extent_inline_ref_size(type); 1608 continue; 1609 } 1610 1611 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1612 struct btrfs_extent_data_ref *dref; 1613 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1614 if (match_extent_data_ref(leaf, dref, root_objectid, 1615 owner, offset)) { 1616 err = 0; 1617 break; 1618 } 1619 if (hash_extent_data_ref_item(leaf, dref) < 1620 hash_extent_data_ref(root_objectid, owner, offset)) 1621 break; 1622 } else { 1623 u64 ref_offset; 1624 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1625 if (parent > 0) { 1626 if (parent == ref_offset) { 1627 err = 0; 1628 break; 1629 } 1630 if (ref_offset < parent) 1631 break; 1632 } else { 1633 if (root_objectid == ref_offset) { 1634 err = 0; 1635 break; 1636 } 1637 if (ref_offset < root_objectid) 1638 break; 1639 } 1640 } 1641 ptr += btrfs_extent_inline_ref_size(type); 1642 } 1643 if (err == -ENOENT && insert) { 1644 if (item_size + extra_size >= 1645 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1646 err = -EAGAIN; 1647 goto out; 1648 } 1649 /* 1650 * To add new inline back ref, we have to make sure 1651 * there is no corresponding back ref item. 1652 * For simplicity, we just do not add new inline back 1653 * ref if there is any kind of item for this block 1654 */ 1655 if (find_next_key(path, 0, &key) == 0 && 1656 key.objectid == bytenr && 1657 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1658 err = -EAGAIN; 1659 goto out; 1660 } 1661 } 1662 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1663 out: 1664 if (insert) { 1665 path->keep_locks = 0; 1666 btrfs_unlock_up_safe(path, 1); 1667 } 1668 return err; 1669 } 1670 1671 /* 1672 * helper to add new inline back ref 1673 */ 1674 static noinline_for_stack 1675 void setup_inline_extent_backref(struct btrfs_root *root, 1676 struct btrfs_path *path, 1677 struct btrfs_extent_inline_ref *iref, 1678 u64 parent, u64 root_objectid, 1679 u64 owner, u64 offset, int refs_to_add, 1680 struct btrfs_delayed_extent_op *extent_op) 1681 { 1682 struct extent_buffer *leaf; 1683 struct btrfs_extent_item *ei; 1684 unsigned long ptr; 1685 unsigned long end; 1686 unsigned long item_offset; 1687 u64 refs; 1688 int size; 1689 int type; 1690 1691 leaf = path->nodes[0]; 1692 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1693 item_offset = (unsigned long)iref - (unsigned long)ei; 1694 1695 type = extent_ref_type(parent, owner); 1696 size = btrfs_extent_inline_ref_size(type); 1697 1698 btrfs_extend_item(root, path, size); 1699 1700 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1701 refs = btrfs_extent_refs(leaf, ei); 1702 refs += refs_to_add; 1703 btrfs_set_extent_refs(leaf, ei, refs); 1704 if (extent_op) 1705 __run_delayed_extent_op(extent_op, leaf, ei); 1706 1707 ptr = (unsigned long)ei + item_offset; 1708 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1709 if (ptr < end - size) 1710 memmove_extent_buffer(leaf, ptr + size, ptr, 1711 end - size - ptr); 1712 1713 iref = (struct btrfs_extent_inline_ref *)ptr; 1714 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1715 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1716 struct btrfs_extent_data_ref *dref; 1717 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1718 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1719 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1720 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1721 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1722 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1723 struct btrfs_shared_data_ref *sref; 1724 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1725 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1726 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1727 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1728 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1729 } else { 1730 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1731 } 1732 btrfs_mark_buffer_dirty(leaf); 1733 } 1734 1735 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1736 struct btrfs_root *root, 1737 struct btrfs_path *path, 1738 struct btrfs_extent_inline_ref **ref_ret, 1739 u64 bytenr, u64 num_bytes, u64 parent, 1740 u64 root_objectid, u64 owner, u64 offset) 1741 { 1742 int ret; 1743 1744 ret = lookup_inline_extent_backref(trans, root, path, ref_ret, 1745 bytenr, num_bytes, parent, 1746 root_objectid, owner, offset, 0); 1747 if (ret != -ENOENT) 1748 return ret; 1749 1750 btrfs_release_path(path); 1751 *ref_ret = NULL; 1752 1753 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1754 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, 1755 root_objectid); 1756 } else { 1757 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, 1758 root_objectid, owner, offset); 1759 } 1760 return ret; 1761 } 1762 1763 /* 1764 * helper to update/remove inline back ref 1765 */ 1766 static noinline_for_stack 1767 void update_inline_extent_backref(struct btrfs_root *root, 1768 struct btrfs_path *path, 1769 struct btrfs_extent_inline_ref *iref, 1770 int refs_to_mod, 1771 struct btrfs_delayed_extent_op *extent_op, 1772 int *last_ref) 1773 { 1774 struct extent_buffer *leaf; 1775 struct btrfs_extent_item *ei; 1776 struct btrfs_extent_data_ref *dref = NULL; 1777 struct btrfs_shared_data_ref *sref = NULL; 1778 unsigned long ptr; 1779 unsigned long end; 1780 u32 item_size; 1781 int size; 1782 int type; 1783 u64 refs; 1784 1785 leaf = path->nodes[0]; 1786 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1787 refs = btrfs_extent_refs(leaf, ei); 1788 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1789 refs += refs_to_mod; 1790 btrfs_set_extent_refs(leaf, ei, refs); 1791 if (extent_op) 1792 __run_delayed_extent_op(extent_op, leaf, ei); 1793 1794 type = btrfs_extent_inline_ref_type(leaf, iref); 1795 1796 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1797 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1798 refs = btrfs_extent_data_ref_count(leaf, dref); 1799 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1800 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1801 refs = btrfs_shared_data_ref_count(leaf, sref); 1802 } else { 1803 refs = 1; 1804 BUG_ON(refs_to_mod != -1); 1805 } 1806 1807 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1808 refs += refs_to_mod; 1809 1810 if (refs > 0) { 1811 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1812 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1813 else 1814 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1815 } else { 1816 *last_ref = 1; 1817 size = btrfs_extent_inline_ref_size(type); 1818 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1819 ptr = (unsigned long)iref; 1820 end = (unsigned long)ei + item_size; 1821 if (ptr + size < end) 1822 memmove_extent_buffer(leaf, ptr, ptr + size, 1823 end - ptr - size); 1824 item_size -= size; 1825 btrfs_truncate_item(root, path, item_size, 1); 1826 } 1827 btrfs_mark_buffer_dirty(leaf); 1828 } 1829 1830 static noinline_for_stack 1831 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1832 struct btrfs_root *root, 1833 struct btrfs_path *path, 1834 u64 bytenr, u64 num_bytes, u64 parent, 1835 u64 root_objectid, u64 owner, 1836 u64 offset, int refs_to_add, 1837 struct btrfs_delayed_extent_op *extent_op) 1838 { 1839 struct btrfs_extent_inline_ref *iref; 1840 int ret; 1841 1842 ret = lookup_inline_extent_backref(trans, root, path, &iref, 1843 bytenr, num_bytes, parent, 1844 root_objectid, owner, offset, 1); 1845 if (ret == 0) { 1846 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1847 update_inline_extent_backref(root, path, iref, 1848 refs_to_add, extent_op, NULL); 1849 } else if (ret == -ENOENT) { 1850 setup_inline_extent_backref(root, path, iref, parent, 1851 root_objectid, owner, offset, 1852 refs_to_add, extent_op); 1853 ret = 0; 1854 } 1855 return ret; 1856 } 1857 1858 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1859 struct btrfs_root *root, 1860 struct btrfs_path *path, 1861 u64 bytenr, u64 parent, u64 root_objectid, 1862 u64 owner, u64 offset, int refs_to_add) 1863 { 1864 int ret; 1865 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1866 BUG_ON(refs_to_add != 1); 1867 ret = insert_tree_block_ref(trans, root, path, bytenr, 1868 parent, root_objectid); 1869 } else { 1870 ret = insert_extent_data_ref(trans, root, path, bytenr, 1871 parent, root_objectid, 1872 owner, offset, refs_to_add); 1873 } 1874 return ret; 1875 } 1876 1877 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1878 struct btrfs_root *root, 1879 struct btrfs_path *path, 1880 struct btrfs_extent_inline_ref *iref, 1881 int refs_to_drop, int is_data, int *last_ref) 1882 { 1883 int ret = 0; 1884 1885 BUG_ON(!is_data && refs_to_drop != 1); 1886 if (iref) { 1887 update_inline_extent_backref(root, path, iref, 1888 -refs_to_drop, NULL, last_ref); 1889 } else if (is_data) { 1890 ret = remove_extent_data_ref(trans, root, path, refs_to_drop, 1891 last_ref); 1892 } else { 1893 *last_ref = 1; 1894 ret = btrfs_del_item(trans, root, path); 1895 } 1896 return ret; 1897 } 1898 1899 static int btrfs_issue_discard(struct block_device *bdev, 1900 u64 start, u64 len) 1901 { 1902 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); 1903 } 1904 1905 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1906 u64 num_bytes, u64 *actual_bytes) 1907 { 1908 int ret; 1909 u64 discarded_bytes = 0; 1910 struct btrfs_bio *bbio = NULL; 1911 1912 1913 /* Tell the block device(s) that the sectors can be discarded */ 1914 ret = btrfs_map_block(root->fs_info, REQ_DISCARD, 1915 bytenr, &num_bytes, &bbio, 0); 1916 /* Error condition is -ENOMEM */ 1917 if (!ret) { 1918 struct btrfs_bio_stripe *stripe = bbio->stripes; 1919 int i; 1920 1921 1922 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 1923 if (!stripe->dev->can_discard) 1924 continue; 1925 1926 ret = btrfs_issue_discard(stripe->dev->bdev, 1927 stripe->physical, 1928 stripe->length); 1929 if (!ret) 1930 discarded_bytes += stripe->length; 1931 else if (ret != -EOPNOTSUPP) 1932 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 1933 1934 /* 1935 * Just in case we get back EOPNOTSUPP for some reason, 1936 * just ignore the return value so we don't screw up 1937 * people calling discard_extent. 1938 */ 1939 ret = 0; 1940 } 1941 kfree(bbio); 1942 } 1943 1944 if (actual_bytes) 1945 *actual_bytes = discarded_bytes; 1946 1947 1948 if (ret == -EOPNOTSUPP) 1949 ret = 0; 1950 return ret; 1951 } 1952 1953 /* Can return -ENOMEM */ 1954 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1955 struct btrfs_root *root, 1956 u64 bytenr, u64 num_bytes, u64 parent, 1957 u64 root_objectid, u64 owner, u64 offset, 1958 int no_quota) 1959 { 1960 int ret; 1961 struct btrfs_fs_info *fs_info = root->fs_info; 1962 1963 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1964 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1965 1966 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1967 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 1968 num_bytes, 1969 parent, root_objectid, (int)owner, 1970 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1971 } else { 1972 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 1973 num_bytes, 1974 parent, root_objectid, owner, offset, 1975 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 1976 } 1977 return ret; 1978 } 1979 1980 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1981 struct btrfs_root *root, 1982 u64 bytenr, u64 num_bytes, 1983 u64 parent, u64 root_objectid, 1984 u64 owner, u64 offset, int refs_to_add, 1985 int no_quota, 1986 struct btrfs_delayed_extent_op *extent_op) 1987 { 1988 struct btrfs_fs_info *fs_info = root->fs_info; 1989 struct btrfs_path *path; 1990 struct extent_buffer *leaf; 1991 struct btrfs_extent_item *item; 1992 struct btrfs_key key; 1993 u64 refs; 1994 int ret; 1995 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; 1996 1997 path = btrfs_alloc_path(); 1998 if (!path) 1999 return -ENOMEM; 2000 2001 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) 2002 no_quota = 1; 2003 2004 path->reada = 1; 2005 path->leave_spinning = 1; 2006 /* this will setup the path even if it fails to insert the back ref */ 2007 ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, 2008 bytenr, num_bytes, parent, 2009 root_objectid, owner, offset, 2010 refs_to_add, extent_op); 2011 if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) 2012 goto out; 2013 /* 2014 * Ok we were able to insert an inline extent and it appears to be a new 2015 * reference, deal with the qgroup accounting. 2016 */ 2017 if (!ret && !no_quota) { 2018 ASSERT(root->fs_info->quota_enabled); 2019 leaf = path->nodes[0]; 2020 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2021 item = btrfs_item_ptr(leaf, path->slots[0], 2022 struct btrfs_extent_item); 2023 if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) 2024 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2025 btrfs_release_path(path); 2026 2027 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2028 bytenr, num_bytes, type, 0); 2029 goto out; 2030 } 2031 2032 /* 2033 * Ok we had -EAGAIN which means we didn't have space to insert and 2034 * inline extent ref, so just update the reference count and add a 2035 * normal backref. 2036 */ 2037 leaf = path->nodes[0]; 2038 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2039 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2040 refs = btrfs_extent_refs(leaf, item); 2041 if (refs) 2042 type = BTRFS_QGROUP_OPER_ADD_SHARED; 2043 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2044 if (extent_op) 2045 __run_delayed_extent_op(extent_op, leaf, item); 2046 2047 btrfs_mark_buffer_dirty(leaf); 2048 btrfs_release_path(path); 2049 2050 if (!no_quota) { 2051 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 2052 bytenr, num_bytes, type, 0); 2053 if (ret) 2054 goto out; 2055 } 2056 2057 path->reada = 1; 2058 path->leave_spinning = 1; 2059 /* now insert the actual backref */ 2060 ret = insert_extent_backref(trans, root->fs_info->extent_root, 2061 path, bytenr, parent, root_objectid, 2062 owner, offset, refs_to_add); 2063 if (ret) 2064 btrfs_abort_transaction(trans, root, ret); 2065 out: 2066 btrfs_free_path(path); 2067 return ret; 2068 } 2069 2070 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2071 struct btrfs_root *root, 2072 struct btrfs_delayed_ref_node *node, 2073 struct btrfs_delayed_extent_op *extent_op, 2074 int insert_reserved) 2075 { 2076 int ret = 0; 2077 struct btrfs_delayed_data_ref *ref; 2078 struct btrfs_key ins; 2079 u64 parent = 0; 2080 u64 ref_root = 0; 2081 u64 flags = 0; 2082 2083 ins.objectid = node->bytenr; 2084 ins.offset = node->num_bytes; 2085 ins.type = BTRFS_EXTENT_ITEM_KEY; 2086 2087 ref = btrfs_delayed_node_to_data_ref(node); 2088 trace_run_delayed_data_ref(node, ref, node->action); 2089 2090 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2091 parent = ref->parent; 2092 ref_root = ref->root; 2093 2094 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2095 if (extent_op) 2096 flags |= extent_op->flags_to_set; 2097 ret = alloc_reserved_file_extent(trans, root, 2098 parent, ref_root, flags, 2099 ref->objectid, ref->offset, 2100 &ins, node->ref_mod); 2101 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2102 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2103 node->num_bytes, parent, 2104 ref_root, ref->objectid, 2105 ref->offset, node->ref_mod, 2106 node->no_quota, extent_op); 2107 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2108 ret = __btrfs_free_extent(trans, root, node->bytenr, 2109 node->num_bytes, parent, 2110 ref_root, ref->objectid, 2111 ref->offset, node->ref_mod, 2112 extent_op, node->no_quota); 2113 } else { 2114 BUG(); 2115 } 2116 return ret; 2117 } 2118 2119 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2120 struct extent_buffer *leaf, 2121 struct btrfs_extent_item *ei) 2122 { 2123 u64 flags = btrfs_extent_flags(leaf, ei); 2124 if (extent_op->update_flags) { 2125 flags |= extent_op->flags_to_set; 2126 btrfs_set_extent_flags(leaf, ei, flags); 2127 } 2128 2129 if (extent_op->update_key) { 2130 struct btrfs_tree_block_info *bi; 2131 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2132 bi = (struct btrfs_tree_block_info *)(ei + 1); 2133 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2134 } 2135 } 2136 2137 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2138 struct btrfs_root *root, 2139 struct btrfs_delayed_ref_node *node, 2140 struct btrfs_delayed_extent_op *extent_op) 2141 { 2142 struct btrfs_key key; 2143 struct btrfs_path *path; 2144 struct btrfs_extent_item *ei; 2145 struct extent_buffer *leaf; 2146 u32 item_size; 2147 int ret; 2148 int err = 0; 2149 int metadata = !extent_op->is_data; 2150 2151 if (trans->aborted) 2152 return 0; 2153 2154 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2155 metadata = 0; 2156 2157 path = btrfs_alloc_path(); 2158 if (!path) 2159 return -ENOMEM; 2160 2161 key.objectid = node->bytenr; 2162 2163 if (metadata) { 2164 key.type = BTRFS_METADATA_ITEM_KEY; 2165 key.offset = extent_op->level; 2166 } else { 2167 key.type = BTRFS_EXTENT_ITEM_KEY; 2168 key.offset = node->num_bytes; 2169 } 2170 2171 again: 2172 path->reada = 1; 2173 path->leave_spinning = 1; 2174 ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, 2175 path, 0, 1); 2176 if (ret < 0) { 2177 err = ret; 2178 goto out; 2179 } 2180 if (ret > 0) { 2181 if (metadata) { 2182 if (path->slots[0] > 0) { 2183 path->slots[0]--; 2184 btrfs_item_key_to_cpu(path->nodes[0], &key, 2185 path->slots[0]); 2186 if (key.objectid == node->bytenr && 2187 key.type == BTRFS_EXTENT_ITEM_KEY && 2188 key.offset == node->num_bytes) 2189 ret = 0; 2190 } 2191 if (ret > 0) { 2192 btrfs_release_path(path); 2193 metadata = 0; 2194 2195 key.objectid = node->bytenr; 2196 key.offset = node->num_bytes; 2197 key.type = BTRFS_EXTENT_ITEM_KEY; 2198 goto again; 2199 } 2200 } else { 2201 err = -EIO; 2202 goto out; 2203 } 2204 } 2205 2206 leaf = path->nodes[0]; 2207 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2208 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2209 if (item_size < sizeof(*ei)) { 2210 ret = convert_extent_item_v0(trans, root->fs_info->extent_root, 2211 path, (u64)-1, 0); 2212 if (ret < 0) { 2213 err = ret; 2214 goto out; 2215 } 2216 leaf = path->nodes[0]; 2217 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2218 } 2219 #endif 2220 BUG_ON(item_size < sizeof(*ei)); 2221 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2222 __run_delayed_extent_op(extent_op, leaf, ei); 2223 2224 btrfs_mark_buffer_dirty(leaf); 2225 out: 2226 btrfs_free_path(path); 2227 return err; 2228 } 2229 2230 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2231 struct btrfs_root *root, 2232 struct btrfs_delayed_ref_node *node, 2233 struct btrfs_delayed_extent_op *extent_op, 2234 int insert_reserved) 2235 { 2236 int ret = 0; 2237 struct btrfs_delayed_tree_ref *ref; 2238 struct btrfs_key ins; 2239 u64 parent = 0; 2240 u64 ref_root = 0; 2241 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 2242 SKINNY_METADATA); 2243 2244 ref = btrfs_delayed_node_to_tree_ref(node); 2245 trace_run_delayed_tree_ref(node, ref, node->action); 2246 2247 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2248 parent = ref->parent; 2249 ref_root = ref->root; 2250 2251 ins.objectid = node->bytenr; 2252 if (skinny_metadata) { 2253 ins.offset = ref->level; 2254 ins.type = BTRFS_METADATA_ITEM_KEY; 2255 } else { 2256 ins.offset = node->num_bytes; 2257 ins.type = BTRFS_EXTENT_ITEM_KEY; 2258 } 2259 2260 BUG_ON(node->ref_mod != 1); 2261 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2262 BUG_ON(!extent_op || !extent_op->update_flags); 2263 ret = alloc_reserved_tree_block(trans, root, 2264 parent, ref_root, 2265 extent_op->flags_to_set, 2266 &extent_op->key, 2267 ref->level, &ins, 2268 node->no_quota); 2269 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2270 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, 2271 node->num_bytes, parent, ref_root, 2272 ref->level, 0, 1, node->no_quota, 2273 extent_op); 2274 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2275 ret = __btrfs_free_extent(trans, root, node->bytenr, 2276 node->num_bytes, parent, ref_root, 2277 ref->level, 0, 1, extent_op, 2278 node->no_quota); 2279 } else { 2280 BUG(); 2281 } 2282 return ret; 2283 } 2284 2285 /* helper function to actually process a single delayed ref entry */ 2286 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2287 struct btrfs_root *root, 2288 struct btrfs_delayed_ref_node *node, 2289 struct btrfs_delayed_extent_op *extent_op, 2290 int insert_reserved) 2291 { 2292 int ret = 0; 2293 2294 if (trans->aborted) { 2295 if (insert_reserved) 2296 btrfs_pin_extent(root, node->bytenr, 2297 node->num_bytes, 1); 2298 return 0; 2299 } 2300 2301 if (btrfs_delayed_ref_is_head(node)) { 2302 struct btrfs_delayed_ref_head *head; 2303 /* 2304 * we've hit the end of the chain and we were supposed 2305 * to insert this extent into the tree. But, it got 2306 * deleted before we ever needed to insert it, so all 2307 * we have to do is clean up the accounting 2308 */ 2309 BUG_ON(extent_op); 2310 head = btrfs_delayed_node_to_head(node); 2311 trace_run_delayed_ref_head(node, head, node->action); 2312 2313 if (insert_reserved) { 2314 btrfs_pin_extent(root, node->bytenr, 2315 node->num_bytes, 1); 2316 if (head->is_data) { 2317 ret = btrfs_del_csums(trans, root, 2318 node->bytenr, 2319 node->num_bytes); 2320 } 2321 } 2322 return ret; 2323 } 2324 2325 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2326 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2327 ret = run_delayed_tree_ref(trans, root, node, extent_op, 2328 insert_reserved); 2329 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2330 node->type == BTRFS_SHARED_DATA_REF_KEY) 2331 ret = run_delayed_data_ref(trans, root, node, extent_op, 2332 insert_reserved); 2333 else 2334 BUG(); 2335 return ret; 2336 } 2337 2338 static noinline struct btrfs_delayed_ref_node * 2339 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2340 { 2341 struct rb_node *node; 2342 struct btrfs_delayed_ref_node *ref, *last = NULL;; 2343 2344 /* 2345 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2346 * this prevents ref count from going down to zero when 2347 * there still are pending delayed ref. 2348 */ 2349 node = rb_first(&head->ref_root); 2350 while (node) { 2351 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2352 rb_node); 2353 if (ref->action == BTRFS_ADD_DELAYED_REF) 2354 return ref; 2355 else if (last == NULL) 2356 last = ref; 2357 node = rb_next(node); 2358 } 2359 return last; 2360 } 2361 2362 /* 2363 * Returns 0 on success or if called with an already aborted transaction. 2364 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2365 */ 2366 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2367 struct btrfs_root *root, 2368 unsigned long nr) 2369 { 2370 struct btrfs_delayed_ref_root *delayed_refs; 2371 struct btrfs_delayed_ref_node *ref; 2372 struct btrfs_delayed_ref_head *locked_ref = NULL; 2373 struct btrfs_delayed_extent_op *extent_op; 2374 struct btrfs_fs_info *fs_info = root->fs_info; 2375 ktime_t start = ktime_get(); 2376 int ret; 2377 unsigned long count = 0; 2378 unsigned long actual_count = 0; 2379 int must_insert_reserved = 0; 2380 2381 delayed_refs = &trans->transaction->delayed_refs; 2382 while (1) { 2383 if (!locked_ref) { 2384 if (count >= nr) 2385 break; 2386 2387 spin_lock(&delayed_refs->lock); 2388 locked_ref = btrfs_select_ref_head(trans); 2389 if (!locked_ref) { 2390 spin_unlock(&delayed_refs->lock); 2391 break; 2392 } 2393 2394 /* grab the lock that says we are going to process 2395 * all the refs for this head */ 2396 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2397 spin_unlock(&delayed_refs->lock); 2398 /* 2399 * we may have dropped the spin lock to get the head 2400 * mutex lock, and that might have given someone else 2401 * time to free the head. If that's true, it has been 2402 * removed from our list and we can move on. 2403 */ 2404 if (ret == -EAGAIN) { 2405 locked_ref = NULL; 2406 count++; 2407 continue; 2408 } 2409 } 2410 2411 /* 2412 * We need to try and merge add/drops of the same ref since we 2413 * can run into issues with relocate dropping the implicit ref 2414 * and then it being added back again before the drop can 2415 * finish. If we merged anything we need to re-loop so we can 2416 * get a good ref. 2417 */ 2418 spin_lock(&locked_ref->lock); 2419 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2420 locked_ref); 2421 2422 /* 2423 * locked_ref is the head node, so we have to go one 2424 * node back for any delayed ref updates 2425 */ 2426 ref = select_delayed_ref(locked_ref); 2427 2428 if (ref && ref->seq && 2429 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2430 spin_unlock(&locked_ref->lock); 2431 btrfs_delayed_ref_unlock(locked_ref); 2432 spin_lock(&delayed_refs->lock); 2433 locked_ref->processing = 0; 2434 delayed_refs->num_heads_ready++; 2435 spin_unlock(&delayed_refs->lock); 2436 locked_ref = NULL; 2437 cond_resched(); 2438 count++; 2439 continue; 2440 } 2441 2442 /* 2443 * record the must insert reserved flag before we 2444 * drop the spin lock. 2445 */ 2446 must_insert_reserved = locked_ref->must_insert_reserved; 2447 locked_ref->must_insert_reserved = 0; 2448 2449 extent_op = locked_ref->extent_op; 2450 locked_ref->extent_op = NULL; 2451 2452 if (!ref) { 2453 2454 2455 /* All delayed refs have been processed, Go ahead 2456 * and send the head node to run_one_delayed_ref, 2457 * so that any accounting fixes can happen 2458 */ 2459 ref = &locked_ref->node; 2460 2461 if (extent_op && must_insert_reserved) { 2462 btrfs_free_delayed_extent_op(extent_op); 2463 extent_op = NULL; 2464 } 2465 2466 if (extent_op) { 2467 spin_unlock(&locked_ref->lock); 2468 ret = run_delayed_extent_op(trans, root, 2469 ref, extent_op); 2470 btrfs_free_delayed_extent_op(extent_op); 2471 2472 if (ret) { 2473 /* 2474 * Need to reset must_insert_reserved if 2475 * there was an error so the abort stuff 2476 * can cleanup the reserved space 2477 * properly. 2478 */ 2479 if (must_insert_reserved) 2480 locked_ref->must_insert_reserved = 1; 2481 locked_ref->processing = 0; 2482 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2483 btrfs_delayed_ref_unlock(locked_ref); 2484 return ret; 2485 } 2486 continue; 2487 } 2488 2489 /* 2490 * Need to drop our head ref lock and re-aqcuire the 2491 * delayed ref lock and then re-check to make sure 2492 * nobody got added. 2493 */ 2494 spin_unlock(&locked_ref->lock); 2495 spin_lock(&delayed_refs->lock); 2496 spin_lock(&locked_ref->lock); 2497 if (rb_first(&locked_ref->ref_root) || 2498 locked_ref->extent_op) { 2499 spin_unlock(&locked_ref->lock); 2500 spin_unlock(&delayed_refs->lock); 2501 continue; 2502 } 2503 ref->in_tree = 0; 2504 delayed_refs->num_heads--; 2505 rb_erase(&locked_ref->href_node, 2506 &delayed_refs->href_root); 2507 spin_unlock(&delayed_refs->lock); 2508 } else { 2509 actual_count++; 2510 ref->in_tree = 0; 2511 rb_erase(&ref->rb_node, &locked_ref->ref_root); 2512 } 2513 atomic_dec(&delayed_refs->num_entries); 2514 2515 if (!btrfs_delayed_ref_is_head(ref)) { 2516 /* 2517 * when we play the delayed ref, also correct the 2518 * ref_mod on head 2519 */ 2520 switch (ref->action) { 2521 case BTRFS_ADD_DELAYED_REF: 2522 case BTRFS_ADD_DELAYED_EXTENT: 2523 locked_ref->node.ref_mod -= ref->ref_mod; 2524 break; 2525 case BTRFS_DROP_DELAYED_REF: 2526 locked_ref->node.ref_mod += ref->ref_mod; 2527 break; 2528 default: 2529 WARN_ON(1); 2530 } 2531 } 2532 spin_unlock(&locked_ref->lock); 2533 2534 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2535 must_insert_reserved); 2536 2537 btrfs_free_delayed_extent_op(extent_op); 2538 if (ret) { 2539 locked_ref->processing = 0; 2540 btrfs_delayed_ref_unlock(locked_ref); 2541 btrfs_put_delayed_ref(ref); 2542 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2543 return ret; 2544 } 2545 2546 /* 2547 * If this node is a head, that means all the refs in this head 2548 * have been dealt with, and we will pick the next head to deal 2549 * with, so we must unlock the head and drop it from the cluster 2550 * list before we release it. 2551 */ 2552 if (btrfs_delayed_ref_is_head(ref)) { 2553 btrfs_delayed_ref_unlock(locked_ref); 2554 locked_ref = NULL; 2555 } 2556 btrfs_put_delayed_ref(ref); 2557 count++; 2558 cond_resched(); 2559 } 2560 2561 /* 2562 * We don't want to include ref heads since we can have empty ref heads 2563 * and those will drastically skew our runtime down since we just do 2564 * accounting, no actual extent tree updates. 2565 */ 2566 if (actual_count > 0) { 2567 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2568 u64 avg; 2569 2570 /* 2571 * We weigh the current average higher than our current runtime 2572 * to avoid large swings in the average. 2573 */ 2574 spin_lock(&delayed_refs->lock); 2575 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2576 avg = div64_u64(avg, 4); 2577 fs_info->avg_delayed_ref_runtime = avg; 2578 spin_unlock(&delayed_refs->lock); 2579 } 2580 return 0; 2581 } 2582 2583 #ifdef SCRAMBLE_DELAYED_REFS 2584 /* 2585 * Normally delayed refs get processed in ascending bytenr order. This 2586 * correlates in most cases to the order added. To expose dependencies on this 2587 * order, we start to process the tree in the middle instead of the beginning 2588 */ 2589 static u64 find_middle(struct rb_root *root) 2590 { 2591 struct rb_node *n = root->rb_node; 2592 struct btrfs_delayed_ref_node *entry; 2593 int alt = 1; 2594 u64 middle; 2595 u64 first = 0, last = 0; 2596 2597 n = rb_first(root); 2598 if (n) { 2599 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2600 first = entry->bytenr; 2601 } 2602 n = rb_last(root); 2603 if (n) { 2604 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2605 last = entry->bytenr; 2606 } 2607 n = root->rb_node; 2608 2609 while (n) { 2610 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2611 WARN_ON(!entry->in_tree); 2612 2613 middle = entry->bytenr; 2614 2615 if (alt) 2616 n = n->rb_left; 2617 else 2618 n = n->rb_right; 2619 2620 alt = 1 - alt; 2621 } 2622 return middle; 2623 } 2624 #endif 2625 2626 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2627 { 2628 u64 num_bytes; 2629 2630 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2631 sizeof(struct btrfs_extent_inline_ref)); 2632 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) 2633 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2634 2635 /* 2636 * We don't ever fill up leaves all the way so multiply by 2 just to be 2637 * closer to what we're really going to want to ouse. 2638 */ 2639 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2640 } 2641 2642 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2643 struct btrfs_root *root) 2644 { 2645 struct btrfs_block_rsv *global_rsv; 2646 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2647 u64 num_bytes; 2648 int ret = 0; 2649 2650 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2651 num_heads = heads_to_leaves(root, num_heads); 2652 if (num_heads > 1) 2653 num_bytes += (num_heads - 1) * root->leafsize; 2654 num_bytes <<= 1; 2655 global_rsv = &root->fs_info->global_block_rsv; 2656 2657 /* 2658 * If we can't allocate any more chunks lets make sure we have _lots_ of 2659 * wiggle room since running delayed refs can create more delayed refs. 2660 */ 2661 if (global_rsv->space_info->full) 2662 num_bytes <<= 1; 2663 2664 spin_lock(&global_rsv->lock); 2665 if (global_rsv->reserved <= num_bytes) 2666 ret = 1; 2667 spin_unlock(&global_rsv->lock); 2668 return ret; 2669 } 2670 2671 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2672 struct btrfs_root *root) 2673 { 2674 struct btrfs_fs_info *fs_info = root->fs_info; 2675 u64 num_entries = 2676 atomic_read(&trans->transaction->delayed_refs.num_entries); 2677 u64 avg_runtime; 2678 u64 val; 2679 2680 smp_mb(); 2681 avg_runtime = fs_info->avg_delayed_ref_runtime; 2682 val = num_entries * avg_runtime; 2683 if (num_entries * avg_runtime >= NSEC_PER_SEC) 2684 return 1; 2685 if (val >= NSEC_PER_SEC / 2) 2686 return 2; 2687 2688 return btrfs_check_space_for_delayed_refs(trans, root); 2689 } 2690 2691 struct async_delayed_refs { 2692 struct btrfs_root *root; 2693 int count; 2694 int error; 2695 int sync; 2696 struct completion wait; 2697 struct btrfs_work work; 2698 }; 2699 2700 static void delayed_ref_async_start(struct btrfs_work *work) 2701 { 2702 struct async_delayed_refs *async; 2703 struct btrfs_trans_handle *trans; 2704 int ret; 2705 2706 async = container_of(work, struct async_delayed_refs, work); 2707 2708 trans = btrfs_join_transaction(async->root); 2709 if (IS_ERR(trans)) { 2710 async->error = PTR_ERR(trans); 2711 goto done; 2712 } 2713 2714 /* 2715 * trans->sync means that when we call end_transaciton, we won't 2716 * wait on delayed refs 2717 */ 2718 trans->sync = true; 2719 ret = btrfs_run_delayed_refs(trans, async->root, async->count); 2720 if (ret) 2721 async->error = ret; 2722 2723 ret = btrfs_end_transaction(trans, async->root); 2724 if (ret && !async->error) 2725 async->error = ret; 2726 done: 2727 if (async->sync) 2728 complete(&async->wait); 2729 else 2730 kfree(async); 2731 } 2732 2733 int btrfs_async_run_delayed_refs(struct btrfs_root *root, 2734 unsigned long count, int wait) 2735 { 2736 struct async_delayed_refs *async; 2737 int ret; 2738 2739 async = kmalloc(sizeof(*async), GFP_NOFS); 2740 if (!async) 2741 return -ENOMEM; 2742 2743 async->root = root->fs_info->tree_root; 2744 async->count = count; 2745 async->error = 0; 2746 if (wait) 2747 async->sync = 1; 2748 else 2749 async->sync = 0; 2750 init_completion(&async->wait); 2751 2752 btrfs_init_work(&async->work, delayed_ref_async_start, 2753 NULL, NULL); 2754 2755 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2756 2757 if (wait) { 2758 wait_for_completion(&async->wait); 2759 ret = async->error; 2760 kfree(async); 2761 return ret; 2762 } 2763 return 0; 2764 } 2765 2766 /* 2767 * this starts processing the delayed reference count updates and 2768 * extent insertions we have queued up so far. count can be 2769 * 0, which means to process everything in the tree at the start 2770 * of the run (but not newly added entries), or it can be some target 2771 * number you'd like to process. 2772 * 2773 * Returns 0 on success or if called with an aborted transaction 2774 * Returns <0 on error and aborts the transaction 2775 */ 2776 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2777 struct btrfs_root *root, unsigned long count) 2778 { 2779 struct rb_node *node; 2780 struct btrfs_delayed_ref_root *delayed_refs; 2781 struct btrfs_delayed_ref_head *head; 2782 int ret; 2783 int run_all = count == (unsigned long)-1; 2784 int run_most = 0; 2785 2786 /* We'll clean this up in btrfs_cleanup_transaction */ 2787 if (trans->aborted) 2788 return 0; 2789 2790 if (root == root->fs_info->extent_root) 2791 root = root->fs_info->tree_root; 2792 2793 delayed_refs = &trans->transaction->delayed_refs; 2794 if (count == 0) { 2795 count = atomic_read(&delayed_refs->num_entries) * 2; 2796 run_most = 1; 2797 } 2798 2799 again: 2800 #ifdef SCRAMBLE_DELAYED_REFS 2801 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2802 #endif 2803 ret = __btrfs_run_delayed_refs(trans, root, count); 2804 if (ret < 0) { 2805 btrfs_abort_transaction(trans, root, ret); 2806 return ret; 2807 } 2808 2809 if (run_all) { 2810 if (!list_empty(&trans->new_bgs)) 2811 btrfs_create_pending_block_groups(trans, root); 2812 2813 spin_lock(&delayed_refs->lock); 2814 node = rb_first(&delayed_refs->href_root); 2815 if (!node) { 2816 spin_unlock(&delayed_refs->lock); 2817 goto out; 2818 } 2819 count = (unsigned long)-1; 2820 2821 while (node) { 2822 head = rb_entry(node, struct btrfs_delayed_ref_head, 2823 href_node); 2824 if (btrfs_delayed_ref_is_head(&head->node)) { 2825 struct btrfs_delayed_ref_node *ref; 2826 2827 ref = &head->node; 2828 atomic_inc(&ref->refs); 2829 2830 spin_unlock(&delayed_refs->lock); 2831 /* 2832 * Mutex was contended, block until it's 2833 * released and try again 2834 */ 2835 mutex_lock(&head->mutex); 2836 mutex_unlock(&head->mutex); 2837 2838 btrfs_put_delayed_ref(ref); 2839 cond_resched(); 2840 goto again; 2841 } else { 2842 WARN_ON(1); 2843 } 2844 node = rb_next(node); 2845 } 2846 spin_unlock(&delayed_refs->lock); 2847 cond_resched(); 2848 goto again; 2849 } 2850 out: 2851 ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); 2852 if (ret) 2853 return ret; 2854 assert_qgroups_uptodate(trans); 2855 return 0; 2856 } 2857 2858 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2859 struct btrfs_root *root, 2860 u64 bytenr, u64 num_bytes, u64 flags, 2861 int level, int is_data) 2862 { 2863 struct btrfs_delayed_extent_op *extent_op; 2864 int ret; 2865 2866 extent_op = btrfs_alloc_delayed_extent_op(); 2867 if (!extent_op) 2868 return -ENOMEM; 2869 2870 extent_op->flags_to_set = flags; 2871 extent_op->update_flags = 1; 2872 extent_op->update_key = 0; 2873 extent_op->is_data = is_data ? 1 : 0; 2874 extent_op->level = level; 2875 2876 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2877 num_bytes, extent_op); 2878 if (ret) 2879 btrfs_free_delayed_extent_op(extent_op); 2880 return ret; 2881 } 2882 2883 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, 2884 struct btrfs_root *root, 2885 struct btrfs_path *path, 2886 u64 objectid, u64 offset, u64 bytenr) 2887 { 2888 struct btrfs_delayed_ref_head *head; 2889 struct btrfs_delayed_ref_node *ref; 2890 struct btrfs_delayed_data_ref *data_ref; 2891 struct btrfs_delayed_ref_root *delayed_refs; 2892 struct rb_node *node; 2893 int ret = 0; 2894 2895 delayed_refs = &trans->transaction->delayed_refs; 2896 spin_lock(&delayed_refs->lock); 2897 head = btrfs_find_delayed_ref_head(trans, bytenr); 2898 if (!head) { 2899 spin_unlock(&delayed_refs->lock); 2900 return 0; 2901 } 2902 2903 if (!mutex_trylock(&head->mutex)) { 2904 atomic_inc(&head->node.refs); 2905 spin_unlock(&delayed_refs->lock); 2906 2907 btrfs_release_path(path); 2908 2909 /* 2910 * Mutex was contended, block until it's released and let 2911 * caller try again 2912 */ 2913 mutex_lock(&head->mutex); 2914 mutex_unlock(&head->mutex); 2915 btrfs_put_delayed_ref(&head->node); 2916 return -EAGAIN; 2917 } 2918 spin_unlock(&delayed_refs->lock); 2919 2920 spin_lock(&head->lock); 2921 node = rb_first(&head->ref_root); 2922 while (node) { 2923 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2924 node = rb_next(node); 2925 2926 /* If it's a shared ref we know a cross reference exists */ 2927 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2928 ret = 1; 2929 break; 2930 } 2931 2932 data_ref = btrfs_delayed_node_to_data_ref(ref); 2933 2934 /* 2935 * If our ref doesn't match the one we're currently looking at 2936 * then we have a cross reference. 2937 */ 2938 if (data_ref->root != root->root_key.objectid || 2939 data_ref->objectid != objectid || 2940 data_ref->offset != offset) { 2941 ret = 1; 2942 break; 2943 } 2944 } 2945 spin_unlock(&head->lock); 2946 mutex_unlock(&head->mutex); 2947 return ret; 2948 } 2949 2950 static noinline int check_committed_ref(struct btrfs_trans_handle *trans, 2951 struct btrfs_root *root, 2952 struct btrfs_path *path, 2953 u64 objectid, u64 offset, u64 bytenr) 2954 { 2955 struct btrfs_root *extent_root = root->fs_info->extent_root; 2956 struct extent_buffer *leaf; 2957 struct btrfs_extent_data_ref *ref; 2958 struct btrfs_extent_inline_ref *iref; 2959 struct btrfs_extent_item *ei; 2960 struct btrfs_key key; 2961 u32 item_size; 2962 int ret; 2963 2964 key.objectid = bytenr; 2965 key.offset = (u64)-1; 2966 key.type = BTRFS_EXTENT_ITEM_KEY; 2967 2968 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2969 if (ret < 0) 2970 goto out; 2971 BUG_ON(ret == 0); /* Corruption */ 2972 2973 ret = -ENOENT; 2974 if (path->slots[0] == 0) 2975 goto out; 2976 2977 path->slots[0]--; 2978 leaf = path->nodes[0]; 2979 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2980 2981 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2982 goto out; 2983 2984 ret = 1; 2985 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2986 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2987 if (item_size < sizeof(*ei)) { 2988 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 2989 goto out; 2990 } 2991 #endif 2992 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2993 2994 if (item_size != sizeof(*ei) + 2995 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2996 goto out; 2997 2998 if (btrfs_extent_generation(leaf, ei) <= 2999 btrfs_root_last_snapshot(&root->root_item)) 3000 goto out; 3001 3002 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3003 if (btrfs_extent_inline_ref_type(leaf, iref) != 3004 BTRFS_EXTENT_DATA_REF_KEY) 3005 goto out; 3006 3007 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3008 if (btrfs_extent_refs(leaf, ei) != 3009 btrfs_extent_data_ref_count(leaf, ref) || 3010 btrfs_extent_data_ref_root(leaf, ref) != 3011 root->root_key.objectid || 3012 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3013 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3014 goto out; 3015 3016 ret = 0; 3017 out: 3018 return ret; 3019 } 3020 3021 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3022 struct btrfs_root *root, 3023 u64 objectid, u64 offset, u64 bytenr) 3024 { 3025 struct btrfs_path *path; 3026 int ret; 3027 int ret2; 3028 3029 path = btrfs_alloc_path(); 3030 if (!path) 3031 return -ENOENT; 3032 3033 do { 3034 ret = check_committed_ref(trans, root, path, objectid, 3035 offset, bytenr); 3036 if (ret && ret != -ENOENT) 3037 goto out; 3038 3039 ret2 = check_delayed_ref(trans, root, path, objectid, 3040 offset, bytenr); 3041 } while (ret2 == -EAGAIN); 3042 3043 if (ret2 && ret2 != -ENOENT) { 3044 ret = ret2; 3045 goto out; 3046 } 3047 3048 if (ret != -ENOENT || ret2 != -ENOENT) 3049 ret = 0; 3050 out: 3051 btrfs_free_path(path); 3052 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3053 WARN_ON(ret > 0); 3054 return ret; 3055 } 3056 3057 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3058 struct btrfs_root *root, 3059 struct extent_buffer *buf, 3060 int full_backref, int inc) 3061 { 3062 u64 bytenr; 3063 u64 num_bytes; 3064 u64 parent; 3065 u64 ref_root; 3066 u32 nritems; 3067 struct btrfs_key key; 3068 struct btrfs_file_extent_item *fi; 3069 int i; 3070 int level; 3071 int ret = 0; 3072 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3073 u64, u64, u64, u64, u64, u64, int); 3074 3075 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3076 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 3077 return 0; 3078 #endif 3079 ref_root = btrfs_header_owner(buf); 3080 nritems = btrfs_header_nritems(buf); 3081 level = btrfs_header_level(buf); 3082 3083 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3084 return 0; 3085 3086 if (inc) 3087 process_func = btrfs_inc_extent_ref; 3088 else 3089 process_func = btrfs_free_extent; 3090 3091 if (full_backref) 3092 parent = buf->start; 3093 else 3094 parent = 0; 3095 3096 for (i = 0; i < nritems; i++) { 3097 if (level == 0) { 3098 btrfs_item_key_to_cpu(buf, &key, i); 3099 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3100 continue; 3101 fi = btrfs_item_ptr(buf, i, 3102 struct btrfs_file_extent_item); 3103 if (btrfs_file_extent_type(buf, fi) == 3104 BTRFS_FILE_EXTENT_INLINE) 3105 continue; 3106 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3107 if (bytenr == 0) 3108 continue; 3109 3110 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3111 key.offset -= btrfs_file_extent_offset(buf, fi); 3112 ret = process_func(trans, root, bytenr, num_bytes, 3113 parent, ref_root, key.objectid, 3114 key.offset, 1); 3115 if (ret) 3116 goto fail; 3117 } else { 3118 bytenr = btrfs_node_blockptr(buf, i); 3119 num_bytes = btrfs_level_size(root, level - 1); 3120 ret = process_func(trans, root, bytenr, num_bytes, 3121 parent, ref_root, level - 1, 0, 3122 1); 3123 if (ret) 3124 goto fail; 3125 } 3126 } 3127 return 0; 3128 fail: 3129 return ret; 3130 } 3131 3132 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3133 struct extent_buffer *buf, int full_backref) 3134 { 3135 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3136 } 3137 3138 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3139 struct extent_buffer *buf, int full_backref) 3140 { 3141 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3142 } 3143 3144 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3145 struct btrfs_root *root, 3146 struct btrfs_path *path, 3147 struct btrfs_block_group_cache *cache) 3148 { 3149 int ret; 3150 struct btrfs_root *extent_root = root->fs_info->extent_root; 3151 unsigned long bi; 3152 struct extent_buffer *leaf; 3153 3154 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3155 if (ret < 0) 3156 goto fail; 3157 BUG_ON(ret); /* Corruption */ 3158 3159 leaf = path->nodes[0]; 3160 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3161 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3162 btrfs_mark_buffer_dirty(leaf); 3163 btrfs_release_path(path); 3164 fail: 3165 if (ret) { 3166 btrfs_abort_transaction(trans, root, ret); 3167 return ret; 3168 } 3169 return 0; 3170 3171 } 3172 3173 static struct btrfs_block_group_cache * 3174 next_block_group(struct btrfs_root *root, 3175 struct btrfs_block_group_cache *cache) 3176 { 3177 struct rb_node *node; 3178 spin_lock(&root->fs_info->block_group_cache_lock); 3179 node = rb_next(&cache->cache_node); 3180 btrfs_put_block_group(cache); 3181 if (node) { 3182 cache = rb_entry(node, struct btrfs_block_group_cache, 3183 cache_node); 3184 btrfs_get_block_group(cache); 3185 } else 3186 cache = NULL; 3187 spin_unlock(&root->fs_info->block_group_cache_lock); 3188 return cache; 3189 } 3190 3191 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3192 struct btrfs_trans_handle *trans, 3193 struct btrfs_path *path) 3194 { 3195 struct btrfs_root *root = block_group->fs_info->tree_root; 3196 struct inode *inode = NULL; 3197 u64 alloc_hint = 0; 3198 int dcs = BTRFS_DC_ERROR; 3199 int num_pages = 0; 3200 int retries = 0; 3201 int ret = 0; 3202 3203 /* 3204 * If this block group is smaller than 100 megs don't bother caching the 3205 * block group. 3206 */ 3207 if (block_group->key.offset < (100 * 1024 * 1024)) { 3208 spin_lock(&block_group->lock); 3209 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3210 spin_unlock(&block_group->lock); 3211 return 0; 3212 } 3213 3214 again: 3215 inode = lookup_free_space_inode(root, block_group, path); 3216 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3217 ret = PTR_ERR(inode); 3218 btrfs_release_path(path); 3219 goto out; 3220 } 3221 3222 if (IS_ERR(inode)) { 3223 BUG_ON(retries); 3224 retries++; 3225 3226 if (block_group->ro) 3227 goto out_free; 3228 3229 ret = create_free_space_inode(root, trans, block_group, path); 3230 if (ret) 3231 goto out_free; 3232 goto again; 3233 } 3234 3235 /* We've already setup this transaction, go ahead and exit */ 3236 if (block_group->cache_generation == trans->transid && 3237 i_size_read(inode)) { 3238 dcs = BTRFS_DC_SETUP; 3239 goto out_put; 3240 } 3241 3242 /* 3243 * We want to set the generation to 0, that way if anything goes wrong 3244 * from here on out we know not to trust this cache when we load up next 3245 * time. 3246 */ 3247 BTRFS_I(inode)->generation = 0; 3248 ret = btrfs_update_inode(trans, root, inode); 3249 WARN_ON(ret); 3250 3251 if (i_size_read(inode) > 0) { 3252 ret = btrfs_check_trunc_cache_free_space(root, 3253 &root->fs_info->global_block_rsv); 3254 if (ret) 3255 goto out_put; 3256 3257 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3258 if (ret) 3259 goto out_put; 3260 } 3261 3262 spin_lock(&block_group->lock); 3263 if (block_group->cached != BTRFS_CACHE_FINISHED || 3264 !btrfs_test_opt(root, SPACE_CACHE) || 3265 block_group->delalloc_bytes) { 3266 /* 3267 * don't bother trying to write stuff out _if_ 3268 * a) we're not cached, 3269 * b) we're with nospace_cache mount option. 3270 */ 3271 dcs = BTRFS_DC_WRITTEN; 3272 spin_unlock(&block_group->lock); 3273 goto out_put; 3274 } 3275 spin_unlock(&block_group->lock); 3276 3277 /* 3278 * Try to preallocate enough space based on how big the block group is. 3279 * Keep in mind this has to include any pinned space which could end up 3280 * taking up quite a bit since it's not folded into the other space 3281 * cache. 3282 */ 3283 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3284 if (!num_pages) 3285 num_pages = 1; 3286 3287 num_pages *= 16; 3288 num_pages *= PAGE_CACHE_SIZE; 3289 3290 ret = btrfs_check_data_free_space(inode, num_pages); 3291 if (ret) 3292 goto out_put; 3293 3294 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3295 num_pages, num_pages, 3296 &alloc_hint); 3297 if (!ret) 3298 dcs = BTRFS_DC_SETUP; 3299 btrfs_free_reserved_data_space(inode, num_pages); 3300 3301 out_put: 3302 iput(inode); 3303 out_free: 3304 btrfs_release_path(path); 3305 out: 3306 spin_lock(&block_group->lock); 3307 if (!ret && dcs == BTRFS_DC_SETUP) 3308 block_group->cache_generation = trans->transid; 3309 block_group->disk_cache_state = dcs; 3310 spin_unlock(&block_group->lock); 3311 3312 return ret; 3313 } 3314 3315 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3316 struct btrfs_root *root) 3317 { 3318 struct btrfs_block_group_cache *cache; 3319 int err = 0; 3320 struct btrfs_path *path; 3321 u64 last = 0; 3322 3323 path = btrfs_alloc_path(); 3324 if (!path) 3325 return -ENOMEM; 3326 3327 again: 3328 while (1) { 3329 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3330 while (cache) { 3331 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3332 break; 3333 cache = next_block_group(root, cache); 3334 } 3335 if (!cache) { 3336 if (last == 0) 3337 break; 3338 last = 0; 3339 continue; 3340 } 3341 err = cache_save_setup(cache, trans, path); 3342 last = cache->key.objectid + cache->key.offset; 3343 btrfs_put_block_group(cache); 3344 } 3345 3346 while (1) { 3347 if (last == 0) { 3348 err = btrfs_run_delayed_refs(trans, root, 3349 (unsigned long)-1); 3350 if (err) /* File system offline */ 3351 goto out; 3352 } 3353 3354 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3355 while (cache) { 3356 if (cache->disk_cache_state == BTRFS_DC_CLEAR) { 3357 btrfs_put_block_group(cache); 3358 goto again; 3359 } 3360 3361 if (cache->dirty) 3362 break; 3363 cache = next_block_group(root, cache); 3364 } 3365 if (!cache) { 3366 if (last == 0) 3367 break; 3368 last = 0; 3369 continue; 3370 } 3371 3372 if (cache->disk_cache_state == BTRFS_DC_SETUP) 3373 cache->disk_cache_state = BTRFS_DC_NEED_WRITE; 3374 cache->dirty = 0; 3375 last = cache->key.objectid + cache->key.offset; 3376 3377 err = write_one_cache_group(trans, root, path, cache); 3378 btrfs_put_block_group(cache); 3379 if (err) /* File system offline */ 3380 goto out; 3381 } 3382 3383 while (1) { 3384 /* 3385 * I don't think this is needed since we're just marking our 3386 * preallocated extent as written, but just in case it can't 3387 * hurt. 3388 */ 3389 if (last == 0) { 3390 err = btrfs_run_delayed_refs(trans, root, 3391 (unsigned long)-1); 3392 if (err) /* File system offline */ 3393 goto out; 3394 } 3395 3396 cache = btrfs_lookup_first_block_group(root->fs_info, last); 3397 while (cache) { 3398 /* 3399 * Really this shouldn't happen, but it could if we 3400 * couldn't write the entire preallocated extent and 3401 * splitting the extent resulted in a new block. 3402 */ 3403 if (cache->dirty) { 3404 btrfs_put_block_group(cache); 3405 goto again; 3406 } 3407 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3408 break; 3409 cache = next_block_group(root, cache); 3410 } 3411 if (!cache) { 3412 if (last == 0) 3413 break; 3414 last = 0; 3415 continue; 3416 } 3417 3418 err = btrfs_write_out_cache(root, trans, cache, path); 3419 3420 /* 3421 * If we didn't have an error then the cache state is still 3422 * NEED_WRITE, so we can set it to WRITTEN. 3423 */ 3424 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) 3425 cache->disk_cache_state = BTRFS_DC_WRITTEN; 3426 last = cache->key.objectid + cache->key.offset; 3427 btrfs_put_block_group(cache); 3428 } 3429 out: 3430 3431 btrfs_free_path(path); 3432 return err; 3433 } 3434 3435 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) 3436 { 3437 struct btrfs_block_group_cache *block_group; 3438 int readonly = 0; 3439 3440 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 3441 if (!block_group || block_group->ro) 3442 readonly = 1; 3443 if (block_group) 3444 btrfs_put_block_group(block_group); 3445 return readonly; 3446 } 3447 3448 static const char *alloc_name(u64 flags) 3449 { 3450 switch (flags) { 3451 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3452 return "mixed"; 3453 case BTRFS_BLOCK_GROUP_METADATA: 3454 return "metadata"; 3455 case BTRFS_BLOCK_GROUP_DATA: 3456 return "data"; 3457 case BTRFS_BLOCK_GROUP_SYSTEM: 3458 return "system"; 3459 default: 3460 WARN_ON(1); 3461 return "invalid-combination"; 3462 }; 3463 } 3464 3465 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3466 u64 total_bytes, u64 bytes_used, 3467 struct btrfs_space_info **space_info) 3468 { 3469 struct btrfs_space_info *found; 3470 int i; 3471 int factor; 3472 int ret; 3473 3474 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3475 BTRFS_BLOCK_GROUP_RAID10)) 3476 factor = 2; 3477 else 3478 factor = 1; 3479 3480 found = __find_space_info(info, flags); 3481 if (found) { 3482 spin_lock(&found->lock); 3483 found->total_bytes += total_bytes; 3484 found->disk_total += total_bytes * factor; 3485 found->bytes_used += bytes_used; 3486 found->disk_used += bytes_used * factor; 3487 found->full = 0; 3488 spin_unlock(&found->lock); 3489 *space_info = found; 3490 return 0; 3491 } 3492 found = kzalloc(sizeof(*found), GFP_NOFS); 3493 if (!found) 3494 return -ENOMEM; 3495 3496 ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3497 if (ret) { 3498 kfree(found); 3499 return ret; 3500 } 3501 3502 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3503 INIT_LIST_HEAD(&found->block_groups[i]); 3504 init_rwsem(&found->groups_sem); 3505 spin_lock_init(&found->lock); 3506 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3507 found->total_bytes = total_bytes; 3508 found->disk_total = total_bytes * factor; 3509 found->bytes_used = bytes_used; 3510 found->disk_used = bytes_used * factor; 3511 found->bytes_pinned = 0; 3512 found->bytes_reserved = 0; 3513 found->bytes_readonly = 0; 3514 found->bytes_may_use = 0; 3515 found->full = 0; 3516 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3517 found->chunk_alloc = 0; 3518 found->flush = 0; 3519 init_waitqueue_head(&found->wait); 3520 3521 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3522 info->space_info_kobj, "%s", 3523 alloc_name(found->flags)); 3524 if (ret) { 3525 kfree(found); 3526 return ret; 3527 } 3528 3529 *space_info = found; 3530 list_add_rcu(&found->list, &info->space_info); 3531 if (flags & BTRFS_BLOCK_GROUP_DATA) 3532 info->data_sinfo = found; 3533 3534 return ret; 3535 } 3536 3537 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3538 { 3539 u64 extra_flags = chunk_to_extended(flags) & 3540 BTRFS_EXTENDED_PROFILE_MASK; 3541 3542 write_seqlock(&fs_info->profiles_lock); 3543 if (flags & BTRFS_BLOCK_GROUP_DATA) 3544 fs_info->avail_data_alloc_bits |= extra_flags; 3545 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3546 fs_info->avail_metadata_alloc_bits |= extra_flags; 3547 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3548 fs_info->avail_system_alloc_bits |= extra_flags; 3549 write_sequnlock(&fs_info->profiles_lock); 3550 } 3551 3552 /* 3553 * returns target flags in extended format or 0 if restripe for this 3554 * chunk_type is not in progress 3555 * 3556 * should be called with either volume_mutex or balance_lock held 3557 */ 3558 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 3559 { 3560 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3561 u64 target = 0; 3562 3563 if (!bctl) 3564 return 0; 3565 3566 if (flags & BTRFS_BLOCK_GROUP_DATA && 3567 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3568 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 3569 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 3570 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3571 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 3572 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 3573 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3574 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 3575 } 3576 3577 return target; 3578 } 3579 3580 /* 3581 * @flags: available profiles in extended format (see ctree.h) 3582 * 3583 * Returns reduced profile in chunk format. If profile changing is in 3584 * progress (either running or paused) picks the target profile (if it's 3585 * already available), otherwise falls back to plain reducing. 3586 */ 3587 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3588 { 3589 /* 3590 * we add in the count of missing devices because we want 3591 * to make sure that any RAID levels on a degraded FS 3592 * continue to be honored. 3593 */ 3594 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3595 root->fs_info->fs_devices->missing_devices; 3596 u64 target; 3597 u64 tmp; 3598 3599 /* 3600 * see if restripe for this chunk_type is in progress, if so 3601 * try to reduce to the target profile 3602 */ 3603 spin_lock(&root->fs_info->balance_lock); 3604 target = get_restripe_target(root->fs_info, flags); 3605 if (target) { 3606 /* pick target profile only if it's already available */ 3607 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 3608 spin_unlock(&root->fs_info->balance_lock); 3609 return extended_to_chunk(target); 3610 } 3611 } 3612 spin_unlock(&root->fs_info->balance_lock); 3613 3614 /* First, mask out the RAID levels which aren't possible */ 3615 if (num_devices == 1) 3616 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3617 BTRFS_BLOCK_GROUP_RAID5); 3618 if (num_devices < 3) 3619 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3620 if (num_devices < 4) 3621 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3622 3623 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3624 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3625 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3626 flags &= ~tmp; 3627 3628 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3629 tmp = BTRFS_BLOCK_GROUP_RAID6; 3630 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3631 tmp = BTRFS_BLOCK_GROUP_RAID5; 3632 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3633 tmp = BTRFS_BLOCK_GROUP_RAID10; 3634 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3635 tmp = BTRFS_BLOCK_GROUP_RAID1; 3636 else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3637 tmp = BTRFS_BLOCK_GROUP_RAID0; 3638 3639 return extended_to_chunk(flags | tmp); 3640 } 3641 3642 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 3643 { 3644 unsigned seq; 3645 u64 flags; 3646 3647 do { 3648 flags = orig_flags; 3649 seq = read_seqbegin(&root->fs_info->profiles_lock); 3650 3651 if (flags & BTRFS_BLOCK_GROUP_DATA) 3652 flags |= root->fs_info->avail_data_alloc_bits; 3653 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3654 flags |= root->fs_info->avail_system_alloc_bits; 3655 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3656 flags |= root->fs_info->avail_metadata_alloc_bits; 3657 } while (read_seqretry(&root->fs_info->profiles_lock, seq)); 3658 3659 return btrfs_reduce_alloc_profile(root, flags); 3660 } 3661 3662 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3663 { 3664 u64 flags; 3665 u64 ret; 3666 3667 if (data) 3668 flags = BTRFS_BLOCK_GROUP_DATA; 3669 else if (root == root->fs_info->chunk_root) 3670 flags = BTRFS_BLOCK_GROUP_SYSTEM; 3671 else 3672 flags = BTRFS_BLOCK_GROUP_METADATA; 3673 3674 ret = get_alloc_profile(root, flags); 3675 return ret; 3676 } 3677 3678 /* 3679 * This will check the space that the inode allocates from to make sure we have 3680 * enough space for bytes. 3681 */ 3682 int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3683 { 3684 struct btrfs_space_info *data_sinfo; 3685 struct btrfs_root *root = BTRFS_I(inode)->root; 3686 struct btrfs_fs_info *fs_info = root->fs_info; 3687 u64 used; 3688 int ret = 0, committed = 0, alloc_chunk = 1; 3689 3690 /* make sure bytes are sectorsize aligned */ 3691 bytes = ALIGN(bytes, root->sectorsize); 3692 3693 if (btrfs_is_free_space_inode(inode)) { 3694 committed = 1; 3695 ASSERT(current->journal_info); 3696 } 3697 3698 data_sinfo = fs_info->data_sinfo; 3699 if (!data_sinfo) 3700 goto alloc; 3701 3702 again: 3703 /* make sure we have enough space to handle the data first */ 3704 spin_lock(&data_sinfo->lock); 3705 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + 3706 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + 3707 data_sinfo->bytes_may_use; 3708 3709 if (used + bytes > data_sinfo->total_bytes) { 3710 struct btrfs_trans_handle *trans; 3711 3712 /* 3713 * if we don't have enough free bytes in this space then we need 3714 * to alloc a new chunk. 3715 */ 3716 if (!data_sinfo->full && alloc_chunk) { 3717 u64 alloc_target; 3718 3719 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3720 spin_unlock(&data_sinfo->lock); 3721 alloc: 3722 alloc_target = btrfs_get_alloc_profile(root, 1); 3723 /* 3724 * It is ugly that we don't call nolock join 3725 * transaction for the free space inode case here. 3726 * But it is safe because we only do the data space 3727 * reservation for the free space cache in the 3728 * transaction context, the common join transaction 3729 * just increase the counter of the current transaction 3730 * handler, doesn't try to acquire the trans_lock of 3731 * the fs. 3732 */ 3733 trans = btrfs_join_transaction(root); 3734 if (IS_ERR(trans)) 3735 return PTR_ERR(trans); 3736 3737 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3738 alloc_target, 3739 CHUNK_ALLOC_NO_FORCE); 3740 btrfs_end_transaction(trans, root); 3741 if (ret < 0) { 3742 if (ret != -ENOSPC) 3743 return ret; 3744 else 3745 goto commit_trans; 3746 } 3747 3748 if (!data_sinfo) 3749 data_sinfo = fs_info->data_sinfo; 3750 3751 goto again; 3752 } 3753 3754 /* 3755 * If we don't have enough pinned space to deal with this 3756 * allocation don't bother committing the transaction. 3757 */ 3758 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, 3759 bytes) < 0) 3760 committed = 1; 3761 spin_unlock(&data_sinfo->lock); 3762 3763 /* commit the current transaction and try again */ 3764 commit_trans: 3765 if (!committed && 3766 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3767 committed = 1; 3768 3769 trans = btrfs_join_transaction(root); 3770 if (IS_ERR(trans)) 3771 return PTR_ERR(trans); 3772 ret = btrfs_commit_transaction(trans, root); 3773 if (ret) 3774 return ret; 3775 goto again; 3776 } 3777 3778 trace_btrfs_space_reservation(root->fs_info, 3779 "space_info:enospc", 3780 data_sinfo->flags, bytes, 1); 3781 return -ENOSPC; 3782 } 3783 data_sinfo->bytes_may_use += bytes; 3784 trace_btrfs_space_reservation(root->fs_info, "space_info", 3785 data_sinfo->flags, bytes, 1); 3786 spin_unlock(&data_sinfo->lock); 3787 3788 return 0; 3789 } 3790 3791 /* 3792 * Called if we need to clear a data reservation for this inode. 3793 */ 3794 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3795 { 3796 struct btrfs_root *root = BTRFS_I(inode)->root; 3797 struct btrfs_space_info *data_sinfo; 3798 3799 /* make sure bytes are sectorsize aligned */ 3800 bytes = ALIGN(bytes, root->sectorsize); 3801 3802 data_sinfo = root->fs_info->data_sinfo; 3803 spin_lock(&data_sinfo->lock); 3804 WARN_ON(data_sinfo->bytes_may_use < bytes); 3805 data_sinfo->bytes_may_use -= bytes; 3806 trace_btrfs_space_reservation(root->fs_info, "space_info", 3807 data_sinfo->flags, bytes, 0); 3808 spin_unlock(&data_sinfo->lock); 3809 } 3810 3811 static void force_metadata_allocation(struct btrfs_fs_info *info) 3812 { 3813 struct list_head *head = &info->space_info; 3814 struct btrfs_space_info *found; 3815 3816 rcu_read_lock(); 3817 list_for_each_entry_rcu(found, head, list) { 3818 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3819 found->force_alloc = CHUNK_ALLOC_FORCE; 3820 } 3821 rcu_read_unlock(); 3822 } 3823 3824 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 3825 { 3826 return (global->size << 1); 3827 } 3828 3829 static int should_alloc_chunk(struct btrfs_root *root, 3830 struct btrfs_space_info *sinfo, int force) 3831 { 3832 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 3833 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3834 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3835 u64 thresh; 3836 3837 if (force == CHUNK_ALLOC_FORCE) 3838 return 1; 3839 3840 /* 3841 * We need to take into account the global rsv because for all intents 3842 * and purposes it's used space. Don't worry about locking the 3843 * global_rsv, it doesn't change except when the transaction commits. 3844 */ 3845 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 3846 num_allocated += calc_global_rsv_need_space(global_rsv); 3847 3848 /* 3849 * in limited mode, we want to have some free space up to 3850 * about 1% of the FS size. 3851 */ 3852 if (force == CHUNK_ALLOC_LIMITED) { 3853 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3854 thresh = max_t(u64, 64 * 1024 * 1024, 3855 div_factor_fine(thresh, 1)); 3856 3857 if (num_bytes - num_allocated < thresh) 3858 return 1; 3859 } 3860 3861 if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) 3862 return 0; 3863 return 1; 3864 } 3865 3866 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) 3867 { 3868 u64 num_dev; 3869 3870 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3871 BTRFS_BLOCK_GROUP_RAID0 | 3872 BTRFS_BLOCK_GROUP_RAID5 | 3873 BTRFS_BLOCK_GROUP_RAID6)) 3874 num_dev = root->fs_info->fs_devices->rw_devices; 3875 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3876 num_dev = 2; 3877 else 3878 num_dev = 1; /* DUP or single */ 3879 3880 /* metadata for updaing devices and chunk tree */ 3881 return btrfs_calc_trans_metadata_size(root, num_dev + 1); 3882 } 3883 3884 static void check_system_chunk(struct btrfs_trans_handle *trans, 3885 struct btrfs_root *root, u64 type) 3886 { 3887 struct btrfs_space_info *info; 3888 u64 left; 3889 u64 thresh; 3890 3891 info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3892 spin_lock(&info->lock); 3893 left = info->total_bytes - info->bytes_used - info->bytes_pinned - 3894 info->bytes_reserved - info->bytes_readonly; 3895 spin_unlock(&info->lock); 3896 3897 thresh = get_system_chunk_thresh(root, type); 3898 if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { 3899 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", 3900 left, thresh, type); 3901 dump_space_info(info, 0, 0); 3902 } 3903 3904 if (left < thresh) { 3905 u64 flags; 3906 3907 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); 3908 btrfs_alloc_chunk(trans, root, flags); 3909 } 3910 } 3911 3912 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3913 struct btrfs_root *extent_root, u64 flags, int force) 3914 { 3915 struct btrfs_space_info *space_info; 3916 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3917 int wait_for_alloc = 0; 3918 int ret = 0; 3919 3920 /* Don't re-enter if we're already allocating a chunk */ 3921 if (trans->allocating_chunk) 3922 return -ENOSPC; 3923 3924 space_info = __find_space_info(extent_root->fs_info, flags); 3925 if (!space_info) { 3926 ret = update_space_info(extent_root->fs_info, flags, 3927 0, 0, &space_info); 3928 BUG_ON(ret); /* -ENOMEM */ 3929 } 3930 BUG_ON(!space_info); /* Logic error */ 3931 3932 again: 3933 spin_lock(&space_info->lock); 3934 if (force < space_info->force_alloc) 3935 force = space_info->force_alloc; 3936 if (space_info->full) { 3937 if (should_alloc_chunk(extent_root, space_info, force)) 3938 ret = -ENOSPC; 3939 else 3940 ret = 0; 3941 spin_unlock(&space_info->lock); 3942 return ret; 3943 } 3944 3945 if (!should_alloc_chunk(extent_root, space_info, force)) { 3946 spin_unlock(&space_info->lock); 3947 return 0; 3948 } else if (space_info->chunk_alloc) { 3949 wait_for_alloc = 1; 3950 } else { 3951 space_info->chunk_alloc = 1; 3952 } 3953 3954 spin_unlock(&space_info->lock); 3955 3956 mutex_lock(&fs_info->chunk_mutex); 3957 3958 /* 3959 * The chunk_mutex is held throughout the entirety of a chunk 3960 * allocation, so once we've acquired the chunk_mutex we know that the 3961 * other guy is done and we need to recheck and see if we should 3962 * allocate. 3963 */ 3964 if (wait_for_alloc) { 3965 mutex_unlock(&fs_info->chunk_mutex); 3966 wait_for_alloc = 0; 3967 goto again; 3968 } 3969 3970 trans->allocating_chunk = true; 3971 3972 /* 3973 * If we have mixed data/metadata chunks we want to make sure we keep 3974 * allocating mixed chunks instead of individual chunks. 3975 */ 3976 if (btrfs_mixed_space_info(space_info)) 3977 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 3978 3979 /* 3980 * if we're doing a data chunk, go ahead and make sure that 3981 * we keep a reasonable number of metadata chunks allocated in the 3982 * FS as well. 3983 */ 3984 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 3985 fs_info->data_chunk_allocations++; 3986 if (!(fs_info->data_chunk_allocations % 3987 fs_info->metadata_ratio)) 3988 force_metadata_allocation(fs_info); 3989 } 3990 3991 /* 3992 * Check if we have enough space in SYSTEM chunk because we may need 3993 * to update devices. 3994 */ 3995 check_system_chunk(trans, extent_root, flags); 3996 3997 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3998 trans->allocating_chunk = false; 3999 4000 spin_lock(&space_info->lock); 4001 if (ret < 0 && ret != -ENOSPC) 4002 goto out; 4003 if (ret) 4004 space_info->full = 1; 4005 else 4006 ret = 1; 4007 4008 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4009 out: 4010 space_info->chunk_alloc = 0; 4011 spin_unlock(&space_info->lock); 4012 mutex_unlock(&fs_info->chunk_mutex); 4013 return ret; 4014 } 4015 4016 static int can_overcommit(struct btrfs_root *root, 4017 struct btrfs_space_info *space_info, u64 bytes, 4018 enum btrfs_reserve_flush_enum flush) 4019 { 4020 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4021 u64 profile = btrfs_get_alloc_profile(root, 0); 4022 u64 space_size; 4023 u64 avail; 4024 u64 used; 4025 4026 used = space_info->bytes_used + space_info->bytes_reserved + 4027 space_info->bytes_pinned + space_info->bytes_readonly; 4028 4029 /* 4030 * We only want to allow over committing if we have lots of actual space 4031 * free, but if we don't have enough space to handle the global reserve 4032 * space then we could end up having a real enospc problem when trying 4033 * to allocate a chunk or some other such important allocation. 4034 */ 4035 spin_lock(&global_rsv->lock); 4036 space_size = calc_global_rsv_need_space(global_rsv); 4037 spin_unlock(&global_rsv->lock); 4038 if (used + space_size >= space_info->total_bytes) 4039 return 0; 4040 4041 used += space_info->bytes_may_use; 4042 4043 spin_lock(&root->fs_info->free_chunk_lock); 4044 avail = root->fs_info->free_chunk_space; 4045 spin_unlock(&root->fs_info->free_chunk_lock); 4046 4047 /* 4048 * If we have dup, raid1 or raid10 then only half of the free 4049 * space is actually useable. For raid56, the space info used 4050 * doesn't include the parity drive, so we don't have to 4051 * change the math 4052 */ 4053 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4054 BTRFS_BLOCK_GROUP_RAID1 | 4055 BTRFS_BLOCK_GROUP_RAID10)) 4056 avail >>= 1; 4057 4058 /* 4059 * If we aren't flushing all things, let us overcommit up to 4060 * 1/2th of the space. If we can flush, don't let us overcommit 4061 * too much, let it overcommit up to 1/8 of the space. 4062 */ 4063 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4064 avail >>= 3; 4065 else 4066 avail >>= 1; 4067 4068 if (used + bytes < space_info->total_bytes + avail) 4069 return 1; 4070 return 0; 4071 } 4072 4073 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, 4074 unsigned long nr_pages, int nr_items) 4075 { 4076 struct super_block *sb = root->fs_info->sb; 4077 4078 if (down_read_trylock(&sb->s_umount)) { 4079 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4080 up_read(&sb->s_umount); 4081 } else { 4082 /* 4083 * We needn't worry the filesystem going from r/w to r/o though 4084 * we don't acquire ->s_umount mutex, because the filesystem 4085 * should guarantee the delalloc inodes list be empty after 4086 * the filesystem is readonly(all dirty pages are written to 4087 * the disk). 4088 */ 4089 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4090 if (!current->journal_info) 4091 btrfs_wait_ordered_roots(root->fs_info, nr_items); 4092 } 4093 } 4094 4095 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) 4096 { 4097 u64 bytes; 4098 int nr; 4099 4100 bytes = btrfs_calc_trans_metadata_size(root, 1); 4101 nr = (int)div64_u64(to_reclaim, bytes); 4102 if (!nr) 4103 nr = 1; 4104 return nr; 4105 } 4106 4107 #define EXTENT_SIZE_PER_ITEM (256 * 1024) 4108 4109 /* 4110 * shrink metadata reservation for delalloc 4111 */ 4112 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4113 bool wait_ordered) 4114 { 4115 struct btrfs_block_rsv *block_rsv; 4116 struct btrfs_space_info *space_info; 4117 struct btrfs_trans_handle *trans; 4118 u64 delalloc_bytes; 4119 u64 max_reclaim; 4120 long time_left; 4121 unsigned long nr_pages; 4122 int loops; 4123 int items; 4124 enum btrfs_reserve_flush_enum flush; 4125 4126 /* Calc the number of the pages we need flush for space reservation */ 4127 items = calc_reclaim_items_nr(root, to_reclaim); 4128 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4129 4130 trans = (struct btrfs_trans_handle *)current->journal_info; 4131 block_rsv = &root->fs_info->delalloc_block_rsv; 4132 space_info = block_rsv->space_info; 4133 4134 delalloc_bytes = percpu_counter_sum_positive( 4135 &root->fs_info->delalloc_bytes); 4136 if (delalloc_bytes == 0) { 4137 if (trans) 4138 return; 4139 if (wait_ordered) 4140 btrfs_wait_ordered_roots(root->fs_info, items); 4141 return; 4142 } 4143 4144 loops = 0; 4145 while (delalloc_bytes && loops < 3) { 4146 max_reclaim = min(delalloc_bytes, to_reclaim); 4147 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 4148 btrfs_writeback_inodes_sb_nr(root, nr_pages, items); 4149 /* 4150 * We need to wait for the async pages to actually start before 4151 * we do anything. 4152 */ 4153 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); 4154 if (!max_reclaim) 4155 goto skip_async; 4156 4157 if (max_reclaim <= nr_pages) 4158 max_reclaim = 0; 4159 else 4160 max_reclaim -= nr_pages; 4161 4162 wait_event(root->fs_info->async_submit_wait, 4163 atomic_read(&root->fs_info->async_delalloc_pages) <= 4164 (int)max_reclaim); 4165 skip_async: 4166 if (!trans) 4167 flush = BTRFS_RESERVE_FLUSH_ALL; 4168 else 4169 flush = BTRFS_RESERVE_NO_FLUSH; 4170 spin_lock(&space_info->lock); 4171 if (can_overcommit(root, space_info, orig, flush)) { 4172 spin_unlock(&space_info->lock); 4173 break; 4174 } 4175 spin_unlock(&space_info->lock); 4176 4177 loops++; 4178 if (wait_ordered && !trans) { 4179 btrfs_wait_ordered_roots(root->fs_info, items); 4180 } else { 4181 time_left = schedule_timeout_killable(1); 4182 if (time_left) 4183 break; 4184 } 4185 delalloc_bytes = percpu_counter_sum_positive( 4186 &root->fs_info->delalloc_bytes); 4187 } 4188 } 4189 4190 /** 4191 * maybe_commit_transaction - possibly commit the transaction if its ok to 4192 * @root - the root we're allocating for 4193 * @bytes - the number of bytes we want to reserve 4194 * @force - force the commit 4195 * 4196 * This will check to make sure that committing the transaction will actually 4197 * get us somewhere and then commit the transaction if it does. Otherwise it 4198 * will return -ENOSPC. 4199 */ 4200 static int may_commit_transaction(struct btrfs_root *root, 4201 struct btrfs_space_info *space_info, 4202 u64 bytes, int force) 4203 { 4204 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; 4205 struct btrfs_trans_handle *trans; 4206 4207 trans = (struct btrfs_trans_handle *)current->journal_info; 4208 if (trans) 4209 return -EAGAIN; 4210 4211 if (force) 4212 goto commit; 4213 4214 /* See if there is enough pinned space to make this reservation */ 4215 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4216 bytes) >= 0) 4217 goto commit; 4218 4219 /* 4220 * See if there is some space in the delayed insertion reservation for 4221 * this reservation. 4222 */ 4223 if (space_info != delayed_rsv->space_info) 4224 return -ENOSPC; 4225 4226 spin_lock(&delayed_rsv->lock); 4227 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4228 bytes - delayed_rsv->size) >= 0) { 4229 spin_unlock(&delayed_rsv->lock); 4230 return -ENOSPC; 4231 } 4232 spin_unlock(&delayed_rsv->lock); 4233 4234 commit: 4235 trans = btrfs_join_transaction(root); 4236 if (IS_ERR(trans)) 4237 return -ENOSPC; 4238 4239 return btrfs_commit_transaction(trans, root); 4240 } 4241 4242 enum flush_state { 4243 FLUSH_DELAYED_ITEMS_NR = 1, 4244 FLUSH_DELAYED_ITEMS = 2, 4245 FLUSH_DELALLOC = 3, 4246 FLUSH_DELALLOC_WAIT = 4, 4247 ALLOC_CHUNK = 5, 4248 COMMIT_TRANS = 6, 4249 }; 4250 4251 static int flush_space(struct btrfs_root *root, 4252 struct btrfs_space_info *space_info, u64 num_bytes, 4253 u64 orig_bytes, int state) 4254 { 4255 struct btrfs_trans_handle *trans; 4256 int nr; 4257 int ret = 0; 4258 4259 switch (state) { 4260 case FLUSH_DELAYED_ITEMS_NR: 4261 case FLUSH_DELAYED_ITEMS: 4262 if (state == FLUSH_DELAYED_ITEMS_NR) 4263 nr = calc_reclaim_items_nr(root, num_bytes) * 2; 4264 else 4265 nr = -1; 4266 4267 trans = btrfs_join_transaction(root); 4268 if (IS_ERR(trans)) { 4269 ret = PTR_ERR(trans); 4270 break; 4271 } 4272 ret = btrfs_run_delayed_items_nr(trans, root, nr); 4273 btrfs_end_transaction(trans, root); 4274 break; 4275 case FLUSH_DELALLOC: 4276 case FLUSH_DELALLOC_WAIT: 4277 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4278 state == FLUSH_DELALLOC_WAIT); 4279 break; 4280 case ALLOC_CHUNK: 4281 trans = btrfs_join_transaction(root); 4282 if (IS_ERR(trans)) { 4283 ret = PTR_ERR(trans); 4284 break; 4285 } 4286 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 4287 btrfs_get_alloc_profile(root, 0), 4288 CHUNK_ALLOC_NO_FORCE); 4289 btrfs_end_transaction(trans, root); 4290 if (ret == -ENOSPC) 4291 ret = 0; 4292 break; 4293 case COMMIT_TRANS: 4294 ret = may_commit_transaction(root, space_info, orig_bytes, 0); 4295 break; 4296 default: 4297 ret = -ENOSPC; 4298 break; 4299 } 4300 4301 return ret; 4302 } 4303 4304 static inline u64 4305 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4306 struct btrfs_space_info *space_info) 4307 { 4308 u64 used; 4309 u64 expected; 4310 u64 to_reclaim; 4311 4312 to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, 4313 16 * 1024 * 1024); 4314 spin_lock(&space_info->lock); 4315 if (can_overcommit(root, space_info, to_reclaim, 4316 BTRFS_RESERVE_FLUSH_ALL)) { 4317 to_reclaim = 0; 4318 goto out; 4319 } 4320 4321 used = space_info->bytes_used + space_info->bytes_reserved + 4322 space_info->bytes_pinned + space_info->bytes_readonly + 4323 space_info->bytes_may_use; 4324 if (can_overcommit(root, space_info, 1024 * 1024, 4325 BTRFS_RESERVE_FLUSH_ALL)) 4326 expected = div_factor_fine(space_info->total_bytes, 95); 4327 else 4328 expected = div_factor_fine(space_info->total_bytes, 90); 4329 4330 if (used > expected) 4331 to_reclaim = used - expected; 4332 else 4333 to_reclaim = 0; 4334 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4335 space_info->bytes_reserved); 4336 out: 4337 spin_unlock(&space_info->lock); 4338 4339 return to_reclaim; 4340 } 4341 4342 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4343 struct btrfs_fs_info *fs_info, u64 used) 4344 { 4345 return (used >= div_factor_fine(space_info->total_bytes, 98) && 4346 !btrfs_fs_closing(fs_info) && 4347 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4348 } 4349 4350 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4351 struct btrfs_fs_info *fs_info) 4352 { 4353 u64 used; 4354 4355 spin_lock(&space_info->lock); 4356 used = space_info->bytes_used + space_info->bytes_reserved + 4357 space_info->bytes_pinned + space_info->bytes_readonly + 4358 space_info->bytes_may_use; 4359 if (need_do_async_reclaim(space_info, fs_info, used)) { 4360 spin_unlock(&space_info->lock); 4361 return 1; 4362 } 4363 spin_unlock(&space_info->lock); 4364 4365 return 0; 4366 } 4367 4368 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4369 { 4370 struct btrfs_fs_info *fs_info; 4371 struct btrfs_space_info *space_info; 4372 u64 to_reclaim; 4373 int flush_state; 4374 4375 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4376 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4377 4378 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4379 space_info); 4380 if (!to_reclaim) 4381 return; 4382 4383 flush_state = FLUSH_DELAYED_ITEMS_NR; 4384 do { 4385 flush_space(fs_info->fs_root, space_info, to_reclaim, 4386 to_reclaim, flush_state); 4387 flush_state++; 4388 if (!btrfs_need_do_async_reclaim(space_info, fs_info)) 4389 return; 4390 } while (flush_state <= COMMIT_TRANS); 4391 4392 if (btrfs_need_do_async_reclaim(space_info, fs_info)) 4393 queue_work(system_unbound_wq, work); 4394 } 4395 4396 void btrfs_init_async_reclaim_work(struct work_struct *work) 4397 { 4398 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 4399 } 4400 4401 /** 4402 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 4403 * @root - the root we're allocating for 4404 * @block_rsv - the block_rsv we're allocating for 4405 * @orig_bytes - the number of bytes we want 4406 * @flush - whether or not we can flush to make our reservation 4407 * 4408 * This will reserve orgi_bytes number of bytes from the space info associated 4409 * with the block_rsv. If there is not enough space it will make an attempt to 4410 * flush out space to make room. It will do this by flushing delalloc if 4411 * possible or committing the transaction. If flush is 0 then no attempts to 4412 * regain reservations will be made and this will fail if there is not enough 4413 * space already. 4414 */ 4415 static int reserve_metadata_bytes(struct btrfs_root *root, 4416 struct btrfs_block_rsv *block_rsv, 4417 u64 orig_bytes, 4418 enum btrfs_reserve_flush_enum flush) 4419 { 4420 struct btrfs_space_info *space_info = block_rsv->space_info; 4421 u64 used; 4422 u64 num_bytes = orig_bytes; 4423 int flush_state = FLUSH_DELAYED_ITEMS_NR; 4424 int ret = 0; 4425 bool flushing = false; 4426 4427 again: 4428 ret = 0; 4429 spin_lock(&space_info->lock); 4430 /* 4431 * We only want to wait if somebody other than us is flushing and we 4432 * are actually allowed to flush all things. 4433 */ 4434 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && 4435 space_info->flush) { 4436 spin_unlock(&space_info->lock); 4437 /* 4438 * If we have a trans handle we can't wait because the flusher 4439 * may have to commit the transaction, which would mean we would 4440 * deadlock since we are waiting for the flusher to finish, but 4441 * hold the current transaction open. 4442 */ 4443 if (current->journal_info) 4444 return -EAGAIN; 4445 ret = wait_event_killable(space_info->wait, !space_info->flush); 4446 /* Must have been killed, return */ 4447 if (ret) 4448 return -EINTR; 4449 4450 spin_lock(&space_info->lock); 4451 } 4452 4453 ret = -ENOSPC; 4454 used = space_info->bytes_used + space_info->bytes_reserved + 4455 space_info->bytes_pinned + space_info->bytes_readonly + 4456 space_info->bytes_may_use; 4457 4458 /* 4459 * The idea here is that we've not already over-reserved the block group 4460 * then we can go ahead and save our reservation first and then start 4461 * flushing if we need to. Otherwise if we've already overcommitted 4462 * lets start flushing stuff first and then come back and try to make 4463 * our reservation. 4464 */ 4465 if (used <= space_info->total_bytes) { 4466 if (used + orig_bytes <= space_info->total_bytes) { 4467 space_info->bytes_may_use += orig_bytes; 4468 trace_btrfs_space_reservation(root->fs_info, 4469 "space_info", space_info->flags, orig_bytes, 1); 4470 ret = 0; 4471 } else { 4472 /* 4473 * Ok set num_bytes to orig_bytes since we aren't 4474 * overocmmitted, this way we only try and reclaim what 4475 * we need. 4476 */ 4477 num_bytes = orig_bytes; 4478 } 4479 } else { 4480 /* 4481 * Ok we're over committed, set num_bytes to the overcommitted 4482 * amount plus the amount of bytes that we need for this 4483 * reservation. 4484 */ 4485 num_bytes = used - space_info->total_bytes + 4486 (orig_bytes * 2); 4487 } 4488 4489 if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { 4490 space_info->bytes_may_use += orig_bytes; 4491 trace_btrfs_space_reservation(root->fs_info, "space_info", 4492 space_info->flags, orig_bytes, 4493 1); 4494 ret = 0; 4495 } 4496 4497 /* 4498 * Couldn't make our reservation, save our place so while we're trying 4499 * to reclaim space we can actually use it instead of somebody else 4500 * stealing it from us. 4501 * 4502 * We make the other tasks wait for the flush only when we can flush 4503 * all things. 4504 */ 4505 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 4506 flushing = true; 4507 space_info->flush = 1; 4508 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4509 used += orig_bytes; 4510 if (need_do_async_reclaim(space_info, root->fs_info, used) && 4511 !work_busy(&root->fs_info->async_reclaim_work)) 4512 queue_work(system_unbound_wq, 4513 &root->fs_info->async_reclaim_work); 4514 } 4515 spin_unlock(&space_info->lock); 4516 4517 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 4518 goto out; 4519 4520 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4521 flush_state); 4522 flush_state++; 4523 4524 /* 4525 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock 4526 * would happen. So skip delalloc flush. 4527 */ 4528 if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4529 (flush_state == FLUSH_DELALLOC || 4530 flush_state == FLUSH_DELALLOC_WAIT)) 4531 flush_state = ALLOC_CHUNK; 4532 4533 if (!ret) 4534 goto again; 4535 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && 4536 flush_state < COMMIT_TRANS) 4537 goto again; 4538 else if (flush == BTRFS_RESERVE_FLUSH_ALL && 4539 flush_state <= COMMIT_TRANS) 4540 goto again; 4541 4542 out: 4543 if (ret == -ENOSPC && 4544 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 4545 struct btrfs_block_rsv *global_rsv = 4546 &root->fs_info->global_block_rsv; 4547 4548 if (block_rsv != global_rsv && 4549 !block_rsv_use_bytes(global_rsv, orig_bytes)) 4550 ret = 0; 4551 } 4552 if (ret == -ENOSPC) 4553 trace_btrfs_space_reservation(root->fs_info, 4554 "space_info:enospc", 4555 space_info->flags, orig_bytes, 1); 4556 if (flushing) { 4557 spin_lock(&space_info->lock); 4558 space_info->flush = 0; 4559 wake_up_all(&space_info->wait); 4560 spin_unlock(&space_info->lock); 4561 } 4562 return ret; 4563 } 4564 4565 static struct btrfs_block_rsv *get_block_rsv( 4566 const struct btrfs_trans_handle *trans, 4567 const struct btrfs_root *root) 4568 { 4569 struct btrfs_block_rsv *block_rsv = NULL; 4570 4571 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 4572 block_rsv = trans->block_rsv; 4573 4574 if (root == root->fs_info->csum_root && trans->adding_csums) 4575 block_rsv = trans->block_rsv; 4576 4577 if (root == root->fs_info->uuid_root) 4578 block_rsv = trans->block_rsv; 4579 4580 if (!block_rsv) 4581 block_rsv = root->block_rsv; 4582 4583 if (!block_rsv) 4584 block_rsv = &root->fs_info->empty_block_rsv; 4585 4586 return block_rsv; 4587 } 4588 4589 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 4590 u64 num_bytes) 4591 { 4592 int ret = -ENOSPC; 4593 spin_lock(&block_rsv->lock); 4594 if (block_rsv->reserved >= num_bytes) { 4595 block_rsv->reserved -= num_bytes; 4596 if (block_rsv->reserved < block_rsv->size) 4597 block_rsv->full = 0; 4598 ret = 0; 4599 } 4600 spin_unlock(&block_rsv->lock); 4601 return ret; 4602 } 4603 4604 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 4605 u64 num_bytes, int update_size) 4606 { 4607 spin_lock(&block_rsv->lock); 4608 block_rsv->reserved += num_bytes; 4609 if (update_size) 4610 block_rsv->size += num_bytes; 4611 else if (block_rsv->reserved >= block_rsv->size) 4612 block_rsv->full = 1; 4613 spin_unlock(&block_rsv->lock); 4614 } 4615 4616 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 4617 struct btrfs_block_rsv *dest, u64 num_bytes, 4618 int min_factor) 4619 { 4620 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4621 u64 min_bytes; 4622 4623 if (global_rsv->space_info != dest->space_info) 4624 return -ENOSPC; 4625 4626 spin_lock(&global_rsv->lock); 4627 min_bytes = div_factor(global_rsv->size, min_factor); 4628 if (global_rsv->reserved < min_bytes + num_bytes) { 4629 spin_unlock(&global_rsv->lock); 4630 return -ENOSPC; 4631 } 4632 global_rsv->reserved -= num_bytes; 4633 if (global_rsv->reserved < global_rsv->size) 4634 global_rsv->full = 0; 4635 spin_unlock(&global_rsv->lock); 4636 4637 block_rsv_add_bytes(dest, num_bytes, 1); 4638 return 0; 4639 } 4640 4641 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4642 struct btrfs_block_rsv *block_rsv, 4643 struct btrfs_block_rsv *dest, u64 num_bytes) 4644 { 4645 struct btrfs_space_info *space_info = block_rsv->space_info; 4646 4647 spin_lock(&block_rsv->lock); 4648 if (num_bytes == (u64)-1) 4649 num_bytes = block_rsv->size; 4650 block_rsv->size -= num_bytes; 4651 if (block_rsv->reserved >= block_rsv->size) { 4652 num_bytes = block_rsv->reserved - block_rsv->size; 4653 block_rsv->reserved = block_rsv->size; 4654 block_rsv->full = 1; 4655 } else { 4656 num_bytes = 0; 4657 } 4658 spin_unlock(&block_rsv->lock); 4659 4660 if (num_bytes > 0) { 4661 if (dest) { 4662 spin_lock(&dest->lock); 4663 if (!dest->full) { 4664 u64 bytes_to_add; 4665 4666 bytes_to_add = dest->size - dest->reserved; 4667 bytes_to_add = min(num_bytes, bytes_to_add); 4668 dest->reserved += bytes_to_add; 4669 if (dest->reserved >= dest->size) 4670 dest->full = 1; 4671 num_bytes -= bytes_to_add; 4672 } 4673 spin_unlock(&dest->lock); 4674 } 4675 if (num_bytes) { 4676 spin_lock(&space_info->lock); 4677 space_info->bytes_may_use -= num_bytes; 4678 trace_btrfs_space_reservation(fs_info, "space_info", 4679 space_info->flags, num_bytes, 0); 4680 spin_unlock(&space_info->lock); 4681 } 4682 } 4683 } 4684 4685 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, 4686 struct btrfs_block_rsv *dst, u64 num_bytes) 4687 { 4688 int ret; 4689 4690 ret = block_rsv_use_bytes(src, num_bytes); 4691 if (ret) 4692 return ret; 4693 4694 block_rsv_add_bytes(dst, num_bytes, 1); 4695 return 0; 4696 } 4697 4698 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 4699 { 4700 memset(rsv, 0, sizeof(*rsv)); 4701 spin_lock_init(&rsv->lock); 4702 rsv->type = type; 4703 } 4704 4705 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 4706 unsigned short type) 4707 { 4708 struct btrfs_block_rsv *block_rsv; 4709 struct btrfs_fs_info *fs_info = root->fs_info; 4710 4711 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 4712 if (!block_rsv) 4713 return NULL; 4714 4715 btrfs_init_block_rsv(block_rsv, type); 4716 block_rsv->space_info = __find_space_info(fs_info, 4717 BTRFS_BLOCK_GROUP_METADATA); 4718 return block_rsv; 4719 } 4720 4721 void btrfs_free_block_rsv(struct btrfs_root *root, 4722 struct btrfs_block_rsv *rsv) 4723 { 4724 if (!rsv) 4725 return; 4726 btrfs_block_rsv_release(root, rsv, (u64)-1); 4727 kfree(rsv); 4728 } 4729 4730 int btrfs_block_rsv_add(struct btrfs_root *root, 4731 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4732 enum btrfs_reserve_flush_enum flush) 4733 { 4734 int ret; 4735 4736 if (num_bytes == 0) 4737 return 0; 4738 4739 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4740 if (!ret) { 4741 block_rsv_add_bytes(block_rsv, num_bytes, 1); 4742 return 0; 4743 } 4744 4745 return ret; 4746 } 4747 4748 int btrfs_block_rsv_check(struct btrfs_root *root, 4749 struct btrfs_block_rsv *block_rsv, int min_factor) 4750 { 4751 u64 num_bytes = 0; 4752 int ret = -ENOSPC; 4753 4754 if (!block_rsv) 4755 return 0; 4756 4757 spin_lock(&block_rsv->lock); 4758 num_bytes = div_factor(block_rsv->size, min_factor); 4759 if (block_rsv->reserved >= num_bytes) 4760 ret = 0; 4761 spin_unlock(&block_rsv->lock); 4762 4763 return ret; 4764 } 4765 4766 int btrfs_block_rsv_refill(struct btrfs_root *root, 4767 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 4768 enum btrfs_reserve_flush_enum flush) 4769 { 4770 u64 num_bytes = 0; 4771 int ret = -ENOSPC; 4772 4773 if (!block_rsv) 4774 return 0; 4775 4776 spin_lock(&block_rsv->lock); 4777 num_bytes = min_reserved; 4778 if (block_rsv->reserved >= num_bytes) 4779 ret = 0; 4780 else 4781 num_bytes -= block_rsv->reserved; 4782 spin_unlock(&block_rsv->lock); 4783 4784 if (!ret) 4785 return 0; 4786 4787 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 4788 if (!ret) { 4789 block_rsv_add_bytes(block_rsv, num_bytes, 0); 4790 return 0; 4791 } 4792 4793 return ret; 4794 } 4795 4796 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4797 struct btrfs_block_rsv *dst_rsv, 4798 u64 num_bytes) 4799 { 4800 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4801 } 4802 4803 void btrfs_block_rsv_release(struct btrfs_root *root, 4804 struct btrfs_block_rsv *block_rsv, 4805 u64 num_bytes) 4806 { 4807 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4808 if (global_rsv == block_rsv || 4809 block_rsv->space_info != global_rsv->space_info) 4810 global_rsv = NULL; 4811 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4812 num_bytes); 4813 } 4814 4815 /* 4816 * helper to calculate size of global block reservation. 4817 * the desired value is sum of space used by extent tree, 4818 * checksum tree and root tree 4819 */ 4820 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) 4821 { 4822 struct btrfs_space_info *sinfo; 4823 u64 num_bytes; 4824 u64 meta_used; 4825 u64 data_used; 4826 int csum_size = btrfs_super_csum_size(fs_info->super_copy); 4827 4828 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 4829 spin_lock(&sinfo->lock); 4830 data_used = sinfo->bytes_used; 4831 spin_unlock(&sinfo->lock); 4832 4833 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4834 spin_lock(&sinfo->lock); 4835 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) 4836 data_used = 0; 4837 meta_used = sinfo->bytes_used; 4838 spin_unlock(&sinfo->lock); 4839 4840 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 4841 csum_size * 2; 4842 num_bytes += div64_u64(data_used + meta_used, 50); 4843 4844 if (num_bytes * 3 > meta_used) 4845 num_bytes = div64_u64(meta_used, 3); 4846 4847 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4848 } 4849 4850 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4851 { 4852 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 4853 struct btrfs_space_info *sinfo = block_rsv->space_info; 4854 u64 num_bytes; 4855 4856 num_bytes = calc_global_metadata_size(fs_info); 4857 4858 spin_lock(&sinfo->lock); 4859 spin_lock(&block_rsv->lock); 4860 4861 block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); 4862 4863 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 4864 sinfo->bytes_reserved + sinfo->bytes_readonly + 4865 sinfo->bytes_may_use; 4866 4867 if (sinfo->total_bytes > num_bytes) { 4868 num_bytes = sinfo->total_bytes - num_bytes; 4869 block_rsv->reserved += num_bytes; 4870 sinfo->bytes_may_use += num_bytes; 4871 trace_btrfs_space_reservation(fs_info, "space_info", 4872 sinfo->flags, num_bytes, 1); 4873 } 4874 4875 if (block_rsv->reserved >= block_rsv->size) { 4876 num_bytes = block_rsv->reserved - block_rsv->size; 4877 sinfo->bytes_may_use -= num_bytes; 4878 trace_btrfs_space_reservation(fs_info, "space_info", 4879 sinfo->flags, num_bytes, 0); 4880 block_rsv->reserved = block_rsv->size; 4881 block_rsv->full = 1; 4882 } 4883 4884 spin_unlock(&block_rsv->lock); 4885 spin_unlock(&sinfo->lock); 4886 } 4887 4888 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 4889 { 4890 struct btrfs_space_info *space_info; 4891 4892 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4893 fs_info->chunk_block_rsv.space_info = space_info; 4894 4895 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4896 fs_info->global_block_rsv.space_info = space_info; 4897 fs_info->delalloc_block_rsv.space_info = space_info; 4898 fs_info->trans_block_rsv.space_info = space_info; 4899 fs_info->empty_block_rsv.space_info = space_info; 4900 fs_info->delayed_block_rsv.space_info = space_info; 4901 4902 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4903 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4904 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 4905 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4906 if (fs_info->quota_root) 4907 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 4908 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4909 4910 update_global_block_rsv(fs_info); 4911 } 4912 4913 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4914 { 4915 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 4916 (u64)-1); 4917 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4918 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4919 WARN_ON(fs_info->trans_block_rsv.size > 0); 4920 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4921 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4922 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4923 WARN_ON(fs_info->delayed_block_rsv.size > 0); 4924 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 4925 } 4926 4927 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4928 struct btrfs_root *root) 4929 { 4930 if (!trans->block_rsv) 4931 return; 4932 4933 if (!trans->bytes_reserved) 4934 return; 4935 4936 trace_btrfs_space_reservation(root->fs_info, "transaction", 4937 trans->transid, trans->bytes_reserved, 0); 4938 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4939 trans->bytes_reserved = 0; 4940 } 4941 4942 /* Can only return 0 or -ENOSPC */ 4943 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 4944 struct inode *inode) 4945 { 4946 struct btrfs_root *root = BTRFS_I(inode)->root; 4947 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4948 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 4949 4950 /* 4951 * We need to hold space in order to delete our orphan item once we've 4952 * added it, so this takes the reservation so we can release it later 4953 * when we are truly done with the orphan item. 4954 */ 4955 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4956 trace_btrfs_space_reservation(root->fs_info, "orphan", 4957 btrfs_ino(inode), num_bytes, 1); 4958 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4959 } 4960 4961 void btrfs_orphan_release_metadata(struct inode *inode) 4962 { 4963 struct btrfs_root *root = BTRFS_I(inode)->root; 4964 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4965 trace_btrfs_space_reservation(root->fs_info, "orphan", 4966 btrfs_ino(inode), num_bytes, 0); 4967 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4968 } 4969 4970 /* 4971 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 4972 * root: the root of the parent directory 4973 * rsv: block reservation 4974 * items: the number of items that we need do reservation 4975 * qgroup_reserved: used to return the reserved size in qgroup 4976 * 4977 * This function is used to reserve the space for snapshot/subvolume 4978 * creation and deletion. Those operations are different with the 4979 * common file/directory operations, they change two fs/file trees 4980 * and root tree, the number of items that the qgroup reserves is 4981 * different with the free space reservation. So we can not use 4982 * the space reseravtion mechanism in start_transaction(). 4983 */ 4984 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 4985 struct btrfs_block_rsv *rsv, 4986 int items, 4987 u64 *qgroup_reserved, 4988 bool use_global_rsv) 4989 { 4990 u64 num_bytes; 4991 int ret; 4992 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4993 4994 if (root->fs_info->quota_enabled) { 4995 /* One for parent inode, two for dir entries */ 4996 num_bytes = 3 * root->leafsize; 4997 ret = btrfs_qgroup_reserve(root, num_bytes); 4998 if (ret) 4999 return ret; 5000 } else { 5001 num_bytes = 0; 5002 } 5003 5004 *qgroup_reserved = num_bytes; 5005 5006 num_bytes = btrfs_calc_trans_metadata_size(root, items); 5007 rsv->space_info = __find_space_info(root->fs_info, 5008 BTRFS_BLOCK_GROUP_METADATA); 5009 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5010 BTRFS_RESERVE_FLUSH_ALL); 5011 5012 if (ret == -ENOSPC && use_global_rsv) 5013 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 5014 5015 if (ret) { 5016 if (*qgroup_reserved) 5017 btrfs_qgroup_free(root, *qgroup_reserved); 5018 } 5019 5020 return ret; 5021 } 5022 5023 void btrfs_subvolume_release_metadata(struct btrfs_root *root, 5024 struct btrfs_block_rsv *rsv, 5025 u64 qgroup_reserved) 5026 { 5027 btrfs_block_rsv_release(root, rsv, (u64)-1); 5028 if (qgroup_reserved) 5029 btrfs_qgroup_free(root, qgroup_reserved); 5030 } 5031 5032 /** 5033 * drop_outstanding_extent - drop an outstanding extent 5034 * @inode: the inode we're dropping the extent for 5035 * 5036 * This is called when we are freeing up an outstanding extent, either called 5037 * after an error or after an extent is written. This will return the number of 5038 * reserved extents that need to be freed. This must be called with 5039 * BTRFS_I(inode)->lock held. 5040 */ 5041 static unsigned drop_outstanding_extent(struct inode *inode) 5042 { 5043 unsigned drop_inode_space = 0; 5044 unsigned dropped_extents = 0; 5045 5046 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 5047 BTRFS_I(inode)->outstanding_extents--; 5048 5049 if (BTRFS_I(inode)->outstanding_extents == 0 && 5050 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5051 &BTRFS_I(inode)->runtime_flags)) 5052 drop_inode_space = 1; 5053 5054 /* 5055 * If we have more or the same amount of outsanding extents than we have 5056 * reserved then we need to leave the reserved extents count alone. 5057 */ 5058 if (BTRFS_I(inode)->outstanding_extents >= 5059 BTRFS_I(inode)->reserved_extents) 5060 return drop_inode_space; 5061 5062 dropped_extents = BTRFS_I(inode)->reserved_extents - 5063 BTRFS_I(inode)->outstanding_extents; 5064 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5065 return dropped_extents + drop_inode_space; 5066 } 5067 5068 /** 5069 * calc_csum_metadata_size - return the amount of metada space that must be 5070 * reserved/free'd for the given bytes. 5071 * @inode: the inode we're manipulating 5072 * @num_bytes: the number of bytes in question 5073 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5074 * 5075 * This adjusts the number of csum_bytes in the inode and then returns the 5076 * correct amount of metadata that must either be reserved or freed. We 5077 * calculate how many checksums we can fit into one leaf and then divide the 5078 * number of bytes that will need to be checksumed by this value to figure out 5079 * how many checksums will be required. If we are adding bytes then the number 5080 * may go up and we will return the number of additional bytes that must be 5081 * reserved. If it is going down we will return the number of bytes that must 5082 * be freed. 5083 * 5084 * This must be called with BTRFS_I(inode)->lock held. 5085 */ 5086 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5087 int reserve) 5088 { 5089 struct btrfs_root *root = BTRFS_I(inode)->root; 5090 u64 csum_size; 5091 int num_csums_per_leaf; 5092 int num_csums; 5093 int old_csums; 5094 5095 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5096 BTRFS_I(inode)->csum_bytes == 0) 5097 return 0; 5098 5099 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5100 if (reserve) 5101 BTRFS_I(inode)->csum_bytes += num_bytes; 5102 else 5103 BTRFS_I(inode)->csum_bytes -= num_bytes; 5104 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 5105 num_csums_per_leaf = (int)div64_u64(csum_size, 5106 sizeof(struct btrfs_csum_item) + 5107 sizeof(struct btrfs_disk_key)); 5108 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5109 num_csums = num_csums + num_csums_per_leaf - 1; 5110 num_csums = num_csums / num_csums_per_leaf; 5111 5112 old_csums = old_csums + num_csums_per_leaf - 1; 5113 old_csums = old_csums / num_csums_per_leaf; 5114 5115 /* No change, no need to reserve more */ 5116 if (old_csums == num_csums) 5117 return 0; 5118 5119 if (reserve) 5120 return btrfs_calc_trans_metadata_size(root, 5121 num_csums - old_csums); 5122 5123 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); 5124 } 5125 5126 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5127 { 5128 struct btrfs_root *root = BTRFS_I(inode)->root; 5129 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 5130 u64 to_reserve = 0; 5131 u64 csum_bytes; 5132 unsigned nr_extents = 0; 5133 int extra_reserve = 0; 5134 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5135 int ret = 0; 5136 bool delalloc_lock = true; 5137 u64 to_free = 0; 5138 unsigned dropped; 5139 5140 /* If we are a free space inode we need to not flush since we will be in 5141 * the middle of a transaction commit. We also don't need the delalloc 5142 * mutex since we won't race with anybody. We need this mostly to make 5143 * lockdep shut its filthy mouth. 5144 */ 5145 if (btrfs_is_free_space_inode(inode)) { 5146 flush = BTRFS_RESERVE_NO_FLUSH; 5147 delalloc_lock = false; 5148 } 5149 5150 if (flush != BTRFS_RESERVE_NO_FLUSH && 5151 btrfs_transaction_in_commit(root->fs_info)) 5152 schedule_timeout(1); 5153 5154 if (delalloc_lock) 5155 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5156 5157 num_bytes = ALIGN(num_bytes, root->sectorsize); 5158 5159 spin_lock(&BTRFS_I(inode)->lock); 5160 BTRFS_I(inode)->outstanding_extents++; 5161 5162 if (BTRFS_I(inode)->outstanding_extents > 5163 BTRFS_I(inode)->reserved_extents) 5164 nr_extents = BTRFS_I(inode)->outstanding_extents - 5165 BTRFS_I(inode)->reserved_extents; 5166 5167 /* 5168 * Add an item to reserve for updating the inode when we complete the 5169 * delalloc io. 5170 */ 5171 if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5172 &BTRFS_I(inode)->runtime_flags)) { 5173 nr_extents++; 5174 extra_reserve = 1; 5175 } 5176 5177 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 5178 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5179 csum_bytes = BTRFS_I(inode)->csum_bytes; 5180 spin_unlock(&BTRFS_I(inode)->lock); 5181 5182 if (root->fs_info->quota_enabled) { 5183 ret = btrfs_qgroup_reserve(root, num_bytes + 5184 nr_extents * root->leafsize); 5185 if (ret) 5186 goto out_fail; 5187 } 5188 5189 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5190 if (unlikely(ret)) { 5191 if (root->fs_info->quota_enabled) 5192 btrfs_qgroup_free(root, num_bytes + 5193 nr_extents * root->leafsize); 5194 goto out_fail; 5195 } 5196 5197 spin_lock(&BTRFS_I(inode)->lock); 5198 if (extra_reserve) { 5199 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5200 &BTRFS_I(inode)->runtime_flags); 5201 nr_extents--; 5202 } 5203 BTRFS_I(inode)->reserved_extents += nr_extents; 5204 spin_unlock(&BTRFS_I(inode)->lock); 5205 5206 if (delalloc_lock) 5207 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5208 5209 if (to_reserve) 5210 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5211 btrfs_ino(inode), to_reserve, 1); 5212 block_rsv_add_bytes(block_rsv, to_reserve, 1); 5213 5214 return 0; 5215 5216 out_fail: 5217 spin_lock(&BTRFS_I(inode)->lock); 5218 dropped = drop_outstanding_extent(inode); 5219 /* 5220 * If the inodes csum_bytes is the same as the original 5221 * csum_bytes then we know we haven't raced with any free()ers 5222 * so we can just reduce our inodes csum bytes and carry on. 5223 */ 5224 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 5225 calc_csum_metadata_size(inode, num_bytes, 0); 5226 } else { 5227 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 5228 u64 bytes; 5229 5230 /* 5231 * This is tricky, but first we need to figure out how much we 5232 * free'd from any free-ers that occured during this 5233 * reservation, so we reset ->csum_bytes to the csum_bytes 5234 * before we dropped our lock, and then call the free for the 5235 * number of bytes that were freed while we were trying our 5236 * reservation. 5237 */ 5238 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 5239 BTRFS_I(inode)->csum_bytes = csum_bytes; 5240 to_free = calc_csum_metadata_size(inode, bytes, 0); 5241 5242 5243 /* 5244 * Now we need to see how much we would have freed had we not 5245 * been making this reservation and our ->csum_bytes were not 5246 * artificially inflated. 5247 */ 5248 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 5249 bytes = csum_bytes - orig_csum_bytes; 5250 bytes = calc_csum_metadata_size(inode, bytes, 0); 5251 5252 /* 5253 * Now reset ->csum_bytes to what it should be. If bytes is 5254 * more than to_free then we would have free'd more space had we 5255 * not had an artificially high ->csum_bytes, so we need to free 5256 * the remainder. If bytes is the same or less then we don't 5257 * need to do anything, the other free-ers did the correct 5258 * thing. 5259 */ 5260 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 5261 if (bytes > to_free) 5262 to_free = bytes - to_free; 5263 else 5264 to_free = 0; 5265 } 5266 spin_unlock(&BTRFS_I(inode)->lock); 5267 if (dropped) 5268 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5269 5270 if (to_free) { 5271 btrfs_block_rsv_release(root, block_rsv, to_free); 5272 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5273 btrfs_ino(inode), to_free, 0); 5274 } 5275 if (delalloc_lock) 5276 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 5277 return ret; 5278 } 5279 5280 /** 5281 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 5282 * @inode: the inode to release the reservation for 5283 * @num_bytes: the number of bytes we're releasing 5284 * 5285 * This will release the metadata reservation for an inode. This can be called 5286 * once we complete IO for a given set of bytes to release their metadata 5287 * reservations. 5288 */ 5289 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 5290 { 5291 struct btrfs_root *root = BTRFS_I(inode)->root; 5292 u64 to_free = 0; 5293 unsigned dropped; 5294 5295 num_bytes = ALIGN(num_bytes, root->sectorsize); 5296 spin_lock(&BTRFS_I(inode)->lock); 5297 dropped = drop_outstanding_extent(inode); 5298 5299 if (num_bytes) 5300 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 5301 spin_unlock(&BTRFS_I(inode)->lock); 5302 if (dropped > 0) 5303 to_free += btrfs_calc_trans_metadata_size(root, dropped); 5304 5305 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5306 btrfs_ino(inode), to_free, 0); 5307 if (root->fs_info->quota_enabled) { 5308 btrfs_qgroup_free(root, num_bytes + 5309 dropped * root->leafsize); 5310 } 5311 5312 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5313 to_free); 5314 } 5315 5316 /** 5317 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 5318 * @inode: inode we're writing to 5319 * @num_bytes: the number of bytes we want to allocate 5320 * 5321 * This will do the following things 5322 * 5323 * o reserve space in the data space info for num_bytes 5324 * o reserve space in the metadata space info based on number of outstanding 5325 * extents and how much csums will be needed 5326 * o add to the inodes ->delalloc_bytes 5327 * o add it to the fs_info's delalloc inodes list. 5328 * 5329 * This will return 0 for success and -ENOSPC if there is no space left. 5330 */ 5331 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 5332 { 5333 int ret; 5334 5335 ret = btrfs_check_data_free_space(inode, num_bytes); 5336 if (ret) 5337 return ret; 5338 5339 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); 5340 if (ret) { 5341 btrfs_free_reserved_data_space(inode, num_bytes); 5342 return ret; 5343 } 5344 5345 return 0; 5346 } 5347 5348 /** 5349 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5350 * @inode: inode we're releasing space for 5351 * @num_bytes: the number of bytes we want to free up 5352 * 5353 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5354 * called in the case that we don't need the metadata AND data reservations 5355 * anymore. So if there is an error or we insert an inline extent. 5356 * 5357 * This function will release the metadata space that was not used and will 5358 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5359 * list if there are no delalloc bytes left. 5360 */ 5361 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5362 { 5363 btrfs_delalloc_release_metadata(inode, num_bytes); 5364 btrfs_free_reserved_data_space(inode, num_bytes); 5365 } 5366 5367 static int update_block_group(struct btrfs_root *root, 5368 u64 bytenr, u64 num_bytes, int alloc) 5369 { 5370 struct btrfs_block_group_cache *cache = NULL; 5371 struct btrfs_fs_info *info = root->fs_info; 5372 u64 total = num_bytes; 5373 u64 old_val; 5374 u64 byte_in_group; 5375 int factor; 5376 5377 /* block accounting for super block */ 5378 spin_lock(&info->delalloc_root_lock); 5379 old_val = btrfs_super_bytes_used(info->super_copy); 5380 if (alloc) 5381 old_val += num_bytes; 5382 else 5383 old_val -= num_bytes; 5384 btrfs_set_super_bytes_used(info->super_copy, old_val); 5385 spin_unlock(&info->delalloc_root_lock); 5386 5387 while (total) { 5388 cache = btrfs_lookup_block_group(info, bytenr); 5389 if (!cache) 5390 return -ENOENT; 5391 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 5392 BTRFS_BLOCK_GROUP_RAID1 | 5393 BTRFS_BLOCK_GROUP_RAID10)) 5394 factor = 2; 5395 else 5396 factor = 1; 5397 /* 5398 * If this block group has free space cache written out, we 5399 * need to make sure to load it if we are removing space. This 5400 * is because we need the unpinning stage to actually add the 5401 * space back to the block group, otherwise we will leak space. 5402 */ 5403 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5404 cache_block_group(cache, 1); 5405 5406 byte_in_group = bytenr - cache->key.objectid; 5407 WARN_ON(byte_in_group > cache->key.offset); 5408 5409 spin_lock(&cache->space_info->lock); 5410 spin_lock(&cache->lock); 5411 5412 if (btrfs_test_opt(root, SPACE_CACHE) && 5413 cache->disk_cache_state < BTRFS_DC_CLEAR) 5414 cache->disk_cache_state = BTRFS_DC_CLEAR; 5415 5416 cache->dirty = 1; 5417 old_val = btrfs_block_group_used(&cache->item); 5418 num_bytes = min(total, cache->key.offset - byte_in_group); 5419 if (alloc) { 5420 old_val += num_bytes; 5421 btrfs_set_block_group_used(&cache->item, old_val); 5422 cache->reserved -= num_bytes; 5423 cache->space_info->bytes_reserved -= num_bytes; 5424 cache->space_info->bytes_used += num_bytes; 5425 cache->space_info->disk_used += num_bytes * factor; 5426 spin_unlock(&cache->lock); 5427 spin_unlock(&cache->space_info->lock); 5428 } else { 5429 old_val -= num_bytes; 5430 btrfs_set_block_group_used(&cache->item, old_val); 5431 cache->pinned += num_bytes; 5432 cache->space_info->bytes_pinned += num_bytes; 5433 cache->space_info->bytes_used -= num_bytes; 5434 cache->space_info->disk_used -= num_bytes * factor; 5435 spin_unlock(&cache->lock); 5436 spin_unlock(&cache->space_info->lock); 5437 5438 set_extent_dirty(info->pinned_extents, 5439 bytenr, bytenr + num_bytes - 1, 5440 GFP_NOFS | __GFP_NOFAIL); 5441 } 5442 btrfs_put_block_group(cache); 5443 total -= num_bytes; 5444 bytenr += num_bytes; 5445 } 5446 return 0; 5447 } 5448 5449 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) 5450 { 5451 struct btrfs_block_group_cache *cache; 5452 u64 bytenr; 5453 5454 spin_lock(&root->fs_info->block_group_cache_lock); 5455 bytenr = root->fs_info->first_logical_byte; 5456 spin_unlock(&root->fs_info->block_group_cache_lock); 5457 5458 if (bytenr < (u64)-1) 5459 return bytenr; 5460 5461 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 5462 if (!cache) 5463 return 0; 5464 5465 bytenr = cache->key.objectid; 5466 btrfs_put_block_group(cache); 5467 5468 return bytenr; 5469 } 5470 5471 static int pin_down_extent(struct btrfs_root *root, 5472 struct btrfs_block_group_cache *cache, 5473 u64 bytenr, u64 num_bytes, int reserved) 5474 { 5475 spin_lock(&cache->space_info->lock); 5476 spin_lock(&cache->lock); 5477 cache->pinned += num_bytes; 5478 cache->space_info->bytes_pinned += num_bytes; 5479 if (reserved) { 5480 cache->reserved -= num_bytes; 5481 cache->space_info->bytes_reserved -= num_bytes; 5482 } 5483 spin_unlock(&cache->lock); 5484 spin_unlock(&cache->space_info->lock); 5485 5486 set_extent_dirty(root->fs_info->pinned_extents, bytenr, 5487 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 5488 if (reserved) 5489 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); 5490 return 0; 5491 } 5492 5493 /* 5494 * this function must be called within transaction 5495 */ 5496 int btrfs_pin_extent(struct btrfs_root *root, 5497 u64 bytenr, u64 num_bytes, int reserved) 5498 { 5499 struct btrfs_block_group_cache *cache; 5500 5501 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5502 BUG_ON(!cache); /* Logic error */ 5503 5504 pin_down_extent(root, cache, bytenr, num_bytes, reserved); 5505 5506 btrfs_put_block_group(cache); 5507 return 0; 5508 } 5509 5510 /* 5511 * this function must be called within transaction 5512 */ 5513 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 5514 u64 bytenr, u64 num_bytes) 5515 { 5516 struct btrfs_block_group_cache *cache; 5517 int ret; 5518 5519 cache = btrfs_lookup_block_group(root->fs_info, bytenr); 5520 if (!cache) 5521 return -EINVAL; 5522 5523 /* 5524 * pull in the free space cache (if any) so that our pin 5525 * removes the free space from the cache. We have load_only set 5526 * to one because the slow code to read in the free extents does check 5527 * the pinned extents. 5528 */ 5529 cache_block_group(cache, 1); 5530 5531 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5532 5533 /* remove us from the free space cache (if we're there at all) */ 5534 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 5535 btrfs_put_block_group(cache); 5536 return ret; 5537 } 5538 5539 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) 5540 { 5541 int ret; 5542 struct btrfs_block_group_cache *block_group; 5543 struct btrfs_caching_control *caching_ctl; 5544 5545 block_group = btrfs_lookup_block_group(root->fs_info, start); 5546 if (!block_group) 5547 return -EINVAL; 5548 5549 cache_block_group(block_group, 0); 5550 caching_ctl = get_caching_control(block_group); 5551 5552 if (!caching_ctl) { 5553 /* Logic error */ 5554 BUG_ON(!block_group_cache_done(block_group)); 5555 ret = btrfs_remove_free_space(block_group, start, num_bytes); 5556 } else { 5557 mutex_lock(&caching_ctl->mutex); 5558 5559 if (start >= caching_ctl->progress) { 5560 ret = add_excluded_extent(root, start, num_bytes); 5561 } else if (start + num_bytes <= caching_ctl->progress) { 5562 ret = btrfs_remove_free_space(block_group, 5563 start, num_bytes); 5564 } else { 5565 num_bytes = caching_ctl->progress - start; 5566 ret = btrfs_remove_free_space(block_group, 5567 start, num_bytes); 5568 if (ret) 5569 goto out_lock; 5570 5571 num_bytes = (start + num_bytes) - 5572 caching_ctl->progress; 5573 start = caching_ctl->progress; 5574 ret = add_excluded_extent(root, start, num_bytes); 5575 } 5576 out_lock: 5577 mutex_unlock(&caching_ctl->mutex); 5578 put_caching_control(caching_ctl); 5579 } 5580 btrfs_put_block_group(block_group); 5581 return ret; 5582 } 5583 5584 int btrfs_exclude_logged_extents(struct btrfs_root *log, 5585 struct extent_buffer *eb) 5586 { 5587 struct btrfs_file_extent_item *item; 5588 struct btrfs_key key; 5589 int found_type; 5590 int i; 5591 5592 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) 5593 return 0; 5594 5595 for (i = 0; i < btrfs_header_nritems(eb); i++) { 5596 btrfs_item_key_to_cpu(eb, &key, i); 5597 if (key.type != BTRFS_EXTENT_DATA_KEY) 5598 continue; 5599 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 5600 found_type = btrfs_file_extent_type(eb, item); 5601 if (found_type == BTRFS_FILE_EXTENT_INLINE) 5602 continue; 5603 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 5604 continue; 5605 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 5606 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 5607 __exclude_logged_extent(log, key.objectid, key.offset); 5608 } 5609 5610 return 0; 5611 } 5612 5613 /** 5614 * btrfs_update_reserved_bytes - update the block_group and space info counters 5615 * @cache: The cache we are manipulating 5616 * @num_bytes: The number of bytes in question 5617 * @reserve: One of the reservation enums 5618 * @delalloc: The blocks are allocated for the delalloc write 5619 * 5620 * This is called by the allocator when it reserves space, or by somebody who is 5621 * freeing space that was never actually used on disk. For example if you 5622 * reserve some space for a new leaf in transaction A and before transaction A 5623 * commits you free that leaf, you call this with reserve set to 0 in order to 5624 * clear the reservation. 5625 * 5626 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper 5627 * ENOSPC accounting. For data we handle the reservation through clearing the 5628 * delalloc bits in the io_tree. We have to do this since we could end up 5629 * allocating less disk space for the amount of data we have reserved in the 5630 * case of compression. 5631 * 5632 * If this is a reservation and the block group has become read only we cannot 5633 * make the reservation and return -EAGAIN, otherwise this function always 5634 * succeeds. 5635 */ 5636 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 5637 u64 num_bytes, int reserve, int delalloc) 5638 { 5639 struct btrfs_space_info *space_info = cache->space_info; 5640 int ret = 0; 5641 5642 spin_lock(&space_info->lock); 5643 spin_lock(&cache->lock); 5644 if (reserve != RESERVE_FREE) { 5645 if (cache->ro) { 5646 ret = -EAGAIN; 5647 } else { 5648 cache->reserved += num_bytes; 5649 space_info->bytes_reserved += num_bytes; 5650 if (reserve == RESERVE_ALLOC) { 5651 trace_btrfs_space_reservation(cache->fs_info, 5652 "space_info", space_info->flags, 5653 num_bytes, 0); 5654 space_info->bytes_may_use -= num_bytes; 5655 } 5656 5657 if (delalloc) 5658 cache->delalloc_bytes += num_bytes; 5659 } 5660 } else { 5661 if (cache->ro) 5662 space_info->bytes_readonly += num_bytes; 5663 cache->reserved -= num_bytes; 5664 space_info->bytes_reserved -= num_bytes; 5665 5666 if (delalloc) 5667 cache->delalloc_bytes -= num_bytes; 5668 } 5669 spin_unlock(&cache->lock); 5670 spin_unlock(&space_info->lock); 5671 return ret; 5672 } 5673 5674 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 5675 struct btrfs_root *root) 5676 { 5677 struct btrfs_fs_info *fs_info = root->fs_info; 5678 struct btrfs_caching_control *next; 5679 struct btrfs_caching_control *caching_ctl; 5680 struct btrfs_block_group_cache *cache; 5681 5682 down_write(&fs_info->commit_root_sem); 5683 5684 list_for_each_entry_safe(caching_ctl, next, 5685 &fs_info->caching_block_groups, list) { 5686 cache = caching_ctl->block_group; 5687 if (block_group_cache_done(cache)) { 5688 cache->last_byte_to_unpin = (u64)-1; 5689 list_del_init(&caching_ctl->list); 5690 put_caching_control(caching_ctl); 5691 } else { 5692 cache->last_byte_to_unpin = caching_ctl->progress; 5693 } 5694 } 5695 5696 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5697 fs_info->pinned_extents = &fs_info->freed_extents[1]; 5698 else 5699 fs_info->pinned_extents = &fs_info->freed_extents[0]; 5700 5701 up_write(&fs_info->commit_root_sem); 5702 5703 update_global_block_rsv(fs_info); 5704 } 5705 5706 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 5707 { 5708 struct btrfs_fs_info *fs_info = root->fs_info; 5709 struct btrfs_block_group_cache *cache = NULL; 5710 struct btrfs_space_info *space_info; 5711 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5712 u64 len; 5713 bool readonly; 5714 5715 while (start <= end) { 5716 readonly = false; 5717 if (!cache || 5718 start >= cache->key.objectid + cache->key.offset) { 5719 if (cache) 5720 btrfs_put_block_group(cache); 5721 cache = btrfs_lookup_block_group(fs_info, start); 5722 BUG_ON(!cache); /* Logic error */ 5723 } 5724 5725 len = cache->key.objectid + cache->key.offset - start; 5726 len = min(len, end + 1 - start); 5727 5728 if (start < cache->last_byte_to_unpin) { 5729 len = min(len, cache->last_byte_to_unpin - start); 5730 btrfs_add_free_space(cache, start, len); 5731 } 5732 5733 start += len; 5734 space_info = cache->space_info; 5735 5736 spin_lock(&space_info->lock); 5737 spin_lock(&cache->lock); 5738 cache->pinned -= len; 5739 space_info->bytes_pinned -= len; 5740 percpu_counter_add(&space_info->total_bytes_pinned, -len); 5741 if (cache->ro) { 5742 space_info->bytes_readonly += len; 5743 readonly = true; 5744 } 5745 spin_unlock(&cache->lock); 5746 if (!readonly && global_rsv->space_info == space_info) { 5747 spin_lock(&global_rsv->lock); 5748 if (!global_rsv->full) { 5749 len = min(len, global_rsv->size - 5750 global_rsv->reserved); 5751 global_rsv->reserved += len; 5752 space_info->bytes_may_use += len; 5753 if (global_rsv->reserved >= global_rsv->size) 5754 global_rsv->full = 1; 5755 } 5756 spin_unlock(&global_rsv->lock); 5757 } 5758 spin_unlock(&space_info->lock); 5759 } 5760 5761 if (cache) 5762 btrfs_put_block_group(cache); 5763 return 0; 5764 } 5765 5766 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 5767 struct btrfs_root *root) 5768 { 5769 struct btrfs_fs_info *fs_info = root->fs_info; 5770 struct extent_io_tree *unpin; 5771 u64 start; 5772 u64 end; 5773 int ret; 5774 5775 if (trans->aborted) 5776 return 0; 5777 5778 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 5779 unpin = &fs_info->freed_extents[1]; 5780 else 5781 unpin = &fs_info->freed_extents[0]; 5782 5783 while (1) { 5784 ret = find_first_extent_bit(unpin, 0, &start, &end, 5785 EXTENT_DIRTY, NULL); 5786 if (ret) 5787 break; 5788 5789 if (btrfs_test_opt(root, DISCARD)) 5790 ret = btrfs_discard_extent(root, start, 5791 end + 1 - start, NULL); 5792 5793 clear_extent_dirty(unpin, start, end, GFP_NOFS); 5794 unpin_extent_range(root, start, end); 5795 cond_resched(); 5796 } 5797 5798 return 0; 5799 } 5800 5801 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 5802 u64 owner, u64 root_objectid) 5803 { 5804 struct btrfs_space_info *space_info; 5805 u64 flags; 5806 5807 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5808 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 5809 flags = BTRFS_BLOCK_GROUP_SYSTEM; 5810 else 5811 flags = BTRFS_BLOCK_GROUP_METADATA; 5812 } else { 5813 flags = BTRFS_BLOCK_GROUP_DATA; 5814 } 5815 5816 space_info = __find_space_info(fs_info, flags); 5817 BUG_ON(!space_info); /* Logic bug */ 5818 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 5819 } 5820 5821 5822 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5823 struct btrfs_root *root, 5824 u64 bytenr, u64 num_bytes, u64 parent, 5825 u64 root_objectid, u64 owner_objectid, 5826 u64 owner_offset, int refs_to_drop, 5827 struct btrfs_delayed_extent_op *extent_op, 5828 int no_quota) 5829 { 5830 struct btrfs_key key; 5831 struct btrfs_path *path; 5832 struct btrfs_fs_info *info = root->fs_info; 5833 struct btrfs_root *extent_root = info->extent_root; 5834 struct extent_buffer *leaf; 5835 struct btrfs_extent_item *ei; 5836 struct btrfs_extent_inline_ref *iref; 5837 int ret; 5838 int is_data; 5839 int extent_slot = 0; 5840 int found_extent = 0; 5841 int num_to_del = 1; 5842 u32 item_size; 5843 u64 refs; 5844 int last_ref = 0; 5845 enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; 5846 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 5847 SKINNY_METADATA); 5848 5849 if (!info->quota_enabled || !is_fstree(root_objectid)) 5850 no_quota = 1; 5851 5852 path = btrfs_alloc_path(); 5853 if (!path) 5854 return -ENOMEM; 5855 5856 path->reada = 1; 5857 path->leave_spinning = 1; 5858 5859 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 5860 BUG_ON(!is_data && refs_to_drop != 1); 5861 5862 if (is_data) 5863 skinny_metadata = 0; 5864 5865 ret = lookup_extent_backref(trans, extent_root, path, &iref, 5866 bytenr, num_bytes, parent, 5867 root_objectid, owner_objectid, 5868 owner_offset); 5869 if (ret == 0) { 5870 extent_slot = path->slots[0]; 5871 while (extent_slot >= 0) { 5872 btrfs_item_key_to_cpu(path->nodes[0], &key, 5873 extent_slot); 5874 if (key.objectid != bytenr) 5875 break; 5876 if (key.type == BTRFS_EXTENT_ITEM_KEY && 5877 key.offset == num_bytes) { 5878 found_extent = 1; 5879 break; 5880 } 5881 if (key.type == BTRFS_METADATA_ITEM_KEY && 5882 key.offset == owner_objectid) { 5883 found_extent = 1; 5884 break; 5885 } 5886 if (path->slots[0] - extent_slot > 5) 5887 break; 5888 extent_slot--; 5889 } 5890 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5891 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 5892 if (found_extent && item_size < sizeof(*ei)) 5893 found_extent = 0; 5894 #endif 5895 if (!found_extent) { 5896 BUG_ON(iref); 5897 ret = remove_extent_backref(trans, extent_root, path, 5898 NULL, refs_to_drop, 5899 is_data, &last_ref); 5900 if (ret) { 5901 btrfs_abort_transaction(trans, extent_root, ret); 5902 goto out; 5903 } 5904 btrfs_release_path(path); 5905 path->leave_spinning = 1; 5906 5907 key.objectid = bytenr; 5908 key.type = BTRFS_EXTENT_ITEM_KEY; 5909 key.offset = num_bytes; 5910 5911 if (!is_data && skinny_metadata) { 5912 key.type = BTRFS_METADATA_ITEM_KEY; 5913 key.offset = owner_objectid; 5914 } 5915 5916 ret = btrfs_search_slot(trans, extent_root, 5917 &key, path, -1, 1); 5918 if (ret > 0 && skinny_metadata && path->slots[0]) { 5919 /* 5920 * Couldn't find our skinny metadata item, 5921 * see if we have ye olde extent item. 5922 */ 5923 path->slots[0]--; 5924 btrfs_item_key_to_cpu(path->nodes[0], &key, 5925 path->slots[0]); 5926 if (key.objectid == bytenr && 5927 key.type == BTRFS_EXTENT_ITEM_KEY && 5928 key.offset == num_bytes) 5929 ret = 0; 5930 } 5931 5932 if (ret > 0 && skinny_metadata) { 5933 skinny_metadata = false; 5934 key.objectid = bytenr; 5935 key.type = BTRFS_EXTENT_ITEM_KEY; 5936 key.offset = num_bytes; 5937 btrfs_release_path(path); 5938 ret = btrfs_search_slot(trans, extent_root, 5939 &key, path, -1, 1); 5940 } 5941 5942 if (ret) { 5943 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5944 ret, bytenr); 5945 if (ret > 0) 5946 btrfs_print_leaf(extent_root, 5947 path->nodes[0]); 5948 } 5949 if (ret < 0) { 5950 btrfs_abort_transaction(trans, extent_root, ret); 5951 goto out; 5952 } 5953 extent_slot = path->slots[0]; 5954 } 5955 } else if (WARN_ON(ret == -ENOENT)) { 5956 btrfs_print_leaf(extent_root, path->nodes[0]); 5957 btrfs_err(info, 5958 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 5959 bytenr, parent, root_objectid, owner_objectid, 5960 owner_offset); 5961 btrfs_abort_transaction(trans, extent_root, ret); 5962 goto out; 5963 } else { 5964 btrfs_abort_transaction(trans, extent_root, ret); 5965 goto out; 5966 } 5967 5968 leaf = path->nodes[0]; 5969 item_size = btrfs_item_size_nr(leaf, extent_slot); 5970 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 5971 if (item_size < sizeof(*ei)) { 5972 BUG_ON(found_extent || extent_slot != path->slots[0]); 5973 ret = convert_extent_item_v0(trans, extent_root, path, 5974 owner_objectid, 0); 5975 if (ret < 0) { 5976 btrfs_abort_transaction(trans, extent_root, ret); 5977 goto out; 5978 } 5979 5980 btrfs_release_path(path); 5981 path->leave_spinning = 1; 5982 5983 key.objectid = bytenr; 5984 key.type = BTRFS_EXTENT_ITEM_KEY; 5985 key.offset = num_bytes; 5986 5987 ret = btrfs_search_slot(trans, extent_root, &key, path, 5988 -1, 1); 5989 if (ret) { 5990 btrfs_err(info, "umm, got %d back from search, was looking for %llu", 5991 ret, bytenr); 5992 btrfs_print_leaf(extent_root, path->nodes[0]); 5993 } 5994 if (ret < 0) { 5995 btrfs_abort_transaction(trans, extent_root, ret); 5996 goto out; 5997 } 5998 5999 extent_slot = path->slots[0]; 6000 leaf = path->nodes[0]; 6001 item_size = btrfs_item_size_nr(leaf, extent_slot); 6002 } 6003 #endif 6004 BUG_ON(item_size < sizeof(*ei)); 6005 ei = btrfs_item_ptr(leaf, extent_slot, 6006 struct btrfs_extent_item); 6007 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6008 key.type == BTRFS_EXTENT_ITEM_KEY) { 6009 struct btrfs_tree_block_info *bi; 6010 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6011 bi = (struct btrfs_tree_block_info *)(ei + 1); 6012 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6013 } 6014 6015 refs = btrfs_extent_refs(leaf, ei); 6016 if (refs < refs_to_drop) { 6017 btrfs_err(info, "trying to drop %d refs but we only have %Lu " 6018 "for bytenr %Lu", refs_to_drop, refs, bytenr); 6019 ret = -EINVAL; 6020 btrfs_abort_transaction(trans, extent_root, ret); 6021 goto out; 6022 } 6023 refs -= refs_to_drop; 6024 6025 if (refs > 0) { 6026 type = BTRFS_QGROUP_OPER_SUB_SHARED; 6027 if (extent_op) 6028 __run_delayed_extent_op(extent_op, leaf, ei); 6029 /* 6030 * In the case of inline back ref, reference count will 6031 * be updated by remove_extent_backref 6032 */ 6033 if (iref) { 6034 BUG_ON(!found_extent); 6035 } else { 6036 btrfs_set_extent_refs(leaf, ei, refs); 6037 btrfs_mark_buffer_dirty(leaf); 6038 } 6039 if (found_extent) { 6040 ret = remove_extent_backref(trans, extent_root, path, 6041 iref, refs_to_drop, 6042 is_data, &last_ref); 6043 if (ret) { 6044 btrfs_abort_transaction(trans, extent_root, ret); 6045 goto out; 6046 } 6047 } 6048 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, 6049 root_objectid); 6050 } else { 6051 if (found_extent) { 6052 BUG_ON(is_data && refs_to_drop != 6053 extent_data_ref_count(root, path, iref)); 6054 if (iref) { 6055 BUG_ON(path->slots[0] != extent_slot); 6056 } else { 6057 BUG_ON(path->slots[0] != extent_slot + 1); 6058 path->slots[0] = extent_slot; 6059 num_to_del = 2; 6060 } 6061 } 6062 6063 last_ref = 1; 6064 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 6065 num_to_del); 6066 if (ret) { 6067 btrfs_abort_transaction(trans, extent_root, ret); 6068 goto out; 6069 } 6070 btrfs_release_path(path); 6071 6072 if (is_data) { 6073 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 6074 if (ret) { 6075 btrfs_abort_transaction(trans, extent_root, ret); 6076 goto out; 6077 } 6078 } 6079 6080 ret = update_block_group(root, bytenr, num_bytes, 0); 6081 if (ret) { 6082 btrfs_abort_transaction(trans, extent_root, ret); 6083 goto out; 6084 } 6085 } 6086 btrfs_release_path(path); 6087 6088 /* Deal with the quota accounting */ 6089 if (!ret && last_ref && !no_quota) { 6090 int mod_seq = 0; 6091 6092 if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && 6093 type == BTRFS_QGROUP_OPER_SUB_SHARED) 6094 mod_seq = 1; 6095 6096 ret = btrfs_qgroup_record_ref(trans, info, root_objectid, 6097 bytenr, num_bytes, type, 6098 mod_seq); 6099 } 6100 out: 6101 btrfs_free_path(path); 6102 return ret; 6103 } 6104 6105 /* 6106 * when we free an block, it is possible (and likely) that we free the last 6107 * delayed ref for that extent as well. This searches the delayed ref tree for 6108 * a given extent, and if there are no other delayed refs to be processed, it 6109 * removes it from the tree. 6110 */ 6111 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 6112 struct btrfs_root *root, u64 bytenr) 6113 { 6114 struct btrfs_delayed_ref_head *head; 6115 struct btrfs_delayed_ref_root *delayed_refs; 6116 int ret = 0; 6117 6118 delayed_refs = &trans->transaction->delayed_refs; 6119 spin_lock(&delayed_refs->lock); 6120 head = btrfs_find_delayed_ref_head(trans, bytenr); 6121 if (!head) 6122 goto out_delayed_unlock; 6123 6124 spin_lock(&head->lock); 6125 if (rb_first(&head->ref_root)) 6126 goto out; 6127 6128 if (head->extent_op) { 6129 if (!head->must_insert_reserved) 6130 goto out; 6131 btrfs_free_delayed_extent_op(head->extent_op); 6132 head->extent_op = NULL; 6133 } 6134 6135 /* 6136 * waiting for the lock here would deadlock. If someone else has it 6137 * locked they are already in the process of dropping it anyway 6138 */ 6139 if (!mutex_trylock(&head->mutex)) 6140 goto out; 6141 6142 /* 6143 * at this point we have a head with no other entries. Go 6144 * ahead and process it. 6145 */ 6146 head->node.in_tree = 0; 6147 rb_erase(&head->href_node, &delayed_refs->href_root); 6148 6149 atomic_dec(&delayed_refs->num_entries); 6150 6151 /* 6152 * we don't take a ref on the node because we're removing it from the 6153 * tree, so we just steal the ref the tree was holding. 6154 */ 6155 delayed_refs->num_heads--; 6156 if (head->processing == 0) 6157 delayed_refs->num_heads_ready--; 6158 head->processing = 0; 6159 spin_unlock(&head->lock); 6160 spin_unlock(&delayed_refs->lock); 6161 6162 BUG_ON(head->extent_op); 6163 if (head->must_insert_reserved) 6164 ret = 1; 6165 6166 mutex_unlock(&head->mutex); 6167 btrfs_put_delayed_ref(&head->node); 6168 return ret; 6169 out: 6170 spin_unlock(&head->lock); 6171 6172 out_delayed_unlock: 6173 spin_unlock(&delayed_refs->lock); 6174 return 0; 6175 } 6176 6177 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 6178 struct btrfs_root *root, 6179 struct extent_buffer *buf, 6180 u64 parent, int last_ref) 6181 { 6182 struct btrfs_block_group_cache *cache = NULL; 6183 int pin = 1; 6184 int ret; 6185 6186 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6187 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 6188 buf->start, buf->len, 6189 parent, root->root_key.objectid, 6190 btrfs_header_level(buf), 6191 BTRFS_DROP_DELAYED_REF, NULL, 0); 6192 BUG_ON(ret); /* -ENOMEM */ 6193 } 6194 6195 if (!last_ref) 6196 return; 6197 6198 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 6199 6200 if (btrfs_header_generation(buf) == trans->transid) { 6201 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 6202 ret = check_ref_cleanup(trans, root, buf->start); 6203 if (!ret) 6204 goto out; 6205 } 6206 6207 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 6208 pin_down_extent(root, cache, buf->start, buf->len, 1); 6209 goto out; 6210 } 6211 6212 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 6213 6214 btrfs_add_free_space(cache, buf->start, buf->len); 6215 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 6216 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 6217 pin = 0; 6218 } 6219 out: 6220 if (pin) 6221 add_pinned_bytes(root->fs_info, buf->len, 6222 btrfs_header_level(buf), 6223 root->root_key.objectid); 6224 6225 /* 6226 * Deleting the buffer, clear the corrupt flag since it doesn't matter 6227 * anymore. 6228 */ 6229 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 6230 btrfs_put_block_group(cache); 6231 } 6232 6233 /* Can return -ENOMEM */ 6234 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6235 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6236 u64 owner, u64 offset, int no_quota) 6237 { 6238 int ret; 6239 struct btrfs_fs_info *fs_info = root->fs_info; 6240 6241 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6242 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 6243 return 0; 6244 #endif 6245 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6246 6247 /* 6248 * tree log blocks never actually go into the extent allocation 6249 * tree, just update pinning info and exit early. 6250 */ 6251 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 6252 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 6253 /* unlocks the pinned mutex */ 6254 btrfs_pin_extent(root, bytenr, num_bytes, 1); 6255 ret = 0; 6256 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6257 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6258 num_bytes, 6259 parent, root_objectid, (int)owner, 6260 BTRFS_DROP_DELAYED_REF, NULL, no_quota); 6261 } else { 6262 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6263 num_bytes, 6264 parent, root_objectid, owner, 6265 offset, BTRFS_DROP_DELAYED_REF, 6266 NULL, no_quota); 6267 } 6268 return ret; 6269 } 6270 6271 static u64 stripe_align(struct btrfs_root *root, 6272 struct btrfs_block_group_cache *cache, 6273 u64 val, u64 num_bytes) 6274 { 6275 u64 ret = ALIGN(val, root->stripesize); 6276 return ret; 6277 } 6278 6279 /* 6280 * when we wait for progress in the block group caching, its because 6281 * our allocation attempt failed at least once. So, we must sleep 6282 * and let some progress happen before we try again. 6283 * 6284 * This function will sleep at least once waiting for new free space to 6285 * show up, and then it will check the block group free space numbers 6286 * for our min num_bytes. Another option is to have it go ahead 6287 * and look in the rbtree for a free extent of a given size, but this 6288 * is a good start. 6289 * 6290 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 6291 * any of the information in this block group. 6292 */ 6293 static noinline void 6294 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 6295 u64 num_bytes) 6296 { 6297 struct btrfs_caching_control *caching_ctl; 6298 6299 caching_ctl = get_caching_control(cache); 6300 if (!caching_ctl) 6301 return; 6302 6303 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 6304 (cache->free_space_ctl->free_space >= num_bytes)); 6305 6306 put_caching_control(caching_ctl); 6307 } 6308 6309 static noinline int 6310 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 6311 { 6312 struct btrfs_caching_control *caching_ctl; 6313 int ret = 0; 6314 6315 caching_ctl = get_caching_control(cache); 6316 if (!caching_ctl) 6317 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 6318 6319 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 6320 if (cache->cached == BTRFS_CACHE_ERROR) 6321 ret = -EIO; 6322 put_caching_control(caching_ctl); 6323 return ret; 6324 } 6325 6326 int __get_raid_index(u64 flags) 6327 { 6328 if (flags & BTRFS_BLOCK_GROUP_RAID10) 6329 return BTRFS_RAID_RAID10; 6330 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 6331 return BTRFS_RAID_RAID1; 6332 else if (flags & BTRFS_BLOCK_GROUP_DUP) 6333 return BTRFS_RAID_DUP; 6334 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 6335 return BTRFS_RAID_RAID0; 6336 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 6337 return BTRFS_RAID_RAID5; 6338 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 6339 return BTRFS_RAID_RAID6; 6340 6341 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 6342 } 6343 6344 int get_block_group_index(struct btrfs_block_group_cache *cache) 6345 { 6346 return __get_raid_index(cache->flags); 6347 } 6348 6349 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 6350 [BTRFS_RAID_RAID10] = "raid10", 6351 [BTRFS_RAID_RAID1] = "raid1", 6352 [BTRFS_RAID_DUP] = "dup", 6353 [BTRFS_RAID_RAID0] = "raid0", 6354 [BTRFS_RAID_SINGLE] = "single", 6355 [BTRFS_RAID_RAID5] = "raid5", 6356 [BTRFS_RAID_RAID6] = "raid6", 6357 }; 6358 6359 static const char *get_raid_name(enum btrfs_raid_types type) 6360 { 6361 if (type >= BTRFS_NR_RAID_TYPES) 6362 return NULL; 6363 6364 return btrfs_raid_type_names[type]; 6365 } 6366 6367 enum btrfs_loop_type { 6368 LOOP_CACHING_NOWAIT = 0, 6369 LOOP_CACHING_WAIT = 1, 6370 LOOP_ALLOC_CHUNK = 2, 6371 LOOP_NO_EMPTY_SIZE = 3, 6372 }; 6373 6374 static inline void 6375 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 6376 int delalloc) 6377 { 6378 if (delalloc) 6379 down_read(&cache->data_rwsem); 6380 } 6381 6382 static inline void 6383 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 6384 int delalloc) 6385 { 6386 btrfs_get_block_group(cache); 6387 if (delalloc) 6388 down_read(&cache->data_rwsem); 6389 } 6390 6391 static struct btrfs_block_group_cache * 6392 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 6393 struct btrfs_free_cluster *cluster, 6394 int delalloc) 6395 { 6396 struct btrfs_block_group_cache *used_bg; 6397 bool locked = false; 6398 again: 6399 spin_lock(&cluster->refill_lock); 6400 if (locked) { 6401 if (used_bg == cluster->block_group) 6402 return used_bg; 6403 6404 up_read(&used_bg->data_rwsem); 6405 btrfs_put_block_group(used_bg); 6406 } 6407 6408 used_bg = cluster->block_group; 6409 if (!used_bg) 6410 return NULL; 6411 6412 if (used_bg == block_group) 6413 return used_bg; 6414 6415 btrfs_get_block_group(used_bg); 6416 6417 if (!delalloc) 6418 return used_bg; 6419 6420 if (down_read_trylock(&used_bg->data_rwsem)) 6421 return used_bg; 6422 6423 spin_unlock(&cluster->refill_lock); 6424 down_read(&used_bg->data_rwsem); 6425 locked = true; 6426 goto again; 6427 } 6428 6429 static inline void 6430 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 6431 int delalloc) 6432 { 6433 if (delalloc) 6434 up_read(&cache->data_rwsem); 6435 btrfs_put_block_group(cache); 6436 } 6437 6438 /* 6439 * walks the btree of allocated extents and find a hole of a given size. 6440 * The key ins is changed to record the hole: 6441 * ins->objectid == start position 6442 * ins->flags = BTRFS_EXTENT_ITEM_KEY 6443 * ins->offset == the size of the hole. 6444 * Any available blocks before search_start are skipped. 6445 * 6446 * If there is no suitable free space, we will record the max size of 6447 * the free space extent currently. 6448 */ 6449 static noinline int find_free_extent(struct btrfs_root *orig_root, 6450 u64 num_bytes, u64 empty_size, 6451 u64 hint_byte, struct btrfs_key *ins, 6452 u64 flags, int delalloc) 6453 { 6454 int ret = 0; 6455 struct btrfs_root *root = orig_root->fs_info->extent_root; 6456 struct btrfs_free_cluster *last_ptr = NULL; 6457 struct btrfs_block_group_cache *block_group = NULL; 6458 u64 search_start = 0; 6459 u64 max_extent_size = 0; 6460 int empty_cluster = 2 * 1024 * 1024; 6461 struct btrfs_space_info *space_info; 6462 int loop = 0; 6463 int index = __get_raid_index(flags); 6464 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 6465 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 6466 bool failed_cluster_refill = false; 6467 bool failed_alloc = false; 6468 bool use_cluster = true; 6469 bool have_caching_bg = false; 6470 6471 WARN_ON(num_bytes < root->sectorsize); 6472 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 6473 ins->objectid = 0; 6474 ins->offset = 0; 6475 6476 trace_find_free_extent(orig_root, num_bytes, empty_size, flags); 6477 6478 space_info = __find_space_info(root->fs_info, flags); 6479 if (!space_info) { 6480 btrfs_err(root->fs_info, "No space info for %llu", flags); 6481 return -ENOSPC; 6482 } 6483 6484 /* 6485 * If the space info is for both data and metadata it means we have a 6486 * small filesystem and we can't use the clustering stuff. 6487 */ 6488 if (btrfs_mixed_space_info(space_info)) 6489 use_cluster = false; 6490 6491 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 6492 last_ptr = &root->fs_info->meta_alloc_cluster; 6493 if (!btrfs_test_opt(root, SSD)) 6494 empty_cluster = 64 * 1024; 6495 } 6496 6497 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 6498 btrfs_test_opt(root, SSD)) { 6499 last_ptr = &root->fs_info->data_alloc_cluster; 6500 } 6501 6502 if (last_ptr) { 6503 spin_lock(&last_ptr->lock); 6504 if (last_ptr->block_group) 6505 hint_byte = last_ptr->window_start; 6506 spin_unlock(&last_ptr->lock); 6507 } 6508 6509 search_start = max(search_start, first_logical_byte(root, 0)); 6510 search_start = max(search_start, hint_byte); 6511 6512 if (!last_ptr) 6513 empty_cluster = 0; 6514 6515 if (search_start == hint_byte) { 6516 block_group = btrfs_lookup_block_group(root->fs_info, 6517 search_start); 6518 /* 6519 * we don't want to use the block group if it doesn't match our 6520 * allocation bits, or if its not cached. 6521 * 6522 * However if we are re-searching with an ideal block group 6523 * picked out then we don't care that the block group is cached. 6524 */ 6525 if (block_group && block_group_bits(block_group, flags) && 6526 block_group->cached != BTRFS_CACHE_NO) { 6527 down_read(&space_info->groups_sem); 6528 if (list_empty(&block_group->list) || 6529 block_group->ro) { 6530 /* 6531 * someone is removing this block group, 6532 * we can't jump into the have_block_group 6533 * target because our list pointers are not 6534 * valid 6535 */ 6536 btrfs_put_block_group(block_group); 6537 up_read(&space_info->groups_sem); 6538 } else { 6539 index = get_block_group_index(block_group); 6540 btrfs_lock_block_group(block_group, delalloc); 6541 goto have_block_group; 6542 } 6543 } else if (block_group) { 6544 btrfs_put_block_group(block_group); 6545 } 6546 } 6547 search: 6548 have_caching_bg = false; 6549 down_read(&space_info->groups_sem); 6550 list_for_each_entry(block_group, &space_info->block_groups[index], 6551 list) { 6552 u64 offset; 6553 int cached; 6554 6555 btrfs_grab_block_group(block_group, delalloc); 6556 search_start = block_group->key.objectid; 6557 6558 /* 6559 * this can happen if we end up cycling through all the 6560 * raid types, but we want to make sure we only allocate 6561 * for the proper type. 6562 */ 6563 if (!block_group_bits(block_group, flags)) { 6564 u64 extra = BTRFS_BLOCK_GROUP_DUP | 6565 BTRFS_BLOCK_GROUP_RAID1 | 6566 BTRFS_BLOCK_GROUP_RAID5 | 6567 BTRFS_BLOCK_GROUP_RAID6 | 6568 BTRFS_BLOCK_GROUP_RAID10; 6569 6570 /* 6571 * if they asked for extra copies and this block group 6572 * doesn't provide them, bail. This does allow us to 6573 * fill raid0 from raid1. 6574 */ 6575 if ((flags & extra) && !(block_group->flags & extra)) 6576 goto loop; 6577 } 6578 6579 have_block_group: 6580 cached = block_group_cache_done(block_group); 6581 if (unlikely(!cached)) { 6582 ret = cache_block_group(block_group, 0); 6583 BUG_ON(ret < 0); 6584 ret = 0; 6585 } 6586 6587 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 6588 goto loop; 6589 if (unlikely(block_group->ro)) 6590 goto loop; 6591 6592 /* 6593 * Ok we want to try and use the cluster allocator, so 6594 * lets look there 6595 */ 6596 if (last_ptr) { 6597 struct btrfs_block_group_cache *used_block_group; 6598 unsigned long aligned_cluster; 6599 /* 6600 * the refill lock keeps out other 6601 * people trying to start a new cluster 6602 */ 6603 used_block_group = btrfs_lock_cluster(block_group, 6604 last_ptr, 6605 delalloc); 6606 if (!used_block_group) 6607 goto refill_cluster; 6608 6609 if (used_block_group != block_group && 6610 (used_block_group->ro || 6611 !block_group_bits(used_block_group, flags))) 6612 goto release_cluster; 6613 6614 offset = btrfs_alloc_from_cluster(used_block_group, 6615 last_ptr, 6616 num_bytes, 6617 used_block_group->key.objectid, 6618 &max_extent_size); 6619 if (offset) { 6620 /* we have a block, we're done */ 6621 spin_unlock(&last_ptr->refill_lock); 6622 trace_btrfs_reserve_extent_cluster(root, 6623 used_block_group, 6624 search_start, num_bytes); 6625 if (used_block_group != block_group) { 6626 btrfs_release_block_group(block_group, 6627 delalloc); 6628 block_group = used_block_group; 6629 } 6630 goto checks; 6631 } 6632 6633 WARN_ON(last_ptr->block_group != used_block_group); 6634 release_cluster: 6635 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6636 * set up a new clusters, so lets just skip it 6637 * and let the allocator find whatever block 6638 * it can find. If we reach this point, we 6639 * will have tried the cluster allocator 6640 * plenty of times and not have found 6641 * anything, so we are likely way too 6642 * fragmented for the clustering stuff to find 6643 * anything. 6644 * 6645 * However, if the cluster is taken from the 6646 * current block group, release the cluster 6647 * first, so that we stand a better chance of 6648 * succeeding in the unclustered 6649 * allocation. */ 6650 if (loop >= LOOP_NO_EMPTY_SIZE && 6651 used_block_group != block_group) { 6652 spin_unlock(&last_ptr->refill_lock); 6653 btrfs_release_block_group(used_block_group, 6654 delalloc); 6655 goto unclustered_alloc; 6656 } 6657 6658 /* 6659 * this cluster didn't work out, free it and 6660 * start over 6661 */ 6662 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6663 6664 if (used_block_group != block_group) 6665 btrfs_release_block_group(used_block_group, 6666 delalloc); 6667 refill_cluster: 6668 if (loop >= LOOP_NO_EMPTY_SIZE) { 6669 spin_unlock(&last_ptr->refill_lock); 6670 goto unclustered_alloc; 6671 } 6672 6673 aligned_cluster = max_t(unsigned long, 6674 empty_cluster + empty_size, 6675 block_group->full_stripe_len); 6676 6677 /* allocate a cluster in this block group */ 6678 ret = btrfs_find_space_cluster(root, block_group, 6679 last_ptr, search_start, 6680 num_bytes, 6681 aligned_cluster); 6682 if (ret == 0) { 6683 /* 6684 * now pull our allocation out of this 6685 * cluster 6686 */ 6687 offset = btrfs_alloc_from_cluster(block_group, 6688 last_ptr, 6689 num_bytes, 6690 search_start, 6691 &max_extent_size); 6692 if (offset) { 6693 /* we found one, proceed */ 6694 spin_unlock(&last_ptr->refill_lock); 6695 trace_btrfs_reserve_extent_cluster(root, 6696 block_group, search_start, 6697 num_bytes); 6698 goto checks; 6699 } 6700 } else if (!cached && loop > LOOP_CACHING_NOWAIT 6701 && !failed_cluster_refill) { 6702 spin_unlock(&last_ptr->refill_lock); 6703 6704 failed_cluster_refill = true; 6705 wait_block_group_cache_progress(block_group, 6706 num_bytes + empty_cluster + empty_size); 6707 goto have_block_group; 6708 } 6709 6710 /* 6711 * at this point we either didn't find a cluster 6712 * or we weren't able to allocate a block from our 6713 * cluster. Free the cluster we've been trying 6714 * to use, and go to the next block group 6715 */ 6716 btrfs_return_cluster_to_free_space(NULL, last_ptr); 6717 spin_unlock(&last_ptr->refill_lock); 6718 goto loop; 6719 } 6720 6721 unclustered_alloc: 6722 spin_lock(&block_group->free_space_ctl->tree_lock); 6723 if (cached && 6724 block_group->free_space_ctl->free_space < 6725 num_bytes + empty_cluster + empty_size) { 6726 if (block_group->free_space_ctl->free_space > 6727 max_extent_size) 6728 max_extent_size = 6729 block_group->free_space_ctl->free_space; 6730 spin_unlock(&block_group->free_space_ctl->tree_lock); 6731 goto loop; 6732 } 6733 spin_unlock(&block_group->free_space_ctl->tree_lock); 6734 6735 offset = btrfs_find_space_for_alloc(block_group, search_start, 6736 num_bytes, empty_size, 6737 &max_extent_size); 6738 /* 6739 * If we didn't find a chunk, and we haven't failed on this 6740 * block group before, and this block group is in the middle of 6741 * caching and we are ok with waiting, then go ahead and wait 6742 * for progress to be made, and set failed_alloc to true. 6743 * 6744 * If failed_alloc is true then we've already waited on this 6745 * block group once and should move on to the next block group. 6746 */ 6747 if (!offset && !failed_alloc && !cached && 6748 loop > LOOP_CACHING_NOWAIT) { 6749 wait_block_group_cache_progress(block_group, 6750 num_bytes + empty_size); 6751 failed_alloc = true; 6752 goto have_block_group; 6753 } else if (!offset) { 6754 if (!cached) 6755 have_caching_bg = true; 6756 goto loop; 6757 } 6758 checks: 6759 search_start = stripe_align(root, block_group, 6760 offset, num_bytes); 6761 6762 /* move on to the next group */ 6763 if (search_start + num_bytes > 6764 block_group->key.objectid + block_group->key.offset) { 6765 btrfs_add_free_space(block_group, offset, num_bytes); 6766 goto loop; 6767 } 6768 6769 if (offset < search_start) 6770 btrfs_add_free_space(block_group, offset, 6771 search_start - offset); 6772 BUG_ON(offset > search_start); 6773 6774 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 6775 alloc_type, delalloc); 6776 if (ret == -EAGAIN) { 6777 btrfs_add_free_space(block_group, offset, num_bytes); 6778 goto loop; 6779 } 6780 6781 /* we are all good, lets return */ 6782 ins->objectid = search_start; 6783 ins->offset = num_bytes; 6784 6785 trace_btrfs_reserve_extent(orig_root, block_group, 6786 search_start, num_bytes); 6787 btrfs_release_block_group(block_group, delalloc); 6788 break; 6789 loop: 6790 failed_cluster_refill = false; 6791 failed_alloc = false; 6792 BUG_ON(index != get_block_group_index(block_group)); 6793 btrfs_release_block_group(block_group, delalloc); 6794 } 6795 up_read(&space_info->groups_sem); 6796 6797 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 6798 goto search; 6799 6800 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 6801 goto search; 6802 6803 /* 6804 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 6805 * caching kthreads as we move along 6806 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 6807 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 6808 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 6809 * again 6810 */ 6811 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 6812 index = 0; 6813 loop++; 6814 if (loop == LOOP_ALLOC_CHUNK) { 6815 struct btrfs_trans_handle *trans; 6816 int exist = 0; 6817 6818 trans = current->journal_info; 6819 if (trans) 6820 exist = 1; 6821 else 6822 trans = btrfs_join_transaction(root); 6823 6824 if (IS_ERR(trans)) { 6825 ret = PTR_ERR(trans); 6826 goto out; 6827 } 6828 6829 ret = do_chunk_alloc(trans, root, flags, 6830 CHUNK_ALLOC_FORCE); 6831 /* 6832 * Do not bail out on ENOSPC since we 6833 * can do more things. 6834 */ 6835 if (ret < 0 && ret != -ENOSPC) 6836 btrfs_abort_transaction(trans, 6837 root, ret); 6838 else 6839 ret = 0; 6840 if (!exist) 6841 btrfs_end_transaction(trans, root); 6842 if (ret) 6843 goto out; 6844 } 6845 6846 if (loop == LOOP_NO_EMPTY_SIZE) { 6847 empty_size = 0; 6848 empty_cluster = 0; 6849 } 6850 6851 goto search; 6852 } else if (!ins->objectid) { 6853 ret = -ENOSPC; 6854 } else if (ins->objectid) { 6855 ret = 0; 6856 } 6857 out: 6858 if (ret == -ENOSPC) 6859 ins->offset = max_extent_size; 6860 return ret; 6861 } 6862 6863 static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 6864 int dump_block_groups) 6865 { 6866 struct btrfs_block_group_cache *cache; 6867 int index = 0; 6868 6869 spin_lock(&info->lock); 6870 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", 6871 info->flags, 6872 info->total_bytes - info->bytes_used - info->bytes_pinned - 6873 info->bytes_reserved - info->bytes_readonly, 6874 (info->full) ? "" : "not "); 6875 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " 6876 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6877 info->total_bytes, info->bytes_used, info->bytes_pinned, 6878 info->bytes_reserved, info->bytes_may_use, 6879 info->bytes_readonly); 6880 spin_unlock(&info->lock); 6881 6882 if (!dump_block_groups) 6883 return; 6884 6885 down_read(&info->groups_sem); 6886 again: 6887 list_for_each_entry(cache, &info->block_groups[index], list) { 6888 spin_lock(&cache->lock); 6889 printk(KERN_INFO "BTRFS: " 6890 "block group %llu has %llu bytes, " 6891 "%llu used %llu pinned %llu reserved %s\n", 6892 cache->key.objectid, cache->key.offset, 6893 btrfs_block_group_used(&cache->item), cache->pinned, 6894 cache->reserved, cache->ro ? "[readonly]" : ""); 6895 btrfs_dump_free_space(cache, bytes); 6896 spin_unlock(&cache->lock); 6897 } 6898 if (++index < BTRFS_NR_RAID_TYPES) 6899 goto again; 6900 up_read(&info->groups_sem); 6901 } 6902 6903 int btrfs_reserve_extent(struct btrfs_root *root, 6904 u64 num_bytes, u64 min_alloc_size, 6905 u64 empty_size, u64 hint_byte, 6906 struct btrfs_key *ins, int is_data, int delalloc) 6907 { 6908 bool final_tried = false; 6909 u64 flags; 6910 int ret; 6911 6912 flags = btrfs_get_alloc_profile(root, is_data); 6913 again: 6914 WARN_ON(num_bytes < root->sectorsize); 6915 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 6916 flags, delalloc); 6917 6918 if (ret == -ENOSPC) { 6919 if (!final_tried && ins->offset) { 6920 num_bytes = min(num_bytes >> 1, ins->offset); 6921 num_bytes = round_down(num_bytes, root->sectorsize); 6922 num_bytes = max(num_bytes, min_alloc_size); 6923 if (num_bytes == min_alloc_size) 6924 final_tried = true; 6925 goto again; 6926 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 6927 struct btrfs_space_info *sinfo; 6928 6929 sinfo = __find_space_info(root->fs_info, flags); 6930 btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", 6931 flags, num_bytes); 6932 if (sinfo) 6933 dump_space_info(sinfo, num_bytes, 1); 6934 } 6935 } 6936 6937 return ret; 6938 } 6939 6940 static int __btrfs_free_reserved_extent(struct btrfs_root *root, 6941 u64 start, u64 len, 6942 int pin, int delalloc) 6943 { 6944 struct btrfs_block_group_cache *cache; 6945 int ret = 0; 6946 6947 cache = btrfs_lookup_block_group(root->fs_info, start); 6948 if (!cache) { 6949 btrfs_err(root->fs_info, "Unable to find block group for %llu", 6950 start); 6951 return -ENOSPC; 6952 } 6953 6954 if (btrfs_test_opt(root, DISCARD)) 6955 ret = btrfs_discard_extent(root, start, len, NULL); 6956 6957 if (pin) 6958 pin_down_extent(root, cache, start, len, 1); 6959 else { 6960 btrfs_add_free_space(cache, start, len); 6961 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 6962 } 6963 btrfs_put_block_group(cache); 6964 6965 trace_btrfs_reserved_extent_free(root, start, len); 6966 6967 return ret; 6968 } 6969 6970 int btrfs_free_reserved_extent(struct btrfs_root *root, 6971 u64 start, u64 len, int delalloc) 6972 { 6973 return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); 6974 } 6975 6976 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 6977 u64 start, u64 len) 6978 { 6979 return __btrfs_free_reserved_extent(root, start, len, 1, 0); 6980 } 6981 6982 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 6983 struct btrfs_root *root, 6984 u64 parent, u64 root_objectid, 6985 u64 flags, u64 owner, u64 offset, 6986 struct btrfs_key *ins, int ref_mod) 6987 { 6988 int ret; 6989 struct btrfs_fs_info *fs_info = root->fs_info; 6990 struct btrfs_extent_item *extent_item; 6991 struct btrfs_extent_inline_ref *iref; 6992 struct btrfs_path *path; 6993 struct extent_buffer *leaf; 6994 int type; 6995 u32 size; 6996 6997 if (parent > 0) 6998 type = BTRFS_SHARED_DATA_REF_KEY; 6999 else 7000 type = BTRFS_EXTENT_DATA_REF_KEY; 7001 7002 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 7003 7004 path = btrfs_alloc_path(); 7005 if (!path) 7006 return -ENOMEM; 7007 7008 path->leave_spinning = 1; 7009 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7010 ins, size); 7011 if (ret) { 7012 btrfs_free_path(path); 7013 return ret; 7014 } 7015 7016 leaf = path->nodes[0]; 7017 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7018 struct btrfs_extent_item); 7019 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 7020 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7021 btrfs_set_extent_flags(leaf, extent_item, 7022 flags | BTRFS_EXTENT_FLAG_DATA); 7023 7024 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7025 btrfs_set_extent_inline_ref_type(leaf, iref, type); 7026 if (parent > 0) { 7027 struct btrfs_shared_data_ref *ref; 7028 ref = (struct btrfs_shared_data_ref *)(iref + 1); 7029 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7030 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 7031 } else { 7032 struct btrfs_extent_data_ref *ref; 7033 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 7034 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 7035 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 7036 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 7037 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 7038 } 7039 7040 btrfs_mark_buffer_dirty(path->nodes[0]); 7041 btrfs_free_path(path); 7042 7043 /* Always set parent to 0 here since its exclusive anyway. */ 7044 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7045 ins->objectid, ins->offset, 7046 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7047 if (ret) 7048 return ret; 7049 7050 ret = update_block_group(root, ins->objectid, ins->offset, 1); 7051 if (ret) { /* -ENOENT, logic error */ 7052 btrfs_err(fs_info, "update block group failed for %llu %llu", 7053 ins->objectid, ins->offset); 7054 BUG(); 7055 } 7056 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 7057 return ret; 7058 } 7059 7060 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 7061 struct btrfs_root *root, 7062 u64 parent, u64 root_objectid, 7063 u64 flags, struct btrfs_disk_key *key, 7064 int level, struct btrfs_key *ins, 7065 int no_quota) 7066 { 7067 int ret; 7068 struct btrfs_fs_info *fs_info = root->fs_info; 7069 struct btrfs_extent_item *extent_item; 7070 struct btrfs_tree_block_info *block_info; 7071 struct btrfs_extent_inline_ref *iref; 7072 struct btrfs_path *path; 7073 struct extent_buffer *leaf; 7074 u32 size = sizeof(*extent_item) + sizeof(*iref); 7075 u64 num_bytes = ins->offset; 7076 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7077 SKINNY_METADATA); 7078 7079 if (!skinny_metadata) 7080 size += sizeof(*block_info); 7081 7082 path = btrfs_alloc_path(); 7083 if (!path) { 7084 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7085 root->leafsize); 7086 return -ENOMEM; 7087 } 7088 7089 path->leave_spinning = 1; 7090 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7091 ins, size); 7092 if (ret) { 7093 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7094 root->leafsize); 7095 btrfs_free_path(path); 7096 return ret; 7097 } 7098 7099 leaf = path->nodes[0]; 7100 extent_item = btrfs_item_ptr(leaf, path->slots[0], 7101 struct btrfs_extent_item); 7102 btrfs_set_extent_refs(leaf, extent_item, 1); 7103 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 7104 btrfs_set_extent_flags(leaf, extent_item, 7105 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 7106 7107 if (skinny_metadata) { 7108 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7109 num_bytes = root->leafsize; 7110 } else { 7111 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7112 btrfs_set_tree_block_key(leaf, block_info, key); 7113 btrfs_set_tree_block_level(leaf, block_info, level); 7114 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 7115 } 7116 7117 if (parent > 0) { 7118 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 7119 btrfs_set_extent_inline_ref_type(leaf, iref, 7120 BTRFS_SHARED_BLOCK_REF_KEY); 7121 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 7122 } else { 7123 btrfs_set_extent_inline_ref_type(leaf, iref, 7124 BTRFS_TREE_BLOCK_REF_KEY); 7125 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 7126 } 7127 7128 btrfs_mark_buffer_dirty(leaf); 7129 btrfs_free_path(path); 7130 7131 if (!no_quota) { 7132 ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, 7133 ins->objectid, num_bytes, 7134 BTRFS_QGROUP_OPER_ADD_EXCL, 0); 7135 if (ret) 7136 return ret; 7137 } 7138 7139 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7140 if (ret) { /* -ENOENT, logic error */ 7141 btrfs_err(fs_info, "update block group failed for %llu %llu", 7142 ins->objectid, ins->offset); 7143 BUG(); 7144 } 7145 7146 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); 7147 return ret; 7148 } 7149 7150 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7151 struct btrfs_root *root, 7152 u64 root_objectid, u64 owner, 7153 u64 offset, struct btrfs_key *ins) 7154 { 7155 int ret; 7156 7157 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 7158 7159 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 7160 ins->offset, 0, 7161 root_objectid, owner, offset, 7162 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 7163 return ret; 7164 } 7165 7166 /* 7167 * this is used by the tree logging recovery code. It records that 7168 * an extent has been allocated and makes sure to clear the free 7169 * space cache bits as well 7170 */ 7171 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 7172 struct btrfs_root *root, 7173 u64 root_objectid, u64 owner, u64 offset, 7174 struct btrfs_key *ins) 7175 { 7176 int ret; 7177 struct btrfs_block_group_cache *block_group; 7178 7179 /* 7180 * Mixed block groups will exclude before processing the log so we only 7181 * need to do the exlude dance if this fs isn't mixed. 7182 */ 7183 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { 7184 ret = __exclude_logged_extent(root, ins->objectid, ins->offset); 7185 if (ret) 7186 return ret; 7187 } 7188 7189 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 7190 if (!block_group) 7191 return -EINVAL; 7192 7193 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 7194 RESERVE_ALLOC_NO_ACCOUNT, 0); 7195 BUG_ON(ret); /* logic error */ 7196 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 7197 0, owner, offset, ins, 1); 7198 btrfs_put_block_group(block_group); 7199 return ret; 7200 } 7201 7202 static struct extent_buffer * 7203 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 7204 u64 bytenr, u32 blocksize, int level) 7205 { 7206 struct extent_buffer *buf; 7207 7208 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 7209 if (!buf) 7210 return ERR_PTR(-ENOMEM); 7211 btrfs_set_header_generation(buf, trans->transid); 7212 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 7213 btrfs_tree_lock(buf); 7214 clean_tree_block(trans, root, buf); 7215 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 7216 7217 btrfs_set_lock_blocking(buf); 7218 btrfs_set_buffer_uptodate(buf); 7219 7220 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7221 /* 7222 * we allow two log transactions at a time, use different 7223 * EXENT bit to differentiate dirty pages. 7224 */ 7225 if (root->log_transid % 2 == 0) 7226 set_extent_dirty(&root->dirty_log_pages, buf->start, 7227 buf->start + buf->len - 1, GFP_NOFS); 7228 else 7229 set_extent_new(&root->dirty_log_pages, buf->start, 7230 buf->start + buf->len - 1, GFP_NOFS); 7231 } else { 7232 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7233 buf->start + buf->len - 1, GFP_NOFS); 7234 } 7235 trans->blocks_used++; 7236 /* this returns a buffer locked for blocking */ 7237 return buf; 7238 } 7239 7240 static struct btrfs_block_rsv * 7241 use_block_rsv(struct btrfs_trans_handle *trans, 7242 struct btrfs_root *root, u32 blocksize) 7243 { 7244 struct btrfs_block_rsv *block_rsv; 7245 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 7246 int ret; 7247 bool global_updated = false; 7248 7249 block_rsv = get_block_rsv(trans, root); 7250 7251 if (unlikely(block_rsv->size == 0)) 7252 goto try_reserve; 7253 again: 7254 ret = block_rsv_use_bytes(block_rsv, blocksize); 7255 if (!ret) 7256 return block_rsv; 7257 7258 if (block_rsv->failfast) 7259 return ERR_PTR(ret); 7260 7261 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 7262 global_updated = true; 7263 update_global_block_rsv(root->fs_info); 7264 goto again; 7265 } 7266 7267 if (btrfs_test_opt(root, ENOSPC_DEBUG)) { 7268 static DEFINE_RATELIMIT_STATE(_rs, 7269 DEFAULT_RATELIMIT_INTERVAL * 10, 7270 /*DEFAULT_RATELIMIT_BURST*/ 1); 7271 if (__ratelimit(&_rs)) 7272 WARN(1, KERN_DEBUG 7273 "BTRFS: block rsv returned %d\n", ret); 7274 } 7275 try_reserve: 7276 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 7277 BTRFS_RESERVE_NO_FLUSH); 7278 if (!ret) 7279 return block_rsv; 7280 /* 7281 * If we couldn't reserve metadata bytes try and use some from 7282 * the global reserve if its space type is the same as the global 7283 * reservation. 7284 */ 7285 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 7286 block_rsv->space_info == global_rsv->space_info) { 7287 ret = block_rsv_use_bytes(global_rsv, blocksize); 7288 if (!ret) 7289 return global_rsv; 7290 } 7291 return ERR_PTR(ret); 7292 } 7293 7294 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 7295 struct btrfs_block_rsv *block_rsv, u32 blocksize) 7296 { 7297 block_rsv_add_bytes(block_rsv, blocksize, 0); 7298 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 7299 } 7300 7301 /* 7302 * finds a free extent and does all the dirty work required for allocation 7303 * returns the key for the extent through ins, and a tree buffer for 7304 * the first block of the extent through buf. 7305 * 7306 * returns the tree buffer or NULL. 7307 */ 7308 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 7309 struct btrfs_root *root, u32 blocksize, 7310 u64 parent, u64 root_objectid, 7311 struct btrfs_disk_key *key, int level, 7312 u64 hint, u64 empty_size) 7313 { 7314 struct btrfs_key ins; 7315 struct btrfs_block_rsv *block_rsv; 7316 struct extent_buffer *buf; 7317 u64 flags = 0; 7318 int ret; 7319 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7320 SKINNY_METADATA); 7321 7322 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7323 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { 7324 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7325 blocksize, level); 7326 if (!IS_ERR(buf)) 7327 root->alloc_bytenr += blocksize; 7328 return buf; 7329 } 7330 #endif 7331 block_rsv = use_block_rsv(trans, root, blocksize); 7332 if (IS_ERR(block_rsv)) 7333 return ERR_CAST(block_rsv); 7334 7335 ret = btrfs_reserve_extent(root, blocksize, blocksize, 7336 empty_size, hint, &ins, 0, 0); 7337 if (ret) { 7338 unuse_block_rsv(root->fs_info, block_rsv, blocksize); 7339 return ERR_PTR(ret); 7340 } 7341 7342 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 7343 blocksize, level); 7344 BUG_ON(IS_ERR(buf)); /* -ENOMEM */ 7345 7346 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 7347 if (parent == 0) 7348 parent = ins.objectid; 7349 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 7350 } else 7351 BUG_ON(parent > 0); 7352 7353 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 7354 struct btrfs_delayed_extent_op *extent_op; 7355 extent_op = btrfs_alloc_delayed_extent_op(); 7356 BUG_ON(!extent_op); /* -ENOMEM */ 7357 if (key) 7358 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 7359 else 7360 memset(&extent_op->key, 0, sizeof(extent_op->key)); 7361 extent_op->flags_to_set = flags; 7362 if (skinny_metadata) 7363 extent_op->update_key = 0; 7364 else 7365 extent_op->update_key = 1; 7366 extent_op->update_flags = 1; 7367 extent_op->is_data = 0; 7368 extent_op->level = level; 7369 7370 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, 7371 ins.objectid, 7372 ins.offset, parent, root_objectid, 7373 level, BTRFS_ADD_DELAYED_EXTENT, 7374 extent_op, 0); 7375 BUG_ON(ret); /* -ENOMEM */ 7376 } 7377 return buf; 7378 } 7379 7380 struct walk_control { 7381 u64 refs[BTRFS_MAX_LEVEL]; 7382 u64 flags[BTRFS_MAX_LEVEL]; 7383 struct btrfs_key update_progress; 7384 int stage; 7385 int level; 7386 int shared_level; 7387 int update_ref; 7388 int keep_locks; 7389 int reada_slot; 7390 int reada_count; 7391 int for_reloc; 7392 }; 7393 7394 #define DROP_REFERENCE 1 7395 #define UPDATE_BACKREF 2 7396 7397 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 7398 struct btrfs_root *root, 7399 struct walk_control *wc, 7400 struct btrfs_path *path) 7401 { 7402 u64 bytenr; 7403 u64 generation; 7404 u64 refs; 7405 u64 flags; 7406 u32 nritems; 7407 u32 blocksize; 7408 struct btrfs_key key; 7409 struct extent_buffer *eb; 7410 int ret; 7411 int slot; 7412 int nread = 0; 7413 7414 if (path->slots[wc->level] < wc->reada_slot) { 7415 wc->reada_count = wc->reada_count * 2 / 3; 7416 wc->reada_count = max(wc->reada_count, 2); 7417 } else { 7418 wc->reada_count = wc->reada_count * 3 / 2; 7419 wc->reada_count = min_t(int, wc->reada_count, 7420 BTRFS_NODEPTRS_PER_BLOCK(root)); 7421 } 7422 7423 eb = path->nodes[wc->level]; 7424 nritems = btrfs_header_nritems(eb); 7425 blocksize = btrfs_level_size(root, wc->level - 1); 7426 7427 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7428 if (nread >= wc->reada_count) 7429 break; 7430 7431 cond_resched(); 7432 bytenr = btrfs_node_blockptr(eb, slot); 7433 generation = btrfs_node_ptr_generation(eb, slot); 7434 7435 if (slot == path->slots[wc->level]) 7436 goto reada; 7437 7438 if (wc->stage == UPDATE_BACKREF && 7439 generation <= root->root_key.offset) 7440 continue; 7441 7442 /* We don't lock the tree block, it's OK to be racy here */ 7443 ret = btrfs_lookup_extent_info(trans, root, bytenr, 7444 wc->level - 1, 1, &refs, 7445 &flags); 7446 /* We don't care about errors in readahead. */ 7447 if (ret < 0) 7448 continue; 7449 BUG_ON(refs == 0); 7450 7451 if (wc->stage == DROP_REFERENCE) { 7452 if (refs == 1) 7453 goto reada; 7454 7455 if (wc->level == 1 && 7456 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7457 continue; 7458 if (!wc->update_ref || 7459 generation <= root->root_key.offset) 7460 continue; 7461 btrfs_node_key_to_cpu(eb, &key, slot); 7462 ret = btrfs_comp_cpu_keys(&key, 7463 &wc->update_progress); 7464 if (ret < 0) 7465 continue; 7466 } else { 7467 if (wc->level == 1 && 7468 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7469 continue; 7470 } 7471 reada: 7472 ret = readahead_tree_block(root, bytenr, blocksize, 7473 generation); 7474 if (ret) 7475 break; 7476 nread++; 7477 } 7478 wc->reada_slot = slot; 7479 } 7480 7481 static int account_leaf_items(struct btrfs_trans_handle *trans, 7482 struct btrfs_root *root, 7483 struct extent_buffer *eb) 7484 { 7485 int nr = btrfs_header_nritems(eb); 7486 int i, extent_type, ret; 7487 struct btrfs_key key; 7488 struct btrfs_file_extent_item *fi; 7489 u64 bytenr, num_bytes; 7490 7491 for (i = 0; i < nr; i++) { 7492 btrfs_item_key_to_cpu(eb, &key, i); 7493 7494 if (key.type != BTRFS_EXTENT_DATA_KEY) 7495 continue; 7496 7497 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 7498 /* filter out non qgroup-accountable extents */ 7499 extent_type = btrfs_file_extent_type(eb, fi); 7500 7501 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 7502 continue; 7503 7504 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 7505 if (!bytenr) 7506 continue; 7507 7508 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 7509 7510 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7511 root->objectid, 7512 bytenr, num_bytes, 7513 BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); 7514 if (ret) 7515 return ret; 7516 } 7517 return 0; 7518 } 7519 7520 /* 7521 * Walk up the tree from the bottom, freeing leaves and any interior 7522 * nodes which have had all slots visited. If a node (leaf or 7523 * interior) is freed, the node above it will have it's slot 7524 * incremented. The root node will never be freed. 7525 * 7526 * At the end of this function, we should have a path which has all 7527 * slots incremented to the next position for a search. If we need to 7528 * read a new node it will be NULL and the node above it will have the 7529 * correct slot selected for a later read. 7530 * 7531 * If we increment the root nodes slot counter past the number of 7532 * elements, 1 is returned to signal completion of the search. 7533 */ 7534 static int adjust_slots_upwards(struct btrfs_root *root, 7535 struct btrfs_path *path, int root_level) 7536 { 7537 int level = 0; 7538 int nr, slot; 7539 struct extent_buffer *eb; 7540 7541 if (root_level == 0) 7542 return 1; 7543 7544 while (level <= root_level) { 7545 eb = path->nodes[level]; 7546 nr = btrfs_header_nritems(eb); 7547 path->slots[level]++; 7548 slot = path->slots[level]; 7549 if (slot >= nr || level == 0) { 7550 /* 7551 * Don't free the root - we will detect this 7552 * condition after our loop and return a 7553 * positive value for caller to stop walking the tree. 7554 */ 7555 if (level != root_level) { 7556 btrfs_tree_unlock_rw(eb, path->locks[level]); 7557 path->locks[level] = 0; 7558 7559 free_extent_buffer(eb); 7560 path->nodes[level] = NULL; 7561 path->slots[level] = 0; 7562 } 7563 } else { 7564 /* 7565 * We have a valid slot to walk back down 7566 * from. Stop here so caller can process these 7567 * new nodes. 7568 */ 7569 break; 7570 } 7571 7572 level++; 7573 } 7574 7575 eb = path->nodes[root_level]; 7576 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 7577 return 1; 7578 7579 return 0; 7580 } 7581 7582 /* 7583 * root_eb is the subtree root and is locked before this function is called. 7584 */ 7585 static int account_shared_subtree(struct btrfs_trans_handle *trans, 7586 struct btrfs_root *root, 7587 struct extent_buffer *root_eb, 7588 u64 root_gen, 7589 int root_level) 7590 { 7591 int ret = 0; 7592 int level; 7593 struct extent_buffer *eb = root_eb; 7594 struct btrfs_path *path = NULL; 7595 7596 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); 7597 BUG_ON(root_eb == NULL); 7598 7599 if (!root->fs_info->quota_enabled) 7600 return 0; 7601 7602 if (!extent_buffer_uptodate(root_eb)) { 7603 ret = btrfs_read_buffer(root_eb, root_gen); 7604 if (ret) 7605 goto out; 7606 } 7607 7608 if (root_level == 0) { 7609 ret = account_leaf_items(trans, root, root_eb); 7610 goto out; 7611 } 7612 7613 path = btrfs_alloc_path(); 7614 if (!path) 7615 return -ENOMEM; 7616 7617 /* 7618 * Walk down the tree. Missing extent blocks are filled in as 7619 * we go. Metadata is accounted every time we read a new 7620 * extent block. 7621 * 7622 * When we reach a leaf, we account for file extent items in it, 7623 * walk back up the tree (adjusting slot pointers as we go) 7624 * and restart the search process. 7625 */ 7626 extent_buffer_get(root_eb); /* For path */ 7627 path->nodes[root_level] = root_eb; 7628 path->slots[root_level] = 0; 7629 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 7630 walk_down: 7631 level = root_level; 7632 while (level >= 0) { 7633 if (path->nodes[level] == NULL) { 7634 int child_bsize = root->nodesize; 7635 int parent_slot; 7636 u64 child_gen; 7637 u64 child_bytenr; 7638 7639 /* We need to get child blockptr/gen from 7640 * parent before we can read it. */ 7641 eb = path->nodes[level + 1]; 7642 parent_slot = path->slots[level + 1]; 7643 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 7644 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 7645 7646 eb = read_tree_block(root, child_bytenr, child_bsize, 7647 child_gen); 7648 if (!eb || !extent_buffer_uptodate(eb)) { 7649 ret = -EIO; 7650 goto out; 7651 } 7652 7653 path->nodes[level] = eb; 7654 path->slots[level] = 0; 7655 7656 btrfs_tree_read_lock(eb); 7657 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 7658 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 7659 7660 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7661 root->objectid, 7662 child_bytenr, 7663 child_bsize, 7664 BTRFS_QGROUP_OPER_SUB_SUBTREE, 7665 0); 7666 if (ret) 7667 goto out; 7668 7669 } 7670 7671 if (level == 0) { 7672 ret = account_leaf_items(trans, root, path->nodes[level]); 7673 if (ret) 7674 goto out; 7675 7676 /* Nonzero return here means we completed our search */ 7677 ret = adjust_slots_upwards(root, path, root_level); 7678 if (ret) 7679 break; 7680 7681 /* Restart search with new slots */ 7682 goto walk_down; 7683 } 7684 7685 level--; 7686 } 7687 7688 ret = 0; 7689 out: 7690 btrfs_free_path(path); 7691 7692 return ret; 7693 } 7694 7695 /* 7696 * helper to process tree block while walking down the tree. 7697 * 7698 * when wc->stage == UPDATE_BACKREF, this function updates 7699 * back refs for pointers in the block. 7700 * 7701 * NOTE: return value 1 means we should stop walking down. 7702 */ 7703 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 7704 struct btrfs_root *root, 7705 struct btrfs_path *path, 7706 struct walk_control *wc, int lookup_info) 7707 { 7708 int level = wc->level; 7709 struct extent_buffer *eb = path->nodes[level]; 7710 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 7711 int ret; 7712 7713 if (wc->stage == UPDATE_BACKREF && 7714 btrfs_header_owner(eb) != root->root_key.objectid) 7715 return 1; 7716 7717 /* 7718 * when reference count of tree block is 1, it won't increase 7719 * again. once full backref flag is set, we never clear it. 7720 */ 7721 if (lookup_info && 7722 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 7723 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 7724 BUG_ON(!path->locks[level]); 7725 ret = btrfs_lookup_extent_info(trans, root, 7726 eb->start, level, 1, 7727 &wc->refs[level], 7728 &wc->flags[level]); 7729 BUG_ON(ret == -ENOMEM); 7730 if (ret) 7731 return ret; 7732 BUG_ON(wc->refs[level] == 0); 7733 } 7734 7735 if (wc->stage == DROP_REFERENCE) { 7736 if (wc->refs[level] > 1) 7737 return 1; 7738 7739 if (path->locks[level] && !wc->keep_locks) { 7740 btrfs_tree_unlock_rw(eb, path->locks[level]); 7741 path->locks[level] = 0; 7742 } 7743 return 0; 7744 } 7745 7746 /* wc->stage == UPDATE_BACKREF */ 7747 if (!(wc->flags[level] & flag)) { 7748 BUG_ON(!path->locks[level]); 7749 ret = btrfs_inc_ref(trans, root, eb, 1); 7750 BUG_ON(ret); /* -ENOMEM */ 7751 ret = btrfs_dec_ref(trans, root, eb, 0); 7752 BUG_ON(ret); /* -ENOMEM */ 7753 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7754 eb->len, flag, 7755 btrfs_header_level(eb), 0); 7756 BUG_ON(ret); /* -ENOMEM */ 7757 wc->flags[level] |= flag; 7758 } 7759 7760 /* 7761 * the block is shared by multiple trees, so it's not good to 7762 * keep the tree lock 7763 */ 7764 if (path->locks[level] && level > 0) { 7765 btrfs_tree_unlock_rw(eb, path->locks[level]); 7766 path->locks[level] = 0; 7767 } 7768 return 0; 7769 } 7770 7771 /* 7772 * helper to process tree block pointer. 7773 * 7774 * when wc->stage == DROP_REFERENCE, this function checks 7775 * reference count of the block pointed to. if the block 7776 * is shared and we need update back refs for the subtree 7777 * rooted at the block, this function changes wc->stage to 7778 * UPDATE_BACKREF. if the block is shared and there is no 7779 * need to update back, this function drops the reference 7780 * to the block. 7781 * 7782 * NOTE: return value 1 means we should stop walking down. 7783 */ 7784 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 7785 struct btrfs_root *root, 7786 struct btrfs_path *path, 7787 struct walk_control *wc, int *lookup_info) 7788 { 7789 u64 bytenr; 7790 u64 generation; 7791 u64 parent; 7792 u32 blocksize; 7793 struct btrfs_key key; 7794 struct extent_buffer *next; 7795 int level = wc->level; 7796 int reada = 0; 7797 int ret = 0; 7798 bool need_account = false; 7799 7800 generation = btrfs_node_ptr_generation(path->nodes[level], 7801 path->slots[level]); 7802 /* 7803 * if the lower level block was created before the snapshot 7804 * was created, we know there is no need to update back refs 7805 * for the subtree 7806 */ 7807 if (wc->stage == UPDATE_BACKREF && 7808 generation <= root->root_key.offset) { 7809 *lookup_info = 1; 7810 return 1; 7811 } 7812 7813 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7814 blocksize = btrfs_level_size(root, level - 1); 7815 7816 next = btrfs_find_tree_block(root, bytenr, blocksize); 7817 if (!next) { 7818 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7819 if (!next) 7820 return -ENOMEM; 7821 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 7822 level - 1); 7823 reada = 1; 7824 } 7825 btrfs_tree_lock(next); 7826 btrfs_set_lock_blocking(next); 7827 7828 ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, 7829 &wc->refs[level - 1], 7830 &wc->flags[level - 1]); 7831 if (ret < 0) { 7832 btrfs_tree_unlock(next); 7833 return ret; 7834 } 7835 7836 if (unlikely(wc->refs[level - 1] == 0)) { 7837 btrfs_err(root->fs_info, "Missing references."); 7838 BUG(); 7839 } 7840 *lookup_info = 0; 7841 7842 if (wc->stage == DROP_REFERENCE) { 7843 if (wc->refs[level - 1] > 1) { 7844 need_account = true; 7845 if (level == 1 && 7846 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7847 goto skip; 7848 7849 if (!wc->update_ref || 7850 generation <= root->root_key.offset) 7851 goto skip; 7852 7853 btrfs_node_key_to_cpu(path->nodes[level], &key, 7854 path->slots[level]); 7855 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 7856 if (ret < 0) 7857 goto skip; 7858 7859 wc->stage = UPDATE_BACKREF; 7860 wc->shared_level = level - 1; 7861 } 7862 } else { 7863 if (level == 1 && 7864 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7865 goto skip; 7866 } 7867 7868 if (!btrfs_buffer_uptodate(next, generation, 0)) { 7869 btrfs_tree_unlock(next); 7870 free_extent_buffer(next); 7871 next = NULL; 7872 *lookup_info = 1; 7873 } 7874 7875 if (!next) { 7876 if (reada && level == 1) 7877 reada_walk_down(trans, root, wc, path); 7878 next = read_tree_block(root, bytenr, blocksize, generation); 7879 if (!next || !extent_buffer_uptodate(next)) { 7880 free_extent_buffer(next); 7881 return -EIO; 7882 } 7883 btrfs_tree_lock(next); 7884 btrfs_set_lock_blocking(next); 7885 } 7886 7887 level--; 7888 BUG_ON(level != btrfs_header_level(next)); 7889 path->nodes[level] = next; 7890 path->slots[level] = 0; 7891 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7892 wc->level = level; 7893 if (wc->level == 1) 7894 wc->reada_slot = 0; 7895 return 0; 7896 skip: 7897 wc->refs[level - 1] = 0; 7898 wc->flags[level - 1] = 0; 7899 if (wc->stage == DROP_REFERENCE) { 7900 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 7901 parent = path->nodes[level]->start; 7902 } else { 7903 BUG_ON(root->root_key.objectid != 7904 btrfs_header_owner(path->nodes[level])); 7905 parent = 0; 7906 } 7907 7908 if (need_account) { 7909 ret = account_shared_subtree(trans, root, next, 7910 generation, level - 1); 7911 if (ret) { 7912 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 7913 "%d accounting shared subtree. Quota " 7914 "is out of sync, rescan required.\n", 7915 root->fs_info->sb->s_id, ret); 7916 } 7917 } 7918 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7919 root->root_key.objectid, level - 1, 0, 0); 7920 BUG_ON(ret); /* -ENOMEM */ 7921 } 7922 btrfs_tree_unlock(next); 7923 free_extent_buffer(next); 7924 *lookup_info = 1; 7925 return 1; 7926 } 7927 7928 /* 7929 * helper to process tree block while walking up the tree. 7930 * 7931 * when wc->stage == DROP_REFERENCE, this function drops 7932 * reference count on the block. 7933 * 7934 * when wc->stage == UPDATE_BACKREF, this function changes 7935 * wc->stage back to DROP_REFERENCE if we changed wc->stage 7936 * to UPDATE_BACKREF previously while processing the block. 7937 * 7938 * NOTE: return value 1 means we should stop walking up. 7939 */ 7940 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 7941 struct btrfs_root *root, 7942 struct btrfs_path *path, 7943 struct walk_control *wc) 7944 { 7945 int ret; 7946 int level = wc->level; 7947 struct extent_buffer *eb = path->nodes[level]; 7948 u64 parent = 0; 7949 7950 if (wc->stage == UPDATE_BACKREF) { 7951 BUG_ON(wc->shared_level < level); 7952 if (level < wc->shared_level) 7953 goto out; 7954 7955 ret = find_next_key(path, level + 1, &wc->update_progress); 7956 if (ret > 0) 7957 wc->update_ref = 0; 7958 7959 wc->stage = DROP_REFERENCE; 7960 wc->shared_level = -1; 7961 path->slots[level] = 0; 7962 7963 /* 7964 * check reference count again if the block isn't locked. 7965 * we should start walking down the tree again if reference 7966 * count is one. 7967 */ 7968 if (!path->locks[level]) { 7969 BUG_ON(level == 0); 7970 btrfs_tree_lock(eb); 7971 btrfs_set_lock_blocking(eb); 7972 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 7973 7974 ret = btrfs_lookup_extent_info(trans, root, 7975 eb->start, level, 1, 7976 &wc->refs[level], 7977 &wc->flags[level]); 7978 if (ret < 0) { 7979 btrfs_tree_unlock_rw(eb, path->locks[level]); 7980 path->locks[level] = 0; 7981 return ret; 7982 } 7983 BUG_ON(wc->refs[level] == 0); 7984 if (wc->refs[level] == 1) { 7985 btrfs_tree_unlock_rw(eb, path->locks[level]); 7986 path->locks[level] = 0; 7987 return 1; 7988 } 7989 } 7990 } 7991 7992 /* wc->stage == DROP_REFERENCE */ 7993 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 7994 7995 if (wc->refs[level] == 1) { 7996 if (level == 0) { 7997 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 7998 ret = btrfs_dec_ref(trans, root, eb, 1); 7999 else 8000 ret = btrfs_dec_ref(trans, root, eb, 0); 8001 BUG_ON(ret); /* -ENOMEM */ 8002 ret = account_leaf_items(trans, root, eb); 8003 if (ret) { 8004 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 8005 "%d accounting leaf items. Quota " 8006 "is out of sync, rescan required.\n", 8007 root->fs_info->sb->s_id, ret); 8008 } 8009 } 8010 /* make block locked assertion in clean_tree_block happy */ 8011 if (!path->locks[level] && 8012 btrfs_header_generation(eb) == trans->transid) { 8013 btrfs_tree_lock(eb); 8014 btrfs_set_lock_blocking(eb); 8015 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8016 } 8017 clean_tree_block(trans, root, eb); 8018 } 8019 8020 if (eb == root->node) { 8021 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8022 parent = eb->start; 8023 else 8024 BUG_ON(root->root_key.objectid != 8025 btrfs_header_owner(eb)); 8026 } else { 8027 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8028 parent = path->nodes[level + 1]->start; 8029 else 8030 BUG_ON(root->root_key.objectid != 8031 btrfs_header_owner(path->nodes[level + 1])); 8032 } 8033 8034 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8035 out: 8036 wc->refs[level] = 0; 8037 wc->flags[level] = 0; 8038 return 0; 8039 } 8040 8041 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8042 struct btrfs_root *root, 8043 struct btrfs_path *path, 8044 struct walk_control *wc) 8045 { 8046 int level = wc->level; 8047 int lookup_info = 1; 8048 int ret; 8049 8050 while (level >= 0) { 8051 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8052 if (ret > 0) 8053 break; 8054 8055 if (level == 0) 8056 break; 8057 8058 if (path->slots[level] >= 8059 btrfs_header_nritems(path->nodes[level])) 8060 break; 8061 8062 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8063 if (ret > 0) { 8064 path->slots[level]++; 8065 continue; 8066 } else if (ret < 0) 8067 return ret; 8068 level = wc->level; 8069 } 8070 return 0; 8071 } 8072 8073 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8074 struct btrfs_root *root, 8075 struct btrfs_path *path, 8076 struct walk_control *wc, int max_level) 8077 { 8078 int level = wc->level; 8079 int ret; 8080 8081 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8082 while (level < max_level && path->nodes[level]) { 8083 wc->level = level; 8084 if (path->slots[level] + 1 < 8085 btrfs_header_nritems(path->nodes[level])) { 8086 path->slots[level]++; 8087 return 0; 8088 } else { 8089 ret = walk_up_proc(trans, root, path, wc); 8090 if (ret > 0) 8091 return 0; 8092 8093 if (path->locks[level]) { 8094 btrfs_tree_unlock_rw(path->nodes[level], 8095 path->locks[level]); 8096 path->locks[level] = 0; 8097 } 8098 free_extent_buffer(path->nodes[level]); 8099 path->nodes[level] = NULL; 8100 level++; 8101 } 8102 } 8103 return 1; 8104 } 8105 8106 /* 8107 * drop a subvolume tree. 8108 * 8109 * this function traverses the tree freeing any blocks that only 8110 * referenced by the tree. 8111 * 8112 * when a shared tree block is found. this function decreases its 8113 * reference count by one. if update_ref is true, this function 8114 * also make sure backrefs for the shared block and all lower level 8115 * blocks are properly updated. 8116 * 8117 * If called with for_reloc == 0, may exit early with -EAGAIN 8118 */ 8119 int btrfs_drop_snapshot(struct btrfs_root *root, 8120 struct btrfs_block_rsv *block_rsv, int update_ref, 8121 int for_reloc) 8122 { 8123 struct btrfs_path *path; 8124 struct btrfs_trans_handle *trans; 8125 struct btrfs_root *tree_root = root->fs_info->tree_root; 8126 struct btrfs_root_item *root_item = &root->root_item; 8127 struct walk_control *wc; 8128 struct btrfs_key key; 8129 int err = 0; 8130 int ret; 8131 int level; 8132 bool root_dropped = false; 8133 8134 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); 8135 8136 path = btrfs_alloc_path(); 8137 if (!path) { 8138 err = -ENOMEM; 8139 goto out; 8140 } 8141 8142 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8143 if (!wc) { 8144 btrfs_free_path(path); 8145 err = -ENOMEM; 8146 goto out; 8147 } 8148 8149 trans = btrfs_start_transaction(tree_root, 0); 8150 if (IS_ERR(trans)) { 8151 err = PTR_ERR(trans); 8152 goto out_free; 8153 } 8154 8155 if (block_rsv) 8156 trans->block_rsv = block_rsv; 8157 8158 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 8159 level = btrfs_header_level(root->node); 8160 path->nodes[level] = btrfs_lock_root_node(root); 8161 btrfs_set_lock_blocking(path->nodes[level]); 8162 path->slots[level] = 0; 8163 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8164 memset(&wc->update_progress, 0, 8165 sizeof(wc->update_progress)); 8166 } else { 8167 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 8168 memcpy(&wc->update_progress, &key, 8169 sizeof(wc->update_progress)); 8170 8171 level = root_item->drop_level; 8172 BUG_ON(level == 0); 8173 path->lowest_level = level; 8174 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8175 path->lowest_level = 0; 8176 if (ret < 0) { 8177 err = ret; 8178 goto out_end_trans; 8179 } 8180 WARN_ON(ret > 0); 8181 8182 /* 8183 * unlock our path, this is safe because only this 8184 * function is allowed to delete this snapshot 8185 */ 8186 btrfs_unlock_up_safe(path, 0); 8187 8188 level = btrfs_header_level(root->node); 8189 while (1) { 8190 btrfs_tree_lock(path->nodes[level]); 8191 btrfs_set_lock_blocking(path->nodes[level]); 8192 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8193 8194 ret = btrfs_lookup_extent_info(trans, root, 8195 path->nodes[level]->start, 8196 level, 1, &wc->refs[level], 8197 &wc->flags[level]); 8198 if (ret < 0) { 8199 err = ret; 8200 goto out_end_trans; 8201 } 8202 BUG_ON(wc->refs[level] == 0); 8203 8204 if (level == root_item->drop_level) 8205 break; 8206 8207 btrfs_tree_unlock(path->nodes[level]); 8208 path->locks[level] = 0; 8209 WARN_ON(wc->refs[level] != 1); 8210 level--; 8211 } 8212 } 8213 8214 wc->level = level; 8215 wc->shared_level = -1; 8216 wc->stage = DROP_REFERENCE; 8217 wc->update_ref = update_ref; 8218 wc->keep_locks = 0; 8219 wc->for_reloc = for_reloc; 8220 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8221 8222 while (1) { 8223 8224 ret = walk_down_tree(trans, root, path, wc); 8225 if (ret < 0) { 8226 err = ret; 8227 break; 8228 } 8229 8230 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 8231 if (ret < 0) { 8232 err = ret; 8233 break; 8234 } 8235 8236 if (ret > 0) { 8237 BUG_ON(wc->stage != DROP_REFERENCE); 8238 break; 8239 } 8240 8241 if (wc->stage == DROP_REFERENCE) { 8242 level = wc->level; 8243 btrfs_node_key(path->nodes[level], 8244 &root_item->drop_progress, 8245 path->slots[level]); 8246 root_item->drop_level = level; 8247 } 8248 8249 BUG_ON(wc->level == 0); 8250 if (btrfs_should_end_transaction(trans, tree_root) || 8251 (!for_reloc && btrfs_need_cleaner_sleep(root))) { 8252 ret = btrfs_update_root(trans, tree_root, 8253 &root->root_key, 8254 root_item); 8255 if (ret) { 8256 btrfs_abort_transaction(trans, tree_root, ret); 8257 err = ret; 8258 goto out_end_trans; 8259 } 8260 8261 /* 8262 * Qgroup update accounting is run from 8263 * delayed ref handling. This usually works 8264 * out because delayed refs are normally the 8265 * only way qgroup updates are added. However, 8266 * we may have added updates during our tree 8267 * walk so run qgroups here to make sure we 8268 * don't lose any updates. 8269 */ 8270 ret = btrfs_delayed_qgroup_accounting(trans, 8271 root->fs_info); 8272 if (ret) 8273 printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8274 "running qgroup updates " 8275 "during snapshot delete. " 8276 "Quota is out of sync, " 8277 "rescan required.\n", ret); 8278 8279 btrfs_end_transaction_throttle(trans, tree_root); 8280 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8281 pr_debug("BTRFS: drop snapshot early exit\n"); 8282 err = -EAGAIN; 8283 goto out_free; 8284 } 8285 8286 trans = btrfs_start_transaction(tree_root, 0); 8287 if (IS_ERR(trans)) { 8288 err = PTR_ERR(trans); 8289 goto out_free; 8290 } 8291 if (block_rsv) 8292 trans->block_rsv = block_rsv; 8293 } 8294 } 8295 btrfs_release_path(path); 8296 if (err) 8297 goto out_end_trans; 8298 8299 ret = btrfs_del_root(trans, tree_root, &root->root_key); 8300 if (ret) { 8301 btrfs_abort_transaction(trans, tree_root, ret); 8302 goto out_end_trans; 8303 } 8304 8305 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 8306 ret = btrfs_find_root(tree_root, &root->root_key, path, 8307 NULL, NULL); 8308 if (ret < 0) { 8309 btrfs_abort_transaction(trans, tree_root, ret); 8310 err = ret; 8311 goto out_end_trans; 8312 } else if (ret > 0) { 8313 /* if we fail to delete the orphan item this time 8314 * around, it'll get picked up the next time. 8315 * 8316 * The most common failure here is just -ENOENT. 8317 */ 8318 btrfs_del_orphan_item(trans, tree_root, 8319 root->root_key.objectid); 8320 } 8321 } 8322 8323 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 8324 btrfs_drop_and_free_fs_root(tree_root->fs_info, root); 8325 } else { 8326 free_extent_buffer(root->node); 8327 free_extent_buffer(root->commit_root); 8328 btrfs_put_fs_root(root); 8329 } 8330 root_dropped = true; 8331 out_end_trans: 8332 ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); 8333 if (ret) 8334 printk_ratelimited(KERN_ERR "BTRFS: Failure %d " 8335 "running qgroup updates " 8336 "during snapshot delete. " 8337 "Quota is out of sync, " 8338 "rescan required.\n", ret); 8339 8340 btrfs_end_transaction_throttle(trans, tree_root); 8341 out_free: 8342 kfree(wc); 8343 btrfs_free_path(path); 8344 out: 8345 /* 8346 * So if we need to stop dropping the snapshot for whatever reason we 8347 * need to make sure to add it back to the dead root list so that we 8348 * keep trying to do the work later. This also cleans up roots if we 8349 * don't have it in the radix (like when we recover after a power fail 8350 * or unmount) so we don't leak memory. 8351 */ 8352 if (!for_reloc && root_dropped == false) 8353 btrfs_add_dead_root(root); 8354 if (err && err != -EAGAIN) 8355 btrfs_std_error(root->fs_info, err); 8356 return err; 8357 } 8358 8359 /* 8360 * drop subtree rooted at tree block 'node'. 8361 * 8362 * NOTE: this function will unlock and release tree block 'node' 8363 * only used by relocation code 8364 */ 8365 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 8366 struct btrfs_root *root, 8367 struct extent_buffer *node, 8368 struct extent_buffer *parent) 8369 { 8370 struct btrfs_path *path; 8371 struct walk_control *wc; 8372 int level; 8373 int parent_level; 8374 int ret = 0; 8375 int wret; 8376 8377 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 8378 8379 path = btrfs_alloc_path(); 8380 if (!path) 8381 return -ENOMEM; 8382 8383 wc = kzalloc(sizeof(*wc), GFP_NOFS); 8384 if (!wc) { 8385 btrfs_free_path(path); 8386 return -ENOMEM; 8387 } 8388 8389 btrfs_assert_tree_locked(parent); 8390 parent_level = btrfs_header_level(parent); 8391 extent_buffer_get(parent); 8392 path->nodes[parent_level] = parent; 8393 path->slots[parent_level] = btrfs_header_nritems(parent); 8394 8395 btrfs_assert_tree_locked(node); 8396 level = btrfs_header_level(node); 8397 path->nodes[level] = node; 8398 path->slots[level] = 0; 8399 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8400 8401 wc->refs[parent_level] = 1; 8402 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8403 wc->level = level; 8404 wc->shared_level = -1; 8405 wc->stage = DROP_REFERENCE; 8406 wc->update_ref = 0; 8407 wc->keep_locks = 1; 8408 wc->for_reloc = 1; 8409 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 8410 8411 while (1) { 8412 wret = walk_down_tree(trans, root, path, wc); 8413 if (wret < 0) { 8414 ret = wret; 8415 break; 8416 } 8417 8418 wret = walk_up_tree(trans, root, path, wc, parent_level); 8419 if (wret < 0) 8420 ret = wret; 8421 if (wret != 0) 8422 break; 8423 } 8424 8425 kfree(wc); 8426 btrfs_free_path(path); 8427 return ret; 8428 } 8429 8430 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 8431 { 8432 u64 num_devices; 8433 u64 stripped; 8434 8435 /* 8436 * if restripe for this chunk_type is on pick target profile and 8437 * return, otherwise do the usual balance 8438 */ 8439 stripped = get_restripe_target(root->fs_info, flags); 8440 if (stripped) 8441 return extended_to_chunk(stripped); 8442 8443 /* 8444 * we add in the count of missing devices because we want 8445 * to make sure that any RAID levels on a degraded FS 8446 * continue to be honored. 8447 */ 8448 num_devices = root->fs_info->fs_devices->rw_devices + 8449 root->fs_info->fs_devices->missing_devices; 8450 8451 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8452 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8453 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 8454 8455 if (num_devices == 1) { 8456 stripped |= BTRFS_BLOCK_GROUP_DUP; 8457 stripped = flags & ~stripped; 8458 8459 /* turn raid0 into single device chunks */ 8460 if (flags & BTRFS_BLOCK_GROUP_RAID0) 8461 return stripped; 8462 8463 /* turn mirroring into duplication */ 8464 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 8465 BTRFS_BLOCK_GROUP_RAID10)) 8466 return stripped | BTRFS_BLOCK_GROUP_DUP; 8467 } else { 8468 /* they already had raid on here, just return */ 8469 if (flags & stripped) 8470 return flags; 8471 8472 stripped |= BTRFS_BLOCK_GROUP_DUP; 8473 stripped = flags & ~stripped; 8474 8475 /* switch duplicated blocks with raid1 */ 8476 if (flags & BTRFS_BLOCK_GROUP_DUP) 8477 return stripped | BTRFS_BLOCK_GROUP_RAID1; 8478 8479 /* this is drive concat, leave it alone */ 8480 } 8481 8482 return flags; 8483 } 8484 8485 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) 8486 { 8487 struct btrfs_space_info *sinfo = cache->space_info; 8488 u64 num_bytes; 8489 u64 min_allocable_bytes; 8490 int ret = -ENOSPC; 8491 8492 8493 /* 8494 * We need some metadata space and system metadata space for 8495 * allocating chunks in some corner cases until we force to set 8496 * it to be readonly. 8497 */ 8498 if ((sinfo->flags & 8499 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 8500 !force) 8501 min_allocable_bytes = 1 * 1024 * 1024; 8502 else 8503 min_allocable_bytes = 0; 8504 8505 spin_lock(&sinfo->lock); 8506 spin_lock(&cache->lock); 8507 8508 if (cache->ro) { 8509 ret = 0; 8510 goto out; 8511 } 8512 8513 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8514 cache->bytes_super - btrfs_block_group_used(&cache->item); 8515 8516 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 8517 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + 8518 min_allocable_bytes <= sinfo->total_bytes) { 8519 sinfo->bytes_readonly += num_bytes; 8520 cache->ro = 1; 8521 ret = 0; 8522 } 8523 out: 8524 spin_unlock(&cache->lock); 8525 spin_unlock(&sinfo->lock); 8526 return ret; 8527 } 8528 8529 int btrfs_set_block_group_ro(struct btrfs_root *root, 8530 struct btrfs_block_group_cache *cache) 8531 8532 { 8533 struct btrfs_trans_handle *trans; 8534 u64 alloc_flags; 8535 int ret; 8536 8537 BUG_ON(cache->ro); 8538 8539 trans = btrfs_join_transaction(root); 8540 if (IS_ERR(trans)) 8541 return PTR_ERR(trans); 8542 8543 alloc_flags = update_block_group_flags(root, cache->flags); 8544 if (alloc_flags != cache->flags) { 8545 ret = do_chunk_alloc(trans, root, alloc_flags, 8546 CHUNK_ALLOC_FORCE); 8547 if (ret < 0) 8548 goto out; 8549 } 8550 8551 ret = set_block_group_ro(cache, 0); 8552 if (!ret) 8553 goto out; 8554 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8555 ret = do_chunk_alloc(trans, root, alloc_flags, 8556 CHUNK_ALLOC_FORCE); 8557 if (ret < 0) 8558 goto out; 8559 ret = set_block_group_ro(cache, 0); 8560 out: 8561 btrfs_end_transaction(trans, root); 8562 return ret; 8563 } 8564 8565 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 8566 struct btrfs_root *root, u64 type) 8567 { 8568 u64 alloc_flags = get_alloc_profile(root, type); 8569 return do_chunk_alloc(trans, root, alloc_flags, 8570 CHUNK_ALLOC_FORCE); 8571 } 8572 8573 /* 8574 * helper to account the unused space of all the readonly block group in the 8575 * list. takes mirrors into account. 8576 */ 8577 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) 8578 { 8579 struct btrfs_block_group_cache *block_group; 8580 u64 free_bytes = 0; 8581 int factor; 8582 8583 list_for_each_entry(block_group, groups_list, list) { 8584 spin_lock(&block_group->lock); 8585 8586 if (!block_group->ro) { 8587 spin_unlock(&block_group->lock); 8588 continue; 8589 } 8590 8591 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 8592 BTRFS_BLOCK_GROUP_RAID10 | 8593 BTRFS_BLOCK_GROUP_DUP)) 8594 factor = 2; 8595 else 8596 factor = 1; 8597 8598 free_bytes += (block_group->key.offset - 8599 btrfs_block_group_used(&block_group->item)) * 8600 factor; 8601 8602 spin_unlock(&block_group->lock); 8603 } 8604 8605 return free_bytes; 8606 } 8607 8608 /* 8609 * helper to account the unused space of all the readonly block group in the 8610 * space_info. takes mirrors into account. 8611 */ 8612 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 8613 { 8614 int i; 8615 u64 free_bytes = 0; 8616 8617 spin_lock(&sinfo->lock); 8618 8619 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 8620 if (!list_empty(&sinfo->block_groups[i])) 8621 free_bytes += __btrfs_get_ro_block_group_free_space( 8622 &sinfo->block_groups[i]); 8623 8624 spin_unlock(&sinfo->lock); 8625 8626 return free_bytes; 8627 } 8628 8629 void btrfs_set_block_group_rw(struct btrfs_root *root, 8630 struct btrfs_block_group_cache *cache) 8631 { 8632 struct btrfs_space_info *sinfo = cache->space_info; 8633 u64 num_bytes; 8634 8635 BUG_ON(!cache->ro); 8636 8637 spin_lock(&sinfo->lock); 8638 spin_lock(&cache->lock); 8639 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 8640 cache->bytes_super - btrfs_block_group_used(&cache->item); 8641 sinfo->bytes_readonly -= num_bytes; 8642 cache->ro = 0; 8643 spin_unlock(&cache->lock); 8644 spin_unlock(&sinfo->lock); 8645 } 8646 8647 /* 8648 * checks to see if its even possible to relocate this block group. 8649 * 8650 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 8651 * ok to go ahead and try. 8652 */ 8653 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) 8654 { 8655 struct btrfs_block_group_cache *block_group; 8656 struct btrfs_space_info *space_info; 8657 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 8658 struct btrfs_device *device; 8659 struct btrfs_trans_handle *trans; 8660 u64 min_free; 8661 u64 dev_min = 1; 8662 u64 dev_nr = 0; 8663 u64 target; 8664 int index; 8665 int full = 0; 8666 int ret = 0; 8667 8668 block_group = btrfs_lookup_block_group(root->fs_info, bytenr); 8669 8670 /* odd, couldn't find the block group, leave it alone */ 8671 if (!block_group) 8672 return -1; 8673 8674 min_free = btrfs_block_group_used(&block_group->item); 8675 8676 /* no bytes used, we're good */ 8677 if (!min_free) 8678 goto out; 8679 8680 space_info = block_group->space_info; 8681 spin_lock(&space_info->lock); 8682 8683 full = space_info->full; 8684 8685 /* 8686 * if this is the last block group we have in this space, we can't 8687 * relocate it unless we're able to allocate a new chunk below. 8688 * 8689 * Otherwise, we need to make sure we have room in the space to handle 8690 * all of the extents from this block group. If we can, we're good 8691 */ 8692 if ((space_info->total_bytes != block_group->key.offset) && 8693 (space_info->bytes_used + space_info->bytes_reserved + 8694 space_info->bytes_pinned + space_info->bytes_readonly + 8695 min_free < space_info->total_bytes)) { 8696 spin_unlock(&space_info->lock); 8697 goto out; 8698 } 8699 spin_unlock(&space_info->lock); 8700 8701 /* 8702 * ok we don't have enough space, but maybe we have free space on our 8703 * devices to allocate new chunks for relocation, so loop through our 8704 * alloc devices and guess if we have enough space. if this block 8705 * group is going to be restriped, run checks against the target 8706 * profile instead of the current one. 8707 */ 8708 ret = -1; 8709 8710 /* 8711 * index: 8712 * 0: raid10 8713 * 1: raid1 8714 * 2: dup 8715 * 3: raid0 8716 * 4: single 8717 */ 8718 target = get_restripe_target(root->fs_info, block_group->flags); 8719 if (target) { 8720 index = __get_raid_index(extended_to_chunk(target)); 8721 } else { 8722 /* 8723 * this is just a balance, so if we were marked as full 8724 * we know there is no space for a new chunk 8725 */ 8726 if (full) 8727 goto out; 8728 8729 index = get_block_group_index(block_group); 8730 } 8731 8732 if (index == BTRFS_RAID_RAID10) { 8733 dev_min = 4; 8734 /* Divide by 2 */ 8735 min_free >>= 1; 8736 } else if (index == BTRFS_RAID_RAID1) { 8737 dev_min = 2; 8738 } else if (index == BTRFS_RAID_DUP) { 8739 /* Multiply by 2 */ 8740 min_free <<= 1; 8741 } else if (index == BTRFS_RAID_RAID0) { 8742 dev_min = fs_devices->rw_devices; 8743 do_div(min_free, dev_min); 8744 } 8745 8746 /* We need to do this so that we can look at pending chunks */ 8747 trans = btrfs_join_transaction(root); 8748 if (IS_ERR(trans)) { 8749 ret = PTR_ERR(trans); 8750 goto out; 8751 } 8752 8753 mutex_lock(&root->fs_info->chunk_mutex); 8754 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8755 u64 dev_offset; 8756 8757 /* 8758 * check to make sure we can actually find a chunk with enough 8759 * space to fit our block group in. 8760 */ 8761 if (device->total_bytes > device->bytes_used + min_free && 8762 !device->is_tgtdev_for_dev_replace) { 8763 ret = find_free_dev_extent(trans, device, min_free, 8764 &dev_offset, NULL); 8765 if (!ret) 8766 dev_nr++; 8767 8768 if (dev_nr >= dev_min) 8769 break; 8770 8771 ret = -1; 8772 } 8773 } 8774 mutex_unlock(&root->fs_info->chunk_mutex); 8775 btrfs_end_transaction(trans, root); 8776 out: 8777 btrfs_put_block_group(block_group); 8778 return ret; 8779 } 8780 8781 static int find_first_block_group(struct btrfs_root *root, 8782 struct btrfs_path *path, struct btrfs_key *key) 8783 { 8784 int ret = 0; 8785 struct btrfs_key found_key; 8786 struct extent_buffer *leaf; 8787 int slot; 8788 8789 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 8790 if (ret < 0) 8791 goto out; 8792 8793 while (1) { 8794 slot = path->slots[0]; 8795 leaf = path->nodes[0]; 8796 if (slot >= btrfs_header_nritems(leaf)) { 8797 ret = btrfs_next_leaf(root, path); 8798 if (ret == 0) 8799 continue; 8800 if (ret < 0) 8801 goto out; 8802 break; 8803 } 8804 btrfs_item_key_to_cpu(leaf, &found_key, slot); 8805 8806 if (found_key.objectid >= key->objectid && 8807 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 8808 ret = 0; 8809 goto out; 8810 } 8811 path->slots[0]++; 8812 } 8813 out: 8814 return ret; 8815 } 8816 8817 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 8818 { 8819 struct btrfs_block_group_cache *block_group; 8820 u64 last = 0; 8821 8822 while (1) { 8823 struct inode *inode; 8824 8825 block_group = btrfs_lookup_first_block_group(info, last); 8826 while (block_group) { 8827 spin_lock(&block_group->lock); 8828 if (block_group->iref) 8829 break; 8830 spin_unlock(&block_group->lock); 8831 block_group = next_block_group(info->tree_root, 8832 block_group); 8833 } 8834 if (!block_group) { 8835 if (last == 0) 8836 break; 8837 last = 0; 8838 continue; 8839 } 8840 8841 inode = block_group->inode; 8842 block_group->iref = 0; 8843 block_group->inode = NULL; 8844 spin_unlock(&block_group->lock); 8845 iput(inode); 8846 last = block_group->key.objectid + block_group->key.offset; 8847 btrfs_put_block_group(block_group); 8848 } 8849 } 8850 8851 int btrfs_free_block_groups(struct btrfs_fs_info *info) 8852 { 8853 struct btrfs_block_group_cache *block_group; 8854 struct btrfs_space_info *space_info; 8855 struct btrfs_caching_control *caching_ctl; 8856 struct rb_node *n; 8857 8858 down_write(&info->commit_root_sem); 8859 while (!list_empty(&info->caching_block_groups)) { 8860 caching_ctl = list_entry(info->caching_block_groups.next, 8861 struct btrfs_caching_control, list); 8862 list_del(&caching_ctl->list); 8863 put_caching_control(caching_ctl); 8864 } 8865 up_write(&info->commit_root_sem); 8866 8867 spin_lock(&info->block_group_cache_lock); 8868 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8869 block_group = rb_entry(n, struct btrfs_block_group_cache, 8870 cache_node); 8871 rb_erase(&block_group->cache_node, 8872 &info->block_group_cache_tree); 8873 spin_unlock(&info->block_group_cache_lock); 8874 8875 down_write(&block_group->space_info->groups_sem); 8876 list_del(&block_group->list); 8877 up_write(&block_group->space_info->groups_sem); 8878 8879 if (block_group->cached == BTRFS_CACHE_STARTED) 8880 wait_block_group_cache_done(block_group); 8881 8882 /* 8883 * We haven't cached this block group, which means we could 8884 * possibly have excluded extents on this block group. 8885 */ 8886 if (block_group->cached == BTRFS_CACHE_NO || 8887 block_group->cached == BTRFS_CACHE_ERROR) 8888 free_excluded_extents(info->extent_root, block_group); 8889 8890 btrfs_remove_free_space_cache(block_group); 8891 btrfs_put_block_group(block_group); 8892 8893 spin_lock(&info->block_group_cache_lock); 8894 } 8895 spin_unlock(&info->block_group_cache_lock); 8896 8897 /* now that all the block groups are freed, go through and 8898 * free all the space_info structs. This is only called during 8899 * the final stages of unmount, and so we know nobody is 8900 * using them. We call synchronize_rcu() once before we start, 8901 * just to be on the safe side. 8902 */ 8903 synchronize_rcu(); 8904 8905 release_global_block_rsv(info); 8906 8907 while (!list_empty(&info->space_info)) { 8908 int i; 8909 8910 space_info = list_entry(info->space_info.next, 8911 struct btrfs_space_info, 8912 list); 8913 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { 8914 if (WARN_ON(space_info->bytes_pinned > 0 || 8915 space_info->bytes_reserved > 0 || 8916 space_info->bytes_may_use > 0)) { 8917 dump_space_info(space_info, 0, 0); 8918 } 8919 } 8920 list_del(&space_info->list); 8921 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 8922 struct kobject *kobj; 8923 kobj = space_info->block_group_kobjs[i]; 8924 space_info->block_group_kobjs[i] = NULL; 8925 if (kobj) { 8926 kobject_del(kobj); 8927 kobject_put(kobj); 8928 } 8929 } 8930 kobject_del(&space_info->kobj); 8931 kobject_put(&space_info->kobj); 8932 } 8933 return 0; 8934 } 8935 8936 static void __link_block_group(struct btrfs_space_info *space_info, 8937 struct btrfs_block_group_cache *cache) 8938 { 8939 int index = get_block_group_index(cache); 8940 bool first = false; 8941 8942 down_write(&space_info->groups_sem); 8943 if (list_empty(&space_info->block_groups[index])) 8944 first = true; 8945 list_add_tail(&cache->list, &space_info->block_groups[index]); 8946 up_write(&space_info->groups_sem); 8947 8948 if (first) { 8949 struct raid_kobject *rkobj; 8950 int ret; 8951 8952 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 8953 if (!rkobj) 8954 goto out_err; 8955 rkobj->raid_type = index; 8956 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 8957 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 8958 "%s", get_raid_name(index)); 8959 if (ret) { 8960 kobject_put(&rkobj->kobj); 8961 goto out_err; 8962 } 8963 space_info->block_group_kobjs[index] = &rkobj->kobj; 8964 } 8965 8966 return; 8967 out_err: 8968 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); 8969 } 8970 8971 static struct btrfs_block_group_cache * 8972 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) 8973 { 8974 struct btrfs_block_group_cache *cache; 8975 8976 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8977 if (!cache) 8978 return NULL; 8979 8980 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 8981 GFP_NOFS); 8982 if (!cache->free_space_ctl) { 8983 kfree(cache); 8984 return NULL; 8985 } 8986 8987 cache->key.objectid = start; 8988 cache->key.offset = size; 8989 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8990 8991 cache->sectorsize = root->sectorsize; 8992 cache->fs_info = root->fs_info; 8993 cache->full_stripe_len = btrfs_full_stripe_len(root, 8994 &root->fs_info->mapping_tree, 8995 start); 8996 atomic_set(&cache->count, 1); 8997 spin_lock_init(&cache->lock); 8998 init_rwsem(&cache->data_rwsem); 8999 INIT_LIST_HEAD(&cache->list); 9000 INIT_LIST_HEAD(&cache->cluster_list); 9001 INIT_LIST_HEAD(&cache->new_bg_list); 9002 btrfs_init_free_space_ctl(cache); 9003 9004 return cache; 9005 } 9006 9007 int btrfs_read_block_groups(struct btrfs_root *root) 9008 { 9009 struct btrfs_path *path; 9010 int ret; 9011 struct btrfs_block_group_cache *cache; 9012 struct btrfs_fs_info *info = root->fs_info; 9013 struct btrfs_space_info *space_info; 9014 struct btrfs_key key; 9015 struct btrfs_key found_key; 9016 struct extent_buffer *leaf; 9017 int need_clear = 0; 9018 u64 cache_gen; 9019 9020 root = info->extent_root; 9021 key.objectid = 0; 9022 key.offset = 0; 9023 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 9024 path = btrfs_alloc_path(); 9025 if (!path) 9026 return -ENOMEM; 9027 path->reada = 1; 9028 9029 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 9030 if (btrfs_test_opt(root, SPACE_CACHE) && 9031 btrfs_super_generation(root->fs_info->super_copy) != cache_gen) 9032 need_clear = 1; 9033 if (btrfs_test_opt(root, CLEAR_CACHE)) 9034 need_clear = 1; 9035 9036 while (1) { 9037 ret = find_first_block_group(root, path, &key); 9038 if (ret > 0) 9039 break; 9040 if (ret != 0) 9041 goto error; 9042 9043 leaf = path->nodes[0]; 9044 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9045 9046 cache = btrfs_create_block_group_cache(root, found_key.objectid, 9047 found_key.offset); 9048 if (!cache) { 9049 ret = -ENOMEM; 9050 goto error; 9051 } 9052 9053 if (need_clear) { 9054 /* 9055 * When we mount with old space cache, we need to 9056 * set BTRFS_DC_CLEAR and set dirty flag. 9057 * 9058 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9059 * truncate the old free space cache inode and 9060 * setup a new one. 9061 * b) Setting 'dirty flag' makes sure that we flush 9062 * the new space cache info onto disk. 9063 */ 9064 cache->disk_cache_state = BTRFS_DC_CLEAR; 9065 if (btrfs_test_opt(root, SPACE_CACHE)) 9066 cache->dirty = 1; 9067 } 9068 9069 read_extent_buffer(leaf, &cache->item, 9070 btrfs_item_ptr_offset(leaf, path->slots[0]), 9071 sizeof(cache->item)); 9072 cache->flags = btrfs_block_group_flags(&cache->item); 9073 9074 key.objectid = found_key.objectid + found_key.offset; 9075 btrfs_release_path(path); 9076 9077 /* 9078 * We need to exclude the super stripes now so that the space 9079 * info has super bytes accounted for, otherwise we'll think 9080 * we have more space than we actually do. 9081 */ 9082 ret = exclude_super_stripes(root, cache); 9083 if (ret) { 9084 /* 9085 * We may have excluded something, so call this just in 9086 * case. 9087 */ 9088 free_excluded_extents(root, cache); 9089 btrfs_put_block_group(cache); 9090 goto error; 9091 } 9092 9093 /* 9094 * check for two cases, either we are full, and therefore 9095 * don't need to bother with the caching work since we won't 9096 * find any space, or we are empty, and we can just add all 9097 * the space in and be done with it. This saves us _alot_ of 9098 * time, particularly in the full case. 9099 */ 9100 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 9101 cache->last_byte_to_unpin = (u64)-1; 9102 cache->cached = BTRFS_CACHE_FINISHED; 9103 free_excluded_extents(root, cache); 9104 } else if (btrfs_block_group_used(&cache->item) == 0) { 9105 cache->last_byte_to_unpin = (u64)-1; 9106 cache->cached = BTRFS_CACHE_FINISHED; 9107 add_new_free_space(cache, root->fs_info, 9108 found_key.objectid, 9109 found_key.objectid + 9110 found_key.offset); 9111 free_excluded_extents(root, cache); 9112 } 9113 9114 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9115 if (ret) { 9116 btrfs_remove_free_space_cache(cache); 9117 btrfs_put_block_group(cache); 9118 goto error; 9119 } 9120 9121 ret = update_space_info(info, cache->flags, found_key.offset, 9122 btrfs_block_group_used(&cache->item), 9123 &space_info); 9124 if (ret) { 9125 btrfs_remove_free_space_cache(cache); 9126 spin_lock(&info->block_group_cache_lock); 9127 rb_erase(&cache->cache_node, 9128 &info->block_group_cache_tree); 9129 spin_unlock(&info->block_group_cache_lock); 9130 btrfs_put_block_group(cache); 9131 goto error; 9132 } 9133 9134 cache->space_info = space_info; 9135 spin_lock(&cache->space_info->lock); 9136 cache->space_info->bytes_readonly += cache->bytes_super; 9137 spin_unlock(&cache->space_info->lock); 9138 9139 __link_block_group(space_info, cache); 9140 9141 set_avail_alloc_bits(root->fs_info, cache->flags); 9142 if (btrfs_chunk_readonly(root, cache->key.objectid)) 9143 set_block_group_ro(cache, 1); 9144 } 9145 9146 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 9147 if (!(get_alloc_profile(root, space_info->flags) & 9148 (BTRFS_BLOCK_GROUP_RAID10 | 9149 BTRFS_BLOCK_GROUP_RAID1 | 9150 BTRFS_BLOCK_GROUP_RAID5 | 9151 BTRFS_BLOCK_GROUP_RAID6 | 9152 BTRFS_BLOCK_GROUP_DUP))) 9153 continue; 9154 /* 9155 * avoid allocating from un-mirrored block group if there are 9156 * mirrored block groups. 9157 */ 9158 list_for_each_entry(cache, 9159 &space_info->block_groups[BTRFS_RAID_RAID0], 9160 list) 9161 set_block_group_ro(cache, 1); 9162 list_for_each_entry(cache, 9163 &space_info->block_groups[BTRFS_RAID_SINGLE], 9164 list) 9165 set_block_group_ro(cache, 1); 9166 } 9167 9168 init_global_block_rsv(info); 9169 ret = 0; 9170 error: 9171 btrfs_free_path(path); 9172 return ret; 9173 } 9174 9175 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 9176 struct btrfs_root *root) 9177 { 9178 struct btrfs_block_group_cache *block_group, *tmp; 9179 struct btrfs_root *extent_root = root->fs_info->extent_root; 9180 struct btrfs_block_group_item item; 9181 struct btrfs_key key; 9182 int ret = 0; 9183 9184 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 9185 new_bg_list) { 9186 list_del_init(&block_group->new_bg_list); 9187 9188 if (ret) 9189 continue; 9190 9191 spin_lock(&block_group->lock); 9192 memcpy(&item, &block_group->item, sizeof(item)); 9193 memcpy(&key, &block_group->key, sizeof(key)); 9194 spin_unlock(&block_group->lock); 9195 9196 ret = btrfs_insert_item(trans, extent_root, &key, &item, 9197 sizeof(item)); 9198 if (ret) 9199 btrfs_abort_transaction(trans, extent_root, ret); 9200 ret = btrfs_finish_chunk_alloc(trans, extent_root, 9201 key.objectid, key.offset); 9202 if (ret) 9203 btrfs_abort_transaction(trans, extent_root, ret); 9204 } 9205 } 9206 9207 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 9208 struct btrfs_root *root, u64 bytes_used, 9209 u64 type, u64 chunk_objectid, u64 chunk_offset, 9210 u64 size) 9211 { 9212 int ret; 9213 struct btrfs_root *extent_root; 9214 struct btrfs_block_group_cache *cache; 9215 9216 extent_root = root->fs_info->extent_root; 9217 9218 btrfs_set_log_full_commit(root->fs_info, trans); 9219 9220 cache = btrfs_create_block_group_cache(root, chunk_offset, size); 9221 if (!cache) 9222 return -ENOMEM; 9223 9224 btrfs_set_block_group_used(&cache->item, bytes_used); 9225 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 9226 btrfs_set_block_group_flags(&cache->item, type); 9227 9228 cache->flags = type; 9229 cache->last_byte_to_unpin = (u64)-1; 9230 cache->cached = BTRFS_CACHE_FINISHED; 9231 ret = exclude_super_stripes(root, cache); 9232 if (ret) { 9233 /* 9234 * We may have excluded something, so call this just in 9235 * case. 9236 */ 9237 free_excluded_extents(root, cache); 9238 btrfs_put_block_group(cache); 9239 return ret; 9240 } 9241 9242 add_new_free_space(cache, root->fs_info, chunk_offset, 9243 chunk_offset + size); 9244 9245 free_excluded_extents(root, cache); 9246 9247 ret = btrfs_add_block_group_cache(root->fs_info, cache); 9248 if (ret) { 9249 btrfs_remove_free_space_cache(cache); 9250 btrfs_put_block_group(cache); 9251 return ret; 9252 } 9253 9254 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 9255 &cache->space_info); 9256 if (ret) { 9257 btrfs_remove_free_space_cache(cache); 9258 spin_lock(&root->fs_info->block_group_cache_lock); 9259 rb_erase(&cache->cache_node, 9260 &root->fs_info->block_group_cache_tree); 9261 spin_unlock(&root->fs_info->block_group_cache_lock); 9262 btrfs_put_block_group(cache); 9263 return ret; 9264 } 9265 update_global_block_rsv(root->fs_info); 9266 9267 spin_lock(&cache->space_info->lock); 9268 cache->space_info->bytes_readonly += cache->bytes_super; 9269 spin_unlock(&cache->space_info->lock); 9270 9271 __link_block_group(cache->space_info, cache); 9272 9273 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 9274 9275 set_avail_alloc_bits(extent_root->fs_info, type); 9276 9277 return 0; 9278 } 9279 9280 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 9281 { 9282 u64 extra_flags = chunk_to_extended(flags) & 9283 BTRFS_EXTENDED_PROFILE_MASK; 9284 9285 write_seqlock(&fs_info->profiles_lock); 9286 if (flags & BTRFS_BLOCK_GROUP_DATA) 9287 fs_info->avail_data_alloc_bits &= ~extra_flags; 9288 if (flags & BTRFS_BLOCK_GROUP_METADATA) 9289 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 9290 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 9291 fs_info->avail_system_alloc_bits &= ~extra_flags; 9292 write_sequnlock(&fs_info->profiles_lock); 9293 } 9294 9295 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 9296 struct btrfs_root *root, u64 group_start) 9297 { 9298 struct btrfs_path *path; 9299 struct btrfs_block_group_cache *block_group; 9300 struct btrfs_free_cluster *cluster; 9301 struct btrfs_root *tree_root = root->fs_info->tree_root; 9302 struct btrfs_key key; 9303 struct inode *inode; 9304 struct kobject *kobj = NULL; 9305 int ret; 9306 int index; 9307 int factor; 9308 9309 root = root->fs_info->extent_root; 9310 9311 block_group = btrfs_lookup_block_group(root->fs_info, group_start); 9312 BUG_ON(!block_group); 9313 BUG_ON(!block_group->ro); 9314 9315 /* 9316 * Free the reserved super bytes from this block group before 9317 * remove it. 9318 */ 9319 free_excluded_extents(root, block_group); 9320 9321 memcpy(&key, &block_group->key, sizeof(key)); 9322 index = get_block_group_index(block_group); 9323 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 9324 BTRFS_BLOCK_GROUP_RAID1 | 9325 BTRFS_BLOCK_GROUP_RAID10)) 9326 factor = 2; 9327 else 9328 factor = 1; 9329 9330 /* make sure this block group isn't part of an allocation cluster */ 9331 cluster = &root->fs_info->data_alloc_cluster; 9332 spin_lock(&cluster->refill_lock); 9333 btrfs_return_cluster_to_free_space(block_group, cluster); 9334 spin_unlock(&cluster->refill_lock); 9335 9336 /* 9337 * make sure this block group isn't part of a metadata 9338 * allocation cluster 9339 */ 9340 cluster = &root->fs_info->meta_alloc_cluster; 9341 spin_lock(&cluster->refill_lock); 9342 btrfs_return_cluster_to_free_space(block_group, cluster); 9343 spin_unlock(&cluster->refill_lock); 9344 9345 path = btrfs_alloc_path(); 9346 if (!path) { 9347 ret = -ENOMEM; 9348 goto out; 9349 } 9350 9351 inode = lookup_free_space_inode(tree_root, block_group, path); 9352 if (!IS_ERR(inode)) { 9353 ret = btrfs_orphan_add(trans, inode); 9354 if (ret) { 9355 btrfs_add_delayed_iput(inode); 9356 goto out; 9357 } 9358 clear_nlink(inode); 9359 /* One for the block groups ref */ 9360 spin_lock(&block_group->lock); 9361 if (block_group->iref) { 9362 block_group->iref = 0; 9363 block_group->inode = NULL; 9364 spin_unlock(&block_group->lock); 9365 iput(inode); 9366 } else { 9367 spin_unlock(&block_group->lock); 9368 } 9369 /* One for our lookup ref */ 9370 btrfs_add_delayed_iput(inode); 9371 } 9372 9373 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 9374 key.offset = block_group->key.objectid; 9375 key.type = 0; 9376 9377 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 9378 if (ret < 0) 9379 goto out; 9380 if (ret > 0) 9381 btrfs_release_path(path); 9382 if (ret == 0) { 9383 ret = btrfs_del_item(trans, tree_root, path); 9384 if (ret) 9385 goto out; 9386 btrfs_release_path(path); 9387 } 9388 9389 spin_lock(&root->fs_info->block_group_cache_lock); 9390 rb_erase(&block_group->cache_node, 9391 &root->fs_info->block_group_cache_tree); 9392 9393 if (root->fs_info->first_logical_byte == block_group->key.objectid) 9394 root->fs_info->first_logical_byte = (u64)-1; 9395 spin_unlock(&root->fs_info->block_group_cache_lock); 9396 9397 down_write(&block_group->space_info->groups_sem); 9398 /* 9399 * we must use list_del_init so people can check to see if they 9400 * are still on the list after taking the semaphore 9401 */ 9402 list_del_init(&block_group->list); 9403 if (list_empty(&block_group->space_info->block_groups[index])) { 9404 kobj = block_group->space_info->block_group_kobjs[index]; 9405 block_group->space_info->block_group_kobjs[index] = NULL; 9406 clear_avail_alloc_bits(root->fs_info, block_group->flags); 9407 } 9408 up_write(&block_group->space_info->groups_sem); 9409 if (kobj) { 9410 kobject_del(kobj); 9411 kobject_put(kobj); 9412 } 9413 9414 if (block_group->cached == BTRFS_CACHE_STARTED) 9415 wait_block_group_cache_done(block_group); 9416 9417 btrfs_remove_free_space_cache(block_group); 9418 9419 spin_lock(&block_group->space_info->lock); 9420 block_group->space_info->total_bytes -= block_group->key.offset; 9421 block_group->space_info->bytes_readonly -= block_group->key.offset; 9422 block_group->space_info->disk_total -= block_group->key.offset * factor; 9423 spin_unlock(&block_group->space_info->lock); 9424 9425 memcpy(&key, &block_group->key, sizeof(key)); 9426 9427 btrfs_clear_space_info_full(root->fs_info); 9428 9429 btrfs_put_block_group(block_group); 9430 btrfs_put_block_group(block_group); 9431 9432 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 9433 if (ret > 0) 9434 ret = -EIO; 9435 if (ret < 0) 9436 goto out; 9437 9438 ret = btrfs_del_item(trans, root, path); 9439 out: 9440 btrfs_free_path(path); 9441 return ret; 9442 } 9443 9444 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9445 { 9446 struct btrfs_space_info *space_info; 9447 struct btrfs_super_block *disk_super; 9448 u64 features; 9449 u64 flags; 9450 int mixed = 0; 9451 int ret; 9452 9453 disk_super = fs_info->super_copy; 9454 if (!btrfs_super_root(disk_super)) 9455 return 1; 9456 9457 features = btrfs_super_incompat_flags(disk_super); 9458 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 9459 mixed = 1; 9460 9461 flags = BTRFS_BLOCK_GROUP_SYSTEM; 9462 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9463 if (ret) 9464 goto out; 9465 9466 if (mixed) { 9467 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 9468 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9469 } else { 9470 flags = BTRFS_BLOCK_GROUP_METADATA; 9471 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9472 if (ret) 9473 goto out; 9474 9475 flags = BTRFS_BLOCK_GROUP_DATA; 9476 ret = update_space_info(fs_info, flags, 0, 0, &space_info); 9477 } 9478 out: 9479 return ret; 9480 } 9481 9482 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 9483 { 9484 return unpin_extent_range(root, start, end); 9485 } 9486 9487 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 9488 u64 num_bytes, u64 *actual_bytes) 9489 { 9490 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); 9491 } 9492 9493 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) 9494 { 9495 struct btrfs_fs_info *fs_info = root->fs_info; 9496 struct btrfs_block_group_cache *cache = NULL; 9497 u64 group_trimmed; 9498 u64 start; 9499 u64 end; 9500 u64 trimmed = 0; 9501 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 9502 int ret = 0; 9503 9504 /* 9505 * try to trim all FS space, our block group may start from non-zero. 9506 */ 9507 if (range->len == total_bytes) 9508 cache = btrfs_lookup_first_block_group(fs_info, range->start); 9509 else 9510 cache = btrfs_lookup_block_group(fs_info, range->start); 9511 9512 while (cache) { 9513 if (cache->key.objectid >= (range->start + range->len)) { 9514 btrfs_put_block_group(cache); 9515 break; 9516 } 9517 9518 start = max(range->start, cache->key.objectid); 9519 end = min(range->start + range->len, 9520 cache->key.objectid + cache->key.offset); 9521 9522 if (end - start >= range->minlen) { 9523 if (!block_group_cache_done(cache)) { 9524 ret = cache_block_group(cache, 0); 9525 if (ret) { 9526 btrfs_put_block_group(cache); 9527 break; 9528 } 9529 ret = wait_block_group_cache_done(cache); 9530 if (ret) { 9531 btrfs_put_block_group(cache); 9532 break; 9533 } 9534 } 9535 ret = btrfs_trim_block_group(cache, 9536 &group_trimmed, 9537 start, 9538 end, 9539 range->minlen); 9540 9541 trimmed += group_trimmed; 9542 if (ret) { 9543 btrfs_put_block_group(cache); 9544 break; 9545 } 9546 } 9547 9548 cache = next_block_group(fs_info->tree_root, cache); 9549 } 9550 9551 range->len = trimmed; 9552 return ret; 9553 } 9554 9555 /* 9556 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), 9557 * they are used to prevent the some tasks writing data into the page cache 9558 * by nocow before the subvolume is snapshoted, but flush the data into 9559 * the disk after the snapshot creation. 9560 */ 9561 void btrfs_end_nocow_write(struct btrfs_root *root) 9562 { 9563 percpu_counter_dec(&root->subv_writers->counter); 9564 /* 9565 * Make sure counter is updated before we wake up 9566 * waiters. 9567 */ 9568 smp_mb(); 9569 if (waitqueue_active(&root->subv_writers->wait)) 9570 wake_up(&root->subv_writers->wait); 9571 } 9572 9573 int btrfs_start_nocow_write(struct btrfs_root *root) 9574 { 9575 if (unlikely(atomic_read(&root->will_be_snapshoted))) 9576 return 0; 9577 9578 percpu_counter_inc(&root->subv_writers->counter); 9579 /* 9580 * Make sure counter is updated before we check for snapshot creation. 9581 */ 9582 smp_mb(); 9583 if (unlikely(atomic_read(&root->will_be_snapshoted))) { 9584 btrfs_end_nocow_write(root); 9585 return 0; 9586 } 9587 return 1; 9588 } 9589