1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/pagemap.h> 20 #include <linux/writeback.h> 21 #include <linux/blkdev.h> 22 #include <linux/sort.h> 23 #include <linux/rcupdate.h> 24 #include <linux/kthread.h> 25 #include <linux/slab.h> 26 #include <linux/ratelimit.h> 27 #include <linux/percpu_counter.h> 28 #include "hash.h" 29 #include "tree-log.h" 30 #include "disk-io.h" 31 #include "print-tree.h" 32 #include "volumes.h" 33 #include "raid56.h" 34 #include "locking.h" 35 #include "free-space-cache.h" 36 #include "free-space-tree.h" 37 #include "math.h" 38 #include "sysfs.h" 39 #include "qgroup.h" 40 41 #undef SCRAMBLE_DELAYED_REFS 42 43 /* 44 * control flags for do_chunk_alloc's force field 45 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 46 * if we really need one. 47 * 48 * CHUNK_ALLOC_LIMITED means to only try and allocate one 49 * if we have very few chunks already allocated. This is 50 * used as part of the clustering code to help make sure 51 * we have a good pool of storage to cluster in, without 52 * filling the FS with empty chunks 53 * 54 * CHUNK_ALLOC_FORCE means it must try to allocate one 55 * 56 */ 57 enum { 58 CHUNK_ALLOC_NO_FORCE = 0, 59 CHUNK_ALLOC_LIMITED = 1, 60 CHUNK_ALLOC_FORCE = 2, 61 }; 62 63 static int update_block_group(struct btrfs_trans_handle *trans, 64 struct btrfs_fs_info *fs_info, u64 bytenr, 65 u64 num_bytes, int alloc); 66 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 67 struct btrfs_fs_info *fs_info, 68 struct btrfs_delayed_ref_node *node, u64 parent, 69 u64 root_objectid, u64 owner_objectid, 70 u64 owner_offset, int refs_to_drop, 71 struct btrfs_delayed_extent_op *extra_op); 72 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 73 struct extent_buffer *leaf, 74 struct btrfs_extent_item *ei); 75 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 76 struct btrfs_fs_info *fs_info, 77 u64 parent, u64 root_objectid, 78 u64 flags, u64 owner, u64 offset, 79 struct btrfs_key *ins, int ref_mod); 80 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 81 struct btrfs_fs_info *fs_info, 82 u64 parent, u64 root_objectid, 83 u64 flags, struct btrfs_disk_key *key, 84 int level, struct btrfs_key *ins); 85 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 86 struct btrfs_fs_info *fs_info, u64 flags, 87 int force); 88 static int find_next_key(struct btrfs_path *path, int level, 89 struct btrfs_key *key); 90 static void dump_space_info(struct btrfs_fs_info *fs_info, 91 struct btrfs_space_info *info, u64 bytes, 92 int dump_block_groups); 93 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 94 u64 ram_bytes, u64 num_bytes, int delalloc); 95 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 96 u64 num_bytes, int delalloc); 97 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 98 u64 num_bytes); 99 static int __reserve_metadata_bytes(struct btrfs_root *root, 100 struct btrfs_space_info *space_info, 101 u64 orig_bytes, 102 enum btrfs_reserve_flush_enum flush); 103 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 104 struct btrfs_space_info *space_info, 105 u64 num_bytes); 106 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 107 struct btrfs_space_info *space_info, 108 u64 num_bytes); 109 110 static noinline int 111 block_group_cache_done(struct btrfs_block_group_cache *cache) 112 { 113 smp_mb(); 114 return cache->cached == BTRFS_CACHE_FINISHED || 115 cache->cached == BTRFS_CACHE_ERROR; 116 } 117 118 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 119 { 120 return (cache->flags & bits) == bits; 121 } 122 123 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 124 { 125 atomic_inc(&cache->count); 126 } 127 128 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 129 { 130 if (atomic_dec_and_test(&cache->count)) { 131 WARN_ON(cache->pinned > 0); 132 WARN_ON(cache->reserved > 0); 133 kfree(cache->free_space_ctl); 134 kfree(cache); 135 } 136 } 137 138 /* 139 * this adds the block group to the fs_info rb tree for the block group 140 * cache 141 */ 142 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 143 struct btrfs_block_group_cache *block_group) 144 { 145 struct rb_node **p; 146 struct rb_node *parent = NULL; 147 struct btrfs_block_group_cache *cache; 148 149 spin_lock(&info->block_group_cache_lock); 150 p = &info->block_group_cache_tree.rb_node; 151 152 while (*p) { 153 parent = *p; 154 cache = rb_entry(parent, struct btrfs_block_group_cache, 155 cache_node); 156 if (block_group->key.objectid < cache->key.objectid) { 157 p = &(*p)->rb_left; 158 } else if (block_group->key.objectid > cache->key.objectid) { 159 p = &(*p)->rb_right; 160 } else { 161 spin_unlock(&info->block_group_cache_lock); 162 return -EEXIST; 163 } 164 } 165 166 rb_link_node(&block_group->cache_node, parent, p); 167 rb_insert_color(&block_group->cache_node, 168 &info->block_group_cache_tree); 169 170 if (info->first_logical_byte > block_group->key.objectid) 171 info->first_logical_byte = block_group->key.objectid; 172 173 spin_unlock(&info->block_group_cache_lock); 174 175 return 0; 176 } 177 178 /* 179 * This will return the block group at or after bytenr if contains is 0, else 180 * it will return the block group that contains the bytenr 181 */ 182 static struct btrfs_block_group_cache * 183 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 184 int contains) 185 { 186 struct btrfs_block_group_cache *cache, *ret = NULL; 187 struct rb_node *n; 188 u64 end, start; 189 190 spin_lock(&info->block_group_cache_lock); 191 n = info->block_group_cache_tree.rb_node; 192 193 while (n) { 194 cache = rb_entry(n, struct btrfs_block_group_cache, 195 cache_node); 196 end = cache->key.objectid + cache->key.offset - 1; 197 start = cache->key.objectid; 198 199 if (bytenr < start) { 200 if (!contains && (!ret || start < ret->key.objectid)) 201 ret = cache; 202 n = n->rb_left; 203 } else if (bytenr > start) { 204 if (contains && bytenr <= end) { 205 ret = cache; 206 break; 207 } 208 n = n->rb_right; 209 } else { 210 ret = cache; 211 break; 212 } 213 } 214 if (ret) { 215 btrfs_get_block_group(ret); 216 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 217 info->first_logical_byte = ret->key.objectid; 218 } 219 spin_unlock(&info->block_group_cache_lock); 220 221 return ret; 222 } 223 224 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 225 u64 start, u64 num_bytes) 226 { 227 u64 end = start + num_bytes - 1; 228 set_extent_bits(&fs_info->freed_extents[0], 229 start, end, EXTENT_UPTODATE); 230 set_extent_bits(&fs_info->freed_extents[1], 231 start, end, EXTENT_UPTODATE); 232 return 0; 233 } 234 235 static void free_excluded_extents(struct btrfs_fs_info *fs_info, 236 struct btrfs_block_group_cache *cache) 237 { 238 u64 start, end; 239 240 start = cache->key.objectid; 241 end = start + cache->key.offset - 1; 242 243 clear_extent_bits(&fs_info->freed_extents[0], 244 start, end, EXTENT_UPTODATE); 245 clear_extent_bits(&fs_info->freed_extents[1], 246 start, end, EXTENT_UPTODATE); 247 } 248 249 static int exclude_super_stripes(struct btrfs_fs_info *fs_info, 250 struct btrfs_block_group_cache *cache) 251 { 252 u64 bytenr; 253 u64 *logical; 254 int stripe_len; 255 int i, nr, ret; 256 257 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 258 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 259 cache->bytes_super += stripe_len; 260 ret = add_excluded_extent(fs_info, cache->key.objectid, 261 stripe_len); 262 if (ret) 263 return ret; 264 } 265 266 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 267 bytenr = btrfs_sb_offset(i); 268 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 269 bytenr, 0, &logical, &nr, &stripe_len); 270 if (ret) 271 return ret; 272 273 while (nr--) { 274 u64 start, len; 275 276 if (logical[nr] > cache->key.objectid + 277 cache->key.offset) 278 continue; 279 280 if (logical[nr] + stripe_len <= cache->key.objectid) 281 continue; 282 283 start = logical[nr]; 284 if (start < cache->key.objectid) { 285 start = cache->key.objectid; 286 len = (logical[nr] + stripe_len) - start; 287 } else { 288 len = min_t(u64, stripe_len, 289 cache->key.objectid + 290 cache->key.offset - start); 291 } 292 293 cache->bytes_super += len; 294 ret = add_excluded_extent(fs_info, start, len); 295 if (ret) { 296 kfree(logical); 297 return ret; 298 } 299 } 300 301 kfree(logical); 302 } 303 return 0; 304 } 305 306 static struct btrfs_caching_control * 307 get_caching_control(struct btrfs_block_group_cache *cache) 308 { 309 struct btrfs_caching_control *ctl; 310 311 spin_lock(&cache->lock); 312 if (!cache->caching_ctl) { 313 spin_unlock(&cache->lock); 314 return NULL; 315 } 316 317 ctl = cache->caching_ctl; 318 atomic_inc(&ctl->count); 319 spin_unlock(&cache->lock); 320 return ctl; 321 } 322 323 static void put_caching_control(struct btrfs_caching_control *ctl) 324 { 325 if (atomic_dec_and_test(&ctl->count)) 326 kfree(ctl); 327 } 328 329 #ifdef CONFIG_BTRFS_DEBUG 330 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 331 { 332 struct btrfs_fs_info *fs_info = block_group->fs_info; 333 u64 start = block_group->key.objectid; 334 u64 len = block_group->key.offset; 335 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 336 fs_info->nodesize : fs_info->sectorsize; 337 u64 step = chunk << 1; 338 339 while (len > chunk) { 340 btrfs_remove_free_space(block_group, start, chunk); 341 start += step; 342 if (len < step) 343 len = 0; 344 else 345 len -= step; 346 } 347 } 348 #endif 349 350 /* 351 * this is only called by cache_block_group, since we could have freed extents 352 * we need to check the pinned_extents for any extents that can't be used yet 353 * since their free space will be released as soon as the transaction commits. 354 */ 355 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 356 struct btrfs_fs_info *info, u64 start, u64 end) 357 { 358 u64 extent_start, extent_end, size, total_added = 0; 359 int ret; 360 361 while (start < end) { 362 ret = find_first_extent_bit(info->pinned_extents, start, 363 &extent_start, &extent_end, 364 EXTENT_DIRTY | EXTENT_UPTODATE, 365 NULL); 366 if (ret) 367 break; 368 369 if (extent_start <= start) { 370 start = extent_end + 1; 371 } else if (extent_start > start && extent_start < end) { 372 size = extent_start - start; 373 total_added += size; 374 ret = btrfs_add_free_space(block_group, start, 375 size); 376 BUG_ON(ret); /* -ENOMEM or logic error */ 377 start = extent_end + 1; 378 } else { 379 break; 380 } 381 } 382 383 if (start < end) { 384 size = end - start; 385 total_added += size; 386 ret = btrfs_add_free_space(block_group, start, size); 387 BUG_ON(ret); /* -ENOMEM or logic error */ 388 } 389 390 return total_added; 391 } 392 393 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 394 { 395 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 396 struct btrfs_fs_info *fs_info = block_group->fs_info; 397 struct btrfs_root *extent_root = fs_info->extent_root; 398 struct btrfs_path *path; 399 struct extent_buffer *leaf; 400 struct btrfs_key key; 401 u64 total_found = 0; 402 u64 last = 0; 403 u32 nritems; 404 int ret; 405 bool wakeup = true; 406 407 path = btrfs_alloc_path(); 408 if (!path) 409 return -ENOMEM; 410 411 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 412 413 #ifdef CONFIG_BTRFS_DEBUG 414 /* 415 * If we're fragmenting we don't want to make anybody think we can 416 * allocate from this block group until we've had a chance to fragment 417 * the free space. 418 */ 419 if (btrfs_should_fragment_free_space(block_group)) 420 wakeup = false; 421 #endif 422 /* 423 * We don't want to deadlock with somebody trying to allocate a new 424 * extent for the extent root while also trying to search the extent 425 * root to add free space. So we skip locking and search the commit 426 * root, since its read-only 427 */ 428 path->skip_locking = 1; 429 path->search_commit_root = 1; 430 path->reada = READA_FORWARD; 431 432 key.objectid = last; 433 key.offset = 0; 434 key.type = BTRFS_EXTENT_ITEM_KEY; 435 436 next: 437 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 438 if (ret < 0) 439 goto out; 440 441 leaf = path->nodes[0]; 442 nritems = btrfs_header_nritems(leaf); 443 444 while (1) { 445 if (btrfs_fs_closing(fs_info) > 1) { 446 last = (u64)-1; 447 break; 448 } 449 450 if (path->slots[0] < nritems) { 451 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 452 } else { 453 ret = find_next_key(path, 0, &key); 454 if (ret) 455 break; 456 457 if (need_resched() || 458 rwsem_is_contended(&fs_info->commit_root_sem)) { 459 if (wakeup) 460 caching_ctl->progress = last; 461 btrfs_release_path(path); 462 up_read(&fs_info->commit_root_sem); 463 mutex_unlock(&caching_ctl->mutex); 464 cond_resched(); 465 mutex_lock(&caching_ctl->mutex); 466 down_read(&fs_info->commit_root_sem); 467 goto next; 468 } 469 470 ret = btrfs_next_leaf(extent_root, path); 471 if (ret < 0) 472 goto out; 473 if (ret) 474 break; 475 leaf = path->nodes[0]; 476 nritems = btrfs_header_nritems(leaf); 477 continue; 478 } 479 480 if (key.objectid < last) { 481 key.objectid = last; 482 key.offset = 0; 483 key.type = BTRFS_EXTENT_ITEM_KEY; 484 485 if (wakeup) 486 caching_ctl->progress = last; 487 btrfs_release_path(path); 488 goto next; 489 } 490 491 if (key.objectid < block_group->key.objectid) { 492 path->slots[0]++; 493 continue; 494 } 495 496 if (key.objectid >= block_group->key.objectid + 497 block_group->key.offset) 498 break; 499 500 if (key.type == BTRFS_EXTENT_ITEM_KEY || 501 key.type == BTRFS_METADATA_ITEM_KEY) { 502 total_found += add_new_free_space(block_group, 503 fs_info, last, 504 key.objectid); 505 if (key.type == BTRFS_METADATA_ITEM_KEY) 506 last = key.objectid + 507 fs_info->nodesize; 508 else 509 last = key.objectid + key.offset; 510 511 if (total_found > CACHING_CTL_WAKE_UP) { 512 total_found = 0; 513 if (wakeup) 514 wake_up(&caching_ctl->wait); 515 } 516 } 517 path->slots[0]++; 518 } 519 ret = 0; 520 521 total_found += add_new_free_space(block_group, fs_info, last, 522 block_group->key.objectid + 523 block_group->key.offset); 524 caching_ctl->progress = (u64)-1; 525 526 out: 527 btrfs_free_path(path); 528 return ret; 529 } 530 531 static noinline void caching_thread(struct btrfs_work *work) 532 { 533 struct btrfs_block_group_cache *block_group; 534 struct btrfs_fs_info *fs_info; 535 struct btrfs_caching_control *caching_ctl; 536 struct btrfs_root *extent_root; 537 int ret; 538 539 caching_ctl = container_of(work, struct btrfs_caching_control, work); 540 block_group = caching_ctl->block_group; 541 fs_info = block_group->fs_info; 542 extent_root = fs_info->extent_root; 543 544 mutex_lock(&caching_ctl->mutex); 545 down_read(&fs_info->commit_root_sem); 546 547 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 548 ret = load_free_space_tree(caching_ctl); 549 else 550 ret = load_extent_tree_free(caching_ctl); 551 552 spin_lock(&block_group->lock); 553 block_group->caching_ctl = NULL; 554 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 555 spin_unlock(&block_group->lock); 556 557 #ifdef CONFIG_BTRFS_DEBUG 558 if (btrfs_should_fragment_free_space(block_group)) { 559 u64 bytes_used; 560 561 spin_lock(&block_group->space_info->lock); 562 spin_lock(&block_group->lock); 563 bytes_used = block_group->key.offset - 564 btrfs_block_group_used(&block_group->item); 565 block_group->space_info->bytes_used += bytes_used >> 1; 566 spin_unlock(&block_group->lock); 567 spin_unlock(&block_group->space_info->lock); 568 fragment_free_space(block_group); 569 } 570 #endif 571 572 caching_ctl->progress = (u64)-1; 573 574 up_read(&fs_info->commit_root_sem); 575 free_excluded_extents(fs_info, block_group); 576 mutex_unlock(&caching_ctl->mutex); 577 578 wake_up(&caching_ctl->wait); 579 580 put_caching_control(caching_ctl); 581 btrfs_put_block_group(block_group); 582 } 583 584 static int cache_block_group(struct btrfs_block_group_cache *cache, 585 int load_cache_only) 586 { 587 DEFINE_WAIT(wait); 588 struct btrfs_fs_info *fs_info = cache->fs_info; 589 struct btrfs_caching_control *caching_ctl; 590 int ret = 0; 591 592 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 593 if (!caching_ctl) 594 return -ENOMEM; 595 596 INIT_LIST_HEAD(&caching_ctl->list); 597 mutex_init(&caching_ctl->mutex); 598 init_waitqueue_head(&caching_ctl->wait); 599 caching_ctl->block_group = cache; 600 caching_ctl->progress = cache->key.objectid; 601 atomic_set(&caching_ctl->count, 1); 602 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 603 caching_thread, NULL, NULL); 604 605 spin_lock(&cache->lock); 606 /* 607 * This should be a rare occasion, but this could happen I think in the 608 * case where one thread starts to load the space cache info, and then 609 * some other thread starts a transaction commit which tries to do an 610 * allocation while the other thread is still loading the space cache 611 * info. The previous loop should have kept us from choosing this block 612 * group, but if we've moved to the state where we will wait on caching 613 * block groups we need to first check if we're doing a fast load here, 614 * so we can wait for it to finish, otherwise we could end up allocating 615 * from a block group who's cache gets evicted for one reason or 616 * another. 617 */ 618 while (cache->cached == BTRFS_CACHE_FAST) { 619 struct btrfs_caching_control *ctl; 620 621 ctl = cache->caching_ctl; 622 atomic_inc(&ctl->count); 623 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 624 spin_unlock(&cache->lock); 625 626 schedule(); 627 628 finish_wait(&ctl->wait, &wait); 629 put_caching_control(ctl); 630 spin_lock(&cache->lock); 631 } 632 633 if (cache->cached != BTRFS_CACHE_NO) { 634 spin_unlock(&cache->lock); 635 kfree(caching_ctl); 636 return 0; 637 } 638 WARN_ON(cache->caching_ctl); 639 cache->caching_ctl = caching_ctl; 640 cache->cached = BTRFS_CACHE_FAST; 641 spin_unlock(&cache->lock); 642 643 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 644 mutex_lock(&caching_ctl->mutex); 645 ret = load_free_space_cache(fs_info, cache); 646 647 spin_lock(&cache->lock); 648 if (ret == 1) { 649 cache->caching_ctl = NULL; 650 cache->cached = BTRFS_CACHE_FINISHED; 651 cache->last_byte_to_unpin = (u64)-1; 652 caching_ctl->progress = (u64)-1; 653 } else { 654 if (load_cache_only) { 655 cache->caching_ctl = NULL; 656 cache->cached = BTRFS_CACHE_NO; 657 } else { 658 cache->cached = BTRFS_CACHE_STARTED; 659 cache->has_caching_ctl = 1; 660 } 661 } 662 spin_unlock(&cache->lock); 663 #ifdef CONFIG_BTRFS_DEBUG 664 if (ret == 1 && 665 btrfs_should_fragment_free_space(cache)) { 666 u64 bytes_used; 667 668 spin_lock(&cache->space_info->lock); 669 spin_lock(&cache->lock); 670 bytes_used = cache->key.offset - 671 btrfs_block_group_used(&cache->item); 672 cache->space_info->bytes_used += bytes_used >> 1; 673 spin_unlock(&cache->lock); 674 spin_unlock(&cache->space_info->lock); 675 fragment_free_space(cache); 676 } 677 #endif 678 mutex_unlock(&caching_ctl->mutex); 679 680 wake_up(&caching_ctl->wait); 681 if (ret == 1) { 682 put_caching_control(caching_ctl); 683 free_excluded_extents(fs_info, cache); 684 return 0; 685 } 686 } else { 687 /* 688 * We're either using the free space tree or no caching at all. 689 * Set cached to the appropriate value and wakeup any waiters. 690 */ 691 spin_lock(&cache->lock); 692 if (load_cache_only) { 693 cache->caching_ctl = NULL; 694 cache->cached = BTRFS_CACHE_NO; 695 } else { 696 cache->cached = BTRFS_CACHE_STARTED; 697 cache->has_caching_ctl = 1; 698 } 699 spin_unlock(&cache->lock); 700 wake_up(&caching_ctl->wait); 701 } 702 703 if (load_cache_only) { 704 put_caching_control(caching_ctl); 705 return 0; 706 } 707 708 down_write(&fs_info->commit_root_sem); 709 atomic_inc(&caching_ctl->count); 710 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 711 up_write(&fs_info->commit_root_sem); 712 713 btrfs_get_block_group(cache); 714 715 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 716 717 return ret; 718 } 719 720 /* 721 * return the block group that starts at or after bytenr 722 */ 723 static struct btrfs_block_group_cache * 724 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 725 { 726 return block_group_cache_tree_search(info, bytenr, 0); 727 } 728 729 /* 730 * return the block group that contains the given bytenr 731 */ 732 struct btrfs_block_group_cache *btrfs_lookup_block_group( 733 struct btrfs_fs_info *info, 734 u64 bytenr) 735 { 736 return block_group_cache_tree_search(info, bytenr, 1); 737 } 738 739 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 740 u64 flags) 741 { 742 struct list_head *head = &info->space_info; 743 struct btrfs_space_info *found; 744 745 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 746 747 rcu_read_lock(); 748 list_for_each_entry_rcu(found, head, list) { 749 if (found->flags & flags) { 750 rcu_read_unlock(); 751 return found; 752 } 753 } 754 rcu_read_unlock(); 755 return NULL; 756 } 757 758 /* 759 * after adding space to the filesystem, we need to clear the full flags 760 * on all the space infos. 761 */ 762 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 763 { 764 struct list_head *head = &info->space_info; 765 struct btrfs_space_info *found; 766 767 rcu_read_lock(); 768 list_for_each_entry_rcu(found, head, list) 769 found->full = 0; 770 rcu_read_unlock(); 771 } 772 773 /* simple helper to search for an existing data extent at a given offset */ 774 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 775 { 776 int ret; 777 struct btrfs_key key; 778 struct btrfs_path *path; 779 780 path = btrfs_alloc_path(); 781 if (!path) 782 return -ENOMEM; 783 784 key.objectid = start; 785 key.offset = len; 786 key.type = BTRFS_EXTENT_ITEM_KEY; 787 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 788 btrfs_free_path(path); 789 return ret; 790 } 791 792 /* 793 * helper function to lookup reference count and flags of a tree block. 794 * 795 * the head node for delayed ref is used to store the sum of all the 796 * reference count modifications queued up in the rbtree. the head 797 * node may also store the extent flags to set. This way you can check 798 * to see what the reference count and extent flags would be if all of 799 * the delayed refs are not processed. 800 */ 801 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 802 struct btrfs_fs_info *fs_info, u64 bytenr, 803 u64 offset, int metadata, u64 *refs, u64 *flags) 804 { 805 struct btrfs_delayed_ref_head *head; 806 struct btrfs_delayed_ref_root *delayed_refs; 807 struct btrfs_path *path; 808 struct btrfs_extent_item *ei; 809 struct extent_buffer *leaf; 810 struct btrfs_key key; 811 u32 item_size; 812 u64 num_refs; 813 u64 extent_flags; 814 int ret; 815 816 /* 817 * If we don't have skinny metadata, don't bother doing anything 818 * different 819 */ 820 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 821 offset = fs_info->nodesize; 822 metadata = 0; 823 } 824 825 path = btrfs_alloc_path(); 826 if (!path) 827 return -ENOMEM; 828 829 if (!trans) { 830 path->skip_locking = 1; 831 path->search_commit_root = 1; 832 } 833 834 search_again: 835 key.objectid = bytenr; 836 key.offset = offset; 837 if (metadata) 838 key.type = BTRFS_METADATA_ITEM_KEY; 839 else 840 key.type = BTRFS_EXTENT_ITEM_KEY; 841 842 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 843 if (ret < 0) 844 goto out_free; 845 846 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 847 if (path->slots[0]) { 848 path->slots[0]--; 849 btrfs_item_key_to_cpu(path->nodes[0], &key, 850 path->slots[0]); 851 if (key.objectid == bytenr && 852 key.type == BTRFS_EXTENT_ITEM_KEY && 853 key.offset == fs_info->nodesize) 854 ret = 0; 855 } 856 } 857 858 if (ret == 0) { 859 leaf = path->nodes[0]; 860 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 861 if (item_size >= sizeof(*ei)) { 862 ei = btrfs_item_ptr(leaf, path->slots[0], 863 struct btrfs_extent_item); 864 num_refs = btrfs_extent_refs(leaf, ei); 865 extent_flags = btrfs_extent_flags(leaf, ei); 866 } else { 867 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 868 struct btrfs_extent_item_v0 *ei0; 869 BUG_ON(item_size != sizeof(*ei0)); 870 ei0 = btrfs_item_ptr(leaf, path->slots[0], 871 struct btrfs_extent_item_v0); 872 num_refs = btrfs_extent_refs_v0(leaf, ei0); 873 /* FIXME: this isn't correct for data */ 874 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 875 #else 876 BUG(); 877 #endif 878 } 879 BUG_ON(num_refs == 0); 880 } else { 881 num_refs = 0; 882 extent_flags = 0; 883 ret = 0; 884 } 885 886 if (!trans) 887 goto out; 888 889 delayed_refs = &trans->transaction->delayed_refs; 890 spin_lock(&delayed_refs->lock); 891 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 892 if (head) { 893 if (!mutex_trylock(&head->mutex)) { 894 atomic_inc(&head->node.refs); 895 spin_unlock(&delayed_refs->lock); 896 897 btrfs_release_path(path); 898 899 /* 900 * Mutex was contended, block until it's released and try 901 * again 902 */ 903 mutex_lock(&head->mutex); 904 mutex_unlock(&head->mutex); 905 btrfs_put_delayed_ref(&head->node); 906 goto search_again; 907 } 908 spin_lock(&head->lock); 909 if (head->extent_op && head->extent_op->update_flags) 910 extent_flags |= head->extent_op->flags_to_set; 911 else 912 BUG_ON(num_refs == 0); 913 914 num_refs += head->node.ref_mod; 915 spin_unlock(&head->lock); 916 mutex_unlock(&head->mutex); 917 } 918 spin_unlock(&delayed_refs->lock); 919 out: 920 WARN_ON(num_refs == 0); 921 if (refs) 922 *refs = num_refs; 923 if (flags) 924 *flags = extent_flags; 925 out_free: 926 btrfs_free_path(path); 927 return ret; 928 } 929 930 /* 931 * Back reference rules. Back refs have three main goals: 932 * 933 * 1) differentiate between all holders of references to an extent so that 934 * when a reference is dropped we can make sure it was a valid reference 935 * before freeing the extent. 936 * 937 * 2) Provide enough information to quickly find the holders of an extent 938 * if we notice a given block is corrupted or bad. 939 * 940 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 941 * maintenance. This is actually the same as #2, but with a slightly 942 * different use case. 943 * 944 * There are two kinds of back refs. The implicit back refs is optimized 945 * for pointers in non-shared tree blocks. For a given pointer in a block, 946 * back refs of this kind provide information about the block's owner tree 947 * and the pointer's key. These information allow us to find the block by 948 * b-tree searching. The full back refs is for pointers in tree blocks not 949 * referenced by their owner trees. The location of tree block is recorded 950 * in the back refs. Actually the full back refs is generic, and can be 951 * used in all cases the implicit back refs is used. The major shortcoming 952 * of the full back refs is its overhead. Every time a tree block gets 953 * COWed, we have to update back refs entry for all pointers in it. 954 * 955 * For a newly allocated tree block, we use implicit back refs for 956 * pointers in it. This means most tree related operations only involve 957 * implicit back refs. For a tree block created in old transaction, the 958 * only way to drop a reference to it is COW it. So we can detect the 959 * event that tree block loses its owner tree's reference and do the 960 * back refs conversion. 961 * 962 * When a tree block is COWed through a tree, there are four cases: 963 * 964 * The reference count of the block is one and the tree is the block's 965 * owner tree. Nothing to do in this case. 966 * 967 * The reference count of the block is one and the tree is not the 968 * block's owner tree. In this case, full back refs is used for pointers 969 * in the block. Remove these full back refs, add implicit back refs for 970 * every pointers in the new block. 971 * 972 * The reference count of the block is greater than one and the tree is 973 * the block's owner tree. In this case, implicit back refs is used for 974 * pointers in the block. Add full back refs for every pointers in the 975 * block, increase lower level extents' reference counts. The original 976 * implicit back refs are entailed to the new block. 977 * 978 * The reference count of the block is greater than one and the tree is 979 * not the block's owner tree. Add implicit back refs for every pointer in 980 * the new block, increase lower level extents' reference count. 981 * 982 * Back Reference Key composing: 983 * 984 * The key objectid corresponds to the first byte in the extent, 985 * The key type is used to differentiate between types of back refs. 986 * There are different meanings of the key offset for different types 987 * of back refs. 988 * 989 * File extents can be referenced by: 990 * 991 * - multiple snapshots, subvolumes, or different generations in one subvol 992 * - different files inside a single subvolume 993 * - different offsets inside a file (bookend extents in file.c) 994 * 995 * The extent ref structure for the implicit back refs has fields for: 996 * 997 * - Objectid of the subvolume root 998 * - objectid of the file holding the reference 999 * - original offset in the file 1000 * - how many bookend extents 1001 * 1002 * The key offset for the implicit back refs is hash of the first 1003 * three fields. 1004 * 1005 * The extent ref structure for the full back refs has field for: 1006 * 1007 * - number of pointers in the tree leaf 1008 * 1009 * The key offset for the implicit back refs is the first byte of 1010 * the tree leaf 1011 * 1012 * When a file extent is allocated, The implicit back refs is used. 1013 * the fields are filled in: 1014 * 1015 * (root_key.objectid, inode objectid, offset in file, 1) 1016 * 1017 * When a file extent is removed file truncation, we find the 1018 * corresponding implicit back refs and check the following fields: 1019 * 1020 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1021 * 1022 * Btree extents can be referenced by: 1023 * 1024 * - Different subvolumes 1025 * 1026 * Both the implicit back refs and the full back refs for tree blocks 1027 * only consist of key. The key offset for the implicit back refs is 1028 * objectid of block's owner tree. The key offset for the full back refs 1029 * is the first byte of parent block. 1030 * 1031 * When implicit back refs is used, information about the lowest key and 1032 * level of the tree block are required. These information are stored in 1033 * tree block info structure. 1034 */ 1035 1036 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1037 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 1038 struct btrfs_fs_info *fs_info, 1039 struct btrfs_path *path, 1040 u64 owner, u32 extra_size) 1041 { 1042 struct btrfs_root *root = fs_info->extent_root; 1043 struct btrfs_extent_item *item; 1044 struct btrfs_extent_item_v0 *ei0; 1045 struct btrfs_extent_ref_v0 *ref0; 1046 struct btrfs_tree_block_info *bi; 1047 struct extent_buffer *leaf; 1048 struct btrfs_key key; 1049 struct btrfs_key found_key; 1050 u32 new_size = sizeof(*item); 1051 u64 refs; 1052 int ret; 1053 1054 leaf = path->nodes[0]; 1055 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1056 1057 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1058 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1059 struct btrfs_extent_item_v0); 1060 refs = btrfs_extent_refs_v0(leaf, ei0); 1061 1062 if (owner == (u64)-1) { 1063 while (1) { 1064 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1065 ret = btrfs_next_leaf(root, path); 1066 if (ret < 0) 1067 return ret; 1068 BUG_ON(ret > 0); /* Corruption */ 1069 leaf = path->nodes[0]; 1070 } 1071 btrfs_item_key_to_cpu(leaf, &found_key, 1072 path->slots[0]); 1073 BUG_ON(key.objectid != found_key.objectid); 1074 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1075 path->slots[0]++; 1076 continue; 1077 } 1078 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1079 struct btrfs_extent_ref_v0); 1080 owner = btrfs_ref_objectid_v0(leaf, ref0); 1081 break; 1082 } 1083 } 1084 btrfs_release_path(path); 1085 1086 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1087 new_size += sizeof(*bi); 1088 1089 new_size -= sizeof(*ei0); 1090 ret = btrfs_search_slot(trans, root, &key, path, 1091 new_size + extra_size, 1); 1092 if (ret < 0) 1093 return ret; 1094 BUG_ON(ret); /* Corruption */ 1095 1096 btrfs_extend_item(fs_info, path, new_size); 1097 1098 leaf = path->nodes[0]; 1099 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1100 btrfs_set_extent_refs(leaf, item, refs); 1101 /* FIXME: get real generation */ 1102 btrfs_set_extent_generation(leaf, item, 0); 1103 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1104 btrfs_set_extent_flags(leaf, item, 1105 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1106 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1107 bi = (struct btrfs_tree_block_info *)(item + 1); 1108 /* FIXME: get first key of the block */ 1109 memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi)); 1110 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1111 } else { 1112 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1113 } 1114 btrfs_mark_buffer_dirty(leaf); 1115 return 0; 1116 } 1117 #endif 1118 1119 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1120 { 1121 u32 high_crc = ~(u32)0; 1122 u32 low_crc = ~(u32)0; 1123 __le64 lenum; 1124 1125 lenum = cpu_to_le64(root_objectid); 1126 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1127 lenum = cpu_to_le64(owner); 1128 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1129 lenum = cpu_to_le64(offset); 1130 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1131 1132 return ((u64)high_crc << 31) ^ (u64)low_crc; 1133 } 1134 1135 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1136 struct btrfs_extent_data_ref *ref) 1137 { 1138 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1139 btrfs_extent_data_ref_objectid(leaf, ref), 1140 btrfs_extent_data_ref_offset(leaf, ref)); 1141 } 1142 1143 static int match_extent_data_ref(struct extent_buffer *leaf, 1144 struct btrfs_extent_data_ref *ref, 1145 u64 root_objectid, u64 owner, u64 offset) 1146 { 1147 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1148 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1149 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1150 return 0; 1151 return 1; 1152 } 1153 1154 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1155 struct btrfs_fs_info *fs_info, 1156 struct btrfs_path *path, 1157 u64 bytenr, u64 parent, 1158 u64 root_objectid, 1159 u64 owner, u64 offset) 1160 { 1161 struct btrfs_root *root = fs_info->extent_root; 1162 struct btrfs_key key; 1163 struct btrfs_extent_data_ref *ref; 1164 struct extent_buffer *leaf; 1165 u32 nritems; 1166 int ret; 1167 int recow; 1168 int err = -ENOENT; 1169 1170 key.objectid = bytenr; 1171 if (parent) { 1172 key.type = BTRFS_SHARED_DATA_REF_KEY; 1173 key.offset = parent; 1174 } else { 1175 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1176 key.offset = hash_extent_data_ref(root_objectid, 1177 owner, offset); 1178 } 1179 again: 1180 recow = 0; 1181 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1182 if (ret < 0) { 1183 err = ret; 1184 goto fail; 1185 } 1186 1187 if (parent) { 1188 if (!ret) 1189 return 0; 1190 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1191 key.type = BTRFS_EXTENT_REF_V0_KEY; 1192 btrfs_release_path(path); 1193 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1194 if (ret < 0) { 1195 err = ret; 1196 goto fail; 1197 } 1198 if (!ret) 1199 return 0; 1200 #endif 1201 goto fail; 1202 } 1203 1204 leaf = path->nodes[0]; 1205 nritems = btrfs_header_nritems(leaf); 1206 while (1) { 1207 if (path->slots[0] >= nritems) { 1208 ret = btrfs_next_leaf(root, path); 1209 if (ret < 0) 1210 err = ret; 1211 if (ret) 1212 goto fail; 1213 1214 leaf = path->nodes[0]; 1215 nritems = btrfs_header_nritems(leaf); 1216 recow = 1; 1217 } 1218 1219 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1220 if (key.objectid != bytenr || 1221 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1222 goto fail; 1223 1224 ref = btrfs_item_ptr(leaf, path->slots[0], 1225 struct btrfs_extent_data_ref); 1226 1227 if (match_extent_data_ref(leaf, ref, root_objectid, 1228 owner, offset)) { 1229 if (recow) { 1230 btrfs_release_path(path); 1231 goto again; 1232 } 1233 err = 0; 1234 break; 1235 } 1236 path->slots[0]++; 1237 } 1238 fail: 1239 return err; 1240 } 1241 1242 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1243 struct btrfs_fs_info *fs_info, 1244 struct btrfs_path *path, 1245 u64 bytenr, u64 parent, 1246 u64 root_objectid, u64 owner, 1247 u64 offset, int refs_to_add) 1248 { 1249 struct btrfs_root *root = fs_info->extent_root; 1250 struct btrfs_key key; 1251 struct extent_buffer *leaf; 1252 u32 size; 1253 u32 num_refs; 1254 int ret; 1255 1256 key.objectid = bytenr; 1257 if (parent) { 1258 key.type = BTRFS_SHARED_DATA_REF_KEY; 1259 key.offset = parent; 1260 size = sizeof(struct btrfs_shared_data_ref); 1261 } else { 1262 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1263 key.offset = hash_extent_data_ref(root_objectid, 1264 owner, offset); 1265 size = sizeof(struct btrfs_extent_data_ref); 1266 } 1267 1268 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1269 if (ret && ret != -EEXIST) 1270 goto fail; 1271 1272 leaf = path->nodes[0]; 1273 if (parent) { 1274 struct btrfs_shared_data_ref *ref; 1275 ref = btrfs_item_ptr(leaf, path->slots[0], 1276 struct btrfs_shared_data_ref); 1277 if (ret == 0) { 1278 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1279 } else { 1280 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1281 num_refs += refs_to_add; 1282 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1283 } 1284 } else { 1285 struct btrfs_extent_data_ref *ref; 1286 while (ret == -EEXIST) { 1287 ref = btrfs_item_ptr(leaf, path->slots[0], 1288 struct btrfs_extent_data_ref); 1289 if (match_extent_data_ref(leaf, ref, root_objectid, 1290 owner, offset)) 1291 break; 1292 btrfs_release_path(path); 1293 key.offset++; 1294 ret = btrfs_insert_empty_item(trans, root, path, &key, 1295 size); 1296 if (ret && ret != -EEXIST) 1297 goto fail; 1298 1299 leaf = path->nodes[0]; 1300 } 1301 ref = btrfs_item_ptr(leaf, path->slots[0], 1302 struct btrfs_extent_data_ref); 1303 if (ret == 0) { 1304 btrfs_set_extent_data_ref_root(leaf, ref, 1305 root_objectid); 1306 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1307 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1308 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1309 } else { 1310 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1311 num_refs += refs_to_add; 1312 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1313 } 1314 } 1315 btrfs_mark_buffer_dirty(leaf); 1316 ret = 0; 1317 fail: 1318 btrfs_release_path(path); 1319 return ret; 1320 } 1321 1322 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1323 struct btrfs_fs_info *fs_info, 1324 struct btrfs_path *path, 1325 int refs_to_drop, int *last_ref) 1326 { 1327 struct btrfs_key key; 1328 struct btrfs_extent_data_ref *ref1 = NULL; 1329 struct btrfs_shared_data_ref *ref2 = NULL; 1330 struct extent_buffer *leaf; 1331 u32 num_refs = 0; 1332 int ret = 0; 1333 1334 leaf = path->nodes[0]; 1335 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1336 1337 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1338 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1339 struct btrfs_extent_data_ref); 1340 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1341 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1342 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1343 struct btrfs_shared_data_ref); 1344 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1345 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1346 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1347 struct btrfs_extent_ref_v0 *ref0; 1348 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1349 struct btrfs_extent_ref_v0); 1350 num_refs = btrfs_ref_count_v0(leaf, ref0); 1351 #endif 1352 } else { 1353 BUG(); 1354 } 1355 1356 BUG_ON(num_refs < refs_to_drop); 1357 num_refs -= refs_to_drop; 1358 1359 if (num_refs == 0) { 1360 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1361 *last_ref = 1; 1362 } else { 1363 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1364 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1365 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1366 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1367 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1368 else { 1369 struct btrfs_extent_ref_v0 *ref0; 1370 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1371 struct btrfs_extent_ref_v0); 1372 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1373 } 1374 #endif 1375 btrfs_mark_buffer_dirty(leaf); 1376 } 1377 return ret; 1378 } 1379 1380 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1381 struct btrfs_extent_inline_ref *iref) 1382 { 1383 struct btrfs_key key; 1384 struct extent_buffer *leaf; 1385 struct btrfs_extent_data_ref *ref1; 1386 struct btrfs_shared_data_ref *ref2; 1387 u32 num_refs = 0; 1388 1389 leaf = path->nodes[0]; 1390 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1391 if (iref) { 1392 if (btrfs_extent_inline_ref_type(leaf, iref) == 1393 BTRFS_EXTENT_DATA_REF_KEY) { 1394 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1395 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1396 } else { 1397 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1398 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1399 } 1400 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1401 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1402 struct btrfs_extent_data_ref); 1403 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1404 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1405 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1406 struct btrfs_shared_data_ref); 1407 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1408 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1409 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1410 struct btrfs_extent_ref_v0 *ref0; 1411 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1412 struct btrfs_extent_ref_v0); 1413 num_refs = btrfs_ref_count_v0(leaf, ref0); 1414 #endif 1415 } else { 1416 WARN_ON(1); 1417 } 1418 return num_refs; 1419 } 1420 1421 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1422 struct btrfs_fs_info *fs_info, 1423 struct btrfs_path *path, 1424 u64 bytenr, u64 parent, 1425 u64 root_objectid) 1426 { 1427 struct btrfs_root *root = fs_info->extent_root; 1428 struct btrfs_key key; 1429 int ret; 1430 1431 key.objectid = bytenr; 1432 if (parent) { 1433 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1434 key.offset = parent; 1435 } else { 1436 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1437 key.offset = root_objectid; 1438 } 1439 1440 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1441 if (ret > 0) 1442 ret = -ENOENT; 1443 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1444 if (ret == -ENOENT && parent) { 1445 btrfs_release_path(path); 1446 key.type = BTRFS_EXTENT_REF_V0_KEY; 1447 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1448 if (ret > 0) 1449 ret = -ENOENT; 1450 } 1451 #endif 1452 return ret; 1453 } 1454 1455 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1456 struct btrfs_fs_info *fs_info, 1457 struct btrfs_path *path, 1458 u64 bytenr, u64 parent, 1459 u64 root_objectid) 1460 { 1461 struct btrfs_key key; 1462 int ret; 1463 1464 key.objectid = bytenr; 1465 if (parent) { 1466 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1467 key.offset = parent; 1468 } else { 1469 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1470 key.offset = root_objectid; 1471 } 1472 1473 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, 1474 path, &key, 0); 1475 btrfs_release_path(path); 1476 return ret; 1477 } 1478 1479 static inline int extent_ref_type(u64 parent, u64 owner) 1480 { 1481 int type; 1482 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1483 if (parent > 0) 1484 type = BTRFS_SHARED_BLOCK_REF_KEY; 1485 else 1486 type = BTRFS_TREE_BLOCK_REF_KEY; 1487 } else { 1488 if (parent > 0) 1489 type = BTRFS_SHARED_DATA_REF_KEY; 1490 else 1491 type = BTRFS_EXTENT_DATA_REF_KEY; 1492 } 1493 return type; 1494 } 1495 1496 static int find_next_key(struct btrfs_path *path, int level, 1497 struct btrfs_key *key) 1498 1499 { 1500 for (; level < BTRFS_MAX_LEVEL; level++) { 1501 if (!path->nodes[level]) 1502 break; 1503 if (path->slots[level] + 1 >= 1504 btrfs_header_nritems(path->nodes[level])) 1505 continue; 1506 if (level == 0) 1507 btrfs_item_key_to_cpu(path->nodes[level], key, 1508 path->slots[level] + 1); 1509 else 1510 btrfs_node_key_to_cpu(path->nodes[level], key, 1511 path->slots[level] + 1); 1512 return 0; 1513 } 1514 return 1; 1515 } 1516 1517 /* 1518 * look for inline back ref. if back ref is found, *ref_ret is set 1519 * to the address of inline back ref, and 0 is returned. 1520 * 1521 * if back ref isn't found, *ref_ret is set to the address where it 1522 * should be inserted, and -ENOENT is returned. 1523 * 1524 * if insert is true and there are too many inline back refs, the path 1525 * points to the extent item, and -EAGAIN is returned. 1526 * 1527 * NOTE: inline back refs are ordered in the same way that back ref 1528 * items in the tree are ordered. 1529 */ 1530 static noinline_for_stack 1531 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1532 struct btrfs_fs_info *fs_info, 1533 struct btrfs_path *path, 1534 struct btrfs_extent_inline_ref **ref_ret, 1535 u64 bytenr, u64 num_bytes, 1536 u64 parent, u64 root_objectid, 1537 u64 owner, u64 offset, int insert) 1538 { 1539 struct btrfs_root *root = fs_info->extent_root; 1540 struct btrfs_key key; 1541 struct extent_buffer *leaf; 1542 struct btrfs_extent_item *ei; 1543 struct btrfs_extent_inline_ref *iref; 1544 u64 flags; 1545 u64 item_size; 1546 unsigned long ptr; 1547 unsigned long end; 1548 int extra_size; 1549 int type; 1550 int want; 1551 int ret; 1552 int err = 0; 1553 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1554 1555 key.objectid = bytenr; 1556 key.type = BTRFS_EXTENT_ITEM_KEY; 1557 key.offset = num_bytes; 1558 1559 want = extent_ref_type(parent, owner); 1560 if (insert) { 1561 extra_size = btrfs_extent_inline_ref_size(want); 1562 path->keep_locks = 1; 1563 } else 1564 extra_size = -1; 1565 1566 /* 1567 * Owner is our parent level, so we can just add one to get the level 1568 * for the block we are interested in. 1569 */ 1570 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1571 key.type = BTRFS_METADATA_ITEM_KEY; 1572 key.offset = owner; 1573 } 1574 1575 again: 1576 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1577 if (ret < 0) { 1578 err = ret; 1579 goto out; 1580 } 1581 1582 /* 1583 * We may be a newly converted file system which still has the old fat 1584 * extent entries for metadata, so try and see if we have one of those. 1585 */ 1586 if (ret > 0 && skinny_metadata) { 1587 skinny_metadata = false; 1588 if (path->slots[0]) { 1589 path->slots[0]--; 1590 btrfs_item_key_to_cpu(path->nodes[0], &key, 1591 path->slots[0]); 1592 if (key.objectid == bytenr && 1593 key.type == BTRFS_EXTENT_ITEM_KEY && 1594 key.offset == num_bytes) 1595 ret = 0; 1596 } 1597 if (ret) { 1598 key.objectid = bytenr; 1599 key.type = BTRFS_EXTENT_ITEM_KEY; 1600 key.offset = num_bytes; 1601 btrfs_release_path(path); 1602 goto again; 1603 } 1604 } 1605 1606 if (ret && !insert) { 1607 err = -ENOENT; 1608 goto out; 1609 } else if (WARN_ON(ret)) { 1610 err = -EIO; 1611 goto out; 1612 } 1613 1614 leaf = path->nodes[0]; 1615 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1616 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1617 if (item_size < sizeof(*ei)) { 1618 if (!insert) { 1619 err = -ENOENT; 1620 goto out; 1621 } 1622 ret = convert_extent_item_v0(trans, fs_info, path, owner, 1623 extra_size); 1624 if (ret < 0) { 1625 err = ret; 1626 goto out; 1627 } 1628 leaf = path->nodes[0]; 1629 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1630 } 1631 #endif 1632 BUG_ON(item_size < sizeof(*ei)); 1633 1634 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1635 flags = btrfs_extent_flags(leaf, ei); 1636 1637 ptr = (unsigned long)(ei + 1); 1638 end = (unsigned long)ei + item_size; 1639 1640 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1641 ptr += sizeof(struct btrfs_tree_block_info); 1642 BUG_ON(ptr > end); 1643 } 1644 1645 err = -ENOENT; 1646 while (1) { 1647 if (ptr >= end) { 1648 WARN_ON(ptr > end); 1649 break; 1650 } 1651 iref = (struct btrfs_extent_inline_ref *)ptr; 1652 type = btrfs_extent_inline_ref_type(leaf, iref); 1653 if (want < type) 1654 break; 1655 if (want > type) { 1656 ptr += btrfs_extent_inline_ref_size(type); 1657 continue; 1658 } 1659 1660 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1661 struct btrfs_extent_data_ref *dref; 1662 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1663 if (match_extent_data_ref(leaf, dref, root_objectid, 1664 owner, offset)) { 1665 err = 0; 1666 break; 1667 } 1668 if (hash_extent_data_ref_item(leaf, dref) < 1669 hash_extent_data_ref(root_objectid, owner, offset)) 1670 break; 1671 } else { 1672 u64 ref_offset; 1673 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1674 if (parent > 0) { 1675 if (parent == ref_offset) { 1676 err = 0; 1677 break; 1678 } 1679 if (ref_offset < parent) 1680 break; 1681 } else { 1682 if (root_objectid == ref_offset) { 1683 err = 0; 1684 break; 1685 } 1686 if (ref_offset < root_objectid) 1687 break; 1688 } 1689 } 1690 ptr += btrfs_extent_inline_ref_size(type); 1691 } 1692 if (err == -ENOENT && insert) { 1693 if (item_size + extra_size >= 1694 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1695 err = -EAGAIN; 1696 goto out; 1697 } 1698 /* 1699 * To add new inline back ref, we have to make sure 1700 * there is no corresponding back ref item. 1701 * For simplicity, we just do not add new inline back 1702 * ref if there is any kind of item for this block 1703 */ 1704 if (find_next_key(path, 0, &key) == 0 && 1705 key.objectid == bytenr && 1706 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1707 err = -EAGAIN; 1708 goto out; 1709 } 1710 } 1711 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1712 out: 1713 if (insert) { 1714 path->keep_locks = 0; 1715 btrfs_unlock_up_safe(path, 1); 1716 } 1717 return err; 1718 } 1719 1720 /* 1721 * helper to add new inline back ref 1722 */ 1723 static noinline_for_stack 1724 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1725 struct btrfs_path *path, 1726 struct btrfs_extent_inline_ref *iref, 1727 u64 parent, u64 root_objectid, 1728 u64 owner, u64 offset, int refs_to_add, 1729 struct btrfs_delayed_extent_op *extent_op) 1730 { 1731 struct extent_buffer *leaf; 1732 struct btrfs_extent_item *ei; 1733 unsigned long ptr; 1734 unsigned long end; 1735 unsigned long item_offset; 1736 u64 refs; 1737 int size; 1738 int type; 1739 1740 leaf = path->nodes[0]; 1741 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1742 item_offset = (unsigned long)iref - (unsigned long)ei; 1743 1744 type = extent_ref_type(parent, owner); 1745 size = btrfs_extent_inline_ref_size(type); 1746 1747 btrfs_extend_item(fs_info, path, size); 1748 1749 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1750 refs = btrfs_extent_refs(leaf, ei); 1751 refs += refs_to_add; 1752 btrfs_set_extent_refs(leaf, ei, refs); 1753 if (extent_op) 1754 __run_delayed_extent_op(extent_op, leaf, ei); 1755 1756 ptr = (unsigned long)ei + item_offset; 1757 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1758 if (ptr < end - size) 1759 memmove_extent_buffer(leaf, ptr + size, ptr, 1760 end - size - ptr); 1761 1762 iref = (struct btrfs_extent_inline_ref *)ptr; 1763 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1764 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1765 struct btrfs_extent_data_ref *dref; 1766 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1767 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1768 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1769 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1770 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1771 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1772 struct btrfs_shared_data_ref *sref; 1773 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1774 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1775 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1776 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1777 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1778 } else { 1779 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1780 } 1781 btrfs_mark_buffer_dirty(leaf); 1782 } 1783 1784 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1785 struct btrfs_fs_info *fs_info, 1786 struct btrfs_path *path, 1787 struct btrfs_extent_inline_ref **ref_ret, 1788 u64 bytenr, u64 num_bytes, u64 parent, 1789 u64 root_objectid, u64 owner, u64 offset) 1790 { 1791 int ret; 1792 1793 ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret, 1794 bytenr, num_bytes, parent, 1795 root_objectid, owner, offset, 0); 1796 if (ret != -ENOENT) 1797 return ret; 1798 1799 btrfs_release_path(path); 1800 *ref_ret = NULL; 1801 1802 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1803 ret = lookup_tree_block_ref(trans, fs_info, path, bytenr, 1804 parent, root_objectid); 1805 } else { 1806 ret = lookup_extent_data_ref(trans, fs_info, path, bytenr, 1807 parent, root_objectid, owner, 1808 offset); 1809 } 1810 return ret; 1811 } 1812 1813 /* 1814 * helper to update/remove inline back ref 1815 */ 1816 static noinline_for_stack 1817 void update_inline_extent_backref(struct btrfs_fs_info *fs_info, 1818 struct btrfs_path *path, 1819 struct btrfs_extent_inline_ref *iref, 1820 int refs_to_mod, 1821 struct btrfs_delayed_extent_op *extent_op, 1822 int *last_ref) 1823 { 1824 struct extent_buffer *leaf; 1825 struct btrfs_extent_item *ei; 1826 struct btrfs_extent_data_ref *dref = NULL; 1827 struct btrfs_shared_data_ref *sref = NULL; 1828 unsigned long ptr; 1829 unsigned long end; 1830 u32 item_size; 1831 int size; 1832 int type; 1833 u64 refs; 1834 1835 leaf = path->nodes[0]; 1836 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1837 refs = btrfs_extent_refs(leaf, ei); 1838 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1839 refs += refs_to_mod; 1840 btrfs_set_extent_refs(leaf, ei, refs); 1841 if (extent_op) 1842 __run_delayed_extent_op(extent_op, leaf, ei); 1843 1844 type = btrfs_extent_inline_ref_type(leaf, iref); 1845 1846 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1847 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1848 refs = btrfs_extent_data_ref_count(leaf, dref); 1849 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1850 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1851 refs = btrfs_shared_data_ref_count(leaf, sref); 1852 } else { 1853 refs = 1; 1854 BUG_ON(refs_to_mod != -1); 1855 } 1856 1857 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1858 refs += refs_to_mod; 1859 1860 if (refs > 0) { 1861 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1862 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1863 else 1864 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1865 } else { 1866 *last_ref = 1; 1867 size = btrfs_extent_inline_ref_size(type); 1868 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1869 ptr = (unsigned long)iref; 1870 end = (unsigned long)ei + item_size; 1871 if (ptr + size < end) 1872 memmove_extent_buffer(leaf, ptr, ptr + size, 1873 end - ptr - size); 1874 item_size -= size; 1875 btrfs_truncate_item(fs_info, path, item_size, 1); 1876 } 1877 btrfs_mark_buffer_dirty(leaf); 1878 } 1879 1880 static noinline_for_stack 1881 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1882 struct btrfs_fs_info *fs_info, 1883 struct btrfs_path *path, 1884 u64 bytenr, u64 num_bytes, u64 parent, 1885 u64 root_objectid, u64 owner, 1886 u64 offset, int refs_to_add, 1887 struct btrfs_delayed_extent_op *extent_op) 1888 { 1889 struct btrfs_extent_inline_ref *iref; 1890 int ret; 1891 1892 ret = lookup_inline_extent_backref(trans, fs_info, path, &iref, 1893 bytenr, num_bytes, parent, 1894 root_objectid, owner, offset, 1); 1895 if (ret == 0) { 1896 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1897 update_inline_extent_backref(fs_info, path, iref, 1898 refs_to_add, extent_op, NULL); 1899 } else if (ret == -ENOENT) { 1900 setup_inline_extent_backref(fs_info, path, iref, parent, 1901 root_objectid, owner, offset, 1902 refs_to_add, extent_op); 1903 ret = 0; 1904 } 1905 return ret; 1906 } 1907 1908 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1909 struct btrfs_fs_info *fs_info, 1910 struct btrfs_path *path, 1911 u64 bytenr, u64 parent, u64 root_objectid, 1912 u64 owner, u64 offset, int refs_to_add) 1913 { 1914 int ret; 1915 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1916 BUG_ON(refs_to_add != 1); 1917 ret = insert_tree_block_ref(trans, fs_info, path, bytenr, 1918 parent, root_objectid); 1919 } else { 1920 ret = insert_extent_data_ref(trans, fs_info, path, bytenr, 1921 parent, root_objectid, 1922 owner, offset, refs_to_add); 1923 } 1924 return ret; 1925 } 1926 1927 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1928 struct btrfs_fs_info *fs_info, 1929 struct btrfs_path *path, 1930 struct btrfs_extent_inline_ref *iref, 1931 int refs_to_drop, int is_data, int *last_ref) 1932 { 1933 int ret = 0; 1934 1935 BUG_ON(!is_data && refs_to_drop != 1); 1936 if (iref) { 1937 update_inline_extent_backref(fs_info, path, iref, 1938 -refs_to_drop, NULL, last_ref); 1939 } else if (is_data) { 1940 ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop, 1941 last_ref); 1942 } else { 1943 *last_ref = 1; 1944 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1945 } 1946 return ret; 1947 } 1948 1949 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1950 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1951 u64 *discarded_bytes) 1952 { 1953 int j, ret = 0; 1954 u64 bytes_left, end; 1955 u64 aligned_start = ALIGN(start, 1 << 9); 1956 1957 if (WARN_ON(start != aligned_start)) { 1958 len -= aligned_start - start; 1959 len = round_down(len, 1 << 9); 1960 start = aligned_start; 1961 } 1962 1963 *discarded_bytes = 0; 1964 1965 if (!len) 1966 return 0; 1967 1968 end = start + len; 1969 bytes_left = len; 1970 1971 /* Skip any superblocks on this device. */ 1972 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1973 u64 sb_start = btrfs_sb_offset(j); 1974 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1975 u64 size = sb_start - start; 1976 1977 if (!in_range(sb_start, start, bytes_left) && 1978 !in_range(sb_end, start, bytes_left) && 1979 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1980 continue; 1981 1982 /* 1983 * Superblock spans beginning of range. Adjust start and 1984 * try again. 1985 */ 1986 if (sb_start <= start) { 1987 start += sb_end - start; 1988 if (start > end) { 1989 bytes_left = 0; 1990 break; 1991 } 1992 bytes_left = end - start; 1993 continue; 1994 } 1995 1996 if (size) { 1997 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 1998 GFP_NOFS, 0); 1999 if (!ret) 2000 *discarded_bytes += size; 2001 else if (ret != -EOPNOTSUPP) 2002 return ret; 2003 } 2004 2005 start = sb_end; 2006 if (start > end) { 2007 bytes_left = 0; 2008 break; 2009 } 2010 bytes_left = end - start; 2011 } 2012 2013 if (bytes_left) { 2014 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 2015 GFP_NOFS, 0); 2016 if (!ret) 2017 *discarded_bytes += bytes_left; 2018 } 2019 return ret; 2020 } 2021 2022 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 2023 u64 num_bytes, u64 *actual_bytes) 2024 { 2025 int ret; 2026 u64 discarded_bytes = 0; 2027 struct btrfs_bio *bbio = NULL; 2028 2029 2030 /* 2031 * Avoid races with device replace and make sure our bbio has devices 2032 * associated to its stripes that don't go away while we are discarding. 2033 */ 2034 btrfs_bio_counter_inc_blocked(fs_info); 2035 /* Tell the block device(s) that the sectors can be discarded */ 2036 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 2037 &bbio, 0); 2038 /* Error condition is -ENOMEM */ 2039 if (!ret) { 2040 struct btrfs_bio_stripe *stripe = bbio->stripes; 2041 int i; 2042 2043 2044 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2045 u64 bytes; 2046 if (!stripe->dev->can_discard) 2047 continue; 2048 2049 ret = btrfs_issue_discard(stripe->dev->bdev, 2050 stripe->physical, 2051 stripe->length, 2052 &bytes); 2053 if (!ret) 2054 discarded_bytes += bytes; 2055 else if (ret != -EOPNOTSUPP) 2056 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2057 2058 /* 2059 * Just in case we get back EOPNOTSUPP for some reason, 2060 * just ignore the return value so we don't screw up 2061 * people calling discard_extent. 2062 */ 2063 ret = 0; 2064 } 2065 btrfs_put_bbio(bbio); 2066 } 2067 btrfs_bio_counter_dec(fs_info); 2068 2069 if (actual_bytes) 2070 *actual_bytes = discarded_bytes; 2071 2072 2073 if (ret == -EOPNOTSUPP) 2074 ret = 0; 2075 return ret; 2076 } 2077 2078 /* Can return -ENOMEM */ 2079 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2080 struct btrfs_fs_info *fs_info, 2081 u64 bytenr, u64 num_bytes, u64 parent, 2082 u64 root_objectid, u64 owner, u64 offset) 2083 { 2084 int ret; 2085 2086 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2087 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2088 2089 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2090 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2091 num_bytes, 2092 parent, root_objectid, (int)owner, 2093 BTRFS_ADD_DELAYED_REF, NULL); 2094 } else { 2095 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2096 num_bytes, parent, root_objectid, 2097 owner, offset, 0, 2098 BTRFS_ADD_DELAYED_REF); 2099 } 2100 return ret; 2101 } 2102 2103 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2104 struct btrfs_fs_info *fs_info, 2105 struct btrfs_delayed_ref_node *node, 2106 u64 parent, u64 root_objectid, 2107 u64 owner, u64 offset, int refs_to_add, 2108 struct btrfs_delayed_extent_op *extent_op) 2109 { 2110 struct btrfs_path *path; 2111 struct extent_buffer *leaf; 2112 struct btrfs_extent_item *item; 2113 struct btrfs_key key; 2114 u64 bytenr = node->bytenr; 2115 u64 num_bytes = node->num_bytes; 2116 u64 refs; 2117 int ret; 2118 2119 path = btrfs_alloc_path(); 2120 if (!path) 2121 return -ENOMEM; 2122 2123 path->reada = READA_FORWARD; 2124 path->leave_spinning = 1; 2125 /* this will setup the path even if it fails to insert the back ref */ 2126 ret = insert_inline_extent_backref(trans, fs_info, path, bytenr, 2127 num_bytes, parent, root_objectid, 2128 owner, offset, 2129 refs_to_add, extent_op); 2130 if ((ret < 0 && ret != -EAGAIN) || !ret) 2131 goto out; 2132 2133 /* 2134 * Ok we had -EAGAIN which means we didn't have space to insert and 2135 * inline extent ref, so just update the reference count and add a 2136 * normal backref. 2137 */ 2138 leaf = path->nodes[0]; 2139 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2140 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2141 refs = btrfs_extent_refs(leaf, item); 2142 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2143 if (extent_op) 2144 __run_delayed_extent_op(extent_op, leaf, item); 2145 2146 btrfs_mark_buffer_dirty(leaf); 2147 btrfs_release_path(path); 2148 2149 path->reada = READA_FORWARD; 2150 path->leave_spinning = 1; 2151 /* now insert the actual backref */ 2152 ret = insert_extent_backref(trans, fs_info, path, bytenr, parent, 2153 root_objectid, owner, offset, refs_to_add); 2154 if (ret) 2155 btrfs_abort_transaction(trans, ret); 2156 out: 2157 btrfs_free_path(path); 2158 return ret; 2159 } 2160 2161 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2162 struct btrfs_fs_info *fs_info, 2163 struct btrfs_delayed_ref_node *node, 2164 struct btrfs_delayed_extent_op *extent_op, 2165 int insert_reserved) 2166 { 2167 int ret = 0; 2168 struct btrfs_delayed_data_ref *ref; 2169 struct btrfs_key ins; 2170 u64 parent = 0; 2171 u64 ref_root = 0; 2172 u64 flags = 0; 2173 2174 ins.objectid = node->bytenr; 2175 ins.offset = node->num_bytes; 2176 ins.type = BTRFS_EXTENT_ITEM_KEY; 2177 2178 ref = btrfs_delayed_node_to_data_ref(node); 2179 trace_run_delayed_data_ref(fs_info, node, ref, node->action); 2180 2181 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2182 parent = ref->parent; 2183 ref_root = ref->root; 2184 2185 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2186 if (extent_op) 2187 flags |= extent_op->flags_to_set; 2188 ret = alloc_reserved_file_extent(trans, fs_info, 2189 parent, ref_root, flags, 2190 ref->objectid, ref->offset, 2191 &ins, node->ref_mod); 2192 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2193 ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, 2194 ref_root, ref->objectid, 2195 ref->offset, node->ref_mod, 2196 extent_op); 2197 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2198 ret = __btrfs_free_extent(trans, fs_info, node, parent, 2199 ref_root, ref->objectid, 2200 ref->offset, node->ref_mod, 2201 extent_op); 2202 } else { 2203 BUG(); 2204 } 2205 return ret; 2206 } 2207 2208 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2209 struct extent_buffer *leaf, 2210 struct btrfs_extent_item *ei) 2211 { 2212 u64 flags = btrfs_extent_flags(leaf, ei); 2213 if (extent_op->update_flags) { 2214 flags |= extent_op->flags_to_set; 2215 btrfs_set_extent_flags(leaf, ei, flags); 2216 } 2217 2218 if (extent_op->update_key) { 2219 struct btrfs_tree_block_info *bi; 2220 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2221 bi = (struct btrfs_tree_block_info *)(ei + 1); 2222 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2223 } 2224 } 2225 2226 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2227 struct btrfs_fs_info *fs_info, 2228 struct btrfs_delayed_ref_node *node, 2229 struct btrfs_delayed_extent_op *extent_op) 2230 { 2231 struct btrfs_key key; 2232 struct btrfs_path *path; 2233 struct btrfs_extent_item *ei; 2234 struct extent_buffer *leaf; 2235 u32 item_size; 2236 int ret; 2237 int err = 0; 2238 int metadata = !extent_op->is_data; 2239 2240 if (trans->aborted) 2241 return 0; 2242 2243 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2244 metadata = 0; 2245 2246 path = btrfs_alloc_path(); 2247 if (!path) 2248 return -ENOMEM; 2249 2250 key.objectid = node->bytenr; 2251 2252 if (metadata) { 2253 key.type = BTRFS_METADATA_ITEM_KEY; 2254 key.offset = extent_op->level; 2255 } else { 2256 key.type = BTRFS_EXTENT_ITEM_KEY; 2257 key.offset = node->num_bytes; 2258 } 2259 2260 again: 2261 path->reada = READA_FORWARD; 2262 path->leave_spinning = 1; 2263 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2264 if (ret < 0) { 2265 err = ret; 2266 goto out; 2267 } 2268 if (ret > 0) { 2269 if (metadata) { 2270 if (path->slots[0] > 0) { 2271 path->slots[0]--; 2272 btrfs_item_key_to_cpu(path->nodes[0], &key, 2273 path->slots[0]); 2274 if (key.objectid == node->bytenr && 2275 key.type == BTRFS_EXTENT_ITEM_KEY && 2276 key.offset == node->num_bytes) 2277 ret = 0; 2278 } 2279 if (ret > 0) { 2280 btrfs_release_path(path); 2281 metadata = 0; 2282 2283 key.objectid = node->bytenr; 2284 key.offset = node->num_bytes; 2285 key.type = BTRFS_EXTENT_ITEM_KEY; 2286 goto again; 2287 } 2288 } else { 2289 err = -EIO; 2290 goto out; 2291 } 2292 } 2293 2294 leaf = path->nodes[0]; 2295 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2296 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2297 if (item_size < sizeof(*ei)) { 2298 ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0); 2299 if (ret < 0) { 2300 err = ret; 2301 goto out; 2302 } 2303 leaf = path->nodes[0]; 2304 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2305 } 2306 #endif 2307 BUG_ON(item_size < sizeof(*ei)); 2308 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2309 __run_delayed_extent_op(extent_op, leaf, ei); 2310 2311 btrfs_mark_buffer_dirty(leaf); 2312 out: 2313 btrfs_free_path(path); 2314 return err; 2315 } 2316 2317 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2318 struct btrfs_fs_info *fs_info, 2319 struct btrfs_delayed_ref_node *node, 2320 struct btrfs_delayed_extent_op *extent_op, 2321 int insert_reserved) 2322 { 2323 int ret = 0; 2324 struct btrfs_delayed_tree_ref *ref; 2325 struct btrfs_key ins; 2326 u64 parent = 0; 2327 u64 ref_root = 0; 2328 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 2329 2330 ref = btrfs_delayed_node_to_tree_ref(node); 2331 trace_run_delayed_tree_ref(fs_info, node, ref, node->action); 2332 2333 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2334 parent = ref->parent; 2335 ref_root = ref->root; 2336 2337 ins.objectid = node->bytenr; 2338 if (skinny_metadata) { 2339 ins.offset = ref->level; 2340 ins.type = BTRFS_METADATA_ITEM_KEY; 2341 } else { 2342 ins.offset = node->num_bytes; 2343 ins.type = BTRFS_EXTENT_ITEM_KEY; 2344 } 2345 2346 if (node->ref_mod != 1) { 2347 btrfs_err(fs_info, 2348 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2349 node->bytenr, node->ref_mod, node->action, ref_root, 2350 parent); 2351 return -EIO; 2352 } 2353 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2354 BUG_ON(!extent_op || !extent_op->update_flags); 2355 ret = alloc_reserved_tree_block(trans, fs_info, 2356 parent, ref_root, 2357 extent_op->flags_to_set, 2358 &extent_op->key, 2359 ref->level, &ins); 2360 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2361 ret = __btrfs_inc_extent_ref(trans, fs_info, node, 2362 parent, ref_root, 2363 ref->level, 0, 1, 2364 extent_op); 2365 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2366 ret = __btrfs_free_extent(trans, fs_info, node, 2367 parent, ref_root, 2368 ref->level, 0, 1, extent_op); 2369 } else { 2370 BUG(); 2371 } 2372 return ret; 2373 } 2374 2375 /* helper function to actually process a single delayed ref entry */ 2376 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2377 struct btrfs_fs_info *fs_info, 2378 struct btrfs_delayed_ref_node *node, 2379 struct btrfs_delayed_extent_op *extent_op, 2380 int insert_reserved) 2381 { 2382 int ret = 0; 2383 2384 if (trans->aborted) { 2385 if (insert_reserved) 2386 btrfs_pin_extent(fs_info, node->bytenr, 2387 node->num_bytes, 1); 2388 return 0; 2389 } 2390 2391 if (btrfs_delayed_ref_is_head(node)) { 2392 struct btrfs_delayed_ref_head *head; 2393 /* 2394 * we've hit the end of the chain and we were supposed 2395 * to insert this extent into the tree. But, it got 2396 * deleted before we ever needed to insert it, so all 2397 * we have to do is clean up the accounting 2398 */ 2399 BUG_ON(extent_op); 2400 head = btrfs_delayed_node_to_head(node); 2401 trace_run_delayed_ref_head(fs_info, node, head, node->action); 2402 2403 if (insert_reserved) { 2404 btrfs_pin_extent(fs_info, node->bytenr, 2405 node->num_bytes, 1); 2406 if (head->is_data) { 2407 ret = btrfs_del_csums(trans, fs_info, 2408 node->bytenr, 2409 node->num_bytes); 2410 } 2411 } 2412 2413 /* Also free its reserved qgroup space */ 2414 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2415 head->qgroup_reserved); 2416 return ret; 2417 } 2418 2419 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2420 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2421 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, 2422 insert_reserved); 2423 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2424 node->type == BTRFS_SHARED_DATA_REF_KEY) 2425 ret = run_delayed_data_ref(trans, fs_info, node, extent_op, 2426 insert_reserved); 2427 else 2428 BUG(); 2429 return ret; 2430 } 2431 2432 static inline struct btrfs_delayed_ref_node * 2433 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2434 { 2435 struct btrfs_delayed_ref_node *ref; 2436 2437 if (list_empty(&head->ref_list)) 2438 return NULL; 2439 2440 /* 2441 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2442 * This is to prevent a ref count from going down to zero, which deletes 2443 * the extent item from the extent tree, when there still are references 2444 * to add, which would fail because they would not find the extent item. 2445 */ 2446 if (!list_empty(&head->ref_add_list)) 2447 return list_first_entry(&head->ref_add_list, 2448 struct btrfs_delayed_ref_node, add_list); 2449 2450 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 2451 list); 2452 ASSERT(list_empty(&ref->add_list)); 2453 return ref; 2454 } 2455 2456 /* 2457 * Returns 0 on success or if called with an already aborted transaction. 2458 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2459 */ 2460 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2461 struct btrfs_fs_info *fs_info, 2462 unsigned long nr) 2463 { 2464 struct btrfs_delayed_ref_root *delayed_refs; 2465 struct btrfs_delayed_ref_node *ref; 2466 struct btrfs_delayed_ref_head *locked_ref = NULL; 2467 struct btrfs_delayed_extent_op *extent_op; 2468 ktime_t start = ktime_get(); 2469 int ret; 2470 unsigned long count = 0; 2471 unsigned long actual_count = 0; 2472 int must_insert_reserved = 0; 2473 2474 delayed_refs = &trans->transaction->delayed_refs; 2475 while (1) { 2476 if (!locked_ref) { 2477 if (count >= nr) 2478 break; 2479 2480 spin_lock(&delayed_refs->lock); 2481 locked_ref = btrfs_select_ref_head(trans); 2482 if (!locked_ref) { 2483 spin_unlock(&delayed_refs->lock); 2484 break; 2485 } 2486 2487 /* grab the lock that says we are going to process 2488 * all the refs for this head */ 2489 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2490 spin_unlock(&delayed_refs->lock); 2491 /* 2492 * we may have dropped the spin lock to get the head 2493 * mutex lock, and that might have given someone else 2494 * time to free the head. If that's true, it has been 2495 * removed from our list and we can move on. 2496 */ 2497 if (ret == -EAGAIN) { 2498 locked_ref = NULL; 2499 count++; 2500 continue; 2501 } 2502 } 2503 2504 /* 2505 * We need to try and merge add/drops of the same ref since we 2506 * can run into issues with relocate dropping the implicit ref 2507 * and then it being added back again before the drop can 2508 * finish. If we merged anything we need to re-loop so we can 2509 * get a good ref. 2510 * Or we can get node references of the same type that weren't 2511 * merged when created due to bumps in the tree mod seq, and 2512 * we need to merge them to prevent adding an inline extent 2513 * backref before dropping it (triggering a BUG_ON at 2514 * insert_inline_extent_backref()). 2515 */ 2516 spin_lock(&locked_ref->lock); 2517 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2518 locked_ref); 2519 2520 /* 2521 * locked_ref is the head node, so we have to go one 2522 * node back for any delayed ref updates 2523 */ 2524 ref = select_delayed_ref(locked_ref); 2525 2526 if (ref && ref->seq && 2527 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2528 spin_unlock(&locked_ref->lock); 2529 spin_lock(&delayed_refs->lock); 2530 locked_ref->processing = 0; 2531 delayed_refs->num_heads_ready++; 2532 spin_unlock(&delayed_refs->lock); 2533 btrfs_delayed_ref_unlock(locked_ref); 2534 locked_ref = NULL; 2535 cond_resched(); 2536 count++; 2537 continue; 2538 } 2539 2540 /* 2541 * record the must insert reserved flag before we 2542 * drop the spin lock. 2543 */ 2544 must_insert_reserved = locked_ref->must_insert_reserved; 2545 locked_ref->must_insert_reserved = 0; 2546 2547 extent_op = locked_ref->extent_op; 2548 locked_ref->extent_op = NULL; 2549 2550 if (!ref) { 2551 2552 2553 /* All delayed refs have been processed, Go ahead 2554 * and send the head node to run_one_delayed_ref, 2555 * so that any accounting fixes can happen 2556 */ 2557 ref = &locked_ref->node; 2558 2559 if (extent_op && must_insert_reserved) { 2560 btrfs_free_delayed_extent_op(extent_op); 2561 extent_op = NULL; 2562 } 2563 2564 if (extent_op) { 2565 spin_unlock(&locked_ref->lock); 2566 ret = run_delayed_extent_op(trans, fs_info, 2567 ref, extent_op); 2568 btrfs_free_delayed_extent_op(extent_op); 2569 2570 if (ret) { 2571 /* 2572 * Need to reset must_insert_reserved if 2573 * there was an error so the abort stuff 2574 * can cleanup the reserved space 2575 * properly. 2576 */ 2577 if (must_insert_reserved) 2578 locked_ref->must_insert_reserved = 1; 2579 spin_lock(&delayed_refs->lock); 2580 locked_ref->processing = 0; 2581 delayed_refs->num_heads_ready++; 2582 spin_unlock(&delayed_refs->lock); 2583 btrfs_debug(fs_info, 2584 "run_delayed_extent_op returned %d", 2585 ret); 2586 btrfs_delayed_ref_unlock(locked_ref); 2587 return ret; 2588 } 2589 continue; 2590 } 2591 2592 /* 2593 * Need to drop our head ref lock and re-acquire the 2594 * delayed ref lock and then re-check to make sure 2595 * nobody got added. 2596 */ 2597 spin_unlock(&locked_ref->lock); 2598 spin_lock(&delayed_refs->lock); 2599 spin_lock(&locked_ref->lock); 2600 if (!list_empty(&locked_ref->ref_list) || 2601 locked_ref->extent_op) { 2602 spin_unlock(&locked_ref->lock); 2603 spin_unlock(&delayed_refs->lock); 2604 continue; 2605 } 2606 ref->in_tree = 0; 2607 delayed_refs->num_heads--; 2608 rb_erase(&locked_ref->href_node, 2609 &delayed_refs->href_root); 2610 spin_unlock(&delayed_refs->lock); 2611 } else { 2612 actual_count++; 2613 ref->in_tree = 0; 2614 list_del(&ref->list); 2615 if (!list_empty(&ref->add_list)) 2616 list_del(&ref->add_list); 2617 } 2618 atomic_dec(&delayed_refs->num_entries); 2619 2620 if (!btrfs_delayed_ref_is_head(ref)) { 2621 /* 2622 * when we play the delayed ref, also correct the 2623 * ref_mod on head 2624 */ 2625 switch (ref->action) { 2626 case BTRFS_ADD_DELAYED_REF: 2627 case BTRFS_ADD_DELAYED_EXTENT: 2628 locked_ref->node.ref_mod -= ref->ref_mod; 2629 break; 2630 case BTRFS_DROP_DELAYED_REF: 2631 locked_ref->node.ref_mod += ref->ref_mod; 2632 break; 2633 default: 2634 WARN_ON(1); 2635 } 2636 } 2637 spin_unlock(&locked_ref->lock); 2638 2639 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, 2640 must_insert_reserved); 2641 2642 btrfs_free_delayed_extent_op(extent_op); 2643 if (ret) { 2644 spin_lock(&delayed_refs->lock); 2645 locked_ref->processing = 0; 2646 delayed_refs->num_heads_ready++; 2647 spin_unlock(&delayed_refs->lock); 2648 btrfs_delayed_ref_unlock(locked_ref); 2649 btrfs_put_delayed_ref(ref); 2650 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2651 ret); 2652 return ret; 2653 } 2654 2655 /* 2656 * If this node is a head, that means all the refs in this head 2657 * have been dealt with, and we will pick the next head to deal 2658 * with, so we must unlock the head and drop it from the cluster 2659 * list before we release it. 2660 */ 2661 if (btrfs_delayed_ref_is_head(ref)) { 2662 if (locked_ref->is_data && 2663 locked_ref->total_ref_mod < 0) { 2664 spin_lock(&delayed_refs->lock); 2665 delayed_refs->pending_csums -= ref->num_bytes; 2666 spin_unlock(&delayed_refs->lock); 2667 } 2668 btrfs_delayed_ref_unlock(locked_ref); 2669 locked_ref = NULL; 2670 } 2671 btrfs_put_delayed_ref(ref); 2672 count++; 2673 cond_resched(); 2674 } 2675 2676 /* 2677 * We don't want to include ref heads since we can have empty ref heads 2678 * and those will drastically skew our runtime down since we just do 2679 * accounting, no actual extent tree updates. 2680 */ 2681 if (actual_count > 0) { 2682 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2683 u64 avg; 2684 2685 /* 2686 * We weigh the current average higher than our current runtime 2687 * to avoid large swings in the average. 2688 */ 2689 spin_lock(&delayed_refs->lock); 2690 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2691 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2692 spin_unlock(&delayed_refs->lock); 2693 } 2694 return 0; 2695 } 2696 2697 #ifdef SCRAMBLE_DELAYED_REFS 2698 /* 2699 * Normally delayed refs get processed in ascending bytenr order. This 2700 * correlates in most cases to the order added. To expose dependencies on this 2701 * order, we start to process the tree in the middle instead of the beginning 2702 */ 2703 static u64 find_middle(struct rb_root *root) 2704 { 2705 struct rb_node *n = root->rb_node; 2706 struct btrfs_delayed_ref_node *entry; 2707 int alt = 1; 2708 u64 middle; 2709 u64 first = 0, last = 0; 2710 2711 n = rb_first(root); 2712 if (n) { 2713 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2714 first = entry->bytenr; 2715 } 2716 n = rb_last(root); 2717 if (n) { 2718 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2719 last = entry->bytenr; 2720 } 2721 n = root->rb_node; 2722 2723 while (n) { 2724 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2725 WARN_ON(!entry->in_tree); 2726 2727 middle = entry->bytenr; 2728 2729 if (alt) 2730 n = n->rb_left; 2731 else 2732 n = n->rb_right; 2733 2734 alt = 1 - alt; 2735 } 2736 return middle; 2737 } 2738 #endif 2739 2740 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2741 { 2742 u64 num_bytes; 2743 2744 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2745 sizeof(struct btrfs_extent_inline_ref)); 2746 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2747 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2748 2749 /* 2750 * We don't ever fill up leaves all the way so multiply by 2 just to be 2751 * closer to what we're really going to want to use. 2752 */ 2753 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2754 } 2755 2756 /* 2757 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2758 * would require to store the csums for that many bytes. 2759 */ 2760 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2761 { 2762 u64 csum_size; 2763 u64 num_csums_per_leaf; 2764 u64 num_csums; 2765 2766 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2767 num_csums_per_leaf = div64_u64(csum_size, 2768 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2769 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2770 num_csums += num_csums_per_leaf - 1; 2771 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2772 return num_csums; 2773 } 2774 2775 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2776 struct btrfs_fs_info *fs_info) 2777 { 2778 struct btrfs_block_rsv *global_rsv; 2779 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2780 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; 2781 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; 2782 u64 num_bytes, num_dirty_bgs_bytes; 2783 int ret = 0; 2784 2785 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 2786 num_heads = heads_to_leaves(fs_info, num_heads); 2787 if (num_heads > 1) 2788 num_bytes += (num_heads - 1) * fs_info->nodesize; 2789 num_bytes <<= 1; 2790 num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * 2791 fs_info->nodesize; 2792 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, 2793 num_dirty_bgs); 2794 global_rsv = &fs_info->global_block_rsv; 2795 2796 /* 2797 * If we can't allocate any more chunks lets make sure we have _lots_ of 2798 * wiggle room since running delayed refs can create more delayed refs. 2799 */ 2800 if (global_rsv->space_info->full) { 2801 num_dirty_bgs_bytes <<= 1; 2802 num_bytes <<= 1; 2803 } 2804 2805 spin_lock(&global_rsv->lock); 2806 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) 2807 ret = 1; 2808 spin_unlock(&global_rsv->lock); 2809 return ret; 2810 } 2811 2812 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2813 struct btrfs_fs_info *fs_info) 2814 { 2815 u64 num_entries = 2816 atomic_read(&trans->transaction->delayed_refs.num_entries); 2817 u64 avg_runtime; 2818 u64 val; 2819 2820 smp_mb(); 2821 avg_runtime = fs_info->avg_delayed_ref_runtime; 2822 val = num_entries * avg_runtime; 2823 if (val >= NSEC_PER_SEC) 2824 return 1; 2825 if (val >= NSEC_PER_SEC / 2) 2826 return 2; 2827 2828 return btrfs_check_space_for_delayed_refs(trans, fs_info); 2829 } 2830 2831 struct async_delayed_refs { 2832 struct btrfs_root *root; 2833 u64 transid; 2834 int count; 2835 int error; 2836 int sync; 2837 struct completion wait; 2838 struct btrfs_work work; 2839 }; 2840 2841 static inline struct async_delayed_refs * 2842 to_async_delayed_refs(struct btrfs_work *work) 2843 { 2844 return container_of(work, struct async_delayed_refs, work); 2845 } 2846 2847 static void delayed_ref_async_start(struct btrfs_work *work) 2848 { 2849 struct async_delayed_refs *async = to_async_delayed_refs(work); 2850 struct btrfs_trans_handle *trans; 2851 struct btrfs_fs_info *fs_info = async->root->fs_info; 2852 int ret; 2853 2854 /* if the commit is already started, we don't need to wait here */ 2855 if (btrfs_transaction_blocked(fs_info)) 2856 goto done; 2857 2858 trans = btrfs_join_transaction(async->root); 2859 if (IS_ERR(trans)) { 2860 async->error = PTR_ERR(trans); 2861 goto done; 2862 } 2863 2864 /* 2865 * trans->sync means that when we call end_transaction, we won't 2866 * wait on delayed refs 2867 */ 2868 trans->sync = true; 2869 2870 /* Don't bother flushing if we got into a different transaction */ 2871 if (trans->transid > async->transid) 2872 goto end; 2873 2874 ret = btrfs_run_delayed_refs(trans, fs_info, async->count); 2875 if (ret) 2876 async->error = ret; 2877 end: 2878 ret = btrfs_end_transaction(trans); 2879 if (ret && !async->error) 2880 async->error = ret; 2881 done: 2882 if (async->sync) 2883 complete(&async->wait); 2884 else 2885 kfree(async); 2886 } 2887 2888 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, 2889 unsigned long count, u64 transid, int wait) 2890 { 2891 struct async_delayed_refs *async; 2892 int ret; 2893 2894 async = kmalloc(sizeof(*async), GFP_NOFS); 2895 if (!async) 2896 return -ENOMEM; 2897 2898 async->root = fs_info->tree_root; 2899 async->count = count; 2900 async->error = 0; 2901 async->transid = transid; 2902 if (wait) 2903 async->sync = 1; 2904 else 2905 async->sync = 0; 2906 init_completion(&async->wait); 2907 2908 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2909 delayed_ref_async_start, NULL, NULL); 2910 2911 btrfs_queue_work(fs_info->extent_workers, &async->work); 2912 2913 if (wait) { 2914 wait_for_completion(&async->wait); 2915 ret = async->error; 2916 kfree(async); 2917 return ret; 2918 } 2919 return 0; 2920 } 2921 2922 /* 2923 * this starts processing the delayed reference count updates and 2924 * extent insertions we have queued up so far. count can be 2925 * 0, which means to process everything in the tree at the start 2926 * of the run (but not newly added entries), or it can be some target 2927 * number you'd like to process. 2928 * 2929 * Returns 0 on success or if called with an aborted transaction 2930 * Returns <0 on error and aborts the transaction 2931 */ 2932 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2933 struct btrfs_fs_info *fs_info, unsigned long count) 2934 { 2935 struct rb_node *node; 2936 struct btrfs_delayed_ref_root *delayed_refs; 2937 struct btrfs_delayed_ref_head *head; 2938 int ret; 2939 int run_all = count == (unsigned long)-1; 2940 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 2941 2942 /* We'll clean this up in btrfs_cleanup_transaction */ 2943 if (trans->aborted) 2944 return 0; 2945 2946 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2947 return 0; 2948 2949 delayed_refs = &trans->transaction->delayed_refs; 2950 if (count == 0) 2951 count = atomic_read(&delayed_refs->num_entries) * 2; 2952 2953 again: 2954 #ifdef SCRAMBLE_DELAYED_REFS 2955 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2956 #endif 2957 trans->can_flush_pending_bgs = false; 2958 ret = __btrfs_run_delayed_refs(trans, fs_info, count); 2959 if (ret < 0) { 2960 btrfs_abort_transaction(trans, ret); 2961 return ret; 2962 } 2963 2964 if (run_all) { 2965 if (!list_empty(&trans->new_bgs)) 2966 btrfs_create_pending_block_groups(trans, fs_info); 2967 2968 spin_lock(&delayed_refs->lock); 2969 node = rb_first(&delayed_refs->href_root); 2970 if (!node) { 2971 spin_unlock(&delayed_refs->lock); 2972 goto out; 2973 } 2974 2975 while (node) { 2976 head = rb_entry(node, struct btrfs_delayed_ref_head, 2977 href_node); 2978 if (btrfs_delayed_ref_is_head(&head->node)) { 2979 struct btrfs_delayed_ref_node *ref; 2980 2981 ref = &head->node; 2982 atomic_inc(&ref->refs); 2983 2984 spin_unlock(&delayed_refs->lock); 2985 /* 2986 * Mutex was contended, block until it's 2987 * released and try again 2988 */ 2989 mutex_lock(&head->mutex); 2990 mutex_unlock(&head->mutex); 2991 2992 btrfs_put_delayed_ref(ref); 2993 cond_resched(); 2994 goto again; 2995 } else { 2996 WARN_ON(1); 2997 } 2998 node = rb_next(node); 2999 } 3000 spin_unlock(&delayed_refs->lock); 3001 cond_resched(); 3002 goto again; 3003 } 3004 out: 3005 assert_qgroups_uptodate(trans); 3006 trans->can_flush_pending_bgs = can_flush_pending_bgs; 3007 return 0; 3008 } 3009 3010 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3011 struct btrfs_fs_info *fs_info, 3012 u64 bytenr, u64 num_bytes, u64 flags, 3013 int level, int is_data) 3014 { 3015 struct btrfs_delayed_extent_op *extent_op; 3016 int ret; 3017 3018 extent_op = btrfs_alloc_delayed_extent_op(); 3019 if (!extent_op) 3020 return -ENOMEM; 3021 3022 extent_op->flags_to_set = flags; 3023 extent_op->update_flags = true; 3024 extent_op->update_key = false; 3025 extent_op->is_data = is_data ? true : false; 3026 extent_op->level = level; 3027 3028 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, 3029 num_bytes, extent_op); 3030 if (ret) 3031 btrfs_free_delayed_extent_op(extent_op); 3032 return ret; 3033 } 3034 3035 static noinline int check_delayed_ref(struct btrfs_root *root, 3036 struct btrfs_path *path, 3037 u64 objectid, u64 offset, u64 bytenr) 3038 { 3039 struct btrfs_delayed_ref_head *head; 3040 struct btrfs_delayed_ref_node *ref; 3041 struct btrfs_delayed_data_ref *data_ref; 3042 struct btrfs_delayed_ref_root *delayed_refs; 3043 struct btrfs_transaction *cur_trans; 3044 int ret = 0; 3045 3046 cur_trans = root->fs_info->running_transaction; 3047 if (!cur_trans) 3048 return 0; 3049 3050 delayed_refs = &cur_trans->delayed_refs; 3051 spin_lock(&delayed_refs->lock); 3052 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3053 if (!head) { 3054 spin_unlock(&delayed_refs->lock); 3055 return 0; 3056 } 3057 3058 if (!mutex_trylock(&head->mutex)) { 3059 atomic_inc(&head->node.refs); 3060 spin_unlock(&delayed_refs->lock); 3061 3062 btrfs_release_path(path); 3063 3064 /* 3065 * Mutex was contended, block until it's released and let 3066 * caller try again 3067 */ 3068 mutex_lock(&head->mutex); 3069 mutex_unlock(&head->mutex); 3070 btrfs_put_delayed_ref(&head->node); 3071 return -EAGAIN; 3072 } 3073 spin_unlock(&delayed_refs->lock); 3074 3075 spin_lock(&head->lock); 3076 list_for_each_entry(ref, &head->ref_list, list) { 3077 /* If it's a shared ref we know a cross reference exists */ 3078 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3079 ret = 1; 3080 break; 3081 } 3082 3083 data_ref = btrfs_delayed_node_to_data_ref(ref); 3084 3085 /* 3086 * If our ref doesn't match the one we're currently looking at 3087 * then we have a cross reference. 3088 */ 3089 if (data_ref->root != root->root_key.objectid || 3090 data_ref->objectid != objectid || 3091 data_ref->offset != offset) { 3092 ret = 1; 3093 break; 3094 } 3095 } 3096 spin_unlock(&head->lock); 3097 mutex_unlock(&head->mutex); 3098 return ret; 3099 } 3100 3101 static noinline int check_committed_ref(struct btrfs_root *root, 3102 struct btrfs_path *path, 3103 u64 objectid, u64 offset, u64 bytenr) 3104 { 3105 struct btrfs_fs_info *fs_info = root->fs_info; 3106 struct btrfs_root *extent_root = fs_info->extent_root; 3107 struct extent_buffer *leaf; 3108 struct btrfs_extent_data_ref *ref; 3109 struct btrfs_extent_inline_ref *iref; 3110 struct btrfs_extent_item *ei; 3111 struct btrfs_key key; 3112 u32 item_size; 3113 int ret; 3114 3115 key.objectid = bytenr; 3116 key.offset = (u64)-1; 3117 key.type = BTRFS_EXTENT_ITEM_KEY; 3118 3119 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3120 if (ret < 0) 3121 goto out; 3122 BUG_ON(ret == 0); /* Corruption */ 3123 3124 ret = -ENOENT; 3125 if (path->slots[0] == 0) 3126 goto out; 3127 3128 path->slots[0]--; 3129 leaf = path->nodes[0]; 3130 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3131 3132 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3133 goto out; 3134 3135 ret = 1; 3136 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3137 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3138 if (item_size < sizeof(*ei)) { 3139 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3140 goto out; 3141 } 3142 #endif 3143 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3144 3145 if (item_size != sizeof(*ei) + 3146 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3147 goto out; 3148 3149 if (btrfs_extent_generation(leaf, ei) <= 3150 btrfs_root_last_snapshot(&root->root_item)) 3151 goto out; 3152 3153 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3154 if (btrfs_extent_inline_ref_type(leaf, iref) != 3155 BTRFS_EXTENT_DATA_REF_KEY) 3156 goto out; 3157 3158 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3159 if (btrfs_extent_refs(leaf, ei) != 3160 btrfs_extent_data_ref_count(leaf, ref) || 3161 btrfs_extent_data_ref_root(leaf, ref) != 3162 root->root_key.objectid || 3163 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3164 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3165 goto out; 3166 3167 ret = 0; 3168 out: 3169 return ret; 3170 } 3171 3172 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 3173 u64 bytenr) 3174 { 3175 struct btrfs_path *path; 3176 int ret; 3177 int ret2; 3178 3179 path = btrfs_alloc_path(); 3180 if (!path) 3181 return -ENOENT; 3182 3183 do { 3184 ret = check_committed_ref(root, path, objectid, 3185 offset, bytenr); 3186 if (ret && ret != -ENOENT) 3187 goto out; 3188 3189 ret2 = check_delayed_ref(root, path, objectid, 3190 offset, bytenr); 3191 } while (ret2 == -EAGAIN); 3192 3193 if (ret2 && ret2 != -ENOENT) { 3194 ret = ret2; 3195 goto out; 3196 } 3197 3198 if (ret != -ENOENT || ret2 != -ENOENT) 3199 ret = 0; 3200 out: 3201 btrfs_free_path(path); 3202 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3203 WARN_ON(ret > 0); 3204 return ret; 3205 } 3206 3207 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3208 struct btrfs_root *root, 3209 struct extent_buffer *buf, 3210 int full_backref, int inc) 3211 { 3212 struct btrfs_fs_info *fs_info = root->fs_info; 3213 u64 bytenr; 3214 u64 num_bytes; 3215 u64 parent; 3216 u64 ref_root; 3217 u32 nritems; 3218 struct btrfs_key key; 3219 struct btrfs_file_extent_item *fi; 3220 int i; 3221 int level; 3222 int ret = 0; 3223 int (*process_func)(struct btrfs_trans_handle *, 3224 struct btrfs_fs_info *, 3225 u64, u64, u64, u64, u64, u64); 3226 3227 3228 if (btrfs_is_testing(fs_info)) 3229 return 0; 3230 3231 ref_root = btrfs_header_owner(buf); 3232 nritems = btrfs_header_nritems(buf); 3233 level = btrfs_header_level(buf); 3234 3235 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3236 return 0; 3237 3238 if (inc) 3239 process_func = btrfs_inc_extent_ref; 3240 else 3241 process_func = btrfs_free_extent; 3242 3243 if (full_backref) 3244 parent = buf->start; 3245 else 3246 parent = 0; 3247 3248 for (i = 0; i < nritems; i++) { 3249 if (level == 0) { 3250 btrfs_item_key_to_cpu(buf, &key, i); 3251 if (key.type != BTRFS_EXTENT_DATA_KEY) 3252 continue; 3253 fi = btrfs_item_ptr(buf, i, 3254 struct btrfs_file_extent_item); 3255 if (btrfs_file_extent_type(buf, fi) == 3256 BTRFS_FILE_EXTENT_INLINE) 3257 continue; 3258 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3259 if (bytenr == 0) 3260 continue; 3261 3262 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3263 key.offset -= btrfs_file_extent_offset(buf, fi); 3264 ret = process_func(trans, fs_info, bytenr, num_bytes, 3265 parent, ref_root, key.objectid, 3266 key.offset); 3267 if (ret) 3268 goto fail; 3269 } else { 3270 bytenr = btrfs_node_blockptr(buf, i); 3271 num_bytes = fs_info->nodesize; 3272 ret = process_func(trans, fs_info, bytenr, num_bytes, 3273 parent, ref_root, level - 1, 0); 3274 if (ret) 3275 goto fail; 3276 } 3277 } 3278 return 0; 3279 fail: 3280 return ret; 3281 } 3282 3283 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3284 struct extent_buffer *buf, int full_backref) 3285 { 3286 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3287 } 3288 3289 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3290 struct extent_buffer *buf, int full_backref) 3291 { 3292 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3293 } 3294 3295 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3296 struct btrfs_fs_info *fs_info, 3297 struct btrfs_path *path, 3298 struct btrfs_block_group_cache *cache) 3299 { 3300 int ret; 3301 struct btrfs_root *extent_root = fs_info->extent_root; 3302 unsigned long bi; 3303 struct extent_buffer *leaf; 3304 3305 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3306 if (ret) { 3307 if (ret > 0) 3308 ret = -ENOENT; 3309 goto fail; 3310 } 3311 3312 leaf = path->nodes[0]; 3313 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3314 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3315 btrfs_mark_buffer_dirty(leaf); 3316 fail: 3317 btrfs_release_path(path); 3318 return ret; 3319 3320 } 3321 3322 static struct btrfs_block_group_cache * 3323 next_block_group(struct btrfs_fs_info *fs_info, 3324 struct btrfs_block_group_cache *cache) 3325 { 3326 struct rb_node *node; 3327 3328 spin_lock(&fs_info->block_group_cache_lock); 3329 3330 /* If our block group was removed, we need a full search. */ 3331 if (RB_EMPTY_NODE(&cache->cache_node)) { 3332 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3333 3334 spin_unlock(&fs_info->block_group_cache_lock); 3335 btrfs_put_block_group(cache); 3336 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3337 } 3338 node = rb_next(&cache->cache_node); 3339 btrfs_put_block_group(cache); 3340 if (node) { 3341 cache = rb_entry(node, struct btrfs_block_group_cache, 3342 cache_node); 3343 btrfs_get_block_group(cache); 3344 } else 3345 cache = NULL; 3346 spin_unlock(&fs_info->block_group_cache_lock); 3347 return cache; 3348 } 3349 3350 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3351 struct btrfs_trans_handle *trans, 3352 struct btrfs_path *path) 3353 { 3354 struct btrfs_fs_info *fs_info = block_group->fs_info; 3355 struct btrfs_root *root = fs_info->tree_root; 3356 struct inode *inode = NULL; 3357 u64 alloc_hint = 0; 3358 int dcs = BTRFS_DC_ERROR; 3359 u64 num_pages = 0; 3360 int retries = 0; 3361 int ret = 0; 3362 3363 /* 3364 * If this block group is smaller than 100 megs don't bother caching the 3365 * block group. 3366 */ 3367 if (block_group->key.offset < (100 * SZ_1M)) { 3368 spin_lock(&block_group->lock); 3369 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3370 spin_unlock(&block_group->lock); 3371 return 0; 3372 } 3373 3374 if (trans->aborted) 3375 return 0; 3376 again: 3377 inode = lookup_free_space_inode(fs_info, block_group, path); 3378 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3379 ret = PTR_ERR(inode); 3380 btrfs_release_path(path); 3381 goto out; 3382 } 3383 3384 if (IS_ERR(inode)) { 3385 BUG_ON(retries); 3386 retries++; 3387 3388 if (block_group->ro) 3389 goto out_free; 3390 3391 ret = create_free_space_inode(fs_info, trans, block_group, 3392 path); 3393 if (ret) 3394 goto out_free; 3395 goto again; 3396 } 3397 3398 /* We've already setup this transaction, go ahead and exit */ 3399 if (block_group->cache_generation == trans->transid && 3400 i_size_read(inode)) { 3401 dcs = BTRFS_DC_SETUP; 3402 goto out_put; 3403 } 3404 3405 /* 3406 * We want to set the generation to 0, that way if anything goes wrong 3407 * from here on out we know not to trust this cache when we load up next 3408 * time. 3409 */ 3410 BTRFS_I(inode)->generation = 0; 3411 ret = btrfs_update_inode(trans, root, inode); 3412 if (ret) { 3413 /* 3414 * So theoretically we could recover from this, simply set the 3415 * super cache generation to 0 so we know to invalidate the 3416 * cache, but then we'd have to keep track of the block groups 3417 * that fail this way so we know we _have_ to reset this cache 3418 * before the next commit or risk reading stale cache. So to 3419 * limit our exposure to horrible edge cases lets just abort the 3420 * transaction, this only happens in really bad situations 3421 * anyway. 3422 */ 3423 btrfs_abort_transaction(trans, ret); 3424 goto out_put; 3425 } 3426 WARN_ON(ret); 3427 3428 if (i_size_read(inode) > 0) { 3429 ret = btrfs_check_trunc_cache_free_space(fs_info, 3430 &fs_info->global_block_rsv); 3431 if (ret) 3432 goto out_put; 3433 3434 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3435 if (ret) 3436 goto out_put; 3437 } 3438 3439 spin_lock(&block_group->lock); 3440 if (block_group->cached != BTRFS_CACHE_FINISHED || 3441 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3442 /* 3443 * don't bother trying to write stuff out _if_ 3444 * a) we're not cached, 3445 * b) we're with nospace_cache mount option. 3446 */ 3447 dcs = BTRFS_DC_WRITTEN; 3448 spin_unlock(&block_group->lock); 3449 goto out_put; 3450 } 3451 spin_unlock(&block_group->lock); 3452 3453 /* 3454 * We hit an ENOSPC when setting up the cache in this transaction, just 3455 * skip doing the setup, we've already cleared the cache so we're safe. 3456 */ 3457 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3458 ret = -ENOSPC; 3459 goto out_put; 3460 } 3461 3462 /* 3463 * Try to preallocate enough space based on how big the block group is. 3464 * Keep in mind this has to include any pinned space which could end up 3465 * taking up quite a bit since it's not folded into the other space 3466 * cache. 3467 */ 3468 num_pages = div_u64(block_group->key.offset, SZ_256M); 3469 if (!num_pages) 3470 num_pages = 1; 3471 3472 num_pages *= 16; 3473 num_pages *= PAGE_SIZE; 3474 3475 ret = btrfs_check_data_free_space(inode, 0, num_pages); 3476 if (ret) 3477 goto out_put; 3478 3479 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3480 num_pages, num_pages, 3481 &alloc_hint); 3482 /* 3483 * Our cache requires contiguous chunks so that we don't modify a bunch 3484 * of metadata or split extents when writing the cache out, which means 3485 * we can enospc if we are heavily fragmented in addition to just normal 3486 * out of space conditions. So if we hit this just skip setting up any 3487 * other block groups for this transaction, maybe we'll unpin enough 3488 * space the next time around. 3489 */ 3490 if (!ret) 3491 dcs = BTRFS_DC_SETUP; 3492 else if (ret == -ENOSPC) 3493 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3494 3495 out_put: 3496 iput(inode); 3497 out_free: 3498 btrfs_release_path(path); 3499 out: 3500 spin_lock(&block_group->lock); 3501 if (!ret && dcs == BTRFS_DC_SETUP) 3502 block_group->cache_generation = trans->transid; 3503 block_group->disk_cache_state = dcs; 3504 spin_unlock(&block_group->lock); 3505 3506 return ret; 3507 } 3508 3509 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3510 struct btrfs_fs_info *fs_info) 3511 { 3512 struct btrfs_block_group_cache *cache, *tmp; 3513 struct btrfs_transaction *cur_trans = trans->transaction; 3514 struct btrfs_path *path; 3515 3516 if (list_empty(&cur_trans->dirty_bgs) || 3517 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3518 return 0; 3519 3520 path = btrfs_alloc_path(); 3521 if (!path) 3522 return -ENOMEM; 3523 3524 /* Could add new block groups, use _safe just in case */ 3525 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3526 dirty_list) { 3527 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3528 cache_save_setup(cache, trans, path); 3529 } 3530 3531 btrfs_free_path(path); 3532 return 0; 3533 } 3534 3535 /* 3536 * transaction commit does final block group cache writeback during a 3537 * critical section where nothing is allowed to change the FS. This is 3538 * required in order for the cache to actually match the block group, 3539 * but can introduce a lot of latency into the commit. 3540 * 3541 * So, btrfs_start_dirty_block_groups is here to kick off block group 3542 * cache IO. There's a chance we'll have to redo some of it if the 3543 * block group changes again during the commit, but it greatly reduces 3544 * the commit latency by getting rid of the easy block groups while 3545 * we're still allowing others to join the commit. 3546 */ 3547 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3548 struct btrfs_fs_info *fs_info) 3549 { 3550 struct btrfs_block_group_cache *cache; 3551 struct btrfs_transaction *cur_trans = trans->transaction; 3552 int ret = 0; 3553 int should_put; 3554 struct btrfs_path *path = NULL; 3555 LIST_HEAD(dirty); 3556 struct list_head *io = &cur_trans->io_bgs; 3557 int num_started = 0; 3558 int loops = 0; 3559 3560 spin_lock(&cur_trans->dirty_bgs_lock); 3561 if (list_empty(&cur_trans->dirty_bgs)) { 3562 spin_unlock(&cur_trans->dirty_bgs_lock); 3563 return 0; 3564 } 3565 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3566 spin_unlock(&cur_trans->dirty_bgs_lock); 3567 3568 again: 3569 /* 3570 * make sure all the block groups on our dirty list actually 3571 * exist 3572 */ 3573 btrfs_create_pending_block_groups(trans, fs_info); 3574 3575 if (!path) { 3576 path = btrfs_alloc_path(); 3577 if (!path) 3578 return -ENOMEM; 3579 } 3580 3581 /* 3582 * cache_write_mutex is here only to save us from balance or automatic 3583 * removal of empty block groups deleting this block group while we are 3584 * writing out the cache 3585 */ 3586 mutex_lock(&trans->transaction->cache_write_mutex); 3587 while (!list_empty(&dirty)) { 3588 cache = list_first_entry(&dirty, 3589 struct btrfs_block_group_cache, 3590 dirty_list); 3591 /* 3592 * this can happen if something re-dirties a block 3593 * group that is already under IO. Just wait for it to 3594 * finish and then do it all again 3595 */ 3596 if (!list_empty(&cache->io_list)) { 3597 list_del_init(&cache->io_list); 3598 btrfs_wait_cache_io(trans, cache, path); 3599 btrfs_put_block_group(cache); 3600 } 3601 3602 3603 /* 3604 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3605 * if it should update the cache_state. Don't delete 3606 * until after we wait. 3607 * 3608 * Since we're not running in the commit critical section 3609 * we need the dirty_bgs_lock to protect from update_block_group 3610 */ 3611 spin_lock(&cur_trans->dirty_bgs_lock); 3612 list_del_init(&cache->dirty_list); 3613 spin_unlock(&cur_trans->dirty_bgs_lock); 3614 3615 should_put = 1; 3616 3617 cache_save_setup(cache, trans, path); 3618 3619 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3620 cache->io_ctl.inode = NULL; 3621 ret = btrfs_write_out_cache(fs_info, trans, 3622 cache, path); 3623 if (ret == 0 && cache->io_ctl.inode) { 3624 num_started++; 3625 should_put = 0; 3626 3627 /* 3628 * the cache_write_mutex is protecting 3629 * the io_list 3630 */ 3631 list_add_tail(&cache->io_list, io); 3632 } else { 3633 /* 3634 * if we failed to write the cache, the 3635 * generation will be bad and life goes on 3636 */ 3637 ret = 0; 3638 } 3639 } 3640 if (!ret) { 3641 ret = write_one_cache_group(trans, fs_info, 3642 path, cache); 3643 /* 3644 * Our block group might still be attached to the list 3645 * of new block groups in the transaction handle of some 3646 * other task (struct btrfs_trans_handle->new_bgs). This 3647 * means its block group item isn't yet in the extent 3648 * tree. If this happens ignore the error, as we will 3649 * try again later in the critical section of the 3650 * transaction commit. 3651 */ 3652 if (ret == -ENOENT) { 3653 ret = 0; 3654 spin_lock(&cur_trans->dirty_bgs_lock); 3655 if (list_empty(&cache->dirty_list)) { 3656 list_add_tail(&cache->dirty_list, 3657 &cur_trans->dirty_bgs); 3658 btrfs_get_block_group(cache); 3659 } 3660 spin_unlock(&cur_trans->dirty_bgs_lock); 3661 } else if (ret) { 3662 btrfs_abort_transaction(trans, ret); 3663 } 3664 } 3665 3666 /* if its not on the io list, we need to put the block group */ 3667 if (should_put) 3668 btrfs_put_block_group(cache); 3669 3670 if (ret) 3671 break; 3672 3673 /* 3674 * Avoid blocking other tasks for too long. It might even save 3675 * us from writing caches for block groups that are going to be 3676 * removed. 3677 */ 3678 mutex_unlock(&trans->transaction->cache_write_mutex); 3679 mutex_lock(&trans->transaction->cache_write_mutex); 3680 } 3681 mutex_unlock(&trans->transaction->cache_write_mutex); 3682 3683 /* 3684 * go through delayed refs for all the stuff we've just kicked off 3685 * and then loop back (just once) 3686 */ 3687 ret = btrfs_run_delayed_refs(trans, fs_info, 0); 3688 if (!ret && loops == 0) { 3689 loops++; 3690 spin_lock(&cur_trans->dirty_bgs_lock); 3691 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3692 /* 3693 * dirty_bgs_lock protects us from concurrent block group 3694 * deletes too (not just cache_write_mutex). 3695 */ 3696 if (!list_empty(&dirty)) { 3697 spin_unlock(&cur_trans->dirty_bgs_lock); 3698 goto again; 3699 } 3700 spin_unlock(&cur_trans->dirty_bgs_lock); 3701 } else if (ret < 0) { 3702 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3703 } 3704 3705 btrfs_free_path(path); 3706 return ret; 3707 } 3708 3709 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3710 struct btrfs_fs_info *fs_info) 3711 { 3712 struct btrfs_block_group_cache *cache; 3713 struct btrfs_transaction *cur_trans = trans->transaction; 3714 int ret = 0; 3715 int should_put; 3716 struct btrfs_path *path; 3717 struct list_head *io = &cur_trans->io_bgs; 3718 int num_started = 0; 3719 3720 path = btrfs_alloc_path(); 3721 if (!path) 3722 return -ENOMEM; 3723 3724 /* 3725 * Even though we are in the critical section of the transaction commit, 3726 * we can still have concurrent tasks adding elements to this 3727 * transaction's list of dirty block groups. These tasks correspond to 3728 * endio free space workers started when writeback finishes for a 3729 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3730 * allocate new block groups as a result of COWing nodes of the root 3731 * tree when updating the free space inode. The writeback for the space 3732 * caches is triggered by an earlier call to 3733 * btrfs_start_dirty_block_groups() and iterations of the following 3734 * loop. 3735 * Also we want to do the cache_save_setup first and then run the 3736 * delayed refs to make sure we have the best chance at doing this all 3737 * in one shot. 3738 */ 3739 spin_lock(&cur_trans->dirty_bgs_lock); 3740 while (!list_empty(&cur_trans->dirty_bgs)) { 3741 cache = list_first_entry(&cur_trans->dirty_bgs, 3742 struct btrfs_block_group_cache, 3743 dirty_list); 3744 3745 /* 3746 * this can happen if cache_save_setup re-dirties a block 3747 * group that is already under IO. Just wait for it to 3748 * finish and then do it all again 3749 */ 3750 if (!list_empty(&cache->io_list)) { 3751 spin_unlock(&cur_trans->dirty_bgs_lock); 3752 list_del_init(&cache->io_list); 3753 btrfs_wait_cache_io(trans, cache, path); 3754 btrfs_put_block_group(cache); 3755 spin_lock(&cur_trans->dirty_bgs_lock); 3756 } 3757 3758 /* 3759 * don't remove from the dirty list until after we've waited 3760 * on any pending IO 3761 */ 3762 list_del_init(&cache->dirty_list); 3763 spin_unlock(&cur_trans->dirty_bgs_lock); 3764 should_put = 1; 3765 3766 cache_save_setup(cache, trans, path); 3767 3768 if (!ret) 3769 ret = btrfs_run_delayed_refs(trans, fs_info, 3770 (unsigned long) -1); 3771 3772 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3773 cache->io_ctl.inode = NULL; 3774 ret = btrfs_write_out_cache(fs_info, trans, 3775 cache, path); 3776 if (ret == 0 && cache->io_ctl.inode) { 3777 num_started++; 3778 should_put = 0; 3779 list_add_tail(&cache->io_list, io); 3780 } else { 3781 /* 3782 * if we failed to write the cache, the 3783 * generation will be bad and life goes on 3784 */ 3785 ret = 0; 3786 } 3787 } 3788 if (!ret) { 3789 ret = write_one_cache_group(trans, fs_info, 3790 path, cache); 3791 /* 3792 * One of the free space endio workers might have 3793 * created a new block group while updating a free space 3794 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3795 * and hasn't released its transaction handle yet, in 3796 * which case the new block group is still attached to 3797 * its transaction handle and its creation has not 3798 * finished yet (no block group item in the extent tree 3799 * yet, etc). If this is the case, wait for all free 3800 * space endio workers to finish and retry. This is a 3801 * a very rare case so no need for a more efficient and 3802 * complex approach. 3803 */ 3804 if (ret == -ENOENT) { 3805 wait_event(cur_trans->writer_wait, 3806 atomic_read(&cur_trans->num_writers) == 1); 3807 ret = write_one_cache_group(trans, fs_info, 3808 path, cache); 3809 } 3810 if (ret) 3811 btrfs_abort_transaction(trans, ret); 3812 } 3813 3814 /* if its not on the io list, we need to put the block group */ 3815 if (should_put) 3816 btrfs_put_block_group(cache); 3817 spin_lock(&cur_trans->dirty_bgs_lock); 3818 } 3819 spin_unlock(&cur_trans->dirty_bgs_lock); 3820 3821 while (!list_empty(io)) { 3822 cache = list_first_entry(io, struct btrfs_block_group_cache, 3823 io_list); 3824 list_del_init(&cache->io_list); 3825 btrfs_wait_cache_io(trans, cache, path); 3826 btrfs_put_block_group(cache); 3827 } 3828 3829 btrfs_free_path(path); 3830 return ret; 3831 } 3832 3833 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3834 { 3835 struct btrfs_block_group_cache *block_group; 3836 int readonly = 0; 3837 3838 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3839 if (!block_group || block_group->ro) 3840 readonly = 1; 3841 if (block_group) 3842 btrfs_put_block_group(block_group); 3843 return readonly; 3844 } 3845 3846 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3847 { 3848 struct btrfs_block_group_cache *bg; 3849 bool ret = true; 3850 3851 bg = btrfs_lookup_block_group(fs_info, bytenr); 3852 if (!bg) 3853 return false; 3854 3855 spin_lock(&bg->lock); 3856 if (bg->ro) 3857 ret = false; 3858 else 3859 atomic_inc(&bg->nocow_writers); 3860 spin_unlock(&bg->lock); 3861 3862 /* no put on block group, done by btrfs_dec_nocow_writers */ 3863 if (!ret) 3864 btrfs_put_block_group(bg); 3865 3866 return ret; 3867 3868 } 3869 3870 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3871 { 3872 struct btrfs_block_group_cache *bg; 3873 3874 bg = btrfs_lookup_block_group(fs_info, bytenr); 3875 ASSERT(bg); 3876 if (atomic_dec_and_test(&bg->nocow_writers)) 3877 wake_up_atomic_t(&bg->nocow_writers); 3878 /* 3879 * Once for our lookup and once for the lookup done by a previous call 3880 * to btrfs_inc_nocow_writers() 3881 */ 3882 btrfs_put_block_group(bg); 3883 btrfs_put_block_group(bg); 3884 } 3885 3886 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) 3887 { 3888 schedule(); 3889 return 0; 3890 } 3891 3892 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3893 { 3894 wait_on_atomic_t(&bg->nocow_writers, 3895 btrfs_wait_nocow_writers_atomic_t, 3896 TASK_UNINTERRUPTIBLE); 3897 } 3898 3899 static const char *alloc_name(u64 flags) 3900 { 3901 switch (flags) { 3902 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3903 return "mixed"; 3904 case BTRFS_BLOCK_GROUP_METADATA: 3905 return "metadata"; 3906 case BTRFS_BLOCK_GROUP_DATA: 3907 return "data"; 3908 case BTRFS_BLOCK_GROUP_SYSTEM: 3909 return "system"; 3910 default: 3911 WARN_ON(1); 3912 return "invalid-combination"; 3913 }; 3914 } 3915 3916 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3917 u64 total_bytes, u64 bytes_used, 3918 u64 bytes_readonly, 3919 struct btrfs_space_info **space_info) 3920 { 3921 struct btrfs_space_info *found; 3922 int i; 3923 int factor; 3924 int ret; 3925 3926 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3927 BTRFS_BLOCK_GROUP_RAID10)) 3928 factor = 2; 3929 else 3930 factor = 1; 3931 3932 found = __find_space_info(info, flags); 3933 if (found) { 3934 spin_lock(&found->lock); 3935 found->total_bytes += total_bytes; 3936 found->disk_total += total_bytes * factor; 3937 found->bytes_used += bytes_used; 3938 found->disk_used += bytes_used * factor; 3939 found->bytes_readonly += bytes_readonly; 3940 if (total_bytes > 0) 3941 found->full = 0; 3942 space_info_add_new_bytes(info, found, total_bytes - 3943 bytes_used - bytes_readonly); 3944 spin_unlock(&found->lock); 3945 *space_info = found; 3946 return 0; 3947 } 3948 found = kzalloc(sizeof(*found), GFP_NOFS); 3949 if (!found) 3950 return -ENOMEM; 3951 3952 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3953 if (ret) { 3954 kfree(found); 3955 return ret; 3956 } 3957 3958 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3959 INIT_LIST_HEAD(&found->block_groups[i]); 3960 init_rwsem(&found->groups_sem); 3961 spin_lock_init(&found->lock); 3962 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3963 found->total_bytes = total_bytes; 3964 found->disk_total = total_bytes * factor; 3965 found->bytes_used = bytes_used; 3966 found->disk_used = bytes_used * factor; 3967 found->bytes_pinned = 0; 3968 found->bytes_reserved = 0; 3969 found->bytes_readonly = bytes_readonly; 3970 found->bytes_may_use = 0; 3971 found->full = 0; 3972 found->max_extent_size = 0; 3973 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3974 found->chunk_alloc = 0; 3975 found->flush = 0; 3976 init_waitqueue_head(&found->wait); 3977 INIT_LIST_HEAD(&found->ro_bgs); 3978 INIT_LIST_HEAD(&found->tickets); 3979 INIT_LIST_HEAD(&found->priority_tickets); 3980 3981 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3982 info->space_info_kobj, "%s", 3983 alloc_name(found->flags)); 3984 if (ret) { 3985 kfree(found); 3986 return ret; 3987 } 3988 3989 *space_info = found; 3990 list_add_rcu(&found->list, &info->space_info); 3991 if (flags & BTRFS_BLOCK_GROUP_DATA) 3992 info->data_sinfo = found; 3993 3994 return ret; 3995 } 3996 3997 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3998 { 3999 u64 extra_flags = chunk_to_extended(flags) & 4000 BTRFS_EXTENDED_PROFILE_MASK; 4001 4002 write_seqlock(&fs_info->profiles_lock); 4003 if (flags & BTRFS_BLOCK_GROUP_DATA) 4004 fs_info->avail_data_alloc_bits |= extra_flags; 4005 if (flags & BTRFS_BLOCK_GROUP_METADATA) 4006 fs_info->avail_metadata_alloc_bits |= extra_flags; 4007 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4008 fs_info->avail_system_alloc_bits |= extra_flags; 4009 write_sequnlock(&fs_info->profiles_lock); 4010 } 4011 4012 /* 4013 * returns target flags in extended format or 0 if restripe for this 4014 * chunk_type is not in progress 4015 * 4016 * should be called with either volume_mutex or balance_lock held 4017 */ 4018 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4019 { 4020 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4021 u64 target = 0; 4022 4023 if (!bctl) 4024 return 0; 4025 4026 if (flags & BTRFS_BLOCK_GROUP_DATA && 4027 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4028 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4029 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4030 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4031 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4032 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4033 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4034 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4035 } 4036 4037 return target; 4038 } 4039 4040 /* 4041 * @flags: available profiles in extended format (see ctree.h) 4042 * 4043 * Returns reduced profile in chunk format. If profile changing is in 4044 * progress (either running or paused) picks the target profile (if it's 4045 * already available), otherwise falls back to plain reducing. 4046 */ 4047 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 4048 { 4049 u64 num_devices = fs_info->fs_devices->rw_devices; 4050 u64 target; 4051 u64 raid_type; 4052 u64 allowed = 0; 4053 4054 /* 4055 * see if restripe for this chunk_type is in progress, if so 4056 * try to reduce to the target profile 4057 */ 4058 spin_lock(&fs_info->balance_lock); 4059 target = get_restripe_target(fs_info, flags); 4060 if (target) { 4061 /* pick target profile only if it's already available */ 4062 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4063 spin_unlock(&fs_info->balance_lock); 4064 return extended_to_chunk(target); 4065 } 4066 } 4067 spin_unlock(&fs_info->balance_lock); 4068 4069 /* First, mask out the RAID levels which aren't possible */ 4070 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4071 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4072 allowed |= btrfs_raid_group[raid_type]; 4073 } 4074 allowed &= flags; 4075 4076 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4077 allowed = BTRFS_BLOCK_GROUP_RAID6; 4078 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4079 allowed = BTRFS_BLOCK_GROUP_RAID5; 4080 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4081 allowed = BTRFS_BLOCK_GROUP_RAID10; 4082 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4083 allowed = BTRFS_BLOCK_GROUP_RAID1; 4084 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4085 allowed = BTRFS_BLOCK_GROUP_RAID0; 4086 4087 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4088 4089 return extended_to_chunk(flags | allowed); 4090 } 4091 4092 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 4093 { 4094 unsigned seq; 4095 u64 flags; 4096 4097 do { 4098 flags = orig_flags; 4099 seq = read_seqbegin(&fs_info->profiles_lock); 4100 4101 if (flags & BTRFS_BLOCK_GROUP_DATA) 4102 flags |= fs_info->avail_data_alloc_bits; 4103 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4104 flags |= fs_info->avail_system_alloc_bits; 4105 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4106 flags |= fs_info->avail_metadata_alloc_bits; 4107 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4108 4109 return btrfs_reduce_alloc_profile(fs_info, flags); 4110 } 4111 4112 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 4113 { 4114 struct btrfs_fs_info *fs_info = root->fs_info; 4115 u64 flags; 4116 u64 ret; 4117 4118 if (data) 4119 flags = BTRFS_BLOCK_GROUP_DATA; 4120 else if (root == fs_info->chunk_root) 4121 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4122 else 4123 flags = BTRFS_BLOCK_GROUP_METADATA; 4124 4125 ret = get_alloc_profile(fs_info, flags); 4126 return ret; 4127 } 4128 4129 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4130 bool may_use_included) 4131 { 4132 ASSERT(s_info); 4133 return s_info->bytes_used + s_info->bytes_reserved + 4134 s_info->bytes_pinned + s_info->bytes_readonly + 4135 (may_use_included ? s_info->bytes_may_use : 0); 4136 } 4137 4138 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) 4139 { 4140 struct btrfs_space_info *data_sinfo; 4141 struct btrfs_root *root = BTRFS_I(inode)->root; 4142 struct btrfs_fs_info *fs_info = root->fs_info; 4143 u64 used; 4144 int ret = 0; 4145 int need_commit = 2; 4146 int have_pinned_space; 4147 4148 /* make sure bytes are sectorsize aligned */ 4149 bytes = ALIGN(bytes, fs_info->sectorsize); 4150 4151 if (btrfs_is_free_space_inode(inode)) { 4152 need_commit = 0; 4153 ASSERT(current->journal_info); 4154 } 4155 4156 data_sinfo = fs_info->data_sinfo; 4157 if (!data_sinfo) 4158 goto alloc; 4159 4160 again: 4161 /* make sure we have enough space to handle the data first */ 4162 spin_lock(&data_sinfo->lock); 4163 used = btrfs_space_info_used(data_sinfo, true); 4164 4165 if (used + bytes > data_sinfo->total_bytes) { 4166 struct btrfs_trans_handle *trans; 4167 4168 /* 4169 * if we don't have enough free bytes in this space then we need 4170 * to alloc a new chunk. 4171 */ 4172 if (!data_sinfo->full) { 4173 u64 alloc_target; 4174 4175 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4176 spin_unlock(&data_sinfo->lock); 4177 alloc: 4178 alloc_target = btrfs_get_alloc_profile(root, 1); 4179 /* 4180 * It is ugly that we don't call nolock join 4181 * transaction for the free space inode case here. 4182 * But it is safe because we only do the data space 4183 * reservation for the free space cache in the 4184 * transaction context, the common join transaction 4185 * just increase the counter of the current transaction 4186 * handler, doesn't try to acquire the trans_lock of 4187 * the fs. 4188 */ 4189 trans = btrfs_join_transaction(root); 4190 if (IS_ERR(trans)) 4191 return PTR_ERR(trans); 4192 4193 ret = do_chunk_alloc(trans, fs_info, alloc_target, 4194 CHUNK_ALLOC_NO_FORCE); 4195 btrfs_end_transaction(trans); 4196 if (ret < 0) { 4197 if (ret != -ENOSPC) 4198 return ret; 4199 else { 4200 have_pinned_space = 1; 4201 goto commit_trans; 4202 } 4203 } 4204 4205 if (!data_sinfo) 4206 data_sinfo = fs_info->data_sinfo; 4207 4208 goto again; 4209 } 4210 4211 /* 4212 * If we don't have enough pinned space to deal with this 4213 * allocation, and no removed chunk in current transaction, 4214 * don't bother committing the transaction. 4215 */ 4216 have_pinned_space = percpu_counter_compare( 4217 &data_sinfo->total_bytes_pinned, 4218 used + bytes - data_sinfo->total_bytes); 4219 spin_unlock(&data_sinfo->lock); 4220 4221 /* commit the current transaction and try again */ 4222 commit_trans: 4223 if (need_commit && 4224 !atomic_read(&fs_info->open_ioctl_trans)) { 4225 need_commit--; 4226 4227 if (need_commit > 0) { 4228 btrfs_start_delalloc_roots(fs_info, 0, -1); 4229 btrfs_wait_ordered_roots(fs_info, -1, 0, 4230 (u64)-1); 4231 } 4232 4233 trans = btrfs_join_transaction(root); 4234 if (IS_ERR(trans)) 4235 return PTR_ERR(trans); 4236 if (have_pinned_space >= 0 || 4237 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4238 &trans->transaction->flags) || 4239 need_commit > 0) { 4240 ret = btrfs_commit_transaction(trans); 4241 if (ret) 4242 return ret; 4243 /* 4244 * The cleaner kthread might still be doing iput 4245 * operations. Wait for it to finish so that 4246 * more space is released. 4247 */ 4248 mutex_lock(&fs_info->cleaner_delayed_iput_mutex); 4249 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); 4250 goto again; 4251 } else { 4252 btrfs_end_transaction(trans); 4253 } 4254 } 4255 4256 trace_btrfs_space_reservation(fs_info, 4257 "space_info:enospc", 4258 data_sinfo->flags, bytes, 1); 4259 return -ENOSPC; 4260 } 4261 data_sinfo->bytes_may_use += bytes; 4262 trace_btrfs_space_reservation(fs_info, "space_info", 4263 data_sinfo->flags, bytes, 1); 4264 spin_unlock(&data_sinfo->lock); 4265 4266 return ret; 4267 } 4268 4269 /* 4270 * New check_data_free_space() with ability for precious data reservation 4271 * Will replace old btrfs_check_data_free_space(), but for patch split, 4272 * add a new function first and then replace it. 4273 */ 4274 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) 4275 { 4276 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4277 int ret; 4278 4279 /* align the range */ 4280 len = round_up(start + len, fs_info->sectorsize) - 4281 round_down(start, fs_info->sectorsize); 4282 start = round_down(start, fs_info->sectorsize); 4283 4284 ret = btrfs_alloc_data_chunk_ondemand(inode, len); 4285 if (ret < 0) 4286 return ret; 4287 4288 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4289 ret = btrfs_qgroup_reserve_data(inode, start, len); 4290 if (ret) 4291 btrfs_free_reserved_data_space_noquota(inode, start, len); 4292 return ret; 4293 } 4294 4295 /* 4296 * Called if we need to clear a data reservation for this inode 4297 * Normally in a error case. 4298 * 4299 * This one will *NOT* use accurate qgroup reserved space API, just for case 4300 * which we can't sleep and is sure it won't affect qgroup reserved space. 4301 * Like clear_bit_hook(). 4302 */ 4303 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4304 u64 len) 4305 { 4306 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4307 struct btrfs_space_info *data_sinfo; 4308 4309 /* Make sure the range is aligned to sectorsize */ 4310 len = round_up(start + len, fs_info->sectorsize) - 4311 round_down(start, fs_info->sectorsize); 4312 start = round_down(start, fs_info->sectorsize); 4313 4314 data_sinfo = fs_info->data_sinfo; 4315 spin_lock(&data_sinfo->lock); 4316 if (WARN_ON(data_sinfo->bytes_may_use < len)) 4317 data_sinfo->bytes_may_use = 0; 4318 else 4319 data_sinfo->bytes_may_use -= len; 4320 trace_btrfs_space_reservation(fs_info, "space_info", 4321 data_sinfo->flags, len, 0); 4322 spin_unlock(&data_sinfo->lock); 4323 } 4324 4325 /* 4326 * Called if we need to clear a data reservation for this inode 4327 * Normally in a error case. 4328 * 4329 * This one will handle the per-inode data rsv map for accurate reserved 4330 * space framework. 4331 */ 4332 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) 4333 { 4334 struct btrfs_root *root = BTRFS_I(inode)->root; 4335 4336 /* Make sure the range is aligned to sectorsize */ 4337 len = round_up(start + len, root->fs_info->sectorsize) - 4338 round_down(start, root->fs_info->sectorsize); 4339 start = round_down(start, root->fs_info->sectorsize); 4340 4341 btrfs_free_reserved_data_space_noquota(inode, start, len); 4342 btrfs_qgroup_free_data(inode, start, len); 4343 } 4344 4345 static void force_metadata_allocation(struct btrfs_fs_info *info) 4346 { 4347 struct list_head *head = &info->space_info; 4348 struct btrfs_space_info *found; 4349 4350 rcu_read_lock(); 4351 list_for_each_entry_rcu(found, head, list) { 4352 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4353 found->force_alloc = CHUNK_ALLOC_FORCE; 4354 } 4355 rcu_read_unlock(); 4356 } 4357 4358 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4359 { 4360 return (global->size << 1); 4361 } 4362 4363 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4364 struct btrfs_space_info *sinfo, int force) 4365 { 4366 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4367 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 4368 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 4369 u64 thresh; 4370 4371 if (force == CHUNK_ALLOC_FORCE) 4372 return 1; 4373 4374 /* 4375 * We need to take into account the global rsv because for all intents 4376 * and purposes it's used space. Don't worry about locking the 4377 * global_rsv, it doesn't change except when the transaction commits. 4378 */ 4379 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 4380 num_allocated += calc_global_rsv_need_space(global_rsv); 4381 4382 /* 4383 * in limited mode, we want to have some free space up to 4384 * about 1% of the FS size. 4385 */ 4386 if (force == CHUNK_ALLOC_LIMITED) { 4387 thresh = btrfs_super_total_bytes(fs_info->super_copy); 4388 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4389 4390 if (num_bytes - num_allocated < thresh) 4391 return 1; 4392 } 4393 4394 if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) 4395 return 0; 4396 return 1; 4397 } 4398 4399 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4400 { 4401 u64 num_dev; 4402 4403 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4404 BTRFS_BLOCK_GROUP_RAID0 | 4405 BTRFS_BLOCK_GROUP_RAID5 | 4406 BTRFS_BLOCK_GROUP_RAID6)) 4407 num_dev = fs_info->fs_devices->rw_devices; 4408 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4409 num_dev = 2; 4410 else 4411 num_dev = 1; /* DUP or single */ 4412 4413 return num_dev; 4414 } 4415 4416 /* 4417 * If @is_allocation is true, reserve space in the system space info necessary 4418 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4419 * removing a chunk. 4420 */ 4421 void check_system_chunk(struct btrfs_trans_handle *trans, 4422 struct btrfs_fs_info *fs_info, u64 type) 4423 { 4424 struct btrfs_space_info *info; 4425 u64 left; 4426 u64 thresh; 4427 int ret = 0; 4428 u64 num_devs; 4429 4430 /* 4431 * Needed because we can end up allocating a system chunk and for an 4432 * atomic and race free space reservation in the chunk block reserve. 4433 */ 4434 ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); 4435 4436 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4437 spin_lock(&info->lock); 4438 left = info->total_bytes - btrfs_space_info_used(info, true); 4439 spin_unlock(&info->lock); 4440 4441 num_devs = get_profile_num_devs(fs_info, type); 4442 4443 /* num_devs device items to update and 1 chunk item to add or remove */ 4444 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 4445 btrfs_calc_trans_metadata_size(fs_info, 1); 4446 4447 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4448 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4449 left, thresh, type); 4450 dump_space_info(fs_info, info, 0, 0); 4451 } 4452 4453 if (left < thresh) { 4454 u64 flags; 4455 4456 flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4457 /* 4458 * Ignore failure to create system chunk. We might end up not 4459 * needing it, as we might not need to COW all nodes/leafs from 4460 * the paths we visit in the chunk tree (they were already COWed 4461 * or created in the current transaction for example). 4462 */ 4463 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4464 } 4465 4466 if (!ret) { 4467 ret = btrfs_block_rsv_add(fs_info->chunk_root, 4468 &fs_info->chunk_block_rsv, 4469 thresh, BTRFS_RESERVE_NO_FLUSH); 4470 if (!ret) 4471 trans->chunk_bytes_reserved += thresh; 4472 } 4473 } 4474 4475 /* 4476 * If force is CHUNK_ALLOC_FORCE: 4477 * - return 1 if it successfully allocates a chunk, 4478 * - return errors including -ENOSPC otherwise. 4479 * If force is NOT CHUNK_ALLOC_FORCE: 4480 * - return 0 if it doesn't need to allocate a new chunk, 4481 * - return 1 if it successfully allocates a chunk, 4482 * - return errors including -ENOSPC otherwise. 4483 */ 4484 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 4485 struct btrfs_fs_info *fs_info, u64 flags, int force) 4486 { 4487 struct btrfs_space_info *space_info; 4488 int wait_for_alloc = 0; 4489 int ret = 0; 4490 4491 /* Don't re-enter if we're already allocating a chunk */ 4492 if (trans->allocating_chunk) 4493 return -ENOSPC; 4494 4495 space_info = __find_space_info(fs_info, flags); 4496 if (!space_info) { 4497 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 4498 BUG_ON(ret); /* -ENOMEM */ 4499 } 4500 BUG_ON(!space_info); /* Logic error */ 4501 4502 again: 4503 spin_lock(&space_info->lock); 4504 if (force < space_info->force_alloc) 4505 force = space_info->force_alloc; 4506 if (space_info->full) { 4507 if (should_alloc_chunk(fs_info, space_info, force)) 4508 ret = -ENOSPC; 4509 else 4510 ret = 0; 4511 spin_unlock(&space_info->lock); 4512 return ret; 4513 } 4514 4515 if (!should_alloc_chunk(fs_info, space_info, force)) { 4516 spin_unlock(&space_info->lock); 4517 return 0; 4518 } else if (space_info->chunk_alloc) { 4519 wait_for_alloc = 1; 4520 } else { 4521 space_info->chunk_alloc = 1; 4522 } 4523 4524 spin_unlock(&space_info->lock); 4525 4526 mutex_lock(&fs_info->chunk_mutex); 4527 4528 /* 4529 * The chunk_mutex is held throughout the entirety of a chunk 4530 * allocation, so once we've acquired the chunk_mutex we know that the 4531 * other guy is done and we need to recheck and see if we should 4532 * allocate. 4533 */ 4534 if (wait_for_alloc) { 4535 mutex_unlock(&fs_info->chunk_mutex); 4536 wait_for_alloc = 0; 4537 goto again; 4538 } 4539 4540 trans->allocating_chunk = true; 4541 4542 /* 4543 * If we have mixed data/metadata chunks we want to make sure we keep 4544 * allocating mixed chunks instead of individual chunks. 4545 */ 4546 if (btrfs_mixed_space_info(space_info)) 4547 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4548 4549 /* 4550 * if we're doing a data chunk, go ahead and make sure that 4551 * we keep a reasonable number of metadata chunks allocated in the 4552 * FS as well. 4553 */ 4554 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4555 fs_info->data_chunk_allocations++; 4556 if (!(fs_info->data_chunk_allocations % 4557 fs_info->metadata_ratio)) 4558 force_metadata_allocation(fs_info); 4559 } 4560 4561 /* 4562 * Check if we have enough space in SYSTEM chunk because we may need 4563 * to update devices. 4564 */ 4565 check_system_chunk(trans, fs_info, flags); 4566 4567 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4568 trans->allocating_chunk = false; 4569 4570 spin_lock(&space_info->lock); 4571 if (ret < 0 && ret != -ENOSPC) 4572 goto out; 4573 if (ret) 4574 space_info->full = 1; 4575 else 4576 ret = 1; 4577 4578 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4579 out: 4580 space_info->chunk_alloc = 0; 4581 spin_unlock(&space_info->lock); 4582 mutex_unlock(&fs_info->chunk_mutex); 4583 /* 4584 * When we allocate a new chunk we reserve space in the chunk block 4585 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4586 * add new nodes/leafs to it if we end up needing to do it when 4587 * inserting the chunk item and updating device items as part of the 4588 * second phase of chunk allocation, performed by 4589 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4590 * large number of new block groups to create in our transaction 4591 * handle's new_bgs list to avoid exhausting the chunk block reserve 4592 * in extreme cases - like having a single transaction create many new 4593 * block groups when starting to write out the free space caches of all 4594 * the block groups that were made dirty during the lifetime of the 4595 * transaction. 4596 */ 4597 if (trans->can_flush_pending_bgs && 4598 trans->chunk_bytes_reserved >= (u64)SZ_2M) { 4599 btrfs_create_pending_block_groups(trans, fs_info); 4600 btrfs_trans_release_chunk_metadata(trans); 4601 } 4602 return ret; 4603 } 4604 4605 static int can_overcommit(struct btrfs_root *root, 4606 struct btrfs_space_info *space_info, u64 bytes, 4607 enum btrfs_reserve_flush_enum flush) 4608 { 4609 struct btrfs_fs_info *fs_info = root->fs_info; 4610 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4611 u64 profile; 4612 u64 space_size; 4613 u64 avail; 4614 u64 used; 4615 4616 /* Don't overcommit when in mixed mode. */ 4617 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4618 return 0; 4619 4620 profile = btrfs_get_alloc_profile(root, 0); 4621 used = btrfs_space_info_used(space_info, false); 4622 4623 /* 4624 * We only want to allow over committing if we have lots of actual space 4625 * free, but if we don't have enough space to handle the global reserve 4626 * space then we could end up having a real enospc problem when trying 4627 * to allocate a chunk or some other such important allocation. 4628 */ 4629 spin_lock(&global_rsv->lock); 4630 space_size = calc_global_rsv_need_space(global_rsv); 4631 spin_unlock(&global_rsv->lock); 4632 if (used + space_size >= space_info->total_bytes) 4633 return 0; 4634 4635 used += space_info->bytes_may_use; 4636 4637 spin_lock(&fs_info->free_chunk_lock); 4638 avail = fs_info->free_chunk_space; 4639 spin_unlock(&fs_info->free_chunk_lock); 4640 4641 /* 4642 * If we have dup, raid1 or raid10 then only half of the free 4643 * space is actually useable. For raid56, the space info used 4644 * doesn't include the parity drive, so we don't have to 4645 * change the math 4646 */ 4647 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4648 BTRFS_BLOCK_GROUP_RAID1 | 4649 BTRFS_BLOCK_GROUP_RAID10)) 4650 avail >>= 1; 4651 4652 /* 4653 * If we aren't flushing all things, let us overcommit up to 4654 * 1/2th of the space. If we can flush, don't let us overcommit 4655 * too much, let it overcommit up to 1/8 of the space. 4656 */ 4657 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4658 avail >>= 3; 4659 else 4660 avail >>= 1; 4661 4662 if (used + bytes < space_info->total_bytes + avail) 4663 return 1; 4664 return 0; 4665 } 4666 4667 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4668 unsigned long nr_pages, int nr_items) 4669 { 4670 struct super_block *sb = fs_info->sb; 4671 4672 if (down_read_trylock(&sb->s_umount)) { 4673 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4674 up_read(&sb->s_umount); 4675 } else { 4676 /* 4677 * We needn't worry the filesystem going from r/w to r/o though 4678 * we don't acquire ->s_umount mutex, because the filesystem 4679 * should guarantee the delalloc inodes list be empty after 4680 * the filesystem is readonly(all dirty pages are written to 4681 * the disk). 4682 */ 4683 btrfs_start_delalloc_roots(fs_info, 0, nr_items); 4684 if (!current->journal_info) 4685 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4686 } 4687 } 4688 4689 static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4690 u64 to_reclaim) 4691 { 4692 u64 bytes; 4693 int nr; 4694 4695 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4696 nr = (int)div64_u64(to_reclaim, bytes); 4697 if (!nr) 4698 nr = 1; 4699 return nr; 4700 } 4701 4702 #define EXTENT_SIZE_PER_ITEM SZ_256K 4703 4704 /* 4705 * shrink metadata reservation for delalloc 4706 */ 4707 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4708 bool wait_ordered) 4709 { 4710 struct btrfs_fs_info *fs_info = root->fs_info; 4711 struct btrfs_block_rsv *block_rsv; 4712 struct btrfs_space_info *space_info; 4713 struct btrfs_trans_handle *trans; 4714 u64 delalloc_bytes; 4715 u64 max_reclaim; 4716 long time_left; 4717 unsigned long nr_pages; 4718 int loops; 4719 int items; 4720 enum btrfs_reserve_flush_enum flush; 4721 4722 /* Calc the number of the pages we need flush for space reservation */ 4723 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4724 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; 4725 4726 trans = (struct btrfs_trans_handle *)current->journal_info; 4727 block_rsv = &fs_info->delalloc_block_rsv; 4728 space_info = block_rsv->space_info; 4729 4730 delalloc_bytes = percpu_counter_sum_positive( 4731 &fs_info->delalloc_bytes); 4732 if (delalloc_bytes == 0) { 4733 if (trans) 4734 return; 4735 if (wait_ordered) 4736 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4737 return; 4738 } 4739 4740 loops = 0; 4741 while (delalloc_bytes && loops < 3) { 4742 max_reclaim = min(delalloc_bytes, to_reclaim); 4743 nr_pages = max_reclaim >> PAGE_SHIFT; 4744 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4745 /* 4746 * We need to wait for the async pages to actually start before 4747 * we do anything. 4748 */ 4749 max_reclaim = atomic_read(&fs_info->async_delalloc_pages); 4750 if (!max_reclaim) 4751 goto skip_async; 4752 4753 if (max_reclaim <= nr_pages) 4754 max_reclaim = 0; 4755 else 4756 max_reclaim -= nr_pages; 4757 4758 wait_event(fs_info->async_submit_wait, 4759 atomic_read(&fs_info->async_delalloc_pages) <= 4760 (int)max_reclaim); 4761 skip_async: 4762 if (!trans) 4763 flush = BTRFS_RESERVE_FLUSH_ALL; 4764 else 4765 flush = BTRFS_RESERVE_NO_FLUSH; 4766 spin_lock(&space_info->lock); 4767 if (can_overcommit(root, space_info, orig, flush)) { 4768 spin_unlock(&space_info->lock); 4769 break; 4770 } 4771 if (list_empty(&space_info->tickets) && 4772 list_empty(&space_info->priority_tickets)) { 4773 spin_unlock(&space_info->lock); 4774 break; 4775 } 4776 spin_unlock(&space_info->lock); 4777 4778 loops++; 4779 if (wait_ordered && !trans) { 4780 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4781 } else { 4782 time_left = schedule_timeout_killable(1); 4783 if (time_left) 4784 break; 4785 } 4786 delalloc_bytes = percpu_counter_sum_positive( 4787 &fs_info->delalloc_bytes); 4788 } 4789 } 4790 4791 /** 4792 * maybe_commit_transaction - possibly commit the transaction if its ok to 4793 * @root - the root we're allocating for 4794 * @bytes - the number of bytes we want to reserve 4795 * @force - force the commit 4796 * 4797 * This will check to make sure that committing the transaction will actually 4798 * get us somewhere and then commit the transaction if it does. Otherwise it 4799 * will return -ENOSPC. 4800 */ 4801 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4802 struct btrfs_space_info *space_info, 4803 u64 bytes, int force) 4804 { 4805 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4806 struct btrfs_trans_handle *trans; 4807 4808 trans = (struct btrfs_trans_handle *)current->journal_info; 4809 if (trans) 4810 return -EAGAIN; 4811 4812 if (force) 4813 goto commit; 4814 4815 /* See if there is enough pinned space to make this reservation */ 4816 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4817 bytes) >= 0) 4818 goto commit; 4819 4820 /* 4821 * See if there is some space in the delayed insertion reservation for 4822 * this reservation. 4823 */ 4824 if (space_info != delayed_rsv->space_info) 4825 return -ENOSPC; 4826 4827 spin_lock(&delayed_rsv->lock); 4828 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4829 bytes - delayed_rsv->size) >= 0) { 4830 spin_unlock(&delayed_rsv->lock); 4831 return -ENOSPC; 4832 } 4833 spin_unlock(&delayed_rsv->lock); 4834 4835 commit: 4836 trans = btrfs_join_transaction(fs_info->fs_root); 4837 if (IS_ERR(trans)) 4838 return -ENOSPC; 4839 4840 return btrfs_commit_transaction(trans); 4841 } 4842 4843 struct reserve_ticket { 4844 u64 bytes; 4845 int error; 4846 struct list_head list; 4847 wait_queue_head_t wait; 4848 }; 4849 4850 static int flush_space(struct btrfs_fs_info *fs_info, 4851 struct btrfs_space_info *space_info, u64 num_bytes, 4852 u64 orig_bytes, int state) 4853 { 4854 struct btrfs_root *root = fs_info->fs_root; 4855 struct btrfs_trans_handle *trans; 4856 int nr; 4857 int ret = 0; 4858 4859 switch (state) { 4860 case FLUSH_DELAYED_ITEMS_NR: 4861 case FLUSH_DELAYED_ITEMS: 4862 if (state == FLUSH_DELAYED_ITEMS_NR) 4863 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4864 else 4865 nr = -1; 4866 4867 trans = btrfs_join_transaction(root); 4868 if (IS_ERR(trans)) { 4869 ret = PTR_ERR(trans); 4870 break; 4871 } 4872 ret = btrfs_run_delayed_items_nr(trans, fs_info, nr); 4873 btrfs_end_transaction(trans); 4874 break; 4875 case FLUSH_DELALLOC: 4876 case FLUSH_DELALLOC_WAIT: 4877 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4878 state == FLUSH_DELALLOC_WAIT); 4879 break; 4880 case ALLOC_CHUNK: 4881 trans = btrfs_join_transaction(root); 4882 if (IS_ERR(trans)) { 4883 ret = PTR_ERR(trans); 4884 break; 4885 } 4886 ret = do_chunk_alloc(trans, fs_info, 4887 btrfs_get_alloc_profile(root, 0), 4888 CHUNK_ALLOC_NO_FORCE); 4889 btrfs_end_transaction(trans); 4890 if (ret > 0 || ret == -ENOSPC) 4891 ret = 0; 4892 break; 4893 case COMMIT_TRANS: 4894 ret = may_commit_transaction(fs_info, space_info, 4895 orig_bytes, 0); 4896 break; 4897 default: 4898 ret = -ENOSPC; 4899 break; 4900 } 4901 4902 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, 4903 orig_bytes, state, ret); 4904 return ret; 4905 } 4906 4907 static inline u64 4908 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4909 struct btrfs_space_info *space_info) 4910 { 4911 struct reserve_ticket *ticket; 4912 u64 used; 4913 u64 expected; 4914 u64 to_reclaim = 0; 4915 4916 list_for_each_entry(ticket, &space_info->tickets, list) 4917 to_reclaim += ticket->bytes; 4918 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4919 to_reclaim += ticket->bytes; 4920 if (to_reclaim) 4921 return to_reclaim; 4922 4923 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4924 if (can_overcommit(root, space_info, to_reclaim, 4925 BTRFS_RESERVE_FLUSH_ALL)) 4926 return 0; 4927 4928 used = space_info->bytes_used + space_info->bytes_reserved + 4929 space_info->bytes_pinned + space_info->bytes_readonly + 4930 space_info->bytes_may_use; 4931 if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 4932 expected = div_factor_fine(space_info->total_bytes, 95); 4933 else 4934 expected = div_factor_fine(space_info->total_bytes, 90); 4935 4936 if (used > expected) 4937 to_reclaim = used - expected; 4938 else 4939 to_reclaim = 0; 4940 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4941 space_info->bytes_reserved); 4942 return to_reclaim; 4943 } 4944 4945 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4946 struct btrfs_root *root, u64 used) 4947 { 4948 struct btrfs_fs_info *fs_info = root->fs_info; 4949 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4950 4951 /* If we're just plain full then async reclaim just slows us down. */ 4952 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4953 return 0; 4954 4955 if (!btrfs_calc_reclaim_metadata_size(root, space_info)) 4956 return 0; 4957 4958 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4959 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4960 } 4961 4962 static void wake_all_tickets(struct list_head *head) 4963 { 4964 struct reserve_ticket *ticket; 4965 4966 while (!list_empty(head)) { 4967 ticket = list_first_entry(head, struct reserve_ticket, list); 4968 list_del_init(&ticket->list); 4969 ticket->error = -ENOSPC; 4970 wake_up(&ticket->wait); 4971 } 4972 } 4973 4974 /* 4975 * This is for normal flushers, we can wait all goddamned day if we want to. We 4976 * will loop and continuously try to flush as long as we are making progress. 4977 * We count progress as clearing off tickets each time we have to loop. 4978 */ 4979 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4980 { 4981 struct btrfs_fs_info *fs_info; 4982 struct btrfs_space_info *space_info; 4983 u64 to_reclaim; 4984 int flush_state; 4985 int commit_cycles = 0; 4986 u64 last_tickets_id; 4987 4988 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4989 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4990 4991 spin_lock(&space_info->lock); 4992 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4993 space_info); 4994 if (!to_reclaim) { 4995 space_info->flush = 0; 4996 spin_unlock(&space_info->lock); 4997 return; 4998 } 4999 last_tickets_id = space_info->tickets_id; 5000 spin_unlock(&space_info->lock); 5001 5002 flush_state = FLUSH_DELAYED_ITEMS_NR; 5003 do { 5004 struct reserve_ticket *ticket; 5005 int ret; 5006 5007 ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5008 flush_state); 5009 spin_lock(&space_info->lock); 5010 if (list_empty(&space_info->tickets)) { 5011 space_info->flush = 0; 5012 spin_unlock(&space_info->lock); 5013 return; 5014 } 5015 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5016 space_info); 5017 ticket = list_first_entry(&space_info->tickets, 5018 struct reserve_ticket, list); 5019 if (last_tickets_id == space_info->tickets_id) { 5020 flush_state++; 5021 } else { 5022 last_tickets_id = space_info->tickets_id; 5023 flush_state = FLUSH_DELAYED_ITEMS_NR; 5024 if (commit_cycles) 5025 commit_cycles--; 5026 } 5027 5028 if (flush_state > COMMIT_TRANS) { 5029 commit_cycles++; 5030 if (commit_cycles > 2) { 5031 wake_all_tickets(&space_info->tickets); 5032 space_info->flush = 0; 5033 } else { 5034 flush_state = FLUSH_DELAYED_ITEMS_NR; 5035 } 5036 } 5037 spin_unlock(&space_info->lock); 5038 } while (flush_state <= COMMIT_TRANS); 5039 } 5040 5041 void btrfs_init_async_reclaim_work(struct work_struct *work) 5042 { 5043 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5044 } 5045 5046 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5047 struct btrfs_space_info *space_info, 5048 struct reserve_ticket *ticket) 5049 { 5050 u64 to_reclaim; 5051 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5052 5053 spin_lock(&space_info->lock); 5054 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5055 space_info); 5056 if (!to_reclaim) { 5057 spin_unlock(&space_info->lock); 5058 return; 5059 } 5060 spin_unlock(&space_info->lock); 5061 5062 do { 5063 flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5064 flush_state); 5065 flush_state++; 5066 spin_lock(&space_info->lock); 5067 if (ticket->bytes == 0) { 5068 spin_unlock(&space_info->lock); 5069 return; 5070 } 5071 spin_unlock(&space_info->lock); 5072 5073 /* 5074 * Priority flushers can't wait on delalloc without 5075 * deadlocking. 5076 */ 5077 if (flush_state == FLUSH_DELALLOC || 5078 flush_state == FLUSH_DELALLOC_WAIT) 5079 flush_state = ALLOC_CHUNK; 5080 } while (flush_state < COMMIT_TRANS); 5081 } 5082 5083 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5084 struct btrfs_space_info *space_info, 5085 struct reserve_ticket *ticket, u64 orig_bytes) 5086 5087 { 5088 DEFINE_WAIT(wait); 5089 int ret = 0; 5090 5091 spin_lock(&space_info->lock); 5092 while (ticket->bytes > 0 && ticket->error == 0) { 5093 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5094 if (ret) { 5095 ret = -EINTR; 5096 break; 5097 } 5098 spin_unlock(&space_info->lock); 5099 5100 schedule(); 5101 5102 finish_wait(&ticket->wait, &wait); 5103 spin_lock(&space_info->lock); 5104 } 5105 if (!ret) 5106 ret = ticket->error; 5107 if (!list_empty(&ticket->list)) 5108 list_del_init(&ticket->list); 5109 if (ticket->bytes && ticket->bytes < orig_bytes) { 5110 u64 num_bytes = orig_bytes - ticket->bytes; 5111 space_info->bytes_may_use -= num_bytes; 5112 trace_btrfs_space_reservation(fs_info, "space_info", 5113 space_info->flags, num_bytes, 0); 5114 } 5115 spin_unlock(&space_info->lock); 5116 5117 return ret; 5118 } 5119 5120 /** 5121 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5122 * @root - the root we're allocating for 5123 * @space_info - the space info we want to allocate from 5124 * @orig_bytes - the number of bytes we want 5125 * @flush - whether or not we can flush to make our reservation 5126 * 5127 * This will reserve orig_bytes number of bytes from the space info associated 5128 * with the block_rsv. If there is not enough space it will make an attempt to 5129 * flush out space to make room. It will do this by flushing delalloc if 5130 * possible or committing the transaction. If flush is 0 then no attempts to 5131 * regain reservations will be made and this will fail if there is not enough 5132 * space already. 5133 */ 5134 static int __reserve_metadata_bytes(struct btrfs_root *root, 5135 struct btrfs_space_info *space_info, 5136 u64 orig_bytes, 5137 enum btrfs_reserve_flush_enum flush) 5138 { 5139 struct btrfs_fs_info *fs_info = root->fs_info; 5140 struct reserve_ticket ticket; 5141 u64 used; 5142 int ret = 0; 5143 5144 ASSERT(orig_bytes); 5145 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5146 5147 spin_lock(&space_info->lock); 5148 ret = -ENOSPC; 5149 used = btrfs_space_info_used(space_info, true); 5150 5151 /* 5152 * If we have enough space then hooray, make our reservation and carry 5153 * on. If not see if we can overcommit, and if we can, hooray carry on. 5154 * If not things get more complicated. 5155 */ 5156 if (used + orig_bytes <= space_info->total_bytes) { 5157 space_info->bytes_may_use += orig_bytes; 5158 trace_btrfs_space_reservation(fs_info, "space_info", 5159 space_info->flags, orig_bytes, 1); 5160 ret = 0; 5161 } else if (can_overcommit(root, space_info, orig_bytes, flush)) { 5162 space_info->bytes_may_use += orig_bytes; 5163 trace_btrfs_space_reservation(fs_info, "space_info", 5164 space_info->flags, orig_bytes, 1); 5165 ret = 0; 5166 } 5167 5168 /* 5169 * If we couldn't make a reservation then setup our reservation ticket 5170 * and kick the async worker if it's not already running. 5171 * 5172 * If we are a priority flusher then we just need to add our ticket to 5173 * the list and we will do our own flushing further down. 5174 */ 5175 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5176 ticket.bytes = orig_bytes; 5177 ticket.error = 0; 5178 init_waitqueue_head(&ticket.wait); 5179 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5180 list_add_tail(&ticket.list, &space_info->tickets); 5181 if (!space_info->flush) { 5182 space_info->flush = 1; 5183 trace_btrfs_trigger_flush(fs_info, 5184 space_info->flags, 5185 orig_bytes, flush, 5186 "enospc"); 5187 queue_work(system_unbound_wq, 5188 &root->fs_info->async_reclaim_work); 5189 } 5190 } else { 5191 list_add_tail(&ticket.list, 5192 &space_info->priority_tickets); 5193 } 5194 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5195 used += orig_bytes; 5196 /* 5197 * We will do the space reservation dance during log replay, 5198 * which means we won't have fs_info->fs_root set, so don't do 5199 * the async reclaim as we will panic. 5200 */ 5201 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5202 need_do_async_reclaim(space_info, root, used) && 5203 !work_busy(&fs_info->async_reclaim_work)) { 5204 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5205 orig_bytes, flush, "preempt"); 5206 queue_work(system_unbound_wq, 5207 &fs_info->async_reclaim_work); 5208 } 5209 } 5210 spin_unlock(&space_info->lock); 5211 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5212 return ret; 5213 5214 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5215 return wait_reserve_ticket(fs_info, space_info, &ticket, 5216 orig_bytes); 5217 5218 ret = 0; 5219 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5220 spin_lock(&space_info->lock); 5221 if (ticket.bytes) { 5222 if (ticket.bytes < orig_bytes) { 5223 u64 num_bytes = orig_bytes - ticket.bytes; 5224 space_info->bytes_may_use -= num_bytes; 5225 trace_btrfs_space_reservation(fs_info, "space_info", 5226 space_info->flags, 5227 num_bytes, 0); 5228 5229 } 5230 list_del_init(&ticket.list); 5231 ret = -ENOSPC; 5232 } 5233 spin_unlock(&space_info->lock); 5234 ASSERT(list_empty(&ticket.list)); 5235 return ret; 5236 } 5237 5238 /** 5239 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5240 * @root - the root we're allocating for 5241 * @block_rsv - the block_rsv we're allocating for 5242 * @orig_bytes - the number of bytes we want 5243 * @flush - whether or not we can flush to make our reservation 5244 * 5245 * This will reserve orgi_bytes number of bytes from the space info associated 5246 * with the block_rsv. If there is not enough space it will make an attempt to 5247 * flush out space to make room. It will do this by flushing delalloc if 5248 * possible or committing the transaction. If flush is 0 then no attempts to 5249 * regain reservations will be made and this will fail if there is not enough 5250 * space already. 5251 */ 5252 static int reserve_metadata_bytes(struct btrfs_root *root, 5253 struct btrfs_block_rsv *block_rsv, 5254 u64 orig_bytes, 5255 enum btrfs_reserve_flush_enum flush) 5256 { 5257 struct btrfs_fs_info *fs_info = root->fs_info; 5258 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5259 int ret; 5260 5261 ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, 5262 flush); 5263 if (ret == -ENOSPC && 5264 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5265 if (block_rsv != global_rsv && 5266 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5267 ret = 0; 5268 } 5269 if (ret == -ENOSPC) 5270 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 5271 block_rsv->space_info->flags, 5272 orig_bytes, 1); 5273 return ret; 5274 } 5275 5276 static struct btrfs_block_rsv *get_block_rsv( 5277 const struct btrfs_trans_handle *trans, 5278 const struct btrfs_root *root) 5279 { 5280 struct btrfs_fs_info *fs_info = root->fs_info; 5281 struct btrfs_block_rsv *block_rsv = NULL; 5282 5283 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5284 (root == fs_info->csum_root && trans->adding_csums) || 5285 (root == fs_info->uuid_root)) 5286 block_rsv = trans->block_rsv; 5287 5288 if (!block_rsv) 5289 block_rsv = root->block_rsv; 5290 5291 if (!block_rsv) 5292 block_rsv = &fs_info->empty_block_rsv; 5293 5294 return block_rsv; 5295 } 5296 5297 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5298 u64 num_bytes) 5299 { 5300 int ret = -ENOSPC; 5301 spin_lock(&block_rsv->lock); 5302 if (block_rsv->reserved >= num_bytes) { 5303 block_rsv->reserved -= num_bytes; 5304 if (block_rsv->reserved < block_rsv->size) 5305 block_rsv->full = 0; 5306 ret = 0; 5307 } 5308 spin_unlock(&block_rsv->lock); 5309 return ret; 5310 } 5311 5312 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5313 u64 num_bytes, int update_size) 5314 { 5315 spin_lock(&block_rsv->lock); 5316 block_rsv->reserved += num_bytes; 5317 if (update_size) 5318 block_rsv->size += num_bytes; 5319 else if (block_rsv->reserved >= block_rsv->size) 5320 block_rsv->full = 1; 5321 spin_unlock(&block_rsv->lock); 5322 } 5323 5324 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5325 struct btrfs_block_rsv *dest, u64 num_bytes, 5326 int min_factor) 5327 { 5328 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5329 u64 min_bytes; 5330 5331 if (global_rsv->space_info != dest->space_info) 5332 return -ENOSPC; 5333 5334 spin_lock(&global_rsv->lock); 5335 min_bytes = div_factor(global_rsv->size, min_factor); 5336 if (global_rsv->reserved < min_bytes + num_bytes) { 5337 spin_unlock(&global_rsv->lock); 5338 return -ENOSPC; 5339 } 5340 global_rsv->reserved -= num_bytes; 5341 if (global_rsv->reserved < global_rsv->size) 5342 global_rsv->full = 0; 5343 spin_unlock(&global_rsv->lock); 5344 5345 block_rsv_add_bytes(dest, num_bytes, 1); 5346 return 0; 5347 } 5348 5349 /* 5350 * This is for space we already have accounted in space_info->bytes_may_use, so 5351 * basically when we're returning space from block_rsv's. 5352 */ 5353 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5354 struct btrfs_space_info *space_info, 5355 u64 num_bytes) 5356 { 5357 struct reserve_ticket *ticket; 5358 struct list_head *head; 5359 u64 used; 5360 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5361 bool check_overcommit = false; 5362 5363 spin_lock(&space_info->lock); 5364 head = &space_info->priority_tickets; 5365 5366 /* 5367 * If we are over our limit then we need to check and see if we can 5368 * overcommit, and if we can't then we just need to free up our space 5369 * and not satisfy any requests. 5370 */ 5371 used = space_info->bytes_used + space_info->bytes_reserved + 5372 space_info->bytes_pinned + space_info->bytes_readonly + 5373 space_info->bytes_may_use; 5374 if (used - num_bytes >= space_info->total_bytes) 5375 check_overcommit = true; 5376 again: 5377 while (!list_empty(head) && num_bytes) { 5378 ticket = list_first_entry(head, struct reserve_ticket, 5379 list); 5380 /* 5381 * We use 0 bytes because this space is already reserved, so 5382 * adding the ticket space would be a double count. 5383 */ 5384 if (check_overcommit && 5385 !can_overcommit(fs_info->extent_root, space_info, 0, 5386 flush)) 5387 break; 5388 if (num_bytes >= ticket->bytes) { 5389 list_del_init(&ticket->list); 5390 num_bytes -= ticket->bytes; 5391 ticket->bytes = 0; 5392 space_info->tickets_id++; 5393 wake_up(&ticket->wait); 5394 } else { 5395 ticket->bytes -= num_bytes; 5396 num_bytes = 0; 5397 } 5398 } 5399 5400 if (num_bytes && head == &space_info->priority_tickets) { 5401 head = &space_info->tickets; 5402 flush = BTRFS_RESERVE_FLUSH_ALL; 5403 goto again; 5404 } 5405 space_info->bytes_may_use -= num_bytes; 5406 trace_btrfs_space_reservation(fs_info, "space_info", 5407 space_info->flags, num_bytes, 0); 5408 spin_unlock(&space_info->lock); 5409 } 5410 5411 /* 5412 * This is for newly allocated space that isn't accounted in 5413 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5414 * we use this helper. 5415 */ 5416 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5417 struct btrfs_space_info *space_info, 5418 u64 num_bytes) 5419 { 5420 struct reserve_ticket *ticket; 5421 struct list_head *head = &space_info->priority_tickets; 5422 5423 again: 5424 while (!list_empty(head) && num_bytes) { 5425 ticket = list_first_entry(head, struct reserve_ticket, 5426 list); 5427 if (num_bytes >= ticket->bytes) { 5428 trace_btrfs_space_reservation(fs_info, "space_info", 5429 space_info->flags, 5430 ticket->bytes, 1); 5431 list_del_init(&ticket->list); 5432 num_bytes -= ticket->bytes; 5433 space_info->bytes_may_use += ticket->bytes; 5434 ticket->bytes = 0; 5435 space_info->tickets_id++; 5436 wake_up(&ticket->wait); 5437 } else { 5438 trace_btrfs_space_reservation(fs_info, "space_info", 5439 space_info->flags, 5440 num_bytes, 1); 5441 space_info->bytes_may_use += num_bytes; 5442 ticket->bytes -= num_bytes; 5443 num_bytes = 0; 5444 } 5445 } 5446 5447 if (num_bytes && head == &space_info->priority_tickets) { 5448 head = &space_info->tickets; 5449 goto again; 5450 } 5451 } 5452 5453 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5454 struct btrfs_block_rsv *block_rsv, 5455 struct btrfs_block_rsv *dest, u64 num_bytes) 5456 { 5457 struct btrfs_space_info *space_info = block_rsv->space_info; 5458 5459 spin_lock(&block_rsv->lock); 5460 if (num_bytes == (u64)-1) 5461 num_bytes = block_rsv->size; 5462 block_rsv->size -= num_bytes; 5463 if (block_rsv->reserved >= block_rsv->size) { 5464 num_bytes = block_rsv->reserved - block_rsv->size; 5465 block_rsv->reserved = block_rsv->size; 5466 block_rsv->full = 1; 5467 } else { 5468 num_bytes = 0; 5469 } 5470 spin_unlock(&block_rsv->lock); 5471 5472 if (num_bytes > 0) { 5473 if (dest) { 5474 spin_lock(&dest->lock); 5475 if (!dest->full) { 5476 u64 bytes_to_add; 5477 5478 bytes_to_add = dest->size - dest->reserved; 5479 bytes_to_add = min(num_bytes, bytes_to_add); 5480 dest->reserved += bytes_to_add; 5481 if (dest->reserved >= dest->size) 5482 dest->full = 1; 5483 num_bytes -= bytes_to_add; 5484 } 5485 spin_unlock(&dest->lock); 5486 } 5487 if (num_bytes) 5488 space_info_add_old_bytes(fs_info, space_info, 5489 num_bytes); 5490 } 5491 } 5492 5493 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5494 struct btrfs_block_rsv *dst, u64 num_bytes, 5495 int update_size) 5496 { 5497 int ret; 5498 5499 ret = block_rsv_use_bytes(src, num_bytes); 5500 if (ret) 5501 return ret; 5502 5503 block_rsv_add_bytes(dst, num_bytes, update_size); 5504 return 0; 5505 } 5506 5507 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5508 { 5509 memset(rsv, 0, sizeof(*rsv)); 5510 spin_lock_init(&rsv->lock); 5511 rsv->type = type; 5512 } 5513 5514 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5515 unsigned short type) 5516 { 5517 struct btrfs_block_rsv *block_rsv; 5518 5519 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5520 if (!block_rsv) 5521 return NULL; 5522 5523 btrfs_init_block_rsv(block_rsv, type); 5524 block_rsv->space_info = __find_space_info(fs_info, 5525 BTRFS_BLOCK_GROUP_METADATA); 5526 return block_rsv; 5527 } 5528 5529 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5530 struct btrfs_block_rsv *rsv) 5531 { 5532 if (!rsv) 5533 return; 5534 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5535 kfree(rsv); 5536 } 5537 5538 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) 5539 { 5540 kfree(rsv); 5541 } 5542 5543 int btrfs_block_rsv_add(struct btrfs_root *root, 5544 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5545 enum btrfs_reserve_flush_enum flush) 5546 { 5547 int ret; 5548 5549 if (num_bytes == 0) 5550 return 0; 5551 5552 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5553 if (!ret) { 5554 block_rsv_add_bytes(block_rsv, num_bytes, 1); 5555 return 0; 5556 } 5557 5558 return ret; 5559 } 5560 5561 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5562 { 5563 u64 num_bytes = 0; 5564 int ret = -ENOSPC; 5565 5566 if (!block_rsv) 5567 return 0; 5568 5569 spin_lock(&block_rsv->lock); 5570 num_bytes = div_factor(block_rsv->size, min_factor); 5571 if (block_rsv->reserved >= num_bytes) 5572 ret = 0; 5573 spin_unlock(&block_rsv->lock); 5574 5575 return ret; 5576 } 5577 5578 int btrfs_block_rsv_refill(struct btrfs_root *root, 5579 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5580 enum btrfs_reserve_flush_enum flush) 5581 { 5582 u64 num_bytes = 0; 5583 int ret = -ENOSPC; 5584 5585 if (!block_rsv) 5586 return 0; 5587 5588 spin_lock(&block_rsv->lock); 5589 num_bytes = min_reserved; 5590 if (block_rsv->reserved >= num_bytes) 5591 ret = 0; 5592 else 5593 num_bytes -= block_rsv->reserved; 5594 spin_unlock(&block_rsv->lock); 5595 5596 if (!ret) 5597 return 0; 5598 5599 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5600 if (!ret) { 5601 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5602 return 0; 5603 } 5604 5605 return ret; 5606 } 5607 5608 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5609 struct btrfs_block_rsv *block_rsv, 5610 u64 num_bytes) 5611 { 5612 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5613 5614 if (global_rsv == block_rsv || 5615 block_rsv->space_info != global_rsv->space_info) 5616 global_rsv = NULL; 5617 block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); 5618 } 5619 5620 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5621 { 5622 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5623 struct btrfs_space_info *sinfo = block_rsv->space_info; 5624 u64 num_bytes; 5625 5626 /* 5627 * The global block rsv is based on the size of the extent tree, the 5628 * checksum tree and the root tree. If the fs is empty we want to set 5629 * it to a minimal amount for safety. 5630 */ 5631 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5632 btrfs_root_used(&fs_info->csum_root->root_item) + 5633 btrfs_root_used(&fs_info->tree_root->root_item); 5634 num_bytes = max_t(u64, num_bytes, SZ_16M); 5635 5636 spin_lock(&sinfo->lock); 5637 spin_lock(&block_rsv->lock); 5638 5639 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5640 5641 if (block_rsv->reserved < block_rsv->size) { 5642 num_bytes = btrfs_space_info_used(sinfo, true); 5643 if (sinfo->total_bytes > num_bytes) { 5644 num_bytes = sinfo->total_bytes - num_bytes; 5645 num_bytes = min(num_bytes, 5646 block_rsv->size - block_rsv->reserved); 5647 block_rsv->reserved += num_bytes; 5648 sinfo->bytes_may_use += num_bytes; 5649 trace_btrfs_space_reservation(fs_info, "space_info", 5650 sinfo->flags, num_bytes, 5651 1); 5652 } 5653 } else if (block_rsv->reserved > block_rsv->size) { 5654 num_bytes = block_rsv->reserved - block_rsv->size; 5655 sinfo->bytes_may_use -= num_bytes; 5656 trace_btrfs_space_reservation(fs_info, "space_info", 5657 sinfo->flags, num_bytes, 0); 5658 block_rsv->reserved = block_rsv->size; 5659 } 5660 5661 if (block_rsv->reserved == block_rsv->size) 5662 block_rsv->full = 1; 5663 else 5664 block_rsv->full = 0; 5665 5666 spin_unlock(&block_rsv->lock); 5667 spin_unlock(&sinfo->lock); 5668 } 5669 5670 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5671 { 5672 struct btrfs_space_info *space_info; 5673 5674 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5675 fs_info->chunk_block_rsv.space_info = space_info; 5676 5677 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5678 fs_info->global_block_rsv.space_info = space_info; 5679 fs_info->delalloc_block_rsv.space_info = space_info; 5680 fs_info->trans_block_rsv.space_info = space_info; 5681 fs_info->empty_block_rsv.space_info = space_info; 5682 fs_info->delayed_block_rsv.space_info = space_info; 5683 5684 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 5685 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 5686 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5687 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5688 if (fs_info->quota_root) 5689 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5690 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5691 5692 update_global_block_rsv(fs_info); 5693 } 5694 5695 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5696 { 5697 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5698 (u64)-1); 5699 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 5700 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 5701 WARN_ON(fs_info->trans_block_rsv.size > 0); 5702 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5703 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5704 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5705 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5706 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5707 } 5708 5709 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5710 struct btrfs_fs_info *fs_info) 5711 { 5712 if (!trans->block_rsv) 5713 return; 5714 5715 if (!trans->bytes_reserved) 5716 return; 5717 5718 trace_btrfs_space_reservation(fs_info, "transaction", 5719 trans->transid, trans->bytes_reserved, 0); 5720 btrfs_block_rsv_release(fs_info, trans->block_rsv, 5721 trans->bytes_reserved); 5722 trans->bytes_reserved = 0; 5723 } 5724 5725 /* 5726 * To be called after all the new block groups attached to the transaction 5727 * handle have been created (btrfs_create_pending_block_groups()). 5728 */ 5729 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5730 { 5731 struct btrfs_fs_info *fs_info = trans->fs_info; 5732 5733 if (!trans->chunk_bytes_reserved) 5734 return; 5735 5736 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5737 5738 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5739 trans->chunk_bytes_reserved); 5740 trans->chunk_bytes_reserved = 0; 5741 } 5742 5743 /* Can only return 0 or -ENOSPC */ 5744 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5745 struct inode *inode) 5746 { 5747 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5748 struct btrfs_root *root = BTRFS_I(inode)->root; 5749 /* 5750 * We always use trans->block_rsv here as we will have reserved space 5751 * for our orphan when starting the transaction, using get_block_rsv() 5752 * here will sometimes make us choose the wrong block rsv as we could be 5753 * doing a reloc inode for a non refcounted root. 5754 */ 5755 struct btrfs_block_rsv *src_rsv = trans->block_rsv; 5756 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 5757 5758 /* 5759 * We need to hold space in order to delete our orphan item once we've 5760 * added it, so this takes the reservation so we can release it later 5761 * when we are truly done with the orphan item. 5762 */ 5763 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5764 5765 trace_btrfs_space_reservation(fs_info, "orphan", 5766 btrfs_ino(BTRFS_I(inode)), num_bytes, 1); 5767 return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 5768 } 5769 5770 void btrfs_orphan_release_metadata(struct inode *inode) 5771 { 5772 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5773 struct btrfs_root *root = BTRFS_I(inode)->root; 5774 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5775 5776 trace_btrfs_space_reservation(fs_info, "orphan", 5777 btrfs_ino(BTRFS_I(inode)), num_bytes, 0); 5778 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); 5779 } 5780 5781 /* 5782 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5783 * root: the root of the parent directory 5784 * rsv: block reservation 5785 * items: the number of items that we need do reservation 5786 * qgroup_reserved: used to return the reserved size in qgroup 5787 * 5788 * This function is used to reserve the space for snapshot/subvolume 5789 * creation and deletion. Those operations are different with the 5790 * common file/directory operations, they change two fs/file trees 5791 * and root tree, the number of items that the qgroup reserves is 5792 * different with the free space reservation. So we can not use 5793 * the space reservation mechanism in start_transaction(). 5794 */ 5795 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5796 struct btrfs_block_rsv *rsv, 5797 int items, 5798 u64 *qgroup_reserved, 5799 bool use_global_rsv) 5800 { 5801 u64 num_bytes; 5802 int ret; 5803 struct btrfs_fs_info *fs_info = root->fs_info; 5804 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5805 5806 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5807 /* One for parent inode, two for dir entries */ 5808 num_bytes = 3 * fs_info->nodesize; 5809 ret = btrfs_qgroup_reserve_meta(root, num_bytes, true); 5810 if (ret) 5811 return ret; 5812 } else { 5813 num_bytes = 0; 5814 } 5815 5816 *qgroup_reserved = num_bytes; 5817 5818 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 5819 rsv->space_info = __find_space_info(fs_info, 5820 BTRFS_BLOCK_GROUP_METADATA); 5821 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5822 BTRFS_RESERVE_FLUSH_ALL); 5823 5824 if (ret == -ENOSPC && use_global_rsv) 5825 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); 5826 5827 if (ret && *qgroup_reserved) 5828 btrfs_qgroup_free_meta(root, *qgroup_reserved); 5829 5830 return ret; 5831 } 5832 5833 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 5834 struct btrfs_block_rsv *rsv) 5835 { 5836 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5837 } 5838 5839 /** 5840 * drop_outstanding_extent - drop an outstanding extent 5841 * @inode: the inode we're dropping the extent for 5842 * @num_bytes: the number of bytes we're releasing. 5843 * 5844 * This is called when we are freeing up an outstanding extent, either called 5845 * after an error or after an extent is written. This will return the number of 5846 * reserved extents that need to be freed. This must be called with 5847 * BTRFS_I(inode)->lock held. 5848 */ 5849 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) 5850 { 5851 unsigned drop_inode_space = 0; 5852 unsigned dropped_extents = 0; 5853 unsigned num_extents; 5854 5855 num_extents = count_max_extents(num_bytes); 5856 ASSERT(num_extents); 5857 ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); 5858 BTRFS_I(inode)->outstanding_extents -= num_extents; 5859 5860 if (BTRFS_I(inode)->outstanding_extents == 0 && 5861 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5862 &BTRFS_I(inode)->runtime_flags)) 5863 drop_inode_space = 1; 5864 5865 /* 5866 * If we have more or the same amount of outstanding extents than we have 5867 * reserved then we need to leave the reserved extents count alone. 5868 */ 5869 if (BTRFS_I(inode)->outstanding_extents >= 5870 BTRFS_I(inode)->reserved_extents) 5871 return drop_inode_space; 5872 5873 dropped_extents = BTRFS_I(inode)->reserved_extents - 5874 BTRFS_I(inode)->outstanding_extents; 5875 BTRFS_I(inode)->reserved_extents -= dropped_extents; 5876 return dropped_extents + drop_inode_space; 5877 } 5878 5879 /** 5880 * calc_csum_metadata_size - return the amount of metadata space that must be 5881 * reserved/freed for the given bytes. 5882 * @inode: the inode we're manipulating 5883 * @num_bytes: the number of bytes in question 5884 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5885 * 5886 * This adjusts the number of csum_bytes in the inode and then returns the 5887 * correct amount of metadata that must either be reserved or freed. We 5888 * calculate how many checksums we can fit into one leaf and then divide the 5889 * number of bytes that will need to be checksumed by this value to figure out 5890 * how many checksums will be required. If we are adding bytes then the number 5891 * may go up and we will return the number of additional bytes that must be 5892 * reserved. If it is going down we will return the number of bytes that must 5893 * be freed. 5894 * 5895 * This must be called with BTRFS_I(inode)->lock held. 5896 */ 5897 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, 5898 int reserve) 5899 { 5900 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5901 u64 old_csums, num_csums; 5902 5903 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5904 BTRFS_I(inode)->csum_bytes == 0) 5905 return 0; 5906 5907 old_csums = btrfs_csum_bytes_to_leaves(fs_info, 5908 BTRFS_I(inode)->csum_bytes); 5909 if (reserve) 5910 BTRFS_I(inode)->csum_bytes += num_bytes; 5911 else 5912 BTRFS_I(inode)->csum_bytes -= num_bytes; 5913 num_csums = btrfs_csum_bytes_to_leaves(fs_info, 5914 BTRFS_I(inode)->csum_bytes); 5915 5916 /* No change, no need to reserve more */ 5917 if (old_csums == num_csums) 5918 return 0; 5919 5920 if (reserve) 5921 return btrfs_calc_trans_metadata_size(fs_info, 5922 num_csums - old_csums); 5923 5924 return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); 5925 } 5926 5927 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 5928 { 5929 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 5930 struct btrfs_root *root = BTRFS_I(inode)->root; 5931 struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; 5932 u64 to_reserve = 0; 5933 u64 csum_bytes; 5934 unsigned nr_extents; 5935 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5936 int ret = 0; 5937 bool delalloc_lock = true; 5938 u64 to_free = 0; 5939 unsigned dropped; 5940 bool release_extra = false; 5941 5942 /* If we are a free space inode we need to not flush since we will be in 5943 * the middle of a transaction commit. We also don't need the delalloc 5944 * mutex since we won't race with anybody. We need this mostly to make 5945 * lockdep shut its filthy mouth. 5946 * 5947 * If we have a transaction open (can happen if we call truncate_block 5948 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 5949 */ 5950 if (btrfs_is_free_space_inode(inode)) { 5951 flush = BTRFS_RESERVE_NO_FLUSH; 5952 delalloc_lock = false; 5953 } else if (current->journal_info) { 5954 flush = BTRFS_RESERVE_FLUSH_LIMIT; 5955 } 5956 5957 if (flush != BTRFS_RESERVE_NO_FLUSH && 5958 btrfs_transaction_in_commit(fs_info)) 5959 schedule_timeout(1); 5960 5961 if (delalloc_lock) 5962 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 5963 5964 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 5965 5966 spin_lock(&BTRFS_I(inode)->lock); 5967 nr_extents = count_max_extents(num_bytes); 5968 BTRFS_I(inode)->outstanding_extents += nr_extents; 5969 5970 nr_extents = 0; 5971 if (BTRFS_I(inode)->outstanding_extents > 5972 BTRFS_I(inode)->reserved_extents) 5973 nr_extents += BTRFS_I(inode)->outstanding_extents - 5974 BTRFS_I(inode)->reserved_extents; 5975 5976 /* We always want to reserve a slot for updating the inode. */ 5977 to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); 5978 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5979 csum_bytes = BTRFS_I(inode)->csum_bytes; 5980 spin_unlock(&BTRFS_I(inode)->lock); 5981 5982 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5983 ret = btrfs_qgroup_reserve_meta(root, 5984 nr_extents * fs_info->nodesize, true); 5985 if (ret) 5986 goto out_fail; 5987 } 5988 5989 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 5990 if (unlikely(ret)) { 5991 btrfs_qgroup_free_meta(root, 5992 nr_extents * fs_info->nodesize); 5993 goto out_fail; 5994 } 5995 5996 spin_lock(&BTRFS_I(inode)->lock); 5997 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5998 &BTRFS_I(inode)->runtime_flags)) { 5999 to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); 6000 release_extra = true; 6001 } 6002 BTRFS_I(inode)->reserved_extents += nr_extents; 6003 spin_unlock(&BTRFS_I(inode)->lock); 6004 6005 if (delalloc_lock) 6006 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 6007 6008 if (to_reserve) 6009 trace_btrfs_space_reservation(fs_info, "delalloc", 6010 btrfs_ino(BTRFS_I(inode)), to_reserve, 1); 6011 if (release_extra) 6012 btrfs_block_rsv_release(fs_info, block_rsv, 6013 btrfs_calc_trans_metadata_size(fs_info, 1)); 6014 return 0; 6015 6016 out_fail: 6017 spin_lock(&BTRFS_I(inode)->lock); 6018 dropped = drop_outstanding_extent(inode, num_bytes); 6019 /* 6020 * If the inodes csum_bytes is the same as the original 6021 * csum_bytes then we know we haven't raced with any free()ers 6022 * so we can just reduce our inodes csum bytes and carry on. 6023 */ 6024 if (BTRFS_I(inode)->csum_bytes == csum_bytes) { 6025 calc_csum_metadata_size(inode, num_bytes, 0); 6026 } else { 6027 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; 6028 u64 bytes; 6029 6030 /* 6031 * This is tricky, but first we need to figure out how much we 6032 * freed from any free-ers that occurred during this 6033 * reservation, so we reset ->csum_bytes to the csum_bytes 6034 * before we dropped our lock, and then call the free for the 6035 * number of bytes that were freed while we were trying our 6036 * reservation. 6037 */ 6038 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; 6039 BTRFS_I(inode)->csum_bytes = csum_bytes; 6040 to_free = calc_csum_metadata_size(inode, bytes, 0); 6041 6042 6043 /* 6044 * Now we need to see how much we would have freed had we not 6045 * been making this reservation and our ->csum_bytes were not 6046 * artificially inflated. 6047 */ 6048 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; 6049 bytes = csum_bytes - orig_csum_bytes; 6050 bytes = calc_csum_metadata_size(inode, bytes, 0); 6051 6052 /* 6053 * Now reset ->csum_bytes to what it should be. If bytes is 6054 * more than to_free then we would have freed more space had we 6055 * not had an artificially high ->csum_bytes, so we need to free 6056 * the remainder. If bytes is the same or less then we don't 6057 * need to do anything, the other free-ers did the correct 6058 * thing. 6059 */ 6060 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; 6061 if (bytes > to_free) 6062 to_free = bytes - to_free; 6063 else 6064 to_free = 0; 6065 } 6066 spin_unlock(&BTRFS_I(inode)->lock); 6067 if (dropped) 6068 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6069 6070 if (to_free) { 6071 btrfs_block_rsv_release(fs_info, block_rsv, to_free); 6072 trace_btrfs_space_reservation(fs_info, "delalloc", 6073 btrfs_ino(BTRFS_I(inode)), to_free, 0); 6074 } 6075 if (delalloc_lock) 6076 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 6077 return ret; 6078 } 6079 6080 /** 6081 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6082 * @inode: the inode to release the reservation for 6083 * @num_bytes: the number of bytes we're releasing 6084 * 6085 * This will release the metadata reservation for an inode. This can be called 6086 * once we complete IO for a given set of bytes to release their metadata 6087 * reservations. 6088 */ 6089 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 6090 { 6091 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 6092 u64 to_free = 0; 6093 unsigned dropped; 6094 6095 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6096 spin_lock(&BTRFS_I(inode)->lock); 6097 dropped = drop_outstanding_extent(inode, num_bytes); 6098 6099 if (num_bytes) 6100 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 6101 spin_unlock(&BTRFS_I(inode)->lock); 6102 if (dropped > 0) 6103 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6104 6105 if (btrfs_is_testing(fs_info)) 6106 return; 6107 6108 trace_btrfs_space_reservation(fs_info, "delalloc", 6109 btrfs_ino(BTRFS_I(inode)), to_free, 0); 6110 6111 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); 6112 } 6113 6114 /** 6115 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6116 * delalloc 6117 * @inode: inode we're writing to 6118 * @start: start range we are writing to 6119 * @len: how long the range we are writing to 6120 * 6121 * This will do the following things 6122 * 6123 * o reserve space in data space info for num bytes 6124 * and reserve precious corresponding qgroup space 6125 * (Done in check_data_free_space) 6126 * 6127 * o reserve space for metadata space, based on the number of outstanding 6128 * extents and how much csums will be needed 6129 * also reserve metadata space in a per root over-reserve method. 6130 * o add to the inodes->delalloc_bytes 6131 * o add it to the fs_info's delalloc inodes list. 6132 * (Above 3 all done in delalloc_reserve_metadata) 6133 * 6134 * Return 0 for success 6135 * Return <0 for error(-ENOSPC or -EQUOT) 6136 */ 6137 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) 6138 { 6139 int ret; 6140 6141 ret = btrfs_check_data_free_space(inode, start, len); 6142 if (ret < 0) 6143 return ret; 6144 ret = btrfs_delalloc_reserve_metadata(inode, len); 6145 if (ret < 0) 6146 btrfs_free_reserved_data_space(inode, start, len); 6147 return ret; 6148 } 6149 6150 /** 6151 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6152 * @inode: inode we're releasing space for 6153 * @start: start position of the space already reserved 6154 * @len: the len of the space already reserved 6155 * 6156 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 6157 * called in the case that we don't need the metadata AND data reservations 6158 * anymore. So if there is an error or we insert an inline extent. 6159 * 6160 * This function will release the metadata space that was not used and will 6161 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6162 * list if there are no delalloc bytes left. 6163 * Also it will handle the qgroup reserved space. 6164 */ 6165 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) 6166 { 6167 btrfs_delalloc_release_metadata(inode, len); 6168 btrfs_free_reserved_data_space(inode, start, len); 6169 } 6170 6171 static int update_block_group(struct btrfs_trans_handle *trans, 6172 struct btrfs_fs_info *info, u64 bytenr, 6173 u64 num_bytes, int alloc) 6174 { 6175 struct btrfs_block_group_cache *cache = NULL; 6176 u64 total = num_bytes; 6177 u64 old_val; 6178 u64 byte_in_group; 6179 int factor; 6180 6181 /* block accounting for super block */ 6182 spin_lock(&info->delalloc_root_lock); 6183 old_val = btrfs_super_bytes_used(info->super_copy); 6184 if (alloc) 6185 old_val += num_bytes; 6186 else 6187 old_val -= num_bytes; 6188 btrfs_set_super_bytes_used(info->super_copy, old_val); 6189 spin_unlock(&info->delalloc_root_lock); 6190 6191 while (total) { 6192 cache = btrfs_lookup_block_group(info, bytenr); 6193 if (!cache) 6194 return -ENOENT; 6195 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 6196 BTRFS_BLOCK_GROUP_RAID1 | 6197 BTRFS_BLOCK_GROUP_RAID10)) 6198 factor = 2; 6199 else 6200 factor = 1; 6201 /* 6202 * If this block group has free space cache written out, we 6203 * need to make sure to load it if we are removing space. This 6204 * is because we need the unpinning stage to actually add the 6205 * space back to the block group, otherwise we will leak space. 6206 */ 6207 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6208 cache_block_group(cache, 1); 6209 6210 byte_in_group = bytenr - cache->key.objectid; 6211 WARN_ON(byte_in_group > cache->key.offset); 6212 6213 spin_lock(&cache->space_info->lock); 6214 spin_lock(&cache->lock); 6215 6216 if (btrfs_test_opt(info, SPACE_CACHE) && 6217 cache->disk_cache_state < BTRFS_DC_CLEAR) 6218 cache->disk_cache_state = BTRFS_DC_CLEAR; 6219 6220 old_val = btrfs_block_group_used(&cache->item); 6221 num_bytes = min(total, cache->key.offset - byte_in_group); 6222 if (alloc) { 6223 old_val += num_bytes; 6224 btrfs_set_block_group_used(&cache->item, old_val); 6225 cache->reserved -= num_bytes; 6226 cache->space_info->bytes_reserved -= num_bytes; 6227 cache->space_info->bytes_used += num_bytes; 6228 cache->space_info->disk_used += num_bytes * factor; 6229 spin_unlock(&cache->lock); 6230 spin_unlock(&cache->space_info->lock); 6231 } else { 6232 old_val -= num_bytes; 6233 btrfs_set_block_group_used(&cache->item, old_val); 6234 cache->pinned += num_bytes; 6235 cache->space_info->bytes_pinned += num_bytes; 6236 cache->space_info->bytes_used -= num_bytes; 6237 cache->space_info->disk_used -= num_bytes * factor; 6238 spin_unlock(&cache->lock); 6239 spin_unlock(&cache->space_info->lock); 6240 6241 trace_btrfs_space_reservation(info, "pinned", 6242 cache->space_info->flags, 6243 num_bytes, 1); 6244 set_extent_dirty(info->pinned_extents, 6245 bytenr, bytenr + num_bytes - 1, 6246 GFP_NOFS | __GFP_NOFAIL); 6247 } 6248 6249 spin_lock(&trans->transaction->dirty_bgs_lock); 6250 if (list_empty(&cache->dirty_list)) { 6251 list_add_tail(&cache->dirty_list, 6252 &trans->transaction->dirty_bgs); 6253 trans->transaction->num_dirty_bgs++; 6254 btrfs_get_block_group(cache); 6255 } 6256 spin_unlock(&trans->transaction->dirty_bgs_lock); 6257 6258 /* 6259 * No longer have used bytes in this block group, queue it for 6260 * deletion. We do this after adding the block group to the 6261 * dirty list to avoid races between cleaner kthread and space 6262 * cache writeout. 6263 */ 6264 if (!alloc && old_val == 0) { 6265 spin_lock(&info->unused_bgs_lock); 6266 if (list_empty(&cache->bg_list)) { 6267 btrfs_get_block_group(cache); 6268 list_add_tail(&cache->bg_list, 6269 &info->unused_bgs); 6270 } 6271 spin_unlock(&info->unused_bgs_lock); 6272 } 6273 6274 btrfs_put_block_group(cache); 6275 total -= num_bytes; 6276 bytenr += num_bytes; 6277 } 6278 return 0; 6279 } 6280 6281 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6282 { 6283 struct btrfs_block_group_cache *cache; 6284 u64 bytenr; 6285 6286 spin_lock(&fs_info->block_group_cache_lock); 6287 bytenr = fs_info->first_logical_byte; 6288 spin_unlock(&fs_info->block_group_cache_lock); 6289 6290 if (bytenr < (u64)-1) 6291 return bytenr; 6292 6293 cache = btrfs_lookup_first_block_group(fs_info, search_start); 6294 if (!cache) 6295 return 0; 6296 6297 bytenr = cache->key.objectid; 6298 btrfs_put_block_group(cache); 6299 6300 return bytenr; 6301 } 6302 6303 static int pin_down_extent(struct btrfs_fs_info *fs_info, 6304 struct btrfs_block_group_cache *cache, 6305 u64 bytenr, u64 num_bytes, int reserved) 6306 { 6307 spin_lock(&cache->space_info->lock); 6308 spin_lock(&cache->lock); 6309 cache->pinned += num_bytes; 6310 cache->space_info->bytes_pinned += num_bytes; 6311 if (reserved) { 6312 cache->reserved -= num_bytes; 6313 cache->space_info->bytes_reserved -= num_bytes; 6314 } 6315 spin_unlock(&cache->lock); 6316 spin_unlock(&cache->space_info->lock); 6317 6318 trace_btrfs_space_reservation(fs_info, "pinned", 6319 cache->space_info->flags, num_bytes, 1); 6320 set_extent_dirty(fs_info->pinned_extents, bytenr, 6321 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6322 return 0; 6323 } 6324 6325 /* 6326 * this function must be called within transaction 6327 */ 6328 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 6329 u64 bytenr, u64 num_bytes, int reserved) 6330 { 6331 struct btrfs_block_group_cache *cache; 6332 6333 cache = btrfs_lookup_block_group(fs_info, bytenr); 6334 BUG_ON(!cache); /* Logic error */ 6335 6336 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); 6337 6338 btrfs_put_block_group(cache); 6339 return 0; 6340 } 6341 6342 /* 6343 * this function must be called within transaction 6344 */ 6345 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 6346 u64 bytenr, u64 num_bytes) 6347 { 6348 struct btrfs_block_group_cache *cache; 6349 int ret; 6350 6351 cache = btrfs_lookup_block_group(fs_info, bytenr); 6352 if (!cache) 6353 return -EINVAL; 6354 6355 /* 6356 * pull in the free space cache (if any) so that our pin 6357 * removes the free space from the cache. We have load_only set 6358 * to one because the slow code to read in the free extents does check 6359 * the pinned extents. 6360 */ 6361 cache_block_group(cache, 1); 6362 6363 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); 6364 6365 /* remove us from the free space cache (if we're there at all) */ 6366 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6367 btrfs_put_block_group(cache); 6368 return ret; 6369 } 6370 6371 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 6372 u64 start, u64 num_bytes) 6373 { 6374 int ret; 6375 struct btrfs_block_group_cache *block_group; 6376 struct btrfs_caching_control *caching_ctl; 6377 6378 block_group = btrfs_lookup_block_group(fs_info, start); 6379 if (!block_group) 6380 return -EINVAL; 6381 6382 cache_block_group(block_group, 0); 6383 caching_ctl = get_caching_control(block_group); 6384 6385 if (!caching_ctl) { 6386 /* Logic error */ 6387 BUG_ON(!block_group_cache_done(block_group)); 6388 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6389 } else { 6390 mutex_lock(&caching_ctl->mutex); 6391 6392 if (start >= caching_ctl->progress) { 6393 ret = add_excluded_extent(fs_info, start, num_bytes); 6394 } else if (start + num_bytes <= caching_ctl->progress) { 6395 ret = btrfs_remove_free_space(block_group, 6396 start, num_bytes); 6397 } else { 6398 num_bytes = caching_ctl->progress - start; 6399 ret = btrfs_remove_free_space(block_group, 6400 start, num_bytes); 6401 if (ret) 6402 goto out_lock; 6403 6404 num_bytes = (start + num_bytes) - 6405 caching_ctl->progress; 6406 start = caching_ctl->progress; 6407 ret = add_excluded_extent(fs_info, start, num_bytes); 6408 } 6409 out_lock: 6410 mutex_unlock(&caching_ctl->mutex); 6411 put_caching_control(caching_ctl); 6412 } 6413 btrfs_put_block_group(block_group); 6414 return ret; 6415 } 6416 6417 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, 6418 struct extent_buffer *eb) 6419 { 6420 struct btrfs_file_extent_item *item; 6421 struct btrfs_key key; 6422 int found_type; 6423 int i; 6424 6425 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 6426 return 0; 6427 6428 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6429 btrfs_item_key_to_cpu(eb, &key, i); 6430 if (key.type != BTRFS_EXTENT_DATA_KEY) 6431 continue; 6432 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6433 found_type = btrfs_file_extent_type(eb, item); 6434 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6435 continue; 6436 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6437 continue; 6438 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6439 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6440 __exclude_logged_extent(fs_info, key.objectid, key.offset); 6441 } 6442 6443 return 0; 6444 } 6445 6446 static void 6447 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6448 { 6449 atomic_inc(&bg->reservations); 6450 } 6451 6452 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6453 const u64 start) 6454 { 6455 struct btrfs_block_group_cache *bg; 6456 6457 bg = btrfs_lookup_block_group(fs_info, start); 6458 ASSERT(bg); 6459 if (atomic_dec_and_test(&bg->reservations)) 6460 wake_up_atomic_t(&bg->reservations); 6461 btrfs_put_block_group(bg); 6462 } 6463 6464 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) 6465 { 6466 schedule(); 6467 return 0; 6468 } 6469 6470 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6471 { 6472 struct btrfs_space_info *space_info = bg->space_info; 6473 6474 ASSERT(bg->ro); 6475 6476 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6477 return; 6478 6479 /* 6480 * Our block group is read only but before we set it to read only, 6481 * some task might have had allocated an extent from it already, but it 6482 * has not yet created a respective ordered extent (and added it to a 6483 * root's list of ordered extents). 6484 * Therefore wait for any task currently allocating extents, since the 6485 * block group's reservations counter is incremented while a read lock 6486 * on the groups' semaphore is held and decremented after releasing 6487 * the read access on that semaphore and creating the ordered extent. 6488 */ 6489 down_write(&space_info->groups_sem); 6490 up_write(&space_info->groups_sem); 6491 6492 wait_on_atomic_t(&bg->reservations, 6493 btrfs_wait_bg_reservations_atomic_t, 6494 TASK_UNINTERRUPTIBLE); 6495 } 6496 6497 /** 6498 * btrfs_add_reserved_bytes - update the block_group and space info counters 6499 * @cache: The cache we are manipulating 6500 * @ram_bytes: The number of bytes of file content, and will be same to 6501 * @num_bytes except for the compress path. 6502 * @num_bytes: The number of bytes in question 6503 * @delalloc: The blocks are allocated for the delalloc write 6504 * 6505 * This is called by the allocator when it reserves space. If this is a 6506 * reservation and the block group has become read only we cannot make the 6507 * reservation and return -EAGAIN, otherwise this function always succeeds. 6508 */ 6509 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 6510 u64 ram_bytes, u64 num_bytes, int delalloc) 6511 { 6512 struct btrfs_space_info *space_info = cache->space_info; 6513 int ret = 0; 6514 6515 spin_lock(&space_info->lock); 6516 spin_lock(&cache->lock); 6517 if (cache->ro) { 6518 ret = -EAGAIN; 6519 } else { 6520 cache->reserved += num_bytes; 6521 space_info->bytes_reserved += num_bytes; 6522 6523 trace_btrfs_space_reservation(cache->fs_info, 6524 "space_info", space_info->flags, 6525 ram_bytes, 0); 6526 space_info->bytes_may_use -= ram_bytes; 6527 if (delalloc) 6528 cache->delalloc_bytes += num_bytes; 6529 } 6530 spin_unlock(&cache->lock); 6531 spin_unlock(&space_info->lock); 6532 return ret; 6533 } 6534 6535 /** 6536 * btrfs_free_reserved_bytes - update the block_group and space info counters 6537 * @cache: The cache we are manipulating 6538 * @num_bytes: The number of bytes in question 6539 * @delalloc: The blocks are allocated for the delalloc write 6540 * 6541 * This is called by somebody who is freeing space that was never actually used 6542 * on disk. For example if you reserve some space for a new leaf in transaction 6543 * A and before transaction A commits you free that leaf, you call this with 6544 * reserve set to 0 in order to clear the reservation. 6545 */ 6546 6547 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 6548 u64 num_bytes, int delalloc) 6549 { 6550 struct btrfs_space_info *space_info = cache->space_info; 6551 int ret = 0; 6552 6553 spin_lock(&space_info->lock); 6554 spin_lock(&cache->lock); 6555 if (cache->ro) 6556 space_info->bytes_readonly += num_bytes; 6557 cache->reserved -= num_bytes; 6558 space_info->bytes_reserved -= num_bytes; 6559 6560 if (delalloc) 6561 cache->delalloc_bytes -= num_bytes; 6562 spin_unlock(&cache->lock); 6563 spin_unlock(&space_info->lock); 6564 return ret; 6565 } 6566 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 6567 { 6568 struct btrfs_caching_control *next; 6569 struct btrfs_caching_control *caching_ctl; 6570 struct btrfs_block_group_cache *cache; 6571 6572 down_write(&fs_info->commit_root_sem); 6573 6574 list_for_each_entry_safe(caching_ctl, next, 6575 &fs_info->caching_block_groups, list) { 6576 cache = caching_ctl->block_group; 6577 if (block_group_cache_done(cache)) { 6578 cache->last_byte_to_unpin = (u64)-1; 6579 list_del_init(&caching_ctl->list); 6580 put_caching_control(caching_ctl); 6581 } else { 6582 cache->last_byte_to_unpin = caching_ctl->progress; 6583 } 6584 } 6585 6586 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6587 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6588 else 6589 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6590 6591 up_write(&fs_info->commit_root_sem); 6592 6593 update_global_block_rsv(fs_info); 6594 } 6595 6596 /* 6597 * Returns the free cluster for the given space info and sets empty_cluster to 6598 * what it should be based on the mount options. 6599 */ 6600 static struct btrfs_free_cluster * 6601 fetch_cluster_info(struct btrfs_fs_info *fs_info, 6602 struct btrfs_space_info *space_info, u64 *empty_cluster) 6603 { 6604 struct btrfs_free_cluster *ret = NULL; 6605 bool ssd = btrfs_test_opt(fs_info, SSD); 6606 6607 *empty_cluster = 0; 6608 if (btrfs_mixed_space_info(space_info)) 6609 return ret; 6610 6611 if (ssd) 6612 *empty_cluster = SZ_2M; 6613 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6614 ret = &fs_info->meta_alloc_cluster; 6615 if (!ssd) 6616 *empty_cluster = SZ_64K; 6617 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { 6618 ret = &fs_info->data_alloc_cluster; 6619 } 6620 6621 return ret; 6622 } 6623 6624 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 6625 u64 start, u64 end, 6626 const bool return_free_space) 6627 { 6628 struct btrfs_block_group_cache *cache = NULL; 6629 struct btrfs_space_info *space_info; 6630 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6631 struct btrfs_free_cluster *cluster = NULL; 6632 u64 len; 6633 u64 total_unpinned = 0; 6634 u64 empty_cluster = 0; 6635 bool readonly; 6636 6637 while (start <= end) { 6638 readonly = false; 6639 if (!cache || 6640 start >= cache->key.objectid + cache->key.offset) { 6641 if (cache) 6642 btrfs_put_block_group(cache); 6643 total_unpinned = 0; 6644 cache = btrfs_lookup_block_group(fs_info, start); 6645 BUG_ON(!cache); /* Logic error */ 6646 6647 cluster = fetch_cluster_info(fs_info, 6648 cache->space_info, 6649 &empty_cluster); 6650 empty_cluster <<= 1; 6651 } 6652 6653 len = cache->key.objectid + cache->key.offset - start; 6654 len = min(len, end + 1 - start); 6655 6656 if (start < cache->last_byte_to_unpin) { 6657 len = min(len, cache->last_byte_to_unpin - start); 6658 if (return_free_space) 6659 btrfs_add_free_space(cache, start, len); 6660 } 6661 6662 start += len; 6663 total_unpinned += len; 6664 space_info = cache->space_info; 6665 6666 /* 6667 * If this space cluster has been marked as fragmented and we've 6668 * unpinned enough in this block group to potentially allow a 6669 * cluster to be created inside of it go ahead and clear the 6670 * fragmented check. 6671 */ 6672 if (cluster && cluster->fragmented && 6673 total_unpinned > empty_cluster) { 6674 spin_lock(&cluster->lock); 6675 cluster->fragmented = 0; 6676 spin_unlock(&cluster->lock); 6677 } 6678 6679 spin_lock(&space_info->lock); 6680 spin_lock(&cache->lock); 6681 cache->pinned -= len; 6682 space_info->bytes_pinned -= len; 6683 6684 trace_btrfs_space_reservation(fs_info, "pinned", 6685 space_info->flags, len, 0); 6686 space_info->max_extent_size = 0; 6687 percpu_counter_add(&space_info->total_bytes_pinned, -len); 6688 if (cache->ro) { 6689 space_info->bytes_readonly += len; 6690 readonly = true; 6691 } 6692 spin_unlock(&cache->lock); 6693 if (!readonly && return_free_space && 6694 global_rsv->space_info == space_info) { 6695 u64 to_add = len; 6696 WARN_ON(!return_free_space); 6697 spin_lock(&global_rsv->lock); 6698 if (!global_rsv->full) { 6699 to_add = min(len, global_rsv->size - 6700 global_rsv->reserved); 6701 global_rsv->reserved += to_add; 6702 space_info->bytes_may_use += to_add; 6703 if (global_rsv->reserved >= global_rsv->size) 6704 global_rsv->full = 1; 6705 trace_btrfs_space_reservation(fs_info, 6706 "space_info", 6707 space_info->flags, 6708 to_add, 1); 6709 len -= to_add; 6710 } 6711 spin_unlock(&global_rsv->lock); 6712 /* Add to any tickets we may have */ 6713 if (len) 6714 space_info_add_new_bytes(fs_info, space_info, 6715 len); 6716 } 6717 spin_unlock(&space_info->lock); 6718 } 6719 6720 if (cache) 6721 btrfs_put_block_group(cache); 6722 return 0; 6723 } 6724 6725 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 6726 struct btrfs_fs_info *fs_info) 6727 { 6728 struct btrfs_block_group_cache *block_group, *tmp; 6729 struct list_head *deleted_bgs; 6730 struct extent_io_tree *unpin; 6731 u64 start; 6732 u64 end; 6733 int ret; 6734 6735 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6736 unpin = &fs_info->freed_extents[1]; 6737 else 6738 unpin = &fs_info->freed_extents[0]; 6739 6740 while (!trans->aborted) { 6741 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6742 ret = find_first_extent_bit(unpin, 0, &start, &end, 6743 EXTENT_DIRTY, NULL); 6744 if (ret) { 6745 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6746 break; 6747 } 6748 6749 if (btrfs_test_opt(fs_info, DISCARD)) 6750 ret = btrfs_discard_extent(fs_info, start, 6751 end + 1 - start, NULL); 6752 6753 clear_extent_dirty(unpin, start, end); 6754 unpin_extent_range(fs_info, start, end, true); 6755 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6756 cond_resched(); 6757 } 6758 6759 /* 6760 * Transaction is finished. We don't need the lock anymore. We 6761 * do need to clean up the block groups in case of a transaction 6762 * abort. 6763 */ 6764 deleted_bgs = &trans->transaction->deleted_bgs; 6765 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6766 u64 trimmed = 0; 6767 6768 ret = -EROFS; 6769 if (!trans->aborted) 6770 ret = btrfs_discard_extent(fs_info, 6771 block_group->key.objectid, 6772 block_group->key.offset, 6773 &trimmed); 6774 6775 list_del_init(&block_group->bg_list); 6776 btrfs_put_block_group_trimming(block_group); 6777 btrfs_put_block_group(block_group); 6778 6779 if (ret) { 6780 const char *errstr = btrfs_decode_error(ret); 6781 btrfs_warn(fs_info, 6782 "Discard failed while removing blockgroup: errno=%d %s\n", 6783 ret, errstr); 6784 } 6785 } 6786 6787 return 0; 6788 } 6789 6790 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 6791 u64 owner, u64 root_objectid) 6792 { 6793 struct btrfs_space_info *space_info; 6794 u64 flags; 6795 6796 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6797 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 6798 flags = BTRFS_BLOCK_GROUP_SYSTEM; 6799 else 6800 flags = BTRFS_BLOCK_GROUP_METADATA; 6801 } else { 6802 flags = BTRFS_BLOCK_GROUP_DATA; 6803 } 6804 6805 space_info = __find_space_info(fs_info, flags); 6806 BUG_ON(!space_info); /* Logic bug */ 6807 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 6808 } 6809 6810 6811 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6812 struct btrfs_fs_info *info, 6813 struct btrfs_delayed_ref_node *node, u64 parent, 6814 u64 root_objectid, u64 owner_objectid, 6815 u64 owner_offset, int refs_to_drop, 6816 struct btrfs_delayed_extent_op *extent_op) 6817 { 6818 struct btrfs_key key; 6819 struct btrfs_path *path; 6820 struct btrfs_root *extent_root = info->extent_root; 6821 struct extent_buffer *leaf; 6822 struct btrfs_extent_item *ei; 6823 struct btrfs_extent_inline_ref *iref; 6824 int ret; 6825 int is_data; 6826 int extent_slot = 0; 6827 int found_extent = 0; 6828 int num_to_del = 1; 6829 u32 item_size; 6830 u64 refs; 6831 u64 bytenr = node->bytenr; 6832 u64 num_bytes = node->num_bytes; 6833 int last_ref = 0; 6834 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 6835 6836 path = btrfs_alloc_path(); 6837 if (!path) 6838 return -ENOMEM; 6839 6840 path->reada = READA_FORWARD; 6841 path->leave_spinning = 1; 6842 6843 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6844 BUG_ON(!is_data && refs_to_drop != 1); 6845 6846 if (is_data) 6847 skinny_metadata = 0; 6848 6849 ret = lookup_extent_backref(trans, info, path, &iref, 6850 bytenr, num_bytes, parent, 6851 root_objectid, owner_objectid, 6852 owner_offset); 6853 if (ret == 0) { 6854 extent_slot = path->slots[0]; 6855 while (extent_slot >= 0) { 6856 btrfs_item_key_to_cpu(path->nodes[0], &key, 6857 extent_slot); 6858 if (key.objectid != bytenr) 6859 break; 6860 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6861 key.offset == num_bytes) { 6862 found_extent = 1; 6863 break; 6864 } 6865 if (key.type == BTRFS_METADATA_ITEM_KEY && 6866 key.offset == owner_objectid) { 6867 found_extent = 1; 6868 break; 6869 } 6870 if (path->slots[0] - extent_slot > 5) 6871 break; 6872 extent_slot--; 6873 } 6874 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6875 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 6876 if (found_extent && item_size < sizeof(*ei)) 6877 found_extent = 0; 6878 #endif 6879 if (!found_extent) { 6880 BUG_ON(iref); 6881 ret = remove_extent_backref(trans, info, path, NULL, 6882 refs_to_drop, 6883 is_data, &last_ref); 6884 if (ret) { 6885 btrfs_abort_transaction(trans, ret); 6886 goto out; 6887 } 6888 btrfs_release_path(path); 6889 path->leave_spinning = 1; 6890 6891 key.objectid = bytenr; 6892 key.type = BTRFS_EXTENT_ITEM_KEY; 6893 key.offset = num_bytes; 6894 6895 if (!is_data && skinny_metadata) { 6896 key.type = BTRFS_METADATA_ITEM_KEY; 6897 key.offset = owner_objectid; 6898 } 6899 6900 ret = btrfs_search_slot(trans, extent_root, 6901 &key, path, -1, 1); 6902 if (ret > 0 && skinny_metadata && path->slots[0]) { 6903 /* 6904 * Couldn't find our skinny metadata item, 6905 * see if we have ye olde extent item. 6906 */ 6907 path->slots[0]--; 6908 btrfs_item_key_to_cpu(path->nodes[0], &key, 6909 path->slots[0]); 6910 if (key.objectid == bytenr && 6911 key.type == BTRFS_EXTENT_ITEM_KEY && 6912 key.offset == num_bytes) 6913 ret = 0; 6914 } 6915 6916 if (ret > 0 && skinny_metadata) { 6917 skinny_metadata = false; 6918 key.objectid = bytenr; 6919 key.type = BTRFS_EXTENT_ITEM_KEY; 6920 key.offset = num_bytes; 6921 btrfs_release_path(path); 6922 ret = btrfs_search_slot(trans, extent_root, 6923 &key, path, -1, 1); 6924 } 6925 6926 if (ret) { 6927 btrfs_err(info, 6928 "umm, got %d back from search, was looking for %llu", 6929 ret, bytenr); 6930 if (ret > 0) 6931 btrfs_print_leaf(info, path->nodes[0]); 6932 } 6933 if (ret < 0) { 6934 btrfs_abort_transaction(trans, ret); 6935 goto out; 6936 } 6937 extent_slot = path->slots[0]; 6938 } 6939 } else if (WARN_ON(ret == -ENOENT)) { 6940 btrfs_print_leaf(info, path->nodes[0]); 6941 btrfs_err(info, 6942 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 6943 bytenr, parent, root_objectid, owner_objectid, 6944 owner_offset); 6945 btrfs_abort_transaction(trans, ret); 6946 goto out; 6947 } else { 6948 btrfs_abort_transaction(trans, ret); 6949 goto out; 6950 } 6951 6952 leaf = path->nodes[0]; 6953 item_size = btrfs_item_size_nr(leaf, extent_slot); 6954 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6955 if (item_size < sizeof(*ei)) { 6956 BUG_ON(found_extent || extent_slot != path->slots[0]); 6957 ret = convert_extent_item_v0(trans, info, path, owner_objectid, 6958 0); 6959 if (ret < 0) { 6960 btrfs_abort_transaction(trans, ret); 6961 goto out; 6962 } 6963 6964 btrfs_release_path(path); 6965 path->leave_spinning = 1; 6966 6967 key.objectid = bytenr; 6968 key.type = BTRFS_EXTENT_ITEM_KEY; 6969 key.offset = num_bytes; 6970 6971 ret = btrfs_search_slot(trans, extent_root, &key, path, 6972 -1, 1); 6973 if (ret) { 6974 btrfs_err(info, 6975 "umm, got %d back from search, was looking for %llu", 6976 ret, bytenr); 6977 btrfs_print_leaf(info, path->nodes[0]); 6978 } 6979 if (ret < 0) { 6980 btrfs_abort_transaction(trans, ret); 6981 goto out; 6982 } 6983 6984 extent_slot = path->slots[0]; 6985 leaf = path->nodes[0]; 6986 item_size = btrfs_item_size_nr(leaf, extent_slot); 6987 } 6988 #endif 6989 BUG_ON(item_size < sizeof(*ei)); 6990 ei = btrfs_item_ptr(leaf, extent_slot, 6991 struct btrfs_extent_item); 6992 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6993 key.type == BTRFS_EXTENT_ITEM_KEY) { 6994 struct btrfs_tree_block_info *bi; 6995 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6996 bi = (struct btrfs_tree_block_info *)(ei + 1); 6997 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6998 } 6999 7000 refs = btrfs_extent_refs(leaf, ei); 7001 if (refs < refs_to_drop) { 7002 btrfs_err(info, 7003 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 7004 refs_to_drop, refs, bytenr); 7005 ret = -EINVAL; 7006 btrfs_abort_transaction(trans, ret); 7007 goto out; 7008 } 7009 refs -= refs_to_drop; 7010 7011 if (refs > 0) { 7012 if (extent_op) 7013 __run_delayed_extent_op(extent_op, leaf, ei); 7014 /* 7015 * In the case of inline back ref, reference count will 7016 * be updated by remove_extent_backref 7017 */ 7018 if (iref) { 7019 BUG_ON(!found_extent); 7020 } else { 7021 btrfs_set_extent_refs(leaf, ei, refs); 7022 btrfs_mark_buffer_dirty(leaf); 7023 } 7024 if (found_extent) { 7025 ret = remove_extent_backref(trans, info, path, 7026 iref, refs_to_drop, 7027 is_data, &last_ref); 7028 if (ret) { 7029 btrfs_abort_transaction(trans, ret); 7030 goto out; 7031 } 7032 } 7033 add_pinned_bytes(info, -num_bytes, owner_objectid, 7034 root_objectid); 7035 } else { 7036 if (found_extent) { 7037 BUG_ON(is_data && refs_to_drop != 7038 extent_data_ref_count(path, iref)); 7039 if (iref) { 7040 BUG_ON(path->slots[0] != extent_slot); 7041 } else { 7042 BUG_ON(path->slots[0] != extent_slot + 1); 7043 path->slots[0] = extent_slot; 7044 num_to_del = 2; 7045 } 7046 } 7047 7048 last_ref = 1; 7049 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7050 num_to_del); 7051 if (ret) { 7052 btrfs_abort_transaction(trans, ret); 7053 goto out; 7054 } 7055 btrfs_release_path(path); 7056 7057 if (is_data) { 7058 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 7059 if (ret) { 7060 btrfs_abort_transaction(trans, ret); 7061 goto out; 7062 } 7063 } 7064 7065 ret = add_to_free_space_tree(trans, info, bytenr, num_bytes); 7066 if (ret) { 7067 btrfs_abort_transaction(trans, ret); 7068 goto out; 7069 } 7070 7071 ret = update_block_group(trans, info, bytenr, num_bytes, 0); 7072 if (ret) { 7073 btrfs_abort_transaction(trans, ret); 7074 goto out; 7075 } 7076 } 7077 btrfs_release_path(path); 7078 7079 out: 7080 btrfs_free_path(path); 7081 return ret; 7082 } 7083 7084 /* 7085 * when we free an block, it is possible (and likely) that we free the last 7086 * delayed ref for that extent as well. This searches the delayed ref tree for 7087 * a given extent, and if there are no other delayed refs to be processed, it 7088 * removes it from the tree. 7089 */ 7090 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7091 u64 bytenr) 7092 { 7093 struct btrfs_delayed_ref_head *head; 7094 struct btrfs_delayed_ref_root *delayed_refs; 7095 int ret = 0; 7096 7097 delayed_refs = &trans->transaction->delayed_refs; 7098 spin_lock(&delayed_refs->lock); 7099 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 7100 if (!head) 7101 goto out_delayed_unlock; 7102 7103 spin_lock(&head->lock); 7104 if (!list_empty(&head->ref_list)) 7105 goto out; 7106 7107 if (head->extent_op) { 7108 if (!head->must_insert_reserved) 7109 goto out; 7110 btrfs_free_delayed_extent_op(head->extent_op); 7111 head->extent_op = NULL; 7112 } 7113 7114 /* 7115 * waiting for the lock here would deadlock. If someone else has it 7116 * locked they are already in the process of dropping it anyway 7117 */ 7118 if (!mutex_trylock(&head->mutex)) 7119 goto out; 7120 7121 /* 7122 * at this point we have a head with no other entries. Go 7123 * ahead and process it. 7124 */ 7125 head->node.in_tree = 0; 7126 rb_erase(&head->href_node, &delayed_refs->href_root); 7127 7128 atomic_dec(&delayed_refs->num_entries); 7129 7130 /* 7131 * we don't take a ref on the node because we're removing it from the 7132 * tree, so we just steal the ref the tree was holding. 7133 */ 7134 delayed_refs->num_heads--; 7135 if (head->processing == 0) 7136 delayed_refs->num_heads_ready--; 7137 head->processing = 0; 7138 spin_unlock(&head->lock); 7139 spin_unlock(&delayed_refs->lock); 7140 7141 BUG_ON(head->extent_op); 7142 if (head->must_insert_reserved) 7143 ret = 1; 7144 7145 mutex_unlock(&head->mutex); 7146 btrfs_put_delayed_ref(&head->node); 7147 return ret; 7148 out: 7149 spin_unlock(&head->lock); 7150 7151 out_delayed_unlock: 7152 spin_unlock(&delayed_refs->lock); 7153 return 0; 7154 } 7155 7156 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7157 struct btrfs_root *root, 7158 struct extent_buffer *buf, 7159 u64 parent, int last_ref) 7160 { 7161 struct btrfs_fs_info *fs_info = root->fs_info; 7162 int pin = 1; 7163 int ret; 7164 7165 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7166 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 7167 buf->start, buf->len, 7168 parent, 7169 root->root_key.objectid, 7170 btrfs_header_level(buf), 7171 BTRFS_DROP_DELAYED_REF, NULL); 7172 BUG_ON(ret); /* -ENOMEM */ 7173 } 7174 7175 if (!last_ref) 7176 return; 7177 7178 if (btrfs_header_generation(buf) == trans->transid) { 7179 struct btrfs_block_group_cache *cache; 7180 7181 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7182 ret = check_ref_cleanup(trans, buf->start); 7183 if (!ret) 7184 goto out; 7185 } 7186 7187 cache = btrfs_lookup_block_group(fs_info, buf->start); 7188 7189 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7190 pin_down_extent(fs_info, cache, buf->start, 7191 buf->len, 1); 7192 btrfs_put_block_group(cache); 7193 goto out; 7194 } 7195 7196 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7197 7198 btrfs_add_free_space(cache, buf->start, buf->len); 7199 btrfs_free_reserved_bytes(cache, buf->len, 0); 7200 btrfs_put_block_group(cache); 7201 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7202 pin = 0; 7203 } 7204 out: 7205 if (pin) 7206 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), 7207 root->root_key.objectid); 7208 7209 /* 7210 * Deleting the buffer, clear the corrupt flag since it doesn't matter 7211 * anymore. 7212 */ 7213 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7214 } 7215 7216 /* Can return -ENOMEM */ 7217 int btrfs_free_extent(struct btrfs_trans_handle *trans, 7218 struct btrfs_fs_info *fs_info, 7219 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7220 u64 owner, u64 offset) 7221 { 7222 int ret; 7223 7224 if (btrfs_is_testing(fs_info)) 7225 return 0; 7226 7227 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); 7228 7229 /* 7230 * tree log blocks never actually go into the extent allocation 7231 * tree, just update pinning info and exit early. 7232 */ 7233 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7234 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7235 /* unlocks the pinned mutex */ 7236 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); 7237 ret = 0; 7238 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7239 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 7240 num_bytes, 7241 parent, root_objectid, (int)owner, 7242 BTRFS_DROP_DELAYED_REF, NULL); 7243 } else { 7244 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 7245 num_bytes, 7246 parent, root_objectid, owner, 7247 offset, 0, 7248 BTRFS_DROP_DELAYED_REF); 7249 } 7250 return ret; 7251 } 7252 7253 /* 7254 * when we wait for progress in the block group caching, its because 7255 * our allocation attempt failed at least once. So, we must sleep 7256 * and let some progress happen before we try again. 7257 * 7258 * This function will sleep at least once waiting for new free space to 7259 * show up, and then it will check the block group free space numbers 7260 * for our min num_bytes. Another option is to have it go ahead 7261 * and look in the rbtree for a free extent of a given size, but this 7262 * is a good start. 7263 * 7264 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7265 * any of the information in this block group. 7266 */ 7267 static noinline void 7268 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7269 u64 num_bytes) 7270 { 7271 struct btrfs_caching_control *caching_ctl; 7272 7273 caching_ctl = get_caching_control(cache); 7274 if (!caching_ctl) 7275 return; 7276 7277 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7278 (cache->free_space_ctl->free_space >= num_bytes)); 7279 7280 put_caching_control(caching_ctl); 7281 } 7282 7283 static noinline int 7284 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7285 { 7286 struct btrfs_caching_control *caching_ctl; 7287 int ret = 0; 7288 7289 caching_ctl = get_caching_control(cache); 7290 if (!caching_ctl) 7291 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7292 7293 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7294 if (cache->cached == BTRFS_CACHE_ERROR) 7295 ret = -EIO; 7296 put_caching_control(caching_ctl); 7297 return ret; 7298 } 7299 7300 int __get_raid_index(u64 flags) 7301 { 7302 if (flags & BTRFS_BLOCK_GROUP_RAID10) 7303 return BTRFS_RAID_RAID10; 7304 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 7305 return BTRFS_RAID_RAID1; 7306 else if (flags & BTRFS_BLOCK_GROUP_DUP) 7307 return BTRFS_RAID_DUP; 7308 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 7309 return BTRFS_RAID_RAID0; 7310 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 7311 return BTRFS_RAID_RAID5; 7312 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 7313 return BTRFS_RAID_RAID6; 7314 7315 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 7316 } 7317 7318 int get_block_group_index(struct btrfs_block_group_cache *cache) 7319 { 7320 return __get_raid_index(cache->flags); 7321 } 7322 7323 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 7324 [BTRFS_RAID_RAID10] = "raid10", 7325 [BTRFS_RAID_RAID1] = "raid1", 7326 [BTRFS_RAID_DUP] = "dup", 7327 [BTRFS_RAID_RAID0] = "raid0", 7328 [BTRFS_RAID_SINGLE] = "single", 7329 [BTRFS_RAID_RAID5] = "raid5", 7330 [BTRFS_RAID_RAID6] = "raid6", 7331 }; 7332 7333 static const char *get_raid_name(enum btrfs_raid_types type) 7334 { 7335 if (type >= BTRFS_NR_RAID_TYPES) 7336 return NULL; 7337 7338 return btrfs_raid_type_names[type]; 7339 } 7340 7341 enum btrfs_loop_type { 7342 LOOP_CACHING_NOWAIT = 0, 7343 LOOP_CACHING_WAIT = 1, 7344 LOOP_ALLOC_CHUNK = 2, 7345 LOOP_NO_EMPTY_SIZE = 3, 7346 }; 7347 7348 static inline void 7349 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7350 int delalloc) 7351 { 7352 if (delalloc) 7353 down_read(&cache->data_rwsem); 7354 } 7355 7356 static inline void 7357 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7358 int delalloc) 7359 { 7360 btrfs_get_block_group(cache); 7361 if (delalloc) 7362 down_read(&cache->data_rwsem); 7363 } 7364 7365 static struct btrfs_block_group_cache * 7366 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7367 struct btrfs_free_cluster *cluster, 7368 int delalloc) 7369 { 7370 struct btrfs_block_group_cache *used_bg = NULL; 7371 7372 spin_lock(&cluster->refill_lock); 7373 while (1) { 7374 used_bg = cluster->block_group; 7375 if (!used_bg) 7376 return NULL; 7377 7378 if (used_bg == block_group) 7379 return used_bg; 7380 7381 btrfs_get_block_group(used_bg); 7382 7383 if (!delalloc) 7384 return used_bg; 7385 7386 if (down_read_trylock(&used_bg->data_rwsem)) 7387 return used_bg; 7388 7389 spin_unlock(&cluster->refill_lock); 7390 7391 /* We should only have one-level nested. */ 7392 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 7393 7394 spin_lock(&cluster->refill_lock); 7395 if (used_bg == cluster->block_group) 7396 return used_bg; 7397 7398 up_read(&used_bg->data_rwsem); 7399 btrfs_put_block_group(used_bg); 7400 } 7401 } 7402 7403 static inline void 7404 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7405 int delalloc) 7406 { 7407 if (delalloc) 7408 up_read(&cache->data_rwsem); 7409 btrfs_put_block_group(cache); 7410 } 7411 7412 /* 7413 * walks the btree of allocated extents and find a hole of a given size. 7414 * The key ins is changed to record the hole: 7415 * ins->objectid == start position 7416 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7417 * ins->offset == the size of the hole. 7418 * Any available blocks before search_start are skipped. 7419 * 7420 * If there is no suitable free space, we will record the max size of 7421 * the free space extent currently. 7422 */ 7423 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 7424 u64 ram_bytes, u64 num_bytes, u64 empty_size, 7425 u64 hint_byte, struct btrfs_key *ins, 7426 u64 flags, int delalloc) 7427 { 7428 int ret = 0; 7429 struct btrfs_root *root = fs_info->extent_root; 7430 struct btrfs_free_cluster *last_ptr = NULL; 7431 struct btrfs_block_group_cache *block_group = NULL; 7432 u64 search_start = 0; 7433 u64 max_extent_size = 0; 7434 u64 empty_cluster = 0; 7435 struct btrfs_space_info *space_info; 7436 int loop = 0; 7437 int index = __get_raid_index(flags); 7438 bool failed_cluster_refill = false; 7439 bool failed_alloc = false; 7440 bool use_cluster = true; 7441 bool have_caching_bg = false; 7442 bool orig_have_caching_bg = false; 7443 bool full_search = false; 7444 7445 WARN_ON(num_bytes < fs_info->sectorsize); 7446 ins->type = BTRFS_EXTENT_ITEM_KEY; 7447 ins->objectid = 0; 7448 ins->offset = 0; 7449 7450 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 7451 7452 space_info = __find_space_info(fs_info, flags); 7453 if (!space_info) { 7454 btrfs_err(fs_info, "No space info for %llu", flags); 7455 return -ENOSPC; 7456 } 7457 7458 /* 7459 * If our free space is heavily fragmented we may not be able to make 7460 * big contiguous allocations, so instead of doing the expensive search 7461 * for free space, simply return ENOSPC with our max_extent_size so we 7462 * can go ahead and search for a more manageable chunk. 7463 * 7464 * If our max_extent_size is large enough for our allocation simply 7465 * disable clustering since we will likely not be able to find enough 7466 * space to create a cluster and induce latency trying. 7467 */ 7468 if (unlikely(space_info->max_extent_size)) { 7469 spin_lock(&space_info->lock); 7470 if (space_info->max_extent_size && 7471 num_bytes > space_info->max_extent_size) { 7472 ins->offset = space_info->max_extent_size; 7473 spin_unlock(&space_info->lock); 7474 return -ENOSPC; 7475 } else if (space_info->max_extent_size) { 7476 use_cluster = false; 7477 } 7478 spin_unlock(&space_info->lock); 7479 } 7480 7481 last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); 7482 if (last_ptr) { 7483 spin_lock(&last_ptr->lock); 7484 if (last_ptr->block_group) 7485 hint_byte = last_ptr->window_start; 7486 if (last_ptr->fragmented) { 7487 /* 7488 * We still set window_start so we can keep track of the 7489 * last place we found an allocation to try and save 7490 * some time. 7491 */ 7492 hint_byte = last_ptr->window_start; 7493 use_cluster = false; 7494 } 7495 spin_unlock(&last_ptr->lock); 7496 } 7497 7498 search_start = max(search_start, first_logical_byte(fs_info, 0)); 7499 search_start = max(search_start, hint_byte); 7500 if (search_start == hint_byte) { 7501 block_group = btrfs_lookup_block_group(fs_info, search_start); 7502 /* 7503 * we don't want to use the block group if it doesn't match our 7504 * allocation bits, or if its not cached. 7505 * 7506 * However if we are re-searching with an ideal block group 7507 * picked out then we don't care that the block group is cached. 7508 */ 7509 if (block_group && block_group_bits(block_group, flags) && 7510 block_group->cached != BTRFS_CACHE_NO) { 7511 down_read(&space_info->groups_sem); 7512 if (list_empty(&block_group->list) || 7513 block_group->ro) { 7514 /* 7515 * someone is removing this block group, 7516 * we can't jump into the have_block_group 7517 * target because our list pointers are not 7518 * valid 7519 */ 7520 btrfs_put_block_group(block_group); 7521 up_read(&space_info->groups_sem); 7522 } else { 7523 index = get_block_group_index(block_group); 7524 btrfs_lock_block_group(block_group, delalloc); 7525 goto have_block_group; 7526 } 7527 } else if (block_group) { 7528 btrfs_put_block_group(block_group); 7529 } 7530 } 7531 search: 7532 have_caching_bg = false; 7533 if (index == 0 || index == __get_raid_index(flags)) 7534 full_search = true; 7535 down_read(&space_info->groups_sem); 7536 list_for_each_entry(block_group, &space_info->block_groups[index], 7537 list) { 7538 u64 offset; 7539 int cached; 7540 7541 btrfs_grab_block_group(block_group, delalloc); 7542 search_start = block_group->key.objectid; 7543 7544 /* 7545 * this can happen if we end up cycling through all the 7546 * raid types, but we want to make sure we only allocate 7547 * for the proper type. 7548 */ 7549 if (!block_group_bits(block_group, flags)) { 7550 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7551 BTRFS_BLOCK_GROUP_RAID1 | 7552 BTRFS_BLOCK_GROUP_RAID5 | 7553 BTRFS_BLOCK_GROUP_RAID6 | 7554 BTRFS_BLOCK_GROUP_RAID10; 7555 7556 /* 7557 * if they asked for extra copies and this block group 7558 * doesn't provide them, bail. This does allow us to 7559 * fill raid0 from raid1. 7560 */ 7561 if ((flags & extra) && !(block_group->flags & extra)) 7562 goto loop; 7563 } 7564 7565 have_block_group: 7566 cached = block_group_cache_done(block_group); 7567 if (unlikely(!cached)) { 7568 have_caching_bg = true; 7569 ret = cache_block_group(block_group, 0); 7570 BUG_ON(ret < 0); 7571 ret = 0; 7572 } 7573 7574 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7575 goto loop; 7576 if (unlikely(block_group->ro)) 7577 goto loop; 7578 7579 /* 7580 * Ok we want to try and use the cluster allocator, so 7581 * lets look there 7582 */ 7583 if (last_ptr && use_cluster) { 7584 struct btrfs_block_group_cache *used_block_group; 7585 unsigned long aligned_cluster; 7586 /* 7587 * the refill lock keeps out other 7588 * people trying to start a new cluster 7589 */ 7590 used_block_group = btrfs_lock_cluster(block_group, 7591 last_ptr, 7592 delalloc); 7593 if (!used_block_group) 7594 goto refill_cluster; 7595 7596 if (used_block_group != block_group && 7597 (used_block_group->ro || 7598 !block_group_bits(used_block_group, flags))) 7599 goto release_cluster; 7600 7601 offset = btrfs_alloc_from_cluster(used_block_group, 7602 last_ptr, 7603 num_bytes, 7604 used_block_group->key.objectid, 7605 &max_extent_size); 7606 if (offset) { 7607 /* we have a block, we're done */ 7608 spin_unlock(&last_ptr->refill_lock); 7609 trace_btrfs_reserve_extent_cluster(fs_info, 7610 used_block_group, 7611 search_start, num_bytes); 7612 if (used_block_group != block_group) { 7613 btrfs_release_block_group(block_group, 7614 delalloc); 7615 block_group = used_block_group; 7616 } 7617 goto checks; 7618 } 7619 7620 WARN_ON(last_ptr->block_group != used_block_group); 7621 release_cluster: 7622 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 7623 * set up a new clusters, so lets just skip it 7624 * and let the allocator find whatever block 7625 * it can find. If we reach this point, we 7626 * will have tried the cluster allocator 7627 * plenty of times and not have found 7628 * anything, so we are likely way too 7629 * fragmented for the clustering stuff to find 7630 * anything. 7631 * 7632 * However, if the cluster is taken from the 7633 * current block group, release the cluster 7634 * first, so that we stand a better chance of 7635 * succeeding in the unclustered 7636 * allocation. */ 7637 if (loop >= LOOP_NO_EMPTY_SIZE && 7638 used_block_group != block_group) { 7639 spin_unlock(&last_ptr->refill_lock); 7640 btrfs_release_block_group(used_block_group, 7641 delalloc); 7642 goto unclustered_alloc; 7643 } 7644 7645 /* 7646 * this cluster didn't work out, free it and 7647 * start over 7648 */ 7649 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7650 7651 if (used_block_group != block_group) 7652 btrfs_release_block_group(used_block_group, 7653 delalloc); 7654 refill_cluster: 7655 if (loop >= LOOP_NO_EMPTY_SIZE) { 7656 spin_unlock(&last_ptr->refill_lock); 7657 goto unclustered_alloc; 7658 } 7659 7660 aligned_cluster = max_t(unsigned long, 7661 empty_cluster + empty_size, 7662 block_group->full_stripe_len); 7663 7664 /* allocate a cluster in this block group */ 7665 ret = btrfs_find_space_cluster(fs_info, block_group, 7666 last_ptr, search_start, 7667 num_bytes, 7668 aligned_cluster); 7669 if (ret == 0) { 7670 /* 7671 * now pull our allocation out of this 7672 * cluster 7673 */ 7674 offset = btrfs_alloc_from_cluster(block_group, 7675 last_ptr, 7676 num_bytes, 7677 search_start, 7678 &max_extent_size); 7679 if (offset) { 7680 /* we found one, proceed */ 7681 spin_unlock(&last_ptr->refill_lock); 7682 trace_btrfs_reserve_extent_cluster(fs_info, 7683 block_group, search_start, 7684 num_bytes); 7685 goto checks; 7686 } 7687 } else if (!cached && loop > LOOP_CACHING_NOWAIT 7688 && !failed_cluster_refill) { 7689 spin_unlock(&last_ptr->refill_lock); 7690 7691 failed_cluster_refill = true; 7692 wait_block_group_cache_progress(block_group, 7693 num_bytes + empty_cluster + empty_size); 7694 goto have_block_group; 7695 } 7696 7697 /* 7698 * at this point we either didn't find a cluster 7699 * or we weren't able to allocate a block from our 7700 * cluster. Free the cluster we've been trying 7701 * to use, and go to the next block group 7702 */ 7703 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7704 spin_unlock(&last_ptr->refill_lock); 7705 goto loop; 7706 } 7707 7708 unclustered_alloc: 7709 /* 7710 * We are doing an unclustered alloc, set the fragmented flag so 7711 * we don't bother trying to setup a cluster again until we get 7712 * more space. 7713 */ 7714 if (unlikely(last_ptr)) { 7715 spin_lock(&last_ptr->lock); 7716 last_ptr->fragmented = 1; 7717 spin_unlock(&last_ptr->lock); 7718 } 7719 if (cached) { 7720 struct btrfs_free_space_ctl *ctl = 7721 block_group->free_space_ctl; 7722 7723 spin_lock(&ctl->tree_lock); 7724 if (ctl->free_space < 7725 num_bytes + empty_cluster + empty_size) { 7726 if (ctl->free_space > max_extent_size) 7727 max_extent_size = ctl->free_space; 7728 spin_unlock(&ctl->tree_lock); 7729 goto loop; 7730 } 7731 spin_unlock(&ctl->tree_lock); 7732 } 7733 7734 offset = btrfs_find_space_for_alloc(block_group, search_start, 7735 num_bytes, empty_size, 7736 &max_extent_size); 7737 /* 7738 * If we didn't find a chunk, and we haven't failed on this 7739 * block group before, and this block group is in the middle of 7740 * caching and we are ok with waiting, then go ahead and wait 7741 * for progress to be made, and set failed_alloc to true. 7742 * 7743 * If failed_alloc is true then we've already waited on this 7744 * block group once and should move on to the next block group. 7745 */ 7746 if (!offset && !failed_alloc && !cached && 7747 loop > LOOP_CACHING_NOWAIT) { 7748 wait_block_group_cache_progress(block_group, 7749 num_bytes + empty_size); 7750 failed_alloc = true; 7751 goto have_block_group; 7752 } else if (!offset) { 7753 goto loop; 7754 } 7755 checks: 7756 search_start = ALIGN(offset, fs_info->stripesize); 7757 7758 /* move on to the next group */ 7759 if (search_start + num_bytes > 7760 block_group->key.objectid + block_group->key.offset) { 7761 btrfs_add_free_space(block_group, offset, num_bytes); 7762 goto loop; 7763 } 7764 7765 if (offset < search_start) 7766 btrfs_add_free_space(block_group, offset, 7767 search_start - offset); 7768 BUG_ON(offset > search_start); 7769 7770 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 7771 num_bytes, delalloc); 7772 if (ret == -EAGAIN) { 7773 btrfs_add_free_space(block_group, offset, num_bytes); 7774 goto loop; 7775 } 7776 btrfs_inc_block_group_reservations(block_group); 7777 7778 /* we are all good, lets return */ 7779 ins->objectid = search_start; 7780 ins->offset = num_bytes; 7781 7782 trace_btrfs_reserve_extent(fs_info, block_group, 7783 search_start, num_bytes); 7784 btrfs_release_block_group(block_group, delalloc); 7785 break; 7786 loop: 7787 failed_cluster_refill = false; 7788 failed_alloc = false; 7789 BUG_ON(index != get_block_group_index(block_group)); 7790 btrfs_release_block_group(block_group, delalloc); 7791 } 7792 up_read(&space_info->groups_sem); 7793 7794 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg 7795 && !orig_have_caching_bg) 7796 orig_have_caching_bg = true; 7797 7798 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 7799 goto search; 7800 7801 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 7802 goto search; 7803 7804 /* 7805 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7806 * caching kthreads as we move along 7807 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7808 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7809 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7810 * again 7811 */ 7812 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 7813 index = 0; 7814 if (loop == LOOP_CACHING_NOWAIT) { 7815 /* 7816 * We want to skip the LOOP_CACHING_WAIT step if we 7817 * don't have any uncached bgs and we've already done a 7818 * full search through. 7819 */ 7820 if (orig_have_caching_bg || !full_search) 7821 loop = LOOP_CACHING_WAIT; 7822 else 7823 loop = LOOP_ALLOC_CHUNK; 7824 } else { 7825 loop++; 7826 } 7827 7828 if (loop == LOOP_ALLOC_CHUNK) { 7829 struct btrfs_trans_handle *trans; 7830 int exist = 0; 7831 7832 trans = current->journal_info; 7833 if (trans) 7834 exist = 1; 7835 else 7836 trans = btrfs_join_transaction(root); 7837 7838 if (IS_ERR(trans)) { 7839 ret = PTR_ERR(trans); 7840 goto out; 7841 } 7842 7843 ret = do_chunk_alloc(trans, fs_info, flags, 7844 CHUNK_ALLOC_FORCE); 7845 7846 /* 7847 * If we can't allocate a new chunk we've already looped 7848 * through at least once, move on to the NO_EMPTY_SIZE 7849 * case. 7850 */ 7851 if (ret == -ENOSPC) 7852 loop = LOOP_NO_EMPTY_SIZE; 7853 7854 /* 7855 * Do not bail out on ENOSPC since we 7856 * can do more things. 7857 */ 7858 if (ret < 0 && ret != -ENOSPC) 7859 btrfs_abort_transaction(trans, ret); 7860 else 7861 ret = 0; 7862 if (!exist) 7863 btrfs_end_transaction(trans); 7864 if (ret) 7865 goto out; 7866 } 7867 7868 if (loop == LOOP_NO_EMPTY_SIZE) { 7869 /* 7870 * Don't loop again if we already have no empty_size and 7871 * no empty_cluster. 7872 */ 7873 if (empty_size == 0 && 7874 empty_cluster == 0) { 7875 ret = -ENOSPC; 7876 goto out; 7877 } 7878 empty_size = 0; 7879 empty_cluster = 0; 7880 } 7881 7882 goto search; 7883 } else if (!ins->objectid) { 7884 ret = -ENOSPC; 7885 } else if (ins->objectid) { 7886 if (!use_cluster && last_ptr) { 7887 spin_lock(&last_ptr->lock); 7888 last_ptr->window_start = ins->objectid; 7889 spin_unlock(&last_ptr->lock); 7890 } 7891 ret = 0; 7892 } 7893 out: 7894 if (ret == -ENOSPC) { 7895 spin_lock(&space_info->lock); 7896 space_info->max_extent_size = max_extent_size; 7897 spin_unlock(&space_info->lock); 7898 ins->offset = max_extent_size; 7899 } 7900 return ret; 7901 } 7902 7903 static void dump_space_info(struct btrfs_fs_info *fs_info, 7904 struct btrfs_space_info *info, u64 bytes, 7905 int dump_block_groups) 7906 { 7907 struct btrfs_block_group_cache *cache; 7908 int index = 0; 7909 7910 spin_lock(&info->lock); 7911 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 7912 info->flags, 7913 info->total_bytes - btrfs_space_info_used(info, true), 7914 info->full ? "" : "not "); 7915 btrfs_info(fs_info, 7916 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 7917 info->total_bytes, info->bytes_used, info->bytes_pinned, 7918 info->bytes_reserved, info->bytes_may_use, 7919 info->bytes_readonly); 7920 spin_unlock(&info->lock); 7921 7922 if (!dump_block_groups) 7923 return; 7924 7925 down_read(&info->groups_sem); 7926 again: 7927 list_for_each_entry(cache, &info->block_groups[index], list) { 7928 spin_lock(&cache->lock); 7929 btrfs_info(fs_info, 7930 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 7931 cache->key.objectid, cache->key.offset, 7932 btrfs_block_group_used(&cache->item), cache->pinned, 7933 cache->reserved, cache->ro ? "[readonly]" : ""); 7934 btrfs_dump_free_space(cache, bytes); 7935 spin_unlock(&cache->lock); 7936 } 7937 if (++index < BTRFS_NR_RAID_TYPES) 7938 goto again; 7939 up_read(&info->groups_sem); 7940 } 7941 7942 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 7943 u64 num_bytes, u64 min_alloc_size, 7944 u64 empty_size, u64 hint_byte, 7945 struct btrfs_key *ins, int is_data, int delalloc) 7946 { 7947 struct btrfs_fs_info *fs_info = root->fs_info; 7948 bool final_tried = num_bytes == min_alloc_size; 7949 u64 flags; 7950 int ret; 7951 7952 flags = btrfs_get_alloc_profile(root, is_data); 7953 again: 7954 WARN_ON(num_bytes < fs_info->sectorsize); 7955 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 7956 hint_byte, ins, flags, delalloc); 7957 if (!ret && !is_data) { 7958 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 7959 } else if (ret == -ENOSPC) { 7960 if (!final_tried && ins->offset) { 7961 num_bytes = min(num_bytes >> 1, ins->offset); 7962 num_bytes = round_down(num_bytes, 7963 fs_info->sectorsize); 7964 num_bytes = max(num_bytes, min_alloc_size); 7965 ram_bytes = num_bytes; 7966 if (num_bytes == min_alloc_size) 7967 final_tried = true; 7968 goto again; 7969 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 7970 struct btrfs_space_info *sinfo; 7971 7972 sinfo = __find_space_info(fs_info, flags); 7973 btrfs_err(fs_info, 7974 "allocation failed flags %llu, wanted %llu", 7975 flags, num_bytes); 7976 if (sinfo) 7977 dump_space_info(fs_info, sinfo, num_bytes, 1); 7978 } 7979 } 7980 7981 return ret; 7982 } 7983 7984 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 7985 u64 start, u64 len, 7986 int pin, int delalloc) 7987 { 7988 struct btrfs_block_group_cache *cache; 7989 int ret = 0; 7990 7991 cache = btrfs_lookup_block_group(fs_info, start); 7992 if (!cache) { 7993 btrfs_err(fs_info, "Unable to find block group for %llu", 7994 start); 7995 return -ENOSPC; 7996 } 7997 7998 if (pin) 7999 pin_down_extent(fs_info, cache, start, len, 1); 8000 else { 8001 if (btrfs_test_opt(fs_info, DISCARD)) 8002 ret = btrfs_discard_extent(fs_info, start, len, NULL); 8003 btrfs_add_free_space(cache, start, len); 8004 btrfs_free_reserved_bytes(cache, len, delalloc); 8005 trace_btrfs_reserved_extent_free(fs_info, start, len); 8006 } 8007 8008 btrfs_put_block_group(cache); 8009 return ret; 8010 } 8011 8012 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8013 u64 start, u64 len, int delalloc) 8014 { 8015 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 8016 } 8017 8018 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 8019 u64 start, u64 len) 8020 { 8021 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 8022 } 8023 8024 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8025 struct btrfs_fs_info *fs_info, 8026 u64 parent, u64 root_objectid, 8027 u64 flags, u64 owner, u64 offset, 8028 struct btrfs_key *ins, int ref_mod) 8029 { 8030 int ret; 8031 struct btrfs_extent_item *extent_item; 8032 struct btrfs_extent_inline_ref *iref; 8033 struct btrfs_path *path; 8034 struct extent_buffer *leaf; 8035 int type; 8036 u32 size; 8037 8038 if (parent > 0) 8039 type = BTRFS_SHARED_DATA_REF_KEY; 8040 else 8041 type = BTRFS_EXTENT_DATA_REF_KEY; 8042 8043 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8044 8045 path = btrfs_alloc_path(); 8046 if (!path) 8047 return -ENOMEM; 8048 8049 path->leave_spinning = 1; 8050 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8051 ins, size); 8052 if (ret) { 8053 btrfs_free_path(path); 8054 return ret; 8055 } 8056 8057 leaf = path->nodes[0]; 8058 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8059 struct btrfs_extent_item); 8060 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8061 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8062 btrfs_set_extent_flags(leaf, extent_item, 8063 flags | BTRFS_EXTENT_FLAG_DATA); 8064 8065 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8066 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8067 if (parent > 0) { 8068 struct btrfs_shared_data_ref *ref; 8069 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8070 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8071 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8072 } else { 8073 struct btrfs_extent_data_ref *ref; 8074 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8075 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8076 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8077 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8078 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8079 } 8080 8081 btrfs_mark_buffer_dirty(path->nodes[0]); 8082 btrfs_free_path(path); 8083 8084 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8085 ins->offset); 8086 if (ret) 8087 return ret; 8088 8089 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); 8090 if (ret) { /* -ENOENT, logic error */ 8091 btrfs_err(fs_info, "update block group failed for %llu %llu", 8092 ins->objectid, ins->offset); 8093 BUG(); 8094 } 8095 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 8096 return ret; 8097 } 8098 8099 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8100 struct btrfs_fs_info *fs_info, 8101 u64 parent, u64 root_objectid, 8102 u64 flags, struct btrfs_disk_key *key, 8103 int level, struct btrfs_key *ins) 8104 { 8105 int ret; 8106 struct btrfs_extent_item *extent_item; 8107 struct btrfs_tree_block_info *block_info; 8108 struct btrfs_extent_inline_ref *iref; 8109 struct btrfs_path *path; 8110 struct extent_buffer *leaf; 8111 u32 size = sizeof(*extent_item) + sizeof(*iref); 8112 u64 num_bytes = ins->offset; 8113 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8114 8115 if (!skinny_metadata) 8116 size += sizeof(*block_info); 8117 8118 path = btrfs_alloc_path(); 8119 if (!path) { 8120 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8121 fs_info->nodesize); 8122 return -ENOMEM; 8123 } 8124 8125 path->leave_spinning = 1; 8126 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8127 ins, size); 8128 if (ret) { 8129 btrfs_free_path(path); 8130 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8131 fs_info->nodesize); 8132 return ret; 8133 } 8134 8135 leaf = path->nodes[0]; 8136 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8137 struct btrfs_extent_item); 8138 btrfs_set_extent_refs(leaf, extent_item, 1); 8139 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8140 btrfs_set_extent_flags(leaf, extent_item, 8141 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8142 8143 if (skinny_metadata) { 8144 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8145 num_bytes = fs_info->nodesize; 8146 } else { 8147 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8148 btrfs_set_tree_block_key(leaf, block_info, key); 8149 btrfs_set_tree_block_level(leaf, block_info, level); 8150 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8151 } 8152 8153 if (parent > 0) { 8154 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8155 btrfs_set_extent_inline_ref_type(leaf, iref, 8156 BTRFS_SHARED_BLOCK_REF_KEY); 8157 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8158 } else { 8159 btrfs_set_extent_inline_ref_type(leaf, iref, 8160 BTRFS_TREE_BLOCK_REF_KEY); 8161 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 8162 } 8163 8164 btrfs_mark_buffer_dirty(leaf); 8165 btrfs_free_path(path); 8166 8167 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8168 num_bytes); 8169 if (ret) 8170 return ret; 8171 8172 ret = update_block_group(trans, fs_info, ins->objectid, 8173 fs_info->nodesize, 1); 8174 if (ret) { /* -ENOENT, logic error */ 8175 btrfs_err(fs_info, "update block group failed for %llu %llu", 8176 ins->objectid, ins->offset); 8177 BUG(); 8178 } 8179 8180 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, 8181 fs_info->nodesize); 8182 return ret; 8183 } 8184 8185 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8186 u64 root_objectid, u64 owner, 8187 u64 offset, u64 ram_bytes, 8188 struct btrfs_key *ins) 8189 { 8190 struct btrfs_fs_info *fs_info = trans->fs_info; 8191 int ret; 8192 8193 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8194 8195 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, 8196 ins->offset, 0, 8197 root_objectid, owner, offset, 8198 ram_bytes, BTRFS_ADD_DELAYED_EXTENT); 8199 return ret; 8200 } 8201 8202 /* 8203 * this is used by the tree logging recovery code. It records that 8204 * an extent has been allocated and makes sure to clear the free 8205 * space cache bits as well 8206 */ 8207 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8208 struct btrfs_fs_info *fs_info, 8209 u64 root_objectid, u64 owner, u64 offset, 8210 struct btrfs_key *ins) 8211 { 8212 int ret; 8213 struct btrfs_block_group_cache *block_group; 8214 struct btrfs_space_info *space_info; 8215 8216 /* 8217 * Mixed block groups will exclude before processing the log so we only 8218 * need to do the exclude dance if this fs isn't mixed. 8219 */ 8220 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 8221 ret = __exclude_logged_extent(fs_info, ins->objectid, 8222 ins->offset); 8223 if (ret) 8224 return ret; 8225 } 8226 8227 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 8228 if (!block_group) 8229 return -EINVAL; 8230 8231 space_info = block_group->space_info; 8232 spin_lock(&space_info->lock); 8233 spin_lock(&block_group->lock); 8234 space_info->bytes_reserved += ins->offset; 8235 block_group->reserved += ins->offset; 8236 spin_unlock(&block_group->lock); 8237 spin_unlock(&space_info->lock); 8238 8239 ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid, 8240 0, owner, offset, ins, 1); 8241 btrfs_put_block_group(block_group); 8242 return ret; 8243 } 8244 8245 static struct extent_buffer * 8246 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8247 u64 bytenr, int level) 8248 { 8249 struct btrfs_fs_info *fs_info = root->fs_info; 8250 struct extent_buffer *buf; 8251 8252 buf = btrfs_find_create_tree_block(fs_info, bytenr); 8253 if (IS_ERR(buf)) 8254 return buf; 8255 8256 btrfs_set_header_generation(buf, trans->transid); 8257 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8258 btrfs_tree_lock(buf); 8259 clean_tree_block(fs_info, buf); 8260 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8261 8262 btrfs_set_lock_blocking(buf); 8263 set_extent_buffer_uptodate(buf); 8264 8265 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8266 buf->log_index = root->log_transid % 2; 8267 /* 8268 * we allow two log transactions at a time, use different 8269 * EXENT bit to differentiate dirty pages. 8270 */ 8271 if (buf->log_index == 0) 8272 set_extent_dirty(&root->dirty_log_pages, buf->start, 8273 buf->start + buf->len - 1, GFP_NOFS); 8274 else 8275 set_extent_new(&root->dirty_log_pages, buf->start, 8276 buf->start + buf->len - 1); 8277 } else { 8278 buf->log_index = -1; 8279 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8280 buf->start + buf->len - 1, GFP_NOFS); 8281 } 8282 trans->dirty = true; 8283 /* this returns a buffer locked for blocking */ 8284 return buf; 8285 } 8286 8287 static struct btrfs_block_rsv * 8288 use_block_rsv(struct btrfs_trans_handle *trans, 8289 struct btrfs_root *root, u32 blocksize) 8290 { 8291 struct btrfs_fs_info *fs_info = root->fs_info; 8292 struct btrfs_block_rsv *block_rsv; 8293 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 8294 int ret; 8295 bool global_updated = false; 8296 8297 block_rsv = get_block_rsv(trans, root); 8298 8299 if (unlikely(block_rsv->size == 0)) 8300 goto try_reserve; 8301 again: 8302 ret = block_rsv_use_bytes(block_rsv, blocksize); 8303 if (!ret) 8304 return block_rsv; 8305 8306 if (block_rsv->failfast) 8307 return ERR_PTR(ret); 8308 8309 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8310 global_updated = true; 8311 update_global_block_rsv(fs_info); 8312 goto again; 8313 } 8314 8315 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8316 static DEFINE_RATELIMIT_STATE(_rs, 8317 DEFAULT_RATELIMIT_INTERVAL * 10, 8318 /*DEFAULT_RATELIMIT_BURST*/ 1); 8319 if (__ratelimit(&_rs)) 8320 WARN(1, KERN_DEBUG 8321 "BTRFS: block rsv returned %d\n", ret); 8322 } 8323 try_reserve: 8324 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8325 BTRFS_RESERVE_NO_FLUSH); 8326 if (!ret) 8327 return block_rsv; 8328 /* 8329 * If we couldn't reserve metadata bytes try and use some from 8330 * the global reserve if its space type is the same as the global 8331 * reservation. 8332 */ 8333 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8334 block_rsv->space_info == global_rsv->space_info) { 8335 ret = block_rsv_use_bytes(global_rsv, blocksize); 8336 if (!ret) 8337 return global_rsv; 8338 } 8339 return ERR_PTR(ret); 8340 } 8341 8342 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8343 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8344 { 8345 block_rsv_add_bytes(block_rsv, blocksize, 0); 8346 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 8347 } 8348 8349 /* 8350 * finds a free extent and does all the dirty work required for allocation 8351 * returns the tree buffer or an ERR_PTR on error. 8352 */ 8353 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8354 struct btrfs_root *root, 8355 u64 parent, u64 root_objectid, 8356 const struct btrfs_disk_key *key, 8357 int level, u64 hint, 8358 u64 empty_size) 8359 { 8360 struct btrfs_fs_info *fs_info = root->fs_info; 8361 struct btrfs_key ins; 8362 struct btrfs_block_rsv *block_rsv; 8363 struct extent_buffer *buf; 8364 struct btrfs_delayed_extent_op *extent_op; 8365 u64 flags = 0; 8366 int ret; 8367 u32 blocksize = fs_info->nodesize; 8368 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8369 8370 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8371 if (btrfs_is_testing(fs_info)) { 8372 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8373 level); 8374 if (!IS_ERR(buf)) 8375 root->alloc_bytenr += blocksize; 8376 return buf; 8377 } 8378 #endif 8379 8380 block_rsv = use_block_rsv(trans, root, blocksize); 8381 if (IS_ERR(block_rsv)) 8382 return ERR_CAST(block_rsv); 8383 8384 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 8385 empty_size, hint, &ins, 0, 0); 8386 if (ret) 8387 goto out_unuse; 8388 8389 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 8390 if (IS_ERR(buf)) { 8391 ret = PTR_ERR(buf); 8392 goto out_free_reserved; 8393 } 8394 8395 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8396 if (parent == 0) 8397 parent = ins.objectid; 8398 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8399 } else 8400 BUG_ON(parent > 0); 8401 8402 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8403 extent_op = btrfs_alloc_delayed_extent_op(); 8404 if (!extent_op) { 8405 ret = -ENOMEM; 8406 goto out_free_buf; 8407 } 8408 if (key) 8409 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8410 else 8411 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8412 extent_op->flags_to_set = flags; 8413 extent_op->update_key = skinny_metadata ? false : true; 8414 extent_op->update_flags = true; 8415 extent_op->is_data = false; 8416 extent_op->level = level; 8417 8418 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 8419 ins.objectid, ins.offset, 8420 parent, root_objectid, level, 8421 BTRFS_ADD_DELAYED_EXTENT, 8422 extent_op); 8423 if (ret) 8424 goto out_free_delayed; 8425 } 8426 return buf; 8427 8428 out_free_delayed: 8429 btrfs_free_delayed_extent_op(extent_op); 8430 out_free_buf: 8431 free_extent_buffer(buf); 8432 out_free_reserved: 8433 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 8434 out_unuse: 8435 unuse_block_rsv(fs_info, block_rsv, blocksize); 8436 return ERR_PTR(ret); 8437 } 8438 8439 struct walk_control { 8440 u64 refs[BTRFS_MAX_LEVEL]; 8441 u64 flags[BTRFS_MAX_LEVEL]; 8442 struct btrfs_key update_progress; 8443 int stage; 8444 int level; 8445 int shared_level; 8446 int update_ref; 8447 int keep_locks; 8448 int reada_slot; 8449 int reada_count; 8450 int for_reloc; 8451 }; 8452 8453 #define DROP_REFERENCE 1 8454 #define UPDATE_BACKREF 2 8455 8456 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8457 struct btrfs_root *root, 8458 struct walk_control *wc, 8459 struct btrfs_path *path) 8460 { 8461 struct btrfs_fs_info *fs_info = root->fs_info; 8462 u64 bytenr; 8463 u64 generation; 8464 u64 refs; 8465 u64 flags; 8466 u32 nritems; 8467 struct btrfs_key key; 8468 struct extent_buffer *eb; 8469 int ret; 8470 int slot; 8471 int nread = 0; 8472 8473 if (path->slots[wc->level] < wc->reada_slot) { 8474 wc->reada_count = wc->reada_count * 2 / 3; 8475 wc->reada_count = max(wc->reada_count, 2); 8476 } else { 8477 wc->reada_count = wc->reada_count * 3 / 2; 8478 wc->reada_count = min_t(int, wc->reada_count, 8479 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 8480 } 8481 8482 eb = path->nodes[wc->level]; 8483 nritems = btrfs_header_nritems(eb); 8484 8485 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8486 if (nread >= wc->reada_count) 8487 break; 8488 8489 cond_resched(); 8490 bytenr = btrfs_node_blockptr(eb, slot); 8491 generation = btrfs_node_ptr_generation(eb, slot); 8492 8493 if (slot == path->slots[wc->level]) 8494 goto reada; 8495 8496 if (wc->stage == UPDATE_BACKREF && 8497 generation <= root->root_key.offset) 8498 continue; 8499 8500 /* We don't lock the tree block, it's OK to be racy here */ 8501 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 8502 wc->level - 1, 1, &refs, 8503 &flags); 8504 /* We don't care about errors in readahead. */ 8505 if (ret < 0) 8506 continue; 8507 BUG_ON(refs == 0); 8508 8509 if (wc->stage == DROP_REFERENCE) { 8510 if (refs == 1) 8511 goto reada; 8512 8513 if (wc->level == 1 && 8514 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8515 continue; 8516 if (!wc->update_ref || 8517 generation <= root->root_key.offset) 8518 continue; 8519 btrfs_node_key_to_cpu(eb, &key, slot); 8520 ret = btrfs_comp_cpu_keys(&key, 8521 &wc->update_progress); 8522 if (ret < 0) 8523 continue; 8524 } else { 8525 if (wc->level == 1 && 8526 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8527 continue; 8528 } 8529 reada: 8530 readahead_tree_block(fs_info, bytenr); 8531 nread++; 8532 } 8533 wc->reada_slot = slot; 8534 } 8535 8536 /* 8537 * helper to process tree block while walking down the tree. 8538 * 8539 * when wc->stage == UPDATE_BACKREF, this function updates 8540 * back refs for pointers in the block. 8541 * 8542 * NOTE: return value 1 means we should stop walking down. 8543 */ 8544 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8545 struct btrfs_root *root, 8546 struct btrfs_path *path, 8547 struct walk_control *wc, int lookup_info) 8548 { 8549 struct btrfs_fs_info *fs_info = root->fs_info; 8550 int level = wc->level; 8551 struct extent_buffer *eb = path->nodes[level]; 8552 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8553 int ret; 8554 8555 if (wc->stage == UPDATE_BACKREF && 8556 btrfs_header_owner(eb) != root->root_key.objectid) 8557 return 1; 8558 8559 /* 8560 * when reference count of tree block is 1, it won't increase 8561 * again. once full backref flag is set, we never clear it. 8562 */ 8563 if (lookup_info && 8564 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8565 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8566 BUG_ON(!path->locks[level]); 8567 ret = btrfs_lookup_extent_info(trans, fs_info, 8568 eb->start, level, 1, 8569 &wc->refs[level], 8570 &wc->flags[level]); 8571 BUG_ON(ret == -ENOMEM); 8572 if (ret) 8573 return ret; 8574 BUG_ON(wc->refs[level] == 0); 8575 } 8576 8577 if (wc->stage == DROP_REFERENCE) { 8578 if (wc->refs[level] > 1) 8579 return 1; 8580 8581 if (path->locks[level] && !wc->keep_locks) { 8582 btrfs_tree_unlock_rw(eb, path->locks[level]); 8583 path->locks[level] = 0; 8584 } 8585 return 0; 8586 } 8587 8588 /* wc->stage == UPDATE_BACKREF */ 8589 if (!(wc->flags[level] & flag)) { 8590 BUG_ON(!path->locks[level]); 8591 ret = btrfs_inc_ref(trans, root, eb, 1); 8592 BUG_ON(ret); /* -ENOMEM */ 8593 ret = btrfs_dec_ref(trans, root, eb, 0); 8594 BUG_ON(ret); /* -ENOMEM */ 8595 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, 8596 eb->len, flag, 8597 btrfs_header_level(eb), 0); 8598 BUG_ON(ret); /* -ENOMEM */ 8599 wc->flags[level] |= flag; 8600 } 8601 8602 /* 8603 * the block is shared by multiple trees, so it's not good to 8604 * keep the tree lock 8605 */ 8606 if (path->locks[level] && level > 0) { 8607 btrfs_tree_unlock_rw(eb, path->locks[level]); 8608 path->locks[level] = 0; 8609 } 8610 return 0; 8611 } 8612 8613 /* 8614 * helper to process tree block pointer. 8615 * 8616 * when wc->stage == DROP_REFERENCE, this function checks 8617 * reference count of the block pointed to. if the block 8618 * is shared and we need update back refs for the subtree 8619 * rooted at the block, this function changes wc->stage to 8620 * UPDATE_BACKREF. if the block is shared and there is no 8621 * need to update back, this function drops the reference 8622 * to the block. 8623 * 8624 * NOTE: return value 1 means we should stop walking down. 8625 */ 8626 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8627 struct btrfs_root *root, 8628 struct btrfs_path *path, 8629 struct walk_control *wc, int *lookup_info) 8630 { 8631 struct btrfs_fs_info *fs_info = root->fs_info; 8632 u64 bytenr; 8633 u64 generation; 8634 u64 parent; 8635 u32 blocksize; 8636 struct btrfs_key key; 8637 struct extent_buffer *next; 8638 int level = wc->level; 8639 int reada = 0; 8640 int ret = 0; 8641 bool need_account = false; 8642 8643 generation = btrfs_node_ptr_generation(path->nodes[level], 8644 path->slots[level]); 8645 /* 8646 * if the lower level block was created before the snapshot 8647 * was created, we know there is no need to update back refs 8648 * for the subtree 8649 */ 8650 if (wc->stage == UPDATE_BACKREF && 8651 generation <= root->root_key.offset) { 8652 *lookup_info = 1; 8653 return 1; 8654 } 8655 8656 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8657 blocksize = fs_info->nodesize; 8658 8659 next = find_extent_buffer(fs_info, bytenr); 8660 if (!next) { 8661 next = btrfs_find_create_tree_block(fs_info, bytenr); 8662 if (IS_ERR(next)) 8663 return PTR_ERR(next); 8664 8665 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8666 level - 1); 8667 reada = 1; 8668 } 8669 btrfs_tree_lock(next); 8670 btrfs_set_lock_blocking(next); 8671 8672 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 8673 &wc->refs[level - 1], 8674 &wc->flags[level - 1]); 8675 if (ret < 0) 8676 goto out_unlock; 8677 8678 if (unlikely(wc->refs[level - 1] == 0)) { 8679 btrfs_err(fs_info, "Missing references."); 8680 ret = -EIO; 8681 goto out_unlock; 8682 } 8683 *lookup_info = 0; 8684 8685 if (wc->stage == DROP_REFERENCE) { 8686 if (wc->refs[level - 1] > 1) { 8687 need_account = true; 8688 if (level == 1 && 8689 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8690 goto skip; 8691 8692 if (!wc->update_ref || 8693 generation <= root->root_key.offset) 8694 goto skip; 8695 8696 btrfs_node_key_to_cpu(path->nodes[level], &key, 8697 path->slots[level]); 8698 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8699 if (ret < 0) 8700 goto skip; 8701 8702 wc->stage = UPDATE_BACKREF; 8703 wc->shared_level = level - 1; 8704 } 8705 } else { 8706 if (level == 1 && 8707 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8708 goto skip; 8709 } 8710 8711 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8712 btrfs_tree_unlock(next); 8713 free_extent_buffer(next); 8714 next = NULL; 8715 *lookup_info = 1; 8716 } 8717 8718 if (!next) { 8719 if (reada && level == 1) 8720 reada_walk_down(trans, root, wc, path); 8721 next = read_tree_block(fs_info, bytenr, generation); 8722 if (IS_ERR(next)) { 8723 return PTR_ERR(next); 8724 } else if (!extent_buffer_uptodate(next)) { 8725 free_extent_buffer(next); 8726 return -EIO; 8727 } 8728 btrfs_tree_lock(next); 8729 btrfs_set_lock_blocking(next); 8730 } 8731 8732 level--; 8733 ASSERT(level == btrfs_header_level(next)); 8734 if (level != btrfs_header_level(next)) { 8735 btrfs_err(root->fs_info, "mismatched level"); 8736 ret = -EIO; 8737 goto out_unlock; 8738 } 8739 path->nodes[level] = next; 8740 path->slots[level] = 0; 8741 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8742 wc->level = level; 8743 if (wc->level == 1) 8744 wc->reada_slot = 0; 8745 return 0; 8746 skip: 8747 wc->refs[level - 1] = 0; 8748 wc->flags[level - 1] = 0; 8749 if (wc->stage == DROP_REFERENCE) { 8750 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8751 parent = path->nodes[level]->start; 8752 } else { 8753 ASSERT(root->root_key.objectid == 8754 btrfs_header_owner(path->nodes[level])); 8755 if (root->root_key.objectid != 8756 btrfs_header_owner(path->nodes[level])) { 8757 btrfs_err(root->fs_info, 8758 "mismatched block owner"); 8759 ret = -EIO; 8760 goto out_unlock; 8761 } 8762 parent = 0; 8763 } 8764 8765 if (need_account) { 8766 ret = btrfs_qgroup_trace_subtree(trans, root, next, 8767 generation, level - 1); 8768 if (ret) { 8769 btrfs_err_rl(fs_info, 8770 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 8771 ret); 8772 } 8773 } 8774 ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, 8775 parent, root->root_key.objectid, 8776 level - 1, 0); 8777 if (ret) 8778 goto out_unlock; 8779 } 8780 8781 *lookup_info = 1; 8782 ret = 1; 8783 8784 out_unlock: 8785 btrfs_tree_unlock(next); 8786 free_extent_buffer(next); 8787 8788 return ret; 8789 } 8790 8791 /* 8792 * helper to process tree block while walking up the tree. 8793 * 8794 * when wc->stage == DROP_REFERENCE, this function drops 8795 * reference count on the block. 8796 * 8797 * when wc->stage == UPDATE_BACKREF, this function changes 8798 * wc->stage back to DROP_REFERENCE if we changed wc->stage 8799 * to UPDATE_BACKREF previously while processing the block. 8800 * 8801 * NOTE: return value 1 means we should stop walking up. 8802 */ 8803 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 8804 struct btrfs_root *root, 8805 struct btrfs_path *path, 8806 struct walk_control *wc) 8807 { 8808 struct btrfs_fs_info *fs_info = root->fs_info; 8809 int ret; 8810 int level = wc->level; 8811 struct extent_buffer *eb = path->nodes[level]; 8812 u64 parent = 0; 8813 8814 if (wc->stage == UPDATE_BACKREF) { 8815 BUG_ON(wc->shared_level < level); 8816 if (level < wc->shared_level) 8817 goto out; 8818 8819 ret = find_next_key(path, level + 1, &wc->update_progress); 8820 if (ret > 0) 8821 wc->update_ref = 0; 8822 8823 wc->stage = DROP_REFERENCE; 8824 wc->shared_level = -1; 8825 path->slots[level] = 0; 8826 8827 /* 8828 * check reference count again if the block isn't locked. 8829 * we should start walking down the tree again if reference 8830 * count is one. 8831 */ 8832 if (!path->locks[level]) { 8833 BUG_ON(level == 0); 8834 btrfs_tree_lock(eb); 8835 btrfs_set_lock_blocking(eb); 8836 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8837 8838 ret = btrfs_lookup_extent_info(trans, fs_info, 8839 eb->start, level, 1, 8840 &wc->refs[level], 8841 &wc->flags[level]); 8842 if (ret < 0) { 8843 btrfs_tree_unlock_rw(eb, path->locks[level]); 8844 path->locks[level] = 0; 8845 return ret; 8846 } 8847 BUG_ON(wc->refs[level] == 0); 8848 if (wc->refs[level] == 1) { 8849 btrfs_tree_unlock_rw(eb, path->locks[level]); 8850 path->locks[level] = 0; 8851 return 1; 8852 } 8853 } 8854 } 8855 8856 /* wc->stage == DROP_REFERENCE */ 8857 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 8858 8859 if (wc->refs[level] == 1) { 8860 if (level == 0) { 8861 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8862 ret = btrfs_dec_ref(trans, root, eb, 1); 8863 else 8864 ret = btrfs_dec_ref(trans, root, eb, 0); 8865 BUG_ON(ret); /* -ENOMEM */ 8866 ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb); 8867 if (ret) { 8868 btrfs_err_rl(fs_info, 8869 "error %d accounting leaf items. Quota is out of sync, rescan required.", 8870 ret); 8871 } 8872 } 8873 /* make block locked assertion in clean_tree_block happy */ 8874 if (!path->locks[level] && 8875 btrfs_header_generation(eb) == trans->transid) { 8876 btrfs_tree_lock(eb); 8877 btrfs_set_lock_blocking(eb); 8878 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8879 } 8880 clean_tree_block(fs_info, eb); 8881 } 8882 8883 if (eb == root->node) { 8884 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8885 parent = eb->start; 8886 else 8887 BUG_ON(root->root_key.objectid != 8888 btrfs_header_owner(eb)); 8889 } else { 8890 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8891 parent = path->nodes[level + 1]->start; 8892 else 8893 BUG_ON(root->root_key.objectid != 8894 btrfs_header_owner(path->nodes[level + 1])); 8895 } 8896 8897 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8898 out: 8899 wc->refs[level] = 0; 8900 wc->flags[level] = 0; 8901 return 0; 8902 } 8903 8904 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8905 struct btrfs_root *root, 8906 struct btrfs_path *path, 8907 struct walk_control *wc) 8908 { 8909 int level = wc->level; 8910 int lookup_info = 1; 8911 int ret; 8912 8913 while (level >= 0) { 8914 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8915 if (ret > 0) 8916 break; 8917 8918 if (level == 0) 8919 break; 8920 8921 if (path->slots[level] >= 8922 btrfs_header_nritems(path->nodes[level])) 8923 break; 8924 8925 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8926 if (ret > 0) { 8927 path->slots[level]++; 8928 continue; 8929 } else if (ret < 0) 8930 return ret; 8931 level = wc->level; 8932 } 8933 return 0; 8934 } 8935 8936 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8937 struct btrfs_root *root, 8938 struct btrfs_path *path, 8939 struct walk_control *wc, int max_level) 8940 { 8941 int level = wc->level; 8942 int ret; 8943 8944 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8945 while (level < max_level && path->nodes[level]) { 8946 wc->level = level; 8947 if (path->slots[level] + 1 < 8948 btrfs_header_nritems(path->nodes[level])) { 8949 path->slots[level]++; 8950 return 0; 8951 } else { 8952 ret = walk_up_proc(trans, root, path, wc); 8953 if (ret > 0) 8954 return 0; 8955 8956 if (path->locks[level]) { 8957 btrfs_tree_unlock_rw(path->nodes[level], 8958 path->locks[level]); 8959 path->locks[level] = 0; 8960 } 8961 free_extent_buffer(path->nodes[level]); 8962 path->nodes[level] = NULL; 8963 level++; 8964 } 8965 } 8966 return 1; 8967 } 8968 8969 /* 8970 * drop a subvolume tree. 8971 * 8972 * this function traverses the tree freeing any blocks that only 8973 * referenced by the tree. 8974 * 8975 * when a shared tree block is found. this function decreases its 8976 * reference count by one. if update_ref is true, this function 8977 * also make sure backrefs for the shared block and all lower level 8978 * blocks are properly updated. 8979 * 8980 * If called with for_reloc == 0, may exit early with -EAGAIN 8981 */ 8982 int btrfs_drop_snapshot(struct btrfs_root *root, 8983 struct btrfs_block_rsv *block_rsv, int update_ref, 8984 int for_reloc) 8985 { 8986 struct btrfs_fs_info *fs_info = root->fs_info; 8987 struct btrfs_path *path; 8988 struct btrfs_trans_handle *trans; 8989 struct btrfs_root *tree_root = fs_info->tree_root; 8990 struct btrfs_root_item *root_item = &root->root_item; 8991 struct walk_control *wc; 8992 struct btrfs_key key; 8993 int err = 0; 8994 int ret; 8995 int level; 8996 bool root_dropped = false; 8997 8998 btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); 8999 9000 path = btrfs_alloc_path(); 9001 if (!path) { 9002 err = -ENOMEM; 9003 goto out; 9004 } 9005 9006 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9007 if (!wc) { 9008 btrfs_free_path(path); 9009 err = -ENOMEM; 9010 goto out; 9011 } 9012 9013 trans = btrfs_start_transaction(tree_root, 0); 9014 if (IS_ERR(trans)) { 9015 err = PTR_ERR(trans); 9016 goto out_free; 9017 } 9018 9019 if (block_rsv) 9020 trans->block_rsv = block_rsv; 9021 9022 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9023 level = btrfs_header_level(root->node); 9024 path->nodes[level] = btrfs_lock_root_node(root); 9025 btrfs_set_lock_blocking(path->nodes[level]); 9026 path->slots[level] = 0; 9027 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9028 memset(&wc->update_progress, 0, 9029 sizeof(wc->update_progress)); 9030 } else { 9031 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9032 memcpy(&wc->update_progress, &key, 9033 sizeof(wc->update_progress)); 9034 9035 level = root_item->drop_level; 9036 BUG_ON(level == 0); 9037 path->lowest_level = level; 9038 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9039 path->lowest_level = 0; 9040 if (ret < 0) { 9041 err = ret; 9042 goto out_end_trans; 9043 } 9044 WARN_ON(ret > 0); 9045 9046 /* 9047 * unlock our path, this is safe because only this 9048 * function is allowed to delete this snapshot 9049 */ 9050 btrfs_unlock_up_safe(path, 0); 9051 9052 level = btrfs_header_level(root->node); 9053 while (1) { 9054 btrfs_tree_lock(path->nodes[level]); 9055 btrfs_set_lock_blocking(path->nodes[level]); 9056 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9057 9058 ret = btrfs_lookup_extent_info(trans, fs_info, 9059 path->nodes[level]->start, 9060 level, 1, &wc->refs[level], 9061 &wc->flags[level]); 9062 if (ret < 0) { 9063 err = ret; 9064 goto out_end_trans; 9065 } 9066 BUG_ON(wc->refs[level] == 0); 9067 9068 if (level == root_item->drop_level) 9069 break; 9070 9071 btrfs_tree_unlock(path->nodes[level]); 9072 path->locks[level] = 0; 9073 WARN_ON(wc->refs[level] != 1); 9074 level--; 9075 } 9076 } 9077 9078 wc->level = level; 9079 wc->shared_level = -1; 9080 wc->stage = DROP_REFERENCE; 9081 wc->update_ref = update_ref; 9082 wc->keep_locks = 0; 9083 wc->for_reloc = for_reloc; 9084 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9085 9086 while (1) { 9087 9088 ret = walk_down_tree(trans, root, path, wc); 9089 if (ret < 0) { 9090 err = ret; 9091 break; 9092 } 9093 9094 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9095 if (ret < 0) { 9096 err = ret; 9097 break; 9098 } 9099 9100 if (ret > 0) { 9101 BUG_ON(wc->stage != DROP_REFERENCE); 9102 break; 9103 } 9104 9105 if (wc->stage == DROP_REFERENCE) { 9106 level = wc->level; 9107 btrfs_node_key(path->nodes[level], 9108 &root_item->drop_progress, 9109 path->slots[level]); 9110 root_item->drop_level = level; 9111 } 9112 9113 BUG_ON(wc->level == 0); 9114 if (btrfs_should_end_transaction(trans) || 9115 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 9116 ret = btrfs_update_root(trans, tree_root, 9117 &root->root_key, 9118 root_item); 9119 if (ret) { 9120 btrfs_abort_transaction(trans, ret); 9121 err = ret; 9122 goto out_end_trans; 9123 } 9124 9125 btrfs_end_transaction_throttle(trans); 9126 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 9127 btrfs_debug(fs_info, 9128 "drop snapshot early exit"); 9129 err = -EAGAIN; 9130 goto out_free; 9131 } 9132 9133 trans = btrfs_start_transaction(tree_root, 0); 9134 if (IS_ERR(trans)) { 9135 err = PTR_ERR(trans); 9136 goto out_free; 9137 } 9138 if (block_rsv) 9139 trans->block_rsv = block_rsv; 9140 } 9141 } 9142 btrfs_release_path(path); 9143 if (err) 9144 goto out_end_trans; 9145 9146 ret = btrfs_del_root(trans, tree_root, &root->root_key); 9147 if (ret) { 9148 btrfs_abort_transaction(trans, ret); 9149 goto out_end_trans; 9150 } 9151 9152 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9153 ret = btrfs_find_root(tree_root, &root->root_key, path, 9154 NULL, NULL); 9155 if (ret < 0) { 9156 btrfs_abort_transaction(trans, ret); 9157 err = ret; 9158 goto out_end_trans; 9159 } else if (ret > 0) { 9160 /* if we fail to delete the orphan item this time 9161 * around, it'll get picked up the next time. 9162 * 9163 * The most common failure here is just -ENOENT. 9164 */ 9165 btrfs_del_orphan_item(trans, tree_root, 9166 root->root_key.objectid); 9167 } 9168 } 9169 9170 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9171 btrfs_add_dropped_root(trans, root); 9172 } else { 9173 free_extent_buffer(root->node); 9174 free_extent_buffer(root->commit_root); 9175 btrfs_put_fs_root(root); 9176 } 9177 root_dropped = true; 9178 out_end_trans: 9179 btrfs_end_transaction_throttle(trans); 9180 out_free: 9181 kfree(wc); 9182 btrfs_free_path(path); 9183 out: 9184 /* 9185 * So if we need to stop dropping the snapshot for whatever reason we 9186 * need to make sure to add it back to the dead root list so that we 9187 * keep trying to do the work later. This also cleans up roots if we 9188 * don't have it in the radix (like when we recover after a power fail 9189 * or unmount) so we don't leak memory. 9190 */ 9191 if (!for_reloc && root_dropped == false) 9192 btrfs_add_dead_root(root); 9193 if (err && err != -EAGAIN) 9194 btrfs_handle_fs_error(fs_info, err, NULL); 9195 return err; 9196 } 9197 9198 /* 9199 * drop subtree rooted at tree block 'node'. 9200 * 9201 * NOTE: this function will unlock and release tree block 'node' 9202 * only used by relocation code 9203 */ 9204 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9205 struct btrfs_root *root, 9206 struct extent_buffer *node, 9207 struct extent_buffer *parent) 9208 { 9209 struct btrfs_fs_info *fs_info = root->fs_info; 9210 struct btrfs_path *path; 9211 struct walk_control *wc; 9212 int level; 9213 int parent_level; 9214 int ret = 0; 9215 int wret; 9216 9217 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9218 9219 path = btrfs_alloc_path(); 9220 if (!path) 9221 return -ENOMEM; 9222 9223 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9224 if (!wc) { 9225 btrfs_free_path(path); 9226 return -ENOMEM; 9227 } 9228 9229 btrfs_assert_tree_locked(parent); 9230 parent_level = btrfs_header_level(parent); 9231 extent_buffer_get(parent); 9232 path->nodes[parent_level] = parent; 9233 path->slots[parent_level] = btrfs_header_nritems(parent); 9234 9235 btrfs_assert_tree_locked(node); 9236 level = btrfs_header_level(node); 9237 path->nodes[level] = node; 9238 path->slots[level] = 0; 9239 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9240 9241 wc->refs[parent_level] = 1; 9242 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9243 wc->level = level; 9244 wc->shared_level = -1; 9245 wc->stage = DROP_REFERENCE; 9246 wc->update_ref = 0; 9247 wc->keep_locks = 1; 9248 wc->for_reloc = 1; 9249 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9250 9251 while (1) { 9252 wret = walk_down_tree(trans, root, path, wc); 9253 if (wret < 0) { 9254 ret = wret; 9255 break; 9256 } 9257 9258 wret = walk_up_tree(trans, root, path, wc, parent_level); 9259 if (wret < 0) 9260 ret = wret; 9261 if (wret != 0) 9262 break; 9263 } 9264 9265 kfree(wc); 9266 btrfs_free_path(path); 9267 return ret; 9268 } 9269 9270 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 9271 { 9272 u64 num_devices; 9273 u64 stripped; 9274 9275 /* 9276 * if restripe for this chunk_type is on pick target profile and 9277 * return, otherwise do the usual balance 9278 */ 9279 stripped = get_restripe_target(fs_info, flags); 9280 if (stripped) 9281 return extended_to_chunk(stripped); 9282 9283 num_devices = fs_info->fs_devices->rw_devices; 9284 9285 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9286 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9287 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9288 9289 if (num_devices == 1) { 9290 stripped |= BTRFS_BLOCK_GROUP_DUP; 9291 stripped = flags & ~stripped; 9292 9293 /* turn raid0 into single device chunks */ 9294 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9295 return stripped; 9296 9297 /* turn mirroring into duplication */ 9298 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9299 BTRFS_BLOCK_GROUP_RAID10)) 9300 return stripped | BTRFS_BLOCK_GROUP_DUP; 9301 } else { 9302 /* they already had raid on here, just return */ 9303 if (flags & stripped) 9304 return flags; 9305 9306 stripped |= BTRFS_BLOCK_GROUP_DUP; 9307 stripped = flags & ~stripped; 9308 9309 /* switch duplicated blocks with raid1 */ 9310 if (flags & BTRFS_BLOCK_GROUP_DUP) 9311 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9312 9313 /* this is drive concat, leave it alone */ 9314 } 9315 9316 return flags; 9317 } 9318 9319 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9320 { 9321 struct btrfs_space_info *sinfo = cache->space_info; 9322 u64 num_bytes; 9323 u64 min_allocable_bytes; 9324 int ret = -ENOSPC; 9325 9326 /* 9327 * We need some metadata space and system metadata space for 9328 * allocating chunks in some corner cases until we force to set 9329 * it to be readonly. 9330 */ 9331 if ((sinfo->flags & 9332 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9333 !force) 9334 min_allocable_bytes = SZ_1M; 9335 else 9336 min_allocable_bytes = 0; 9337 9338 spin_lock(&sinfo->lock); 9339 spin_lock(&cache->lock); 9340 9341 if (cache->ro) { 9342 cache->ro++; 9343 ret = 0; 9344 goto out; 9345 } 9346 9347 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9348 cache->bytes_super - btrfs_block_group_used(&cache->item); 9349 9350 if (btrfs_space_info_used(sinfo, true) + num_bytes + 9351 min_allocable_bytes <= sinfo->total_bytes) { 9352 sinfo->bytes_readonly += num_bytes; 9353 cache->ro++; 9354 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9355 ret = 0; 9356 } 9357 out: 9358 spin_unlock(&cache->lock); 9359 spin_unlock(&sinfo->lock); 9360 return ret; 9361 } 9362 9363 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, 9364 struct btrfs_block_group_cache *cache) 9365 9366 { 9367 struct btrfs_trans_handle *trans; 9368 u64 alloc_flags; 9369 int ret; 9370 9371 again: 9372 trans = btrfs_join_transaction(fs_info->extent_root); 9373 if (IS_ERR(trans)) 9374 return PTR_ERR(trans); 9375 9376 /* 9377 * we're not allowed to set block groups readonly after the dirty 9378 * block groups cache has started writing. If it already started, 9379 * back off and let this transaction commit 9380 */ 9381 mutex_lock(&fs_info->ro_block_group_mutex); 9382 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9383 u64 transid = trans->transid; 9384 9385 mutex_unlock(&fs_info->ro_block_group_mutex); 9386 btrfs_end_transaction(trans); 9387 9388 ret = btrfs_wait_for_commit(fs_info, transid); 9389 if (ret) 9390 return ret; 9391 goto again; 9392 } 9393 9394 /* 9395 * if we are changing raid levels, try to allocate a corresponding 9396 * block group with the new raid level. 9397 */ 9398 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9399 if (alloc_flags != cache->flags) { 9400 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9401 CHUNK_ALLOC_FORCE); 9402 /* 9403 * ENOSPC is allowed here, we may have enough space 9404 * already allocated at the new raid level to 9405 * carry on 9406 */ 9407 if (ret == -ENOSPC) 9408 ret = 0; 9409 if (ret < 0) 9410 goto out; 9411 } 9412 9413 ret = inc_block_group_ro(cache, 0); 9414 if (!ret) 9415 goto out; 9416 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 9417 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9418 CHUNK_ALLOC_FORCE); 9419 if (ret < 0) 9420 goto out; 9421 ret = inc_block_group_ro(cache, 0); 9422 out: 9423 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9424 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9425 mutex_lock(&fs_info->chunk_mutex); 9426 check_system_chunk(trans, fs_info, alloc_flags); 9427 mutex_unlock(&fs_info->chunk_mutex); 9428 } 9429 mutex_unlock(&fs_info->ro_block_group_mutex); 9430 9431 btrfs_end_transaction(trans); 9432 return ret; 9433 } 9434 9435 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 9436 struct btrfs_fs_info *fs_info, u64 type) 9437 { 9438 u64 alloc_flags = get_alloc_profile(fs_info, type); 9439 9440 return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); 9441 } 9442 9443 /* 9444 * helper to account the unused space of all the readonly block group in the 9445 * space_info. takes mirrors into account. 9446 */ 9447 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9448 { 9449 struct btrfs_block_group_cache *block_group; 9450 u64 free_bytes = 0; 9451 int factor; 9452 9453 /* It's df, we don't care if it's racy */ 9454 if (list_empty(&sinfo->ro_bgs)) 9455 return 0; 9456 9457 spin_lock(&sinfo->lock); 9458 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9459 spin_lock(&block_group->lock); 9460 9461 if (!block_group->ro) { 9462 spin_unlock(&block_group->lock); 9463 continue; 9464 } 9465 9466 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 9467 BTRFS_BLOCK_GROUP_RAID10 | 9468 BTRFS_BLOCK_GROUP_DUP)) 9469 factor = 2; 9470 else 9471 factor = 1; 9472 9473 free_bytes += (block_group->key.offset - 9474 btrfs_block_group_used(&block_group->item)) * 9475 factor; 9476 9477 spin_unlock(&block_group->lock); 9478 } 9479 spin_unlock(&sinfo->lock); 9480 9481 return free_bytes; 9482 } 9483 9484 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 9485 { 9486 struct btrfs_space_info *sinfo = cache->space_info; 9487 u64 num_bytes; 9488 9489 BUG_ON(!cache->ro); 9490 9491 spin_lock(&sinfo->lock); 9492 spin_lock(&cache->lock); 9493 if (!--cache->ro) { 9494 num_bytes = cache->key.offset - cache->reserved - 9495 cache->pinned - cache->bytes_super - 9496 btrfs_block_group_used(&cache->item); 9497 sinfo->bytes_readonly -= num_bytes; 9498 list_del_init(&cache->ro_list); 9499 } 9500 spin_unlock(&cache->lock); 9501 spin_unlock(&sinfo->lock); 9502 } 9503 9504 /* 9505 * checks to see if its even possible to relocate this block group. 9506 * 9507 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9508 * ok to go ahead and try. 9509 */ 9510 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 9511 { 9512 struct btrfs_root *root = fs_info->extent_root; 9513 struct btrfs_block_group_cache *block_group; 9514 struct btrfs_space_info *space_info; 9515 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 9516 struct btrfs_device *device; 9517 struct btrfs_trans_handle *trans; 9518 u64 min_free; 9519 u64 dev_min = 1; 9520 u64 dev_nr = 0; 9521 u64 target; 9522 int debug; 9523 int index; 9524 int full = 0; 9525 int ret = 0; 9526 9527 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 9528 9529 block_group = btrfs_lookup_block_group(fs_info, bytenr); 9530 9531 /* odd, couldn't find the block group, leave it alone */ 9532 if (!block_group) { 9533 if (debug) 9534 btrfs_warn(fs_info, 9535 "can't find block group for bytenr %llu", 9536 bytenr); 9537 return -1; 9538 } 9539 9540 min_free = btrfs_block_group_used(&block_group->item); 9541 9542 /* no bytes used, we're good */ 9543 if (!min_free) 9544 goto out; 9545 9546 space_info = block_group->space_info; 9547 spin_lock(&space_info->lock); 9548 9549 full = space_info->full; 9550 9551 /* 9552 * if this is the last block group we have in this space, we can't 9553 * relocate it unless we're able to allocate a new chunk below. 9554 * 9555 * Otherwise, we need to make sure we have room in the space to handle 9556 * all of the extents from this block group. If we can, we're good 9557 */ 9558 if ((space_info->total_bytes != block_group->key.offset) && 9559 (btrfs_space_info_used(space_info, false) + min_free < 9560 space_info->total_bytes)) { 9561 spin_unlock(&space_info->lock); 9562 goto out; 9563 } 9564 spin_unlock(&space_info->lock); 9565 9566 /* 9567 * ok we don't have enough space, but maybe we have free space on our 9568 * devices to allocate new chunks for relocation, so loop through our 9569 * alloc devices and guess if we have enough space. if this block 9570 * group is going to be restriped, run checks against the target 9571 * profile instead of the current one. 9572 */ 9573 ret = -1; 9574 9575 /* 9576 * index: 9577 * 0: raid10 9578 * 1: raid1 9579 * 2: dup 9580 * 3: raid0 9581 * 4: single 9582 */ 9583 target = get_restripe_target(fs_info, block_group->flags); 9584 if (target) { 9585 index = __get_raid_index(extended_to_chunk(target)); 9586 } else { 9587 /* 9588 * this is just a balance, so if we were marked as full 9589 * we know there is no space for a new chunk 9590 */ 9591 if (full) { 9592 if (debug) 9593 btrfs_warn(fs_info, 9594 "no space to alloc new chunk for block group %llu", 9595 block_group->key.objectid); 9596 goto out; 9597 } 9598 9599 index = get_block_group_index(block_group); 9600 } 9601 9602 if (index == BTRFS_RAID_RAID10) { 9603 dev_min = 4; 9604 /* Divide by 2 */ 9605 min_free >>= 1; 9606 } else if (index == BTRFS_RAID_RAID1) { 9607 dev_min = 2; 9608 } else if (index == BTRFS_RAID_DUP) { 9609 /* Multiply by 2 */ 9610 min_free <<= 1; 9611 } else if (index == BTRFS_RAID_RAID0) { 9612 dev_min = fs_devices->rw_devices; 9613 min_free = div64_u64(min_free, dev_min); 9614 } 9615 9616 /* We need to do this so that we can look at pending chunks */ 9617 trans = btrfs_join_transaction(root); 9618 if (IS_ERR(trans)) { 9619 ret = PTR_ERR(trans); 9620 goto out; 9621 } 9622 9623 mutex_lock(&fs_info->chunk_mutex); 9624 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9625 u64 dev_offset; 9626 9627 /* 9628 * check to make sure we can actually find a chunk with enough 9629 * space to fit our block group in. 9630 */ 9631 if (device->total_bytes > device->bytes_used + min_free && 9632 !device->is_tgtdev_for_dev_replace) { 9633 ret = find_free_dev_extent(trans, device, min_free, 9634 &dev_offset, NULL); 9635 if (!ret) 9636 dev_nr++; 9637 9638 if (dev_nr >= dev_min) 9639 break; 9640 9641 ret = -1; 9642 } 9643 } 9644 if (debug && ret == -1) 9645 btrfs_warn(fs_info, 9646 "no space to allocate a new chunk for block group %llu", 9647 block_group->key.objectid); 9648 mutex_unlock(&fs_info->chunk_mutex); 9649 btrfs_end_transaction(trans); 9650 out: 9651 btrfs_put_block_group(block_group); 9652 return ret; 9653 } 9654 9655 static int find_first_block_group(struct btrfs_fs_info *fs_info, 9656 struct btrfs_path *path, 9657 struct btrfs_key *key) 9658 { 9659 struct btrfs_root *root = fs_info->extent_root; 9660 int ret = 0; 9661 struct btrfs_key found_key; 9662 struct extent_buffer *leaf; 9663 int slot; 9664 9665 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9666 if (ret < 0) 9667 goto out; 9668 9669 while (1) { 9670 slot = path->slots[0]; 9671 leaf = path->nodes[0]; 9672 if (slot >= btrfs_header_nritems(leaf)) { 9673 ret = btrfs_next_leaf(root, path); 9674 if (ret == 0) 9675 continue; 9676 if (ret < 0) 9677 goto out; 9678 break; 9679 } 9680 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9681 9682 if (found_key.objectid >= key->objectid && 9683 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9684 struct extent_map_tree *em_tree; 9685 struct extent_map *em; 9686 9687 em_tree = &root->fs_info->mapping_tree.map_tree; 9688 read_lock(&em_tree->lock); 9689 em = lookup_extent_mapping(em_tree, found_key.objectid, 9690 found_key.offset); 9691 read_unlock(&em_tree->lock); 9692 if (!em) { 9693 btrfs_err(fs_info, 9694 "logical %llu len %llu found bg but no related chunk", 9695 found_key.objectid, found_key.offset); 9696 ret = -ENOENT; 9697 } else { 9698 ret = 0; 9699 } 9700 free_extent_map(em); 9701 goto out; 9702 } 9703 path->slots[0]++; 9704 } 9705 out: 9706 return ret; 9707 } 9708 9709 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9710 { 9711 struct btrfs_block_group_cache *block_group; 9712 u64 last = 0; 9713 9714 while (1) { 9715 struct inode *inode; 9716 9717 block_group = btrfs_lookup_first_block_group(info, last); 9718 while (block_group) { 9719 spin_lock(&block_group->lock); 9720 if (block_group->iref) 9721 break; 9722 spin_unlock(&block_group->lock); 9723 block_group = next_block_group(info, block_group); 9724 } 9725 if (!block_group) { 9726 if (last == 0) 9727 break; 9728 last = 0; 9729 continue; 9730 } 9731 9732 inode = block_group->inode; 9733 block_group->iref = 0; 9734 block_group->inode = NULL; 9735 spin_unlock(&block_group->lock); 9736 ASSERT(block_group->io_ctl.inode == NULL); 9737 iput(inode); 9738 last = block_group->key.objectid + block_group->key.offset; 9739 btrfs_put_block_group(block_group); 9740 } 9741 } 9742 9743 int btrfs_free_block_groups(struct btrfs_fs_info *info) 9744 { 9745 struct btrfs_block_group_cache *block_group; 9746 struct btrfs_space_info *space_info; 9747 struct btrfs_caching_control *caching_ctl; 9748 struct rb_node *n; 9749 9750 down_write(&info->commit_root_sem); 9751 while (!list_empty(&info->caching_block_groups)) { 9752 caching_ctl = list_entry(info->caching_block_groups.next, 9753 struct btrfs_caching_control, list); 9754 list_del(&caching_ctl->list); 9755 put_caching_control(caching_ctl); 9756 } 9757 up_write(&info->commit_root_sem); 9758 9759 spin_lock(&info->unused_bgs_lock); 9760 while (!list_empty(&info->unused_bgs)) { 9761 block_group = list_first_entry(&info->unused_bgs, 9762 struct btrfs_block_group_cache, 9763 bg_list); 9764 list_del_init(&block_group->bg_list); 9765 btrfs_put_block_group(block_group); 9766 } 9767 spin_unlock(&info->unused_bgs_lock); 9768 9769 spin_lock(&info->block_group_cache_lock); 9770 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 9771 block_group = rb_entry(n, struct btrfs_block_group_cache, 9772 cache_node); 9773 rb_erase(&block_group->cache_node, 9774 &info->block_group_cache_tree); 9775 RB_CLEAR_NODE(&block_group->cache_node); 9776 spin_unlock(&info->block_group_cache_lock); 9777 9778 down_write(&block_group->space_info->groups_sem); 9779 list_del(&block_group->list); 9780 up_write(&block_group->space_info->groups_sem); 9781 9782 if (block_group->cached == BTRFS_CACHE_STARTED) 9783 wait_block_group_cache_done(block_group); 9784 9785 /* 9786 * We haven't cached this block group, which means we could 9787 * possibly have excluded extents on this block group. 9788 */ 9789 if (block_group->cached == BTRFS_CACHE_NO || 9790 block_group->cached == BTRFS_CACHE_ERROR) 9791 free_excluded_extents(info, block_group); 9792 9793 btrfs_remove_free_space_cache(block_group); 9794 ASSERT(list_empty(&block_group->dirty_list)); 9795 ASSERT(list_empty(&block_group->io_list)); 9796 ASSERT(list_empty(&block_group->bg_list)); 9797 ASSERT(atomic_read(&block_group->count) == 1); 9798 btrfs_put_block_group(block_group); 9799 9800 spin_lock(&info->block_group_cache_lock); 9801 } 9802 spin_unlock(&info->block_group_cache_lock); 9803 9804 /* now that all the block groups are freed, go through and 9805 * free all the space_info structs. This is only called during 9806 * the final stages of unmount, and so we know nobody is 9807 * using them. We call synchronize_rcu() once before we start, 9808 * just to be on the safe side. 9809 */ 9810 synchronize_rcu(); 9811 9812 release_global_block_rsv(info); 9813 9814 while (!list_empty(&info->space_info)) { 9815 int i; 9816 9817 space_info = list_entry(info->space_info.next, 9818 struct btrfs_space_info, 9819 list); 9820 9821 /* 9822 * Do not hide this behind enospc_debug, this is actually 9823 * important and indicates a real bug if this happens. 9824 */ 9825 if (WARN_ON(space_info->bytes_pinned > 0 || 9826 space_info->bytes_reserved > 0 || 9827 space_info->bytes_may_use > 0)) 9828 dump_space_info(info, space_info, 0, 0); 9829 list_del(&space_info->list); 9830 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 9831 struct kobject *kobj; 9832 kobj = space_info->block_group_kobjs[i]; 9833 space_info->block_group_kobjs[i] = NULL; 9834 if (kobj) { 9835 kobject_del(kobj); 9836 kobject_put(kobj); 9837 } 9838 } 9839 kobject_del(&space_info->kobj); 9840 kobject_put(&space_info->kobj); 9841 } 9842 return 0; 9843 } 9844 9845 static void __link_block_group(struct btrfs_space_info *space_info, 9846 struct btrfs_block_group_cache *cache) 9847 { 9848 int index = get_block_group_index(cache); 9849 bool first = false; 9850 9851 down_write(&space_info->groups_sem); 9852 if (list_empty(&space_info->block_groups[index])) 9853 first = true; 9854 list_add_tail(&cache->list, &space_info->block_groups[index]); 9855 up_write(&space_info->groups_sem); 9856 9857 if (first) { 9858 struct raid_kobject *rkobj; 9859 int ret; 9860 9861 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 9862 if (!rkobj) 9863 goto out_err; 9864 rkobj->raid_type = index; 9865 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 9866 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 9867 "%s", get_raid_name(index)); 9868 if (ret) { 9869 kobject_put(&rkobj->kobj); 9870 goto out_err; 9871 } 9872 space_info->block_group_kobjs[index] = &rkobj->kobj; 9873 } 9874 9875 return; 9876 out_err: 9877 btrfs_warn(cache->fs_info, 9878 "failed to add kobject for block cache, ignoring"); 9879 } 9880 9881 static struct btrfs_block_group_cache * 9882 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 9883 u64 start, u64 size) 9884 { 9885 struct btrfs_block_group_cache *cache; 9886 9887 cache = kzalloc(sizeof(*cache), GFP_NOFS); 9888 if (!cache) 9889 return NULL; 9890 9891 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 9892 GFP_NOFS); 9893 if (!cache->free_space_ctl) { 9894 kfree(cache); 9895 return NULL; 9896 } 9897 9898 cache->key.objectid = start; 9899 cache->key.offset = size; 9900 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9901 9902 cache->sectorsize = fs_info->sectorsize; 9903 cache->fs_info = fs_info; 9904 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, 9905 &fs_info->mapping_tree, 9906 start); 9907 set_free_space_tree_thresholds(cache); 9908 9909 atomic_set(&cache->count, 1); 9910 spin_lock_init(&cache->lock); 9911 init_rwsem(&cache->data_rwsem); 9912 INIT_LIST_HEAD(&cache->list); 9913 INIT_LIST_HEAD(&cache->cluster_list); 9914 INIT_LIST_HEAD(&cache->bg_list); 9915 INIT_LIST_HEAD(&cache->ro_list); 9916 INIT_LIST_HEAD(&cache->dirty_list); 9917 INIT_LIST_HEAD(&cache->io_list); 9918 btrfs_init_free_space_ctl(cache); 9919 atomic_set(&cache->trimming, 0); 9920 mutex_init(&cache->free_space_lock); 9921 9922 return cache; 9923 } 9924 9925 int btrfs_read_block_groups(struct btrfs_fs_info *info) 9926 { 9927 struct btrfs_path *path; 9928 int ret; 9929 struct btrfs_block_group_cache *cache; 9930 struct btrfs_space_info *space_info; 9931 struct btrfs_key key; 9932 struct btrfs_key found_key; 9933 struct extent_buffer *leaf; 9934 int need_clear = 0; 9935 u64 cache_gen; 9936 u64 feature; 9937 int mixed; 9938 9939 feature = btrfs_super_incompat_flags(info->super_copy); 9940 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 9941 9942 key.objectid = 0; 9943 key.offset = 0; 9944 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9945 path = btrfs_alloc_path(); 9946 if (!path) 9947 return -ENOMEM; 9948 path->reada = READA_FORWARD; 9949 9950 cache_gen = btrfs_super_cache_generation(info->super_copy); 9951 if (btrfs_test_opt(info, SPACE_CACHE) && 9952 btrfs_super_generation(info->super_copy) != cache_gen) 9953 need_clear = 1; 9954 if (btrfs_test_opt(info, CLEAR_CACHE)) 9955 need_clear = 1; 9956 9957 while (1) { 9958 ret = find_first_block_group(info, path, &key); 9959 if (ret > 0) 9960 break; 9961 if (ret != 0) 9962 goto error; 9963 9964 leaf = path->nodes[0]; 9965 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9966 9967 cache = btrfs_create_block_group_cache(info, found_key.objectid, 9968 found_key.offset); 9969 if (!cache) { 9970 ret = -ENOMEM; 9971 goto error; 9972 } 9973 9974 if (need_clear) { 9975 /* 9976 * When we mount with old space cache, we need to 9977 * set BTRFS_DC_CLEAR and set dirty flag. 9978 * 9979 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9980 * truncate the old free space cache inode and 9981 * setup a new one. 9982 * b) Setting 'dirty flag' makes sure that we flush 9983 * the new space cache info onto disk. 9984 */ 9985 if (btrfs_test_opt(info, SPACE_CACHE)) 9986 cache->disk_cache_state = BTRFS_DC_CLEAR; 9987 } 9988 9989 read_extent_buffer(leaf, &cache->item, 9990 btrfs_item_ptr_offset(leaf, path->slots[0]), 9991 sizeof(cache->item)); 9992 cache->flags = btrfs_block_group_flags(&cache->item); 9993 if (!mixed && 9994 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 9995 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 9996 btrfs_err(info, 9997 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 9998 cache->key.objectid); 9999 ret = -EINVAL; 10000 goto error; 10001 } 10002 10003 key.objectid = found_key.objectid + found_key.offset; 10004 btrfs_release_path(path); 10005 10006 /* 10007 * We need to exclude the super stripes now so that the space 10008 * info has super bytes accounted for, otherwise we'll think 10009 * we have more space than we actually do. 10010 */ 10011 ret = exclude_super_stripes(info, cache); 10012 if (ret) { 10013 /* 10014 * We may have excluded something, so call this just in 10015 * case. 10016 */ 10017 free_excluded_extents(info, cache); 10018 btrfs_put_block_group(cache); 10019 goto error; 10020 } 10021 10022 /* 10023 * check for two cases, either we are full, and therefore 10024 * don't need to bother with the caching work since we won't 10025 * find any space, or we are empty, and we can just add all 10026 * the space in and be done with it. This saves us _alot_ of 10027 * time, particularly in the full case. 10028 */ 10029 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10030 cache->last_byte_to_unpin = (u64)-1; 10031 cache->cached = BTRFS_CACHE_FINISHED; 10032 free_excluded_extents(info, cache); 10033 } else if (btrfs_block_group_used(&cache->item) == 0) { 10034 cache->last_byte_to_unpin = (u64)-1; 10035 cache->cached = BTRFS_CACHE_FINISHED; 10036 add_new_free_space(cache, info, 10037 found_key.objectid, 10038 found_key.objectid + 10039 found_key.offset); 10040 free_excluded_extents(info, cache); 10041 } 10042 10043 ret = btrfs_add_block_group_cache(info, cache); 10044 if (ret) { 10045 btrfs_remove_free_space_cache(cache); 10046 btrfs_put_block_group(cache); 10047 goto error; 10048 } 10049 10050 trace_btrfs_add_block_group(info, cache, 0); 10051 ret = update_space_info(info, cache->flags, found_key.offset, 10052 btrfs_block_group_used(&cache->item), 10053 cache->bytes_super, &space_info); 10054 if (ret) { 10055 btrfs_remove_free_space_cache(cache); 10056 spin_lock(&info->block_group_cache_lock); 10057 rb_erase(&cache->cache_node, 10058 &info->block_group_cache_tree); 10059 RB_CLEAR_NODE(&cache->cache_node); 10060 spin_unlock(&info->block_group_cache_lock); 10061 btrfs_put_block_group(cache); 10062 goto error; 10063 } 10064 10065 cache->space_info = space_info; 10066 10067 __link_block_group(space_info, cache); 10068 10069 set_avail_alloc_bits(info, cache->flags); 10070 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10071 inc_block_group_ro(cache, 1); 10072 } else if (btrfs_block_group_used(&cache->item) == 0) { 10073 spin_lock(&info->unused_bgs_lock); 10074 /* Should always be true but just in case. */ 10075 if (list_empty(&cache->bg_list)) { 10076 btrfs_get_block_group(cache); 10077 list_add_tail(&cache->bg_list, 10078 &info->unused_bgs); 10079 } 10080 spin_unlock(&info->unused_bgs_lock); 10081 } 10082 } 10083 10084 list_for_each_entry_rcu(space_info, &info->space_info, list) { 10085 if (!(get_alloc_profile(info, space_info->flags) & 10086 (BTRFS_BLOCK_GROUP_RAID10 | 10087 BTRFS_BLOCK_GROUP_RAID1 | 10088 BTRFS_BLOCK_GROUP_RAID5 | 10089 BTRFS_BLOCK_GROUP_RAID6 | 10090 BTRFS_BLOCK_GROUP_DUP))) 10091 continue; 10092 /* 10093 * avoid allocating from un-mirrored block group if there are 10094 * mirrored block groups. 10095 */ 10096 list_for_each_entry(cache, 10097 &space_info->block_groups[BTRFS_RAID_RAID0], 10098 list) 10099 inc_block_group_ro(cache, 1); 10100 list_for_each_entry(cache, 10101 &space_info->block_groups[BTRFS_RAID_SINGLE], 10102 list) 10103 inc_block_group_ro(cache, 1); 10104 } 10105 10106 init_global_block_rsv(info); 10107 ret = 0; 10108 error: 10109 btrfs_free_path(path); 10110 return ret; 10111 } 10112 10113 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 10114 struct btrfs_fs_info *fs_info) 10115 { 10116 struct btrfs_block_group_cache *block_group, *tmp; 10117 struct btrfs_root *extent_root = fs_info->extent_root; 10118 struct btrfs_block_group_item item; 10119 struct btrfs_key key; 10120 int ret = 0; 10121 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 10122 10123 trans->can_flush_pending_bgs = false; 10124 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 10125 if (ret) 10126 goto next; 10127 10128 spin_lock(&block_group->lock); 10129 memcpy(&item, &block_group->item, sizeof(item)); 10130 memcpy(&key, &block_group->key, sizeof(key)); 10131 spin_unlock(&block_group->lock); 10132 10133 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10134 sizeof(item)); 10135 if (ret) 10136 btrfs_abort_transaction(trans, ret); 10137 ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid, 10138 key.offset); 10139 if (ret) 10140 btrfs_abort_transaction(trans, ret); 10141 add_block_group_free_space(trans, fs_info, block_group); 10142 /* already aborted the transaction if it failed. */ 10143 next: 10144 list_del_init(&block_group->bg_list); 10145 } 10146 trans->can_flush_pending_bgs = can_flush_pending_bgs; 10147 } 10148 10149 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 10150 struct btrfs_fs_info *fs_info, u64 bytes_used, 10151 u64 type, u64 chunk_objectid, u64 chunk_offset, 10152 u64 size) 10153 { 10154 struct btrfs_block_group_cache *cache; 10155 int ret; 10156 10157 btrfs_set_log_full_commit(fs_info, trans); 10158 10159 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 10160 if (!cache) 10161 return -ENOMEM; 10162 10163 btrfs_set_block_group_used(&cache->item, bytes_used); 10164 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 10165 btrfs_set_block_group_flags(&cache->item, type); 10166 10167 cache->flags = type; 10168 cache->last_byte_to_unpin = (u64)-1; 10169 cache->cached = BTRFS_CACHE_FINISHED; 10170 cache->needs_free_space = 1; 10171 ret = exclude_super_stripes(fs_info, cache); 10172 if (ret) { 10173 /* 10174 * We may have excluded something, so call this just in 10175 * case. 10176 */ 10177 free_excluded_extents(fs_info, cache); 10178 btrfs_put_block_group(cache); 10179 return ret; 10180 } 10181 10182 add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size); 10183 10184 free_excluded_extents(fs_info, cache); 10185 10186 #ifdef CONFIG_BTRFS_DEBUG 10187 if (btrfs_should_fragment_free_space(cache)) { 10188 u64 new_bytes_used = size - bytes_used; 10189 10190 bytes_used += new_bytes_used >> 1; 10191 fragment_free_space(cache); 10192 } 10193 #endif 10194 /* 10195 * Call to ensure the corresponding space_info object is created and 10196 * assigned to our block group, but don't update its counters just yet. 10197 * We want our bg to be added to the rbtree with its ->space_info set. 10198 */ 10199 ret = update_space_info(fs_info, cache->flags, 0, 0, 0, 10200 &cache->space_info); 10201 if (ret) { 10202 btrfs_remove_free_space_cache(cache); 10203 btrfs_put_block_group(cache); 10204 return ret; 10205 } 10206 10207 ret = btrfs_add_block_group_cache(fs_info, cache); 10208 if (ret) { 10209 btrfs_remove_free_space_cache(cache); 10210 btrfs_put_block_group(cache); 10211 return ret; 10212 } 10213 10214 /* 10215 * Now that our block group has its ->space_info set and is inserted in 10216 * the rbtree, update the space info's counters. 10217 */ 10218 trace_btrfs_add_block_group(fs_info, cache, 1); 10219 ret = update_space_info(fs_info, cache->flags, size, bytes_used, 10220 cache->bytes_super, &cache->space_info); 10221 if (ret) { 10222 btrfs_remove_free_space_cache(cache); 10223 spin_lock(&fs_info->block_group_cache_lock); 10224 rb_erase(&cache->cache_node, 10225 &fs_info->block_group_cache_tree); 10226 RB_CLEAR_NODE(&cache->cache_node); 10227 spin_unlock(&fs_info->block_group_cache_lock); 10228 btrfs_put_block_group(cache); 10229 return ret; 10230 } 10231 update_global_block_rsv(fs_info); 10232 10233 __link_block_group(cache->space_info, cache); 10234 10235 list_add_tail(&cache->bg_list, &trans->new_bgs); 10236 10237 set_avail_alloc_bits(fs_info, type); 10238 return 0; 10239 } 10240 10241 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10242 { 10243 u64 extra_flags = chunk_to_extended(flags) & 10244 BTRFS_EXTENDED_PROFILE_MASK; 10245 10246 write_seqlock(&fs_info->profiles_lock); 10247 if (flags & BTRFS_BLOCK_GROUP_DATA) 10248 fs_info->avail_data_alloc_bits &= ~extra_flags; 10249 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10250 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10251 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10252 fs_info->avail_system_alloc_bits &= ~extra_flags; 10253 write_sequnlock(&fs_info->profiles_lock); 10254 } 10255 10256 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10257 struct btrfs_fs_info *fs_info, u64 group_start, 10258 struct extent_map *em) 10259 { 10260 struct btrfs_root *root = fs_info->extent_root; 10261 struct btrfs_path *path; 10262 struct btrfs_block_group_cache *block_group; 10263 struct btrfs_free_cluster *cluster; 10264 struct btrfs_root *tree_root = fs_info->tree_root; 10265 struct btrfs_key key; 10266 struct inode *inode; 10267 struct kobject *kobj = NULL; 10268 int ret; 10269 int index; 10270 int factor; 10271 struct btrfs_caching_control *caching_ctl = NULL; 10272 bool remove_em; 10273 10274 block_group = btrfs_lookup_block_group(fs_info, group_start); 10275 BUG_ON(!block_group); 10276 BUG_ON(!block_group->ro); 10277 10278 /* 10279 * Free the reserved super bytes from this block group before 10280 * remove it. 10281 */ 10282 free_excluded_extents(fs_info, block_group); 10283 10284 memcpy(&key, &block_group->key, sizeof(key)); 10285 index = get_block_group_index(block_group); 10286 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 10287 BTRFS_BLOCK_GROUP_RAID1 | 10288 BTRFS_BLOCK_GROUP_RAID10)) 10289 factor = 2; 10290 else 10291 factor = 1; 10292 10293 /* make sure this block group isn't part of an allocation cluster */ 10294 cluster = &fs_info->data_alloc_cluster; 10295 spin_lock(&cluster->refill_lock); 10296 btrfs_return_cluster_to_free_space(block_group, cluster); 10297 spin_unlock(&cluster->refill_lock); 10298 10299 /* 10300 * make sure this block group isn't part of a metadata 10301 * allocation cluster 10302 */ 10303 cluster = &fs_info->meta_alloc_cluster; 10304 spin_lock(&cluster->refill_lock); 10305 btrfs_return_cluster_to_free_space(block_group, cluster); 10306 spin_unlock(&cluster->refill_lock); 10307 10308 path = btrfs_alloc_path(); 10309 if (!path) { 10310 ret = -ENOMEM; 10311 goto out; 10312 } 10313 10314 /* 10315 * get the inode first so any iput calls done for the io_list 10316 * aren't the final iput (no unlinks allowed now) 10317 */ 10318 inode = lookup_free_space_inode(fs_info, block_group, path); 10319 10320 mutex_lock(&trans->transaction->cache_write_mutex); 10321 /* 10322 * make sure our free spache cache IO is done before remove the 10323 * free space inode 10324 */ 10325 spin_lock(&trans->transaction->dirty_bgs_lock); 10326 if (!list_empty(&block_group->io_list)) { 10327 list_del_init(&block_group->io_list); 10328 10329 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10330 10331 spin_unlock(&trans->transaction->dirty_bgs_lock); 10332 btrfs_wait_cache_io(trans, block_group, path); 10333 btrfs_put_block_group(block_group); 10334 spin_lock(&trans->transaction->dirty_bgs_lock); 10335 } 10336 10337 if (!list_empty(&block_group->dirty_list)) { 10338 list_del_init(&block_group->dirty_list); 10339 btrfs_put_block_group(block_group); 10340 } 10341 spin_unlock(&trans->transaction->dirty_bgs_lock); 10342 mutex_unlock(&trans->transaction->cache_write_mutex); 10343 10344 if (!IS_ERR(inode)) { 10345 ret = btrfs_orphan_add(trans, inode); 10346 if (ret) { 10347 btrfs_add_delayed_iput(inode); 10348 goto out; 10349 } 10350 clear_nlink(inode); 10351 /* One for the block groups ref */ 10352 spin_lock(&block_group->lock); 10353 if (block_group->iref) { 10354 block_group->iref = 0; 10355 block_group->inode = NULL; 10356 spin_unlock(&block_group->lock); 10357 iput(inode); 10358 } else { 10359 spin_unlock(&block_group->lock); 10360 } 10361 /* One for our lookup ref */ 10362 btrfs_add_delayed_iput(inode); 10363 } 10364 10365 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10366 key.offset = block_group->key.objectid; 10367 key.type = 0; 10368 10369 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10370 if (ret < 0) 10371 goto out; 10372 if (ret > 0) 10373 btrfs_release_path(path); 10374 if (ret == 0) { 10375 ret = btrfs_del_item(trans, tree_root, path); 10376 if (ret) 10377 goto out; 10378 btrfs_release_path(path); 10379 } 10380 10381 spin_lock(&fs_info->block_group_cache_lock); 10382 rb_erase(&block_group->cache_node, 10383 &fs_info->block_group_cache_tree); 10384 RB_CLEAR_NODE(&block_group->cache_node); 10385 10386 if (fs_info->first_logical_byte == block_group->key.objectid) 10387 fs_info->first_logical_byte = (u64)-1; 10388 spin_unlock(&fs_info->block_group_cache_lock); 10389 10390 down_write(&block_group->space_info->groups_sem); 10391 /* 10392 * we must use list_del_init so people can check to see if they 10393 * are still on the list after taking the semaphore 10394 */ 10395 list_del_init(&block_group->list); 10396 if (list_empty(&block_group->space_info->block_groups[index])) { 10397 kobj = block_group->space_info->block_group_kobjs[index]; 10398 block_group->space_info->block_group_kobjs[index] = NULL; 10399 clear_avail_alloc_bits(fs_info, block_group->flags); 10400 } 10401 up_write(&block_group->space_info->groups_sem); 10402 if (kobj) { 10403 kobject_del(kobj); 10404 kobject_put(kobj); 10405 } 10406 10407 if (block_group->has_caching_ctl) 10408 caching_ctl = get_caching_control(block_group); 10409 if (block_group->cached == BTRFS_CACHE_STARTED) 10410 wait_block_group_cache_done(block_group); 10411 if (block_group->has_caching_ctl) { 10412 down_write(&fs_info->commit_root_sem); 10413 if (!caching_ctl) { 10414 struct btrfs_caching_control *ctl; 10415 10416 list_for_each_entry(ctl, 10417 &fs_info->caching_block_groups, list) 10418 if (ctl->block_group == block_group) { 10419 caching_ctl = ctl; 10420 atomic_inc(&caching_ctl->count); 10421 break; 10422 } 10423 } 10424 if (caching_ctl) 10425 list_del_init(&caching_ctl->list); 10426 up_write(&fs_info->commit_root_sem); 10427 if (caching_ctl) { 10428 /* Once for the caching bgs list and once for us. */ 10429 put_caching_control(caching_ctl); 10430 put_caching_control(caching_ctl); 10431 } 10432 } 10433 10434 spin_lock(&trans->transaction->dirty_bgs_lock); 10435 if (!list_empty(&block_group->dirty_list)) { 10436 WARN_ON(1); 10437 } 10438 if (!list_empty(&block_group->io_list)) { 10439 WARN_ON(1); 10440 } 10441 spin_unlock(&trans->transaction->dirty_bgs_lock); 10442 btrfs_remove_free_space_cache(block_group); 10443 10444 spin_lock(&block_group->space_info->lock); 10445 list_del_init(&block_group->ro_list); 10446 10447 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 10448 WARN_ON(block_group->space_info->total_bytes 10449 < block_group->key.offset); 10450 WARN_ON(block_group->space_info->bytes_readonly 10451 < block_group->key.offset); 10452 WARN_ON(block_group->space_info->disk_total 10453 < block_group->key.offset * factor); 10454 } 10455 block_group->space_info->total_bytes -= block_group->key.offset; 10456 block_group->space_info->bytes_readonly -= block_group->key.offset; 10457 block_group->space_info->disk_total -= block_group->key.offset * factor; 10458 10459 spin_unlock(&block_group->space_info->lock); 10460 10461 memcpy(&key, &block_group->key, sizeof(key)); 10462 10463 mutex_lock(&fs_info->chunk_mutex); 10464 if (!list_empty(&em->list)) { 10465 /* We're in the transaction->pending_chunks list. */ 10466 free_extent_map(em); 10467 } 10468 spin_lock(&block_group->lock); 10469 block_group->removed = 1; 10470 /* 10471 * At this point trimming can't start on this block group, because we 10472 * removed the block group from the tree fs_info->block_group_cache_tree 10473 * so no one can't find it anymore and even if someone already got this 10474 * block group before we removed it from the rbtree, they have already 10475 * incremented block_group->trimming - if they didn't, they won't find 10476 * any free space entries because we already removed them all when we 10477 * called btrfs_remove_free_space_cache(). 10478 * 10479 * And we must not remove the extent map from the fs_info->mapping_tree 10480 * to prevent the same logical address range and physical device space 10481 * ranges from being reused for a new block group. This is because our 10482 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10483 * completely transactionless, so while it is trimming a range the 10484 * currently running transaction might finish and a new one start, 10485 * allowing for new block groups to be created that can reuse the same 10486 * physical device locations unless we take this special care. 10487 * 10488 * There may also be an implicit trim operation if the file system 10489 * is mounted with -odiscard. The same protections must remain 10490 * in place until the extents have been discarded completely when 10491 * the transaction commit has completed. 10492 */ 10493 remove_em = (atomic_read(&block_group->trimming) == 0); 10494 /* 10495 * Make sure a trimmer task always sees the em in the pinned_chunks list 10496 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10497 * before checking block_group->removed). 10498 */ 10499 if (!remove_em) { 10500 /* 10501 * Our em might be in trans->transaction->pending_chunks which 10502 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10503 * and so is the fs_info->pinned_chunks list. 10504 * 10505 * So at this point we must be holding the chunk_mutex to avoid 10506 * any races with chunk allocation (more specifically at 10507 * volumes.c:contains_pending_extent()), to ensure it always 10508 * sees the em, either in the pending_chunks list or in the 10509 * pinned_chunks list. 10510 */ 10511 list_move_tail(&em->list, &fs_info->pinned_chunks); 10512 } 10513 spin_unlock(&block_group->lock); 10514 10515 if (remove_em) { 10516 struct extent_map_tree *em_tree; 10517 10518 em_tree = &fs_info->mapping_tree.map_tree; 10519 write_lock(&em_tree->lock); 10520 /* 10521 * The em might be in the pending_chunks list, so make sure the 10522 * chunk mutex is locked, since remove_extent_mapping() will 10523 * delete us from that list. 10524 */ 10525 remove_extent_mapping(em_tree, em); 10526 write_unlock(&em_tree->lock); 10527 /* once for the tree */ 10528 free_extent_map(em); 10529 } 10530 10531 mutex_unlock(&fs_info->chunk_mutex); 10532 10533 ret = remove_block_group_free_space(trans, fs_info, block_group); 10534 if (ret) 10535 goto out; 10536 10537 btrfs_put_block_group(block_group); 10538 btrfs_put_block_group(block_group); 10539 10540 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10541 if (ret > 0) 10542 ret = -EIO; 10543 if (ret < 0) 10544 goto out; 10545 10546 ret = btrfs_del_item(trans, root, path); 10547 out: 10548 btrfs_free_path(path); 10549 return ret; 10550 } 10551 10552 struct btrfs_trans_handle * 10553 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10554 const u64 chunk_offset) 10555 { 10556 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10557 struct extent_map *em; 10558 struct map_lookup *map; 10559 unsigned int num_items; 10560 10561 read_lock(&em_tree->lock); 10562 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10563 read_unlock(&em_tree->lock); 10564 ASSERT(em && em->start == chunk_offset); 10565 10566 /* 10567 * We need to reserve 3 + N units from the metadata space info in order 10568 * to remove a block group (done at btrfs_remove_chunk() and at 10569 * btrfs_remove_block_group()), which are used for: 10570 * 10571 * 1 unit for adding the free space inode's orphan (located in the tree 10572 * of tree roots). 10573 * 1 unit for deleting the block group item (located in the extent 10574 * tree). 10575 * 1 unit for deleting the free space item (located in tree of tree 10576 * roots). 10577 * N units for deleting N device extent items corresponding to each 10578 * stripe (located in the device tree). 10579 * 10580 * In order to remove a block group we also need to reserve units in the 10581 * system space info in order to update the chunk tree (update one or 10582 * more device items and remove one chunk item), but this is done at 10583 * btrfs_remove_chunk() through a call to check_system_chunk(). 10584 */ 10585 map = em->map_lookup; 10586 num_items = 3 + map->num_stripes; 10587 free_extent_map(em); 10588 10589 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10590 num_items, 1); 10591 } 10592 10593 /* 10594 * Process the unused_bgs list and remove any that don't have any allocated 10595 * space inside of them. 10596 */ 10597 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10598 { 10599 struct btrfs_block_group_cache *block_group; 10600 struct btrfs_space_info *space_info; 10601 struct btrfs_trans_handle *trans; 10602 int ret = 0; 10603 10604 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 10605 return; 10606 10607 spin_lock(&fs_info->unused_bgs_lock); 10608 while (!list_empty(&fs_info->unused_bgs)) { 10609 u64 start, end; 10610 int trimming; 10611 10612 block_group = list_first_entry(&fs_info->unused_bgs, 10613 struct btrfs_block_group_cache, 10614 bg_list); 10615 list_del_init(&block_group->bg_list); 10616 10617 space_info = block_group->space_info; 10618 10619 if (ret || btrfs_mixed_space_info(space_info)) { 10620 btrfs_put_block_group(block_group); 10621 continue; 10622 } 10623 spin_unlock(&fs_info->unused_bgs_lock); 10624 10625 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10626 10627 /* Don't want to race with allocators so take the groups_sem */ 10628 down_write(&space_info->groups_sem); 10629 spin_lock(&block_group->lock); 10630 if (block_group->reserved || 10631 btrfs_block_group_used(&block_group->item) || 10632 block_group->ro || 10633 list_is_singular(&block_group->list)) { 10634 /* 10635 * We want to bail if we made new allocations or have 10636 * outstanding allocations in this block group. We do 10637 * the ro check in case balance is currently acting on 10638 * this block group. 10639 */ 10640 spin_unlock(&block_group->lock); 10641 up_write(&space_info->groups_sem); 10642 goto next; 10643 } 10644 spin_unlock(&block_group->lock); 10645 10646 /* We don't want to force the issue, only flip if it's ok. */ 10647 ret = inc_block_group_ro(block_group, 0); 10648 up_write(&space_info->groups_sem); 10649 if (ret < 0) { 10650 ret = 0; 10651 goto next; 10652 } 10653 10654 /* 10655 * Want to do this before we do anything else so we can recover 10656 * properly if we fail to join the transaction. 10657 */ 10658 trans = btrfs_start_trans_remove_block_group(fs_info, 10659 block_group->key.objectid); 10660 if (IS_ERR(trans)) { 10661 btrfs_dec_block_group_ro(block_group); 10662 ret = PTR_ERR(trans); 10663 goto next; 10664 } 10665 10666 /* 10667 * We could have pending pinned extents for this block group, 10668 * just delete them, we don't care about them anymore. 10669 */ 10670 start = block_group->key.objectid; 10671 end = start + block_group->key.offset - 1; 10672 /* 10673 * Hold the unused_bg_unpin_mutex lock to avoid racing with 10674 * btrfs_finish_extent_commit(). If we are at transaction N, 10675 * another task might be running finish_extent_commit() for the 10676 * previous transaction N - 1, and have seen a range belonging 10677 * to the block group in freed_extents[] before we were able to 10678 * clear the whole block group range from freed_extents[]. This 10679 * means that task can lookup for the block group after we 10680 * unpinned it from freed_extents[] and removed it, leading to 10681 * a BUG_ON() at btrfs_unpin_extent_range(). 10682 */ 10683 mutex_lock(&fs_info->unused_bg_unpin_mutex); 10684 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 10685 EXTENT_DIRTY); 10686 if (ret) { 10687 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10688 btrfs_dec_block_group_ro(block_group); 10689 goto end_trans; 10690 } 10691 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 10692 EXTENT_DIRTY); 10693 if (ret) { 10694 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10695 btrfs_dec_block_group_ro(block_group); 10696 goto end_trans; 10697 } 10698 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10699 10700 /* Reset pinned so btrfs_put_block_group doesn't complain */ 10701 spin_lock(&space_info->lock); 10702 spin_lock(&block_group->lock); 10703 10704 space_info->bytes_pinned -= block_group->pinned; 10705 space_info->bytes_readonly += block_group->pinned; 10706 percpu_counter_add(&space_info->total_bytes_pinned, 10707 -block_group->pinned); 10708 block_group->pinned = 0; 10709 10710 spin_unlock(&block_group->lock); 10711 spin_unlock(&space_info->lock); 10712 10713 /* DISCARD can flip during remount */ 10714 trimming = btrfs_test_opt(fs_info, DISCARD); 10715 10716 /* Implicit trim during transaction commit. */ 10717 if (trimming) 10718 btrfs_get_block_group_trimming(block_group); 10719 10720 /* 10721 * Btrfs_remove_chunk will abort the transaction if things go 10722 * horribly wrong. 10723 */ 10724 ret = btrfs_remove_chunk(trans, fs_info, 10725 block_group->key.objectid); 10726 10727 if (ret) { 10728 if (trimming) 10729 btrfs_put_block_group_trimming(block_group); 10730 goto end_trans; 10731 } 10732 10733 /* 10734 * If we're not mounted with -odiscard, we can just forget 10735 * about this block group. Otherwise we'll need to wait 10736 * until transaction commit to do the actual discard. 10737 */ 10738 if (trimming) { 10739 spin_lock(&fs_info->unused_bgs_lock); 10740 /* 10741 * A concurrent scrub might have added us to the list 10742 * fs_info->unused_bgs, so use a list_move operation 10743 * to add the block group to the deleted_bgs list. 10744 */ 10745 list_move(&block_group->bg_list, 10746 &trans->transaction->deleted_bgs); 10747 spin_unlock(&fs_info->unused_bgs_lock); 10748 btrfs_get_block_group(block_group); 10749 } 10750 end_trans: 10751 btrfs_end_transaction(trans); 10752 next: 10753 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 10754 btrfs_put_block_group(block_group); 10755 spin_lock(&fs_info->unused_bgs_lock); 10756 } 10757 spin_unlock(&fs_info->unused_bgs_lock); 10758 } 10759 10760 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 10761 { 10762 struct btrfs_space_info *space_info; 10763 struct btrfs_super_block *disk_super; 10764 u64 features; 10765 u64 flags; 10766 int mixed = 0; 10767 int ret; 10768 10769 disk_super = fs_info->super_copy; 10770 if (!btrfs_super_root(disk_super)) 10771 return -EINVAL; 10772 10773 features = btrfs_super_incompat_flags(disk_super); 10774 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 10775 mixed = 1; 10776 10777 flags = BTRFS_BLOCK_GROUP_SYSTEM; 10778 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10779 if (ret) 10780 goto out; 10781 10782 if (mixed) { 10783 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 10784 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10785 } else { 10786 flags = BTRFS_BLOCK_GROUP_METADATA; 10787 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10788 if (ret) 10789 goto out; 10790 10791 flags = BTRFS_BLOCK_GROUP_DATA; 10792 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10793 } 10794 out: 10795 return ret; 10796 } 10797 10798 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 10799 u64 start, u64 end) 10800 { 10801 return unpin_extent_range(fs_info, start, end, false); 10802 } 10803 10804 /* 10805 * It used to be that old block groups would be left around forever. 10806 * Iterating over them would be enough to trim unused space. Since we 10807 * now automatically remove them, we also need to iterate over unallocated 10808 * space. 10809 * 10810 * We don't want a transaction for this since the discard may take a 10811 * substantial amount of time. We don't require that a transaction be 10812 * running, but we do need to take a running transaction into account 10813 * to ensure that we're not discarding chunks that were released in 10814 * the current transaction. 10815 * 10816 * Holding the chunks lock will prevent other threads from allocating 10817 * or releasing chunks, but it won't prevent a running transaction 10818 * from committing and releasing the memory that the pending chunks 10819 * list head uses. For that, we need to take a reference to the 10820 * transaction. 10821 */ 10822 static int btrfs_trim_free_extents(struct btrfs_device *device, 10823 u64 minlen, u64 *trimmed) 10824 { 10825 u64 start = 0, len = 0; 10826 int ret; 10827 10828 *trimmed = 0; 10829 10830 /* Not writeable = nothing to do. */ 10831 if (!device->writeable) 10832 return 0; 10833 10834 /* No free space = nothing to do. */ 10835 if (device->total_bytes <= device->bytes_used) 10836 return 0; 10837 10838 ret = 0; 10839 10840 while (1) { 10841 struct btrfs_fs_info *fs_info = device->fs_info; 10842 struct btrfs_transaction *trans; 10843 u64 bytes; 10844 10845 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 10846 if (ret) 10847 return ret; 10848 10849 down_read(&fs_info->commit_root_sem); 10850 10851 spin_lock(&fs_info->trans_lock); 10852 trans = fs_info->running_transaction; 10853 if (trans) 10854 atomic_inc(&trans->use_count); 10855 spin_unlock(&fs_info->trans_lock); 10856 10857 ret = find_free_dev_extent_start(trans, device, minlen, start, 10858 &start, &len); 10859 if (trans) 10860 btrfs_put_transaction(trans); 10861 10862 if (ret) { 10863 up_read(&fs_info->commit_root_sem); 10864 mutex_unlock(&fs_info->chunk_mutex); 10865 if (ret == -ENOSPC) 10866 ret = 0; 10867 break; 10868 } 10869 10870 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 10871 up_read(&fs_info->commit_root_sem); 10872 mutex_unlock(&fs_info->chunk_mutex); 10873 10874 if (ret) 10875 break; 10876 10877 start += len; 10878 *trimmed += bytes; 10879 10880 if (fatal_signal_pending(current)) { 10881 ret = -ERESTARTSYS; 10882 break; 10883 } 10884 10885 cond_resched(); 10886 } 10887 10888 return ret; 10889 } 10890 10891 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 10892 { 10893 struct btrfs_block_group_cache *cache = NULL; 10894 struct btrfs_device *device; 10895 struct list_head *devices; 10896 u64 group_trimmed; 10897 u64 start; 10898 u64 end; 10899 u64 trimmed = 0; 10900 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 10901 int ret = 0; 10902 10903 /* 10904 * try to trim all FS space, our block group may start from non-zero. 10905 */ 10906 if (range->len == total_bytes) 10907 cache = btrfs_lookup_first_block_group(fs_info, range->start); 10908 else 10909 cache = btrfs_lookup_block_group(fs_info, range->start); 10910 10911 while (cache) { 10912 if (cache->key.objectid >= (range->start + range->len)) { 10913 btrfs_put_block_group(cache); 10914 break; 10915 } 10916 10917 start = max(range->start, cache->key.objectid); 10918 end = min(range->start + range->len, 10919 cache->key.objectid + cache->key.offset); 10920 10921 if (end - start >= range->minlen) { 10922 if (!block_group_cache_done(cache)) { 10923 ret = cache_block_group(cache, 0); 10924 if (ret) { 10925 btrfs_put_block_group(cache); 10926 break; 10927 } 10928 ret = wait_block_group_cache_done(cache); 10929 if (ret) { 10930 btrfs_put_block_group(cache); 10931 break; 10932 } 10933 } 10934 ret = btrfs_trim_block_group(cache, 10935 &group_trimmed, 10936 start, 10937 end, 10938 range->minlen); 10939 10940 trimmed += group_trimmed; 10941 if (ret) { 10942 btrfs_put_block_group(cache); 10943 break; 10944 } 10945 } 10946 10947 cache = next_block_group(fs_info, cache); 10948 } 10949 10950 mutex_lock(&fs_info->fs_devices->device_list_mutex); 10951 devices = &fs_info->fs_devices->alloc_list; 10952 list_for_each_entry(device, devices, dev_alloc_list) { 10953 ret = btrfs_trim_free_extents(device, range->minlen, 10954 &group_trimmed); 10955 if (ret) 10956 break; 10957 10958 trimmed += group_trimmed; 10959 } 10960 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 10961 10962 range->len = trimmed; 10963 return ret; 10964 } 10965 10966 /* 10967 * btrfs_{start,end}_write_no_snapshoting() are similar to 10968 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 10969 * data into the page cache through nocow before the subvolume is snapshoted, 10970 * but flush the data into disk after the snapshot creation, or to prevent 10971 * operations while snapshoting is ongoing and that cause the snapshot to be 10972 * inconsistent (writes followed by expanding truncates for example). 10973 */ 10974 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 10975 { 10976 percpu_counter_dec(&root->subv_writers->counter); 10977 /* 10978 * Make sure counter is updated before we wake up waiters. 10979 */ 10980 smp_mb(); 10981 if (waitqueue_active(&root->subv_writers->wait)) 10982 wake_up(&root->subv_writers->wait); 10983 } 10984 10985 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 10986 { 10987 if (atomic_read(&root->will_be_snapshoted)) 10988 return 0; 10989 10990 percpu_counter_inc(&root->subv_writers->counter); 10991 /* 10992 * Make sure counter is updated before we check for snapshot creation. 10993 */ 10994 smp_mb(); 10995 if (atomic_read(&root->will_be_snapshoted)) { 10996 btrfs_end_write_no_snapshoting(root); 10997 return 0; 10998 } 10999 return 1; 11000 } 11001 11002 static int wait_snapshoting_atomic_t(atomic_t *a) 11003 { 11004 schedule(); 11005 return 0; 11006 } 11007 11008 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11009 { 11010 while (true) { 11011 int ret; 11012 11013 ret = btrfs_start_write_no_snapshoting(root); 11014 if (ret) 11015 break; 11016 wait_on_atomic_t(&root->will_be_snapshoted, 11017 wait_snapshoting_atomic_t, 11018 TASK_UNINTERRUPTIBLE); 11019 } 11020 } 11021