1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/sched/signal.h> 20 #include <linux/pagemap.h> 21 #include <linux/writeback.h> 22 #include <linux/blkdev.h> 23 #include <linux/sort.h> 24 #include <linux/rcupdate.h> 25 #include <linux/kthread.h> 26 #include <linux/slab.h> 27 #include <linux/ratelimit.h> 28 #include <linux/percpu_counter.h> 29 #include "hash.h" 30 #include "tree-log.h" 31 #include "disk-io.h" 32 #include "print-tree.h" 33 #include "volumes.h" 34 #include "raid56.h" 35 #include "locking.h" 36 #include "free-space-cache.h" 37 #include "free-space-tree.h" 38 #include "math.h" 39 #include "sysfs.h" 40 #include "qgroup.h" 41 42 #undef SCRAMBLE_DELAYED_REFS 43 44 /* 45 * control flags for do_chunk_alloc's force field 46 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 47 * if we really need one. 48 * 49 * CHUNK_ALLOC_LIMITED means to only try and allocate one 50 * if we have very few chunks already allocated. This is 51 * used as part of the clustering code to help make sure 52 * we have a good pool of storage to cluster in, without 53 * filling the FS with empty chunks 54 * 55 * CHUNK_ALLOC_FORCE means it must try to allocate one 56 * 57 */ 58 enum { 59 CHUNK_ALLOC_NO_FORCE = 0, 60 CHUNK_ALLOC_LIMITED = 1, 61 CHUNK_ALLOC_FORCE = 2, 62 }; 63 64 static int update_block_group(struct btrfs_trans_handle *trans, 65 struct btrfs_fs_info *fs_info, u64 bytenr, 66 u64 num_bytes, int alloc); 67 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 68 struct btrfs_fs_info *fs_info, 69 struct btrfs_delayed_ref_node *node, u64 parent, 70 u64 root_objectid, u64 owner_objectid, 71 u64 owner_offset, int refs_to_drop, 72 struct btrfs_delayed_extent_op *extra_op); 73 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 74 struct extent_buffer *leaf, 75 struct btrfs_extent_item *ei); 76 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 77 struct btrfs_fs_info *fs_info, 78 u64 parent, u64 root_objectid, 79 u64 flags, u64 owner, u64 offset, 80 struct btrfs_key *ins, int ref_mod); 81 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 82 struct btrfs_fs_info *fs_info, 83 u64 parent, u64 root_objectid, 84 u64 flags, struct btrfs_disk_key *key, 85 int level, struct btrfs_key *ins); 86 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 87 struct btrfs_fs_info *fs_info, u64 flags, 88 int force); 89 static int find_next_key(struct btrfs_path *path, int level, 90 struct btrfs_key *key); 91 static void dump_space_info(struct btrfs_fs_info *fs_info, 92 struct btrfs_space_info *info, u64 bytes, 93 int dump_block_groups); 94 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 95 u64 ram_bytes, u64 num_bytes, int delalloc); 96 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 97 u64 num_bytes, int delalloc); 98 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 99 u64 num_bytes); 100 static int __reserve_metadata_bytes(struct btrfs_root *root, 101 struct btrfs_space_info *space_info, 102 u64 orig_bytes, 103 enum btrfs_reserve_flush_enum flush); 104 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 105 struct btrfs_space_info *space_info, 106 u64 num_bytes); 107 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 108 struct btrfs_space_info *space_info, 109 u64 num_bytes); 110 111 static noinline int 112 block_group_cache_done(struct btrfs_block_group_cache *cache) 113 { 114 smp_mb(); 115 return cache->cached == BTRFS_CACHE_FINISHED || 116 cache->cached == BTRFS_CACHE_ERROR; 117 } 118 119 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 120 { 121 return (cache->flags & bits) == bits; 122 } 123 124 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 125 { 126 atomic_inc(&cache->count); 127 } 128 129 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 130 { 131 if (atomic_dec_and_test(&cache->count)) { 132 WARN_ON(cache->pinned > 0); 133 WARN_ON(cache->reserved > 0); 134 135 /* 136 * If not empty, someone is still holding mutex of 137 * full_stripe_lock, which can only be released by caller. 138 * And it will definitely cause use-after-free when caller 139 * tries to release full stripe lock. 140 * 141 * No better way to resolve, but only to warn. 142 */ 143 WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); 144 kfree(cache->free_space_ctl); 145 kfree(cache); 146 } 147 } 148 149 /* 150 * this adds the block group to the fs_info rb tree for the block group 151 * cache 152 */ 153 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 154 struct btrfs_block_group_cache *block_group) 155 { 156 struct rb_node **p; 157 struct rb_node *parent = NULL; 158 struct btrfs_block_group_cache *cache; 159 160 spin_lock(&info->block_group_cache_lock); 161 p = &info->block_group_cache_tree.rb_node; 162 163 while (*p) { 164 parent = *p; 165 cache = rb_entry(parent, struct btrfs_block_group_cache, 166 cache_node); 167 if (block_group->key.objectid < cache->key.objectid) { 168 p = &(*p)->rb_left; 169 } else if (block_group->key.objectid > cache->key.objectid) { 170 p = &(*p)->rb_right; 171 } else { 172 spin_unlock(&info->block_group_cache_lock); 173 return -EEXIST; 174 } 175 } 176 177 rb_link_node(&block_group->cache_node, parent, p); 178 rb_insert_color(&block_group->cache_node, 179 &info->block_group_cache_tree); 180 181 if (info->first_logical_byte > block_group->key.objectid) 182 info->first_logical_byte = block_group->key.objectid; 183 184 spin_unlock(&info->block_group_cache_lock); 185 186 return 0; 187 } 188 189 /* 190 * This will return the block group at or after bytenr if contains is 0, else 191 * it will return the block group that contains the bytenr 192 */ 193 static struct btrfs_block_group_cache * 194 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 195 int contains) 196 { 197 struct btrfs_block_group_cache *cache, *ret = NULL; 198 struct rb_node *n; 199 u64 end, start; 200 201 spin_lock(&info->block_group_cache_lock); 202 n = info->block_group_cache_tree.rb_node; 203 204 while (n) { 205 cache = rb_entry(n, struct btrfs_block_group_cache, 206 cache_node); 207 end = cache->key.objectid + cache->key.offset - 1; 208 start = cache->key.objectid; 209 210 if (bytenr < start) { 211 if (!contains && (!ret || start < ret->key.objectid)) 212 ret = cache; 213 n = n->rb_left; 214 } else if (bytenr > start) { 215 if (contains && bytenr <= end) { 216 ret = cache; 217 break; 218 } 219 n = n->rb_right; 220 } else { 221 ret = cache; 222 break; 223 } 224 } 225 if (ret) { 226 btrfs_get_block_group(ret); 227 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 228 info->first_logical_byte = ret->key.objectid; 229 } 230 spin_unlock(&info->block_group_cache_lock); 231 232 return ret; 233 } 234 235 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 236 u64 start, u64 num_bytes) 237 { 238 u64 end = start + num_bytes - 1; 239 set_extent_bits(&fs_info->freed_extents[0], 240 start, end, EXTENT_UPTODATE); 241 set_extent_bits(&fs_info->freed_extents[1], 242 start, end, EXTENT_UPTODATE); 243 return 0; 244 } 245 246 static void free_excluded_extents(struct btrfs_fs_info *fs_info, 247 struct btrfs_block_group_cache *cache) 248 { 249 u64 start, end; 250 251 start = cache->key.objectid; 252 end = start + cache->key.offset - 1; 253 254 clear_extent_bits(&fs_info->freed_extents[0], 255 start, end, EXTENT_UPTODATE); 256 clear_extent_bits(&fs_info->freed_extents[1], 257 start, end, EXTENT_UPTODATE); 258 } 259 260 static int exclude_super_stripes(struct btrfs_fs_info *fs_info, 261 struct btrfs_block_group_cache *cache) 262 { 263 u64 bytenr; 264 u64 *logical; 265 int stripe_len; 266 int i, nr, ret; 267 268 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 269 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 270 cache->bytes_super += stripe_len; 271 ret = add_excluded_extent(fs_info, cache->key.objectid, 272 stripe_len); 273 if (ret) 274 return ret; 275 } 276 277 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 278 bytenr = btrfs_sb_offset(i); 279 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 280 bytenr, 0, &logical, &nr, &stripe_len); 281 if (ret) 282 return ret; 283 284 while (nr--) { 285 u64 start, len; 286 287 if (logical[nr] > cache->key.objectid + 288 cache->key.offset) 289 continue; 290 291 if (logical[nr] + stripe_len <= cache->key.objectid) 292 continue; 293 294 start = logical[nr]; 295 if (start < cache->key.objectid) { 296 start = cache->key.objectid; 297 len = (logical[nr] + stripe_len) - start; 298 } else { 299 len = min_t(u64, stripe_len, 300 cache->key.objectid + 301 cache->key.offset - start); 302 } 303 304 cache->bytes_super += len; 305 ret = add_excluded_extent(fs_info, start, len); 306 if (ret) { 307 kfree(logical); 308 return ret; 309 } 310 } 311 312 kfree(logical); 313 } 314 return 0; 315 } 316 317 static struct btrfs_caching_control * 318 get_caching_control(struct btrfs_block_group_cache *cache) 319 { 320 struct btrfs_caching_control *ctl; 321 322 spin_lock(&cache->lock); 323 if (!cache->caching_ctl) { 324 spin_unlock(&cache->lock); 325 return NULL; 326 } 327 328 ctl = cache->caching_ctl; 329 refcount_inc(&ctl->count); 330 spin_unlock(&cache->lock); 331 return ctl; 332 } 333 334 static void put_caching_control(struct btrfs_caching_control *ctl) 335 { 336 if (refcount_dec_and_test(&ctl->count)) 337 kfree(ctl); 338 } 339 340 #ifdef CONFIG_BTRFS_DEBUG 341 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 342 { 343 struct btrfs_fs_info *fs_info = block_group->fs_info; 344 u64 start = block_group->key.objectid; 345 u64 len = block_group->key.offset; 346 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 347 fs_info->nodesize : fs_info->sectorsize; 348 u64 step = chunk << 1; 349 350 while (len > chunk) { 351 btrfs_remove_free_space(block_group, start, chunk); 352 start += step; 353 if (len < step) 354 len = 0; 355 else 356 len -= step; 357 } 358 } 359 #endif 360 361 /* 362 * this is only called by cache_block_group, since we could have freed extents 363 * we need to check the pinned_extents for any extents that can't be used yet 364 * since their free space will be released as soon as the transaction commits. 365 */ 366 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 367 struct btrfs_fs_info *info, u64 start, u64 end) 368 { 369 u64 extent_start, extent_end, size, total_added = 0; 370 int ret; 371 372 while (start < end) { 373 ret = find_first_extent_bit(info->pinned_extents, start, 374 &extent_start, &extent_end, 375 EXTENT_DIRTY | EXTENT_UPTODATE, 376 NULL); 377 if (ret) 378 break; 379 380 if (extent_start <= start) { 381 start = extent_end + 1; 382 } else if (extent_start > start && extent_start < end) { 383 size = extent_start - start; 384 total_added += size; 385 ret = btrfs_add_free_space(block_group, start, 386 size); 387 BUG_ON(ret); /* -ENOMEM or logic error */ 388 start = extent_end + 1; 389 } else { 390 break; 391 } 392 } 393 394 if (start < end) { 395 size = end - start; 396 total_added += size; 397 ret = btrfs_add_free_space(block_group, start, size); 398 BUG_ON(ret); /* -ENOMEM or logic error */ 399 } 400 401 return total_added; 402 } 403 404 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 405 { 406 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 407 struct btrfs_fs_info *fs_info = block_group->fs_info; 408 struct btrfs_root *extent_root = fs_info->extent_root; 409 struct btrfs_path *path; 410 struct extent_buffer *leaf; 411 struct btrfs_key key; 412 u64 total_found = 0; 413 u64 last = 0; 414 u32 nritems; 415 int ret; 416 bool wakeup = true; 417 418 path = btrfs_alloc_path(); 419 if (!path) 420 return -ENOMEM; 421 422 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 423 424 #ifdef CONFIG_BTRFS_DEBUG 425 /* 426 * If we're fragmenting we don't want to make anybody think we can 427 * allocate from this block group until we've had a chance to fragment 428 * the free space. 429 */ 430 if (btrfs_should_fragment_free_space(block_group)) 431 wakeup = false; 432 #endif 433 /* 434 * We don't want to deadlock with somebody trying to allocate a new 435 * extent for the extent root while also trying to search the extent 436 * root to add free space. So we skip locking and search the commit 437 * root, since its read-only 438 */ 439 path->skip_locking = 1; 440 path->search_commit_root = 1; 441 path->reada = READA_FORWARD; 442 443 key.objectid = last; 444 key.offset = 0; 445 key.type = BTRFS_EXTENT_ITEM_KEY; 446 447 next: 448 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 449 if (ret < 0) 450 goto out; 451 452 leaf = path->nodes[0]; 453 nritems = btrfs_header_nritems(leaf); 454 455 while (1) { 456 if (btrfs_fs_closing(fs_info) > 1) { 457 last = (u64)-1; 458 break; 459 } 460 461 if (path->slots[0] < nritems) { 462 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 463 } else { 464 ret = find_next_key(path, 0, &key); 465 if (ret) 466 break; 467 468 if (need_resched() || 469 rwsem_is_contended(&fs_info->commit_root_sem)) { 470 if (wakeup) 471 caching_ctl->progress = last; 472 btrfs_release_path(path); 473 up_read(&fs_info->commit_root_sem); 474 mutex_unlock(&caching_ctl->mutex); 475 cond_resched(); 476 mutex_lock(&caching_ctl->mutex); 477 down_read(&fs_info->commit_root_sem); 478 goto next; 479 } 480 481 ret = btrfs_next_leaf(extent_root, path); 482 if (ret < 0) 483 goto out; 484 if (ret) 485 break; 486 leaf = path->nodes[0]; 487 nritems = btrfs_header_nritems(leaf); 488 continue; 489 } 490 491 if (key.objectid < last) { 492 key.objectid = last; 493 key.offset = 0; 494 key.type = BTRFS_EXTENT_ITEM_KEY; 495 496 if (wakeup) 497 caching_ctl->progress = last; 498 btrfs_release_path(path); 499 goto next; 500 } 501 502 if (key.objectid < block_group->key.objectid) { 503 path->slots[0]++; 504 continue; 505 } 506 507 if (key.objectid >= block_group->key.objectid + 508 block_group->key.offset) 509 break; 510 511 if (key.type == BTRFS_EXTENT_ITEM_KEY || 512 key.type == BTRFS_METADATA_ITEM_KEY) { 513 total_found += add_new_free_space(block_group, 514 fs_info, last, 515 key.objectid); 516 if (key.type == BTRFS_METADATA_ITEM_KEY) 517 last = key.objectid + 518 fs_info->nodesize; 519 else 520 last = key.objectid + key.offset; 521 522 if (total_found > CACHING_CTL_WAKE_UP) { 523 total_found = 0; 524 if (wakeup) 525 wake_up(&caching_ctl->wait); 526 } 527 } 528 path->slots[0]++; 529 } 530 ret = 0; 531 532 total_found += add_new_free_space(block_group, fs_info, last, 533 block_group->key.objectid + 534 block_group->key.offset); 535 caching_ctl->progress = (u64)-1; 536 537 out: 538 btrfs_free_path(path); 539 return ret; 540 } 541 542 static noinline void caching_thread(struct btrfs_work *work) 543 { 544 struct btrfs_block_group_cache *block_group; 545 struct btrfs_fs_info *fs_info; 546 struct btrfs_caching_control *caching_ctl; 547 struct btrfs_root *extent_root; 548 int ret; 549 550 caching_ctl = container_of(work, struct btrfs_caching_control, work); 551 block_group = caching_ctl->block_group; 552 fs_info = block_group->fs_info; 553 extent_root = fs_info->extent_root; 554 555 mutex_lock(&caching_ctl->mutex); 556 down_read(&fs_info->commit_root_sem); 557 558 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 559 ret = load_free_space_tree(caching_ctl); 560 else 561 ret = load_extent_tree_free(caching_ctl); 562 563 spin_lock(&block_group->lock); 564 block_group->caching_ctl = NULL; 565 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 566 spin_unlock(&block_group->lock); 567 568 #ifdef CONFIG_BTRFS_DEBUG 569 if (btrfs_should_fragment_free_space(block_group)) { 570 u64 bytes_used; 571 572 spin_lock(&block_group->space_info->lock); 573 spin_lock(&block_group->lock); 574 bytes_used = block_group->key.offset - 575 btrfs_block_group_used(&block_group->item); 576 block_group->space_info->bytes_used += bytes_used >> 1; 577 spin_unlock(&block_group->lock); 578 spin_unlock(&block_group->space_info->lock); 579 fragment_free_space(block_group); 580 } 581 #endif 582 583 caching_ctl->progress = (u64)-1; 584 585 up_read(&fs_info->commit_root_sem); 586 free_excluded_extents(fs_info, block_group); 587 mutex_unlock(&caching_ctl->mutex); 588 589 wake_up(&caching_ctl->wait); 590 591 put_caching_control(caching_ctl); 592 btrfs_put_block_group(block_group); 593 } 594 595 static int cache_block_group(struct btrfs_block_group_cache *cache, 596 int load_cache_only) 597 { 598 DEFINE_WAIT(wait); 599 struct btrfs_fs_info *fs_info = cache->fs_info; 600 struct btrfs_caching_control *caching_ctl; 601 int ret = 0; 602 603 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 604 if (!caching_ctl) 605 return -ENOMEM; 606 607 INIT_LIST_HEAD(&caching_ctl->list); 608 mutex_init(&caching_ctl->mutex); 609 init_waitqueue_head(&caching_ctl->wait); 610 caching_ctl->block_group = cache; 611 caching_ctl->progress = cache->key.objectid; 612 refcount_set(&caching_ctl->count, 1); 613 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 614 caching_thread, NULL, NULL); 615 616 spin_lock(&cache->lock); 617 /* 618 * This should be a rare occasion, but this could happen I think in the 619 * case where one thread starts to load the space cache info, and then 620 * some other thread starts a transaction commit which tries to do an 621 * allocation while the other thread is still loading the space cache 622 * info. The previous loop should have kept us from choosing this block 623 * group, but if we've moved to the state where we will wait on caching 624 * block groups we need to first check if we're doing a fast load here, 625 * so we can wait for it to finish, otherwise we could end up allocating 626 * from a block group who's cache gets evicted for one reason or 627 * another. 628 */ 629 while (cache->cached == BTRFS_CACHE_FAST) { 630 struct btrfs_caching_control *ctl; 631 632 ctl = cache->caching_ctl; 633 refcount_inc(&ctl->count); 634 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 635 spin_unlock(&cache->lock); 636 637 schedule(); 638 639 finish_wait(&ctl->wait, &wait); 640 put_caching_control(ctl); 641 spin_lock(&cache->lock); 642 } 643 644 if (cache->cached != BTRFS_CACHE_NO) { 645 spin_unlock(&cache->lock); 646 kfree(caching_ctl); 647 return 0; 648 } 649 WARN_ON(cache->caching_ctl); 650 cache->caching_ctl = caching_ctl; 651 cache->cached = BTRFS_CACHE_FAST; 652 spin_unlock(&cache->lock); 653 654 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 655 mutex_lock(&caching_ctl->mutex); 656 ret = load_free_space_cache(fs_info, cache); 657 658 spin_lock(&cache->lock); 659 if (ret == 1) { 660 cache->caching_ctl = NULL; 661 cache->cached = BTRFS_CACHE_FINISHED; 662 cache->last_byte_to_unpin = (u64)-1; 663 caching_ctl->progress = (u64)-1; 664 } else { 665 if (load_cache_only) { 666 cache->caching_ctl = NULL; 667 cache->cached = BTRFS_CACHE_NO; 668 } else { 669 cache->cached = BTRFS_CACHE_STARTED; 670 cache->has_caching_ctl = 1; 671 } 672 } 673 spin_unlock(&cache->lock); 674 #ifdef CONFIG_BTRFS_DEBUG 675 if (ret == 1 && 676 btrfs_should_fragment_free_space(cache)) { 677 u64 bytes_used; 678 679 spin_lock(&cache->space_info->lock); 680 spin_lock(&cache->lock); 681 bytes_used = cache->key.offset - 682 btrfs_block_group_used(&cache->item); 683 cache->space_info->bytes_used += bytes_used >> 1; 684 spin_unlock(&cache->lock); 685 spin_unlock(&cache->space_info->lock); 686 fragment_free_space(cache); 687 } 688 #endif 689 mutex_unlock(&caching_ctl->mutex); 690 691 wake_up(&caching_ctl->wait); 692 if (ret == 1) { 693 put_caching_control(caching_ctl); 694 free_excluded_extents(fs_info, cache); 695 return 0; 696 } 697 } else { 698 /* 699 * We're either using the free space tree or no caching at all. 700 * Set cached to the appropriate value and wakeup any waiters. 701 */ 702 spin_lock(&cache->lock); 703 if (load_cache_only) { 704 cache->caching_ctl = NULL; 705 cache->cached = BTRFS_CACHE_NO; 706 } else { 707 cache->cached = BTRFS_CACHE_STARTED; 708 cache->has_caching_ctl = 1; 709 } 710 spin_unlock(&cache->lock); 711 wake_up(&caching_ctl->wait); 712 } 713 714 if (load_cache_only) { 715 put_caching_control(caching_ctl); 716 return 0; 717 } 718 719 down_write(&fs_info->commit_root_sem); 720 refcount_inc(&caching_ctl->count); 721 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 722 up_write(&fs_info->commit_root_sem); 723 724 btrfs_get_block_group(cache); 725 726 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 727 728 return ret; 729 } 730 731 /* 732 * return the block group that starts at or after bytenr 733 */ 734 static struct btrfs_block_group_cache * 735 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 736 { 737 return block_group_cache_tree_search(info, bytenr, 0); 738 } 739 740 /* 741 * return the block group that contains the given bytenr 742 */ 743 struct btrfs_block_group_cache *btrfs_lookup_block_group( 744 struct btrfs_fs_info *info, 745 u64 bytenr) 746 { 747 return block_group_cache_tree_search(info, bytenr, 1); 748 } 749 750 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 751 u64 flags) 752 { 753 struct list_head *head = &info->space_info; 754 struct btrfs_space_info *found; 755 756 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 757 758 rcu_read_lock(); 759 list_for_each_entry_rcu(found, head, list) { 760 if (found->flags & flags) { 761 rcu_read_unlock(); 762 return found; 763 } 764 } 765 rcu_read_unlock(); 766 return NULL; 767 } 768 769 /* 770 * after adding space to the filesystem, we need to clear the full flags 771 * on all the space infos. 772 */ 773 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 774 { 775 struct list_head *head = &info->space_info; 776 struct btrfs_space_info *found; 777 778 rcu_read_lock(); 779 list_for_each_entry_rcu(found, head, list) 780 found->full = 0; 781 rcu_read_unlock(); 782 } 783 784 /* simple helper to search for an existing data extent at a given offset */ 785 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 786 { 787 int ret; 788 struct btrfs_key key; 789 struct btrfs_path *path; 790 791 path = btrfs_alloc_path(); 792 if (!path) 793 return -ENOMEM; 794 795 key.objectid = start; 796 key.offset = len; 797 key.type = BTRFS_EXTENT_ITEM_KEY; 798 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 799 btrfs_free_path(path); 800 return ret; 801 } 802 803 /* 804 * helper function to lookup reference count and flags of a tree block. 805 * 806 * the head node for delayed ref is used to store the sum of all the 807 * reference count modifications queued up in the rbtree. the head 808 * node may also store the extent flags to set. This way you can check 809 * to see what the reference count and extent flags would be if all of 810 * the delayed refs are not processed. 811 */ 812 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 813 struct btrfs_fs_info *fs_info, u64 bytenr, 814 u64 offset, int metadata, u64 *refs, u64 *flags) 815 { 816 struct btrfs_delayed_ref_head *head; 817 struct btrfs_delayed_ref_root *delayed_refs; 818 struct btrfs_path *path; 819 struct btrfs_extent_item *ei; 820 struct extent_buffer *leaf; 821 struct btrfs_key key; 822 u32 item_size; 823 u64 num_refs; 824 u64 extent_flags; 825 int ret; 826 827 /* 828 * If we don't have skinny metadata, don't bother doing anything 829 * different 830 */ 831 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 832 offset = fs_info->nodesize; 833 metadata = 0; 834 } 835 836 path = btrfs_alloc_path(); 837 if (!path) 838 return -ENOMEM; 839 840 if (!trans) { 841 path->skip_locking = 1; 842 path->search_commit_root = 1; 843 } 844 845 search_again: 846 key.objectid = bytenr; 847 key.offset = offset; 848 if (metadata) 849 key.type = BTRFS_METADATA_ITEM_KEY; 850 else 851 key.type = BTRFS_EXTENT_ITEM_KEY; 852 853 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 854 if (ret < 0) 855 goto out_free; 856 857 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 858 if (path->slots[0]) { 859 path->slots[0]--; 860 btrfs_item_key_to_cpu(path->nodes[0], &key, 861 path->slots[0]); 862 if (key.objectid == bytenr && 863 key.type == BTRFS_EXTENT_ITEM_KEY && 864 key.offset == fs_info->nodesize) 865 ret = 0; 866 } 867 } 868 869 if (ret == 0) { 870 leaf = path->nodes[0]; 871 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 872 if (item_size >= sizeof(*ei)) { 873 ei = btrfs_item_ptr(leaf, path->slots[0], 874 struct btrfs_extent_item); 875 num_refs = btrfs_extent_refs(leaf, ei); 876 extent_flags = btrfs_extent_flags(leaf, ei); 877 } else { 878 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 879 struct btrfs_extent_item_v0 *ei0; 880 BUG_ON(item_size != sizeof(*ei0)); 881 ei0 = btrfs_item_ptr(leaf, path->slots[0], 882 struct btrfs_extent_item_v0); 883 num_refs = btrfs_extent_refs_v0(leaf, ei0); 884 /* FIXME: this isn't correct for data */ 885 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 886 #else 887 BUG(); 888 #endif 889 } 890 BUG_ON(num_refs == 0); 891 } else { 892 num_refs = 0; 893 extent_flags = 0; 894 ret = 0; 895 } 896 897 if (!trans) 898 goto out; 899 900 delayed_refs = &trans->transaction->delayed_refs; 901 spin_lock(&delayed_refs->lock); 902 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 903 if (head) { 904 if (!mutex_trylock(&head->mutex)) { 905 refcount_inc(&head->node.refs); 906 spin_unlock(&delayed_refs->lock); 907 908 btrfs_release_path(path); 909 910 /* 911 * Mutex was contended, block until it's released and try 912 * again 913 */ 914 mutex_lock(&head->mutex); 915 mutex_unlock(&head->mutex); 916 btrfs_put_delayed_ref(&head->node); 917 goto search_again; 918 } 919 spin_lock(&head->lock); 920 if (head->extent_op && head->extent_op->update_flags) 921 extent_flags |= head->extent_op->flags_to_set; 922 else 923 BUG_ON(num_refs == 0); 924 925 num_refs += head->node.ref_mod; 926 spin_unlock(&head->lock); 927 mutex_unlock(&head->mutex); 928 } 929 spin_unlock(&delayed_refs->lock); 930 out: 931 WARN_ON(num_refs == 0); 932 if (refs) 933 *refs = num_refs; 934 if (flags) 935 *flags = extent_flags; 936 out_free: 937 btrfs_free_path(path); 938 return ret; 939 } 940 941 /* 942 * Back reference rules. Back refs have three main goals: 943 * 944 * 1) differentiate between all holders of references to an extent so that 945 * when a reference is dropped we can make sure it was a valid reference 946 * before freeing the extent. 947 * 948 * 2) Provide enough information to quickly find the holders of an extent 949 * if we notice a given block is corrupted or bad. 950 * 951 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 952 * maintenance. This is actually the same as #2, but with a slightly 953 * different use case. 954 * 955 * There are two kinds of back refs. The implicit back refs is optimized 956 * for pointers in non-shared tree blocks. For a given pointer in a block, 957 * back refs of this kind provide information about the block's owner tree 958 * and the pointer's key. These information allow us to find the block by 959 * b-tree searching. The full back refs is for pointers in tree blocks not 960 * referenced by their owner trees. The location of tree block is recorded 961 * in the back refs. Actually the full back refs is generic, and can be 962 * used in all cases the implicit back refs is used. The major shortcoming 963 * of the full back refs is its overhead. Every time a tree block gets 964 * COWed, we have to update back refs entry for all pointers in it. 965 * 966 * For a newly allocated tree block, we use implicit back refs for 967 * pointers in it. This means most tree related operations only involve 968 * implicit back refs. For a tree block created in old transaction, the 969 * only way to drop a reference to it is COW it. So we can detect the 970 * event that tree block loses its owner tree's reference and do the 971 * back refs conversion. 972 * 973 * When a tree block is COWed through a tree, there are four cases: 974 * 975 * The reference count of the block is one and the tree is the block's 976 * owner tree. Nothing to do in this case. 977 * 978 * The reference count of the block is one and the tree is not the 979 * block's owner tree. In this case, full back refs is used for pointers 980 * in the block. Remove these full back refs, add implicit back refs for 981 * every pointers in the new block. 982 * 983 * The reference count of the block is greater than one and the tree is 984 * the block's owner tree. In this case, implicit back refs is used for 985 * pointers in the block. Add full back refs for every pointers in the 986 * block, increase lower level extents' reference counts. The original 987 * implicit back refs are entailed to the new block. 988 * 989 * The reference count of the block is greater than one and the tree is 990 * not the block's owner tree. Add implicit back refs for every pointer in 991 * the new block, increase lower level extents' reference count. 992 * 993 * Back Reference Key composing: 994 * 995 * The key objectid corresponds to the first byte in the extent, 996 * The key type is used to differentiate between types of back refs. 997 * There are different meanings of the key offset for different types 998 * of back refs. 999 * 1000 * File extents can be referenced by: 1001 * 1002 * - multiple snapshots, subvolumes, or different generations in one subvol 1003 * - different files inside a single subvolume 1004 * - different offsets inside a file (bookend extents in file.c) 1005 * 1006 * The extent ref structure for the implicit back refs has fields for: 1007 * 1008 * - Objectid of the subvolume root 1009 * - objectid of the file holding the reference 1010 * - original offset in the file 1011 * - how many bookend extents 1012 * 1013 * The key offset for the implicit back refs is hash of the first 1014 * three fields. 1015 * 1016 * The extent ref structure for the full back refs has field for: 1017 * 1018 * - number of pointers in the tree leaf 1019 * 1020 * The key offset for the implicit back refs is the first byte of 1021 * the tree leaf 1022 * 1023 * When a file extent is allocated, The implicit back refs is used. 1024 * the fields are filled in: 1025 * 1026 * (root_key.objectid, inode objectid, offset in file, 1) 1027 * 1028 * When a file extent is removed file truncation, we find the 1029 * corresponding implicit back refs and check the following fields: 1030 * 1031 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1032 * 1033 * Btree extents can be referenced by: 1034 * 1035 * - Different subvolumes 1036 * 1037 * Both the implicit back refs and the full back refs for tree blocks 1038 * only consist of key. The key offset for the implicit back refs is 1039 * objectid of block's owner tree. The key offset for the full back refs 1040 * is the first byte of parent block. 1041 * 1042 * When implicit back refs is used, information about the lowest key and 1043 * level of the tree block are required. These information are stored in 1044 * tree block info structure. 1045 */ 1046 1047 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1048 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 1049 struct btrfs_fs_info *fs_info, 1050 struct btrfs_path *path, 1051 u64 owner, u32 extra_size) 1052 { 1053 struct btrfs_root *root = fs_info->extent_root; 1054 struct btrfs_extent_item *item; 1055 struct btrfs_extent_item_v0 *ei0; 1056 struct btrfs_extent_ref_v0 *ref0; 1057 struct btrfs_tree_block_info *bi; 1058 struct extent_buffer *leaf; 1059 struct btrfs_key key; 1060 struct btrfs_key found_key; 1061 u32 new_size = sizeof(*item); 1062 u64 refs; 1063 int ret; 1064 1065 leaf = path->nodes[0]; 1066 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1067 1068 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1069 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1070 struct btrfs_extent_item_v0); 1071 refs = btrfs_extent_refs_v0(leaf, ei0); 1072 1073 if (owner == (u64)-1) { 1074 while (1) { 1075 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1076 ret = btrfs_next_leaf(root, path); 1077 if (ret < 0) 1078 return ret; 1079 BUG_ON(ret > 0); /* Corruption */ 1080 leaf = path->nodes[0]; 1081 } 1082 btrfs_item_key_to_cpu(leaf, &found_key, 1083 path->slots[0]); 1084 BUG_ON(key.objectid != found_key.objectid); 1085 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1086 path->slots[0]++; 1087 continue; 1088 } 1089 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1090 struct btrfs_extent_ref_v0); 1091 owner = btrfs_ref_objectid_v0(leaf, ref0); 1092 break; 1093 } 1094 } 1095 btrfs_release_path(path); 1096 1097 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1098 new_size += sizeof(*bi); 1099 1100 new_size -= sizeof(*ei0); 1101 ret = btrfs_search_slot(trans, root, &key, path, 1102 new_size + extra_size, 1); 1103 if (ret < 0) 1104 return ret; 1105 BUG_ON(ret); /* Corruption */ 1106 1107 btrfs_extend_item(fs_info, path, new_size); 1108 1109 leaf = path->nodes[0]; 1110 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1111 btrfs_set_extent_refs(leaf, item, refs); 1112 /* FIXME: get real generation */ 1113 btrfs_set_extent_generation(leaf, item, 0); 1114 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1115 btrfs_set_extent_flags(leaf, item, 1116 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1117 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1118 bi = (struct btrfs_tree_block_info *)(item + 1); 1119 /* FIXME: get first key of the block */ 1120 memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi)); 1121 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1122 } else { 1123 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1124 } 1125 btrfs_mark_buffer_dirty(leaf); 1126 return 0; 1127 } 1128 #endif 1129 1130 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1131 { 1132 u32 high_crc = ~(u32)0; 1133 u32 low_crc = ~(u32)0; 1134 __le64 lenum; 1135 1136 lenum = cpu_to_le64(root_objectid); 1137 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1138 lenum = cpu_to_le64(owner); 1139 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1140 lenum = cpu_to_le64(offset); 1141 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1142 1143 return ((u64)high_crc << 31) ^ (u64)low_crc; 1144 } 1145 1146 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1147 struct btrfs_extent_data_ref *ref) 1148 { 1149 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1150 btrfs_extent_data_ref_objectid(leaf, ref), 1151 btrfs_extent_data_ref_offset(leaf, ref)); 1152 } 1153 1154 static int match_extent_data_ref(struct extent_buffer *leaf, 1155 struct btrfs_extent_data_ref *ref, 1156 u64 root_objectid, u64 owner, u64 offset) 1157 { 1158 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1159 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1160 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1161 return 0; 1162 return 1; 1163 } 1164 1165 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1166 struct btrfs_fs_info *fs_info, 1167 struct btrfs_path *path, 1168 u64 bytenr, u64 parent, 1169 u64 root_objectid, 1170 u64 owner, u64 offset) 1171 { 1172 struct btrfs_root *root = fs_info->extent_root; 1173 struct btrfs_key key; 1174 struct btrfs_extent_data_ref *ref; 1175 struct extent_buffer *leaf; 1176 u32 nritems; 1177 int ret; 1178 int recow; 1179 int err = -ENOENT; 1180 1181 key.objectid = bytenr; 1182 if (parent) { 1183 key.type = BTRFS_SHARED_DATA_REF_KEY; 1184 key.offset = parent; 1185 } else { 1186 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1187 key.offset = hash_extent_data_ref(root_objectid, 1188 owner, offset); 1189 } 1190 again: 1191 recow = 0; 1192 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1193 if (ret < 0) { 1194 err = ret; 1195 goto fail; 1196 } 1197 1198 if (parent) { 1199 if (!ret) 1200 return 0; 1201 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1202 key.type = BTRFS_EXTENT_REF_V0_KEY; 1203 btrfs_release_path(path); 1204 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1205 if (ret < 0) { 1206 err = ret; 1207 goto fail; 1208 } 1209 if (!ret) 1210 return 0; 1211 #endif 1212 goto fail; 1213 } 1214 1215 leaf = path->nodes[0]; 1216 nritems = btrfs_header_nritems(leaf); 1217 while (1) { 1218 if (path->slots[0] >= nritems) { 1219 ret = btrfs_next_leaf(root, path); 1220 if (ret < 0) 1221 err = ret; 1222 if (ret) 1223 goto fail; 1224 1225 leaf = path->nodes[0]; 1226 nritems = btrfs_header_nritems(leaf); 1227 recow = 1; 1228 } 1229 1230 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1231 if (key.objectid != bytenr || 1232 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1233 goto fail; 1234 1235 ref = btrfs_item_ptr(leaf, path->slots[0], 1236 struct btrfs_extent_data_ref); 1237 1238 if (match_extent_data_ref(leaf, ref, root_objectid, 1239 owner, offset)) { 1240 if (recow) { 1241 btrfs_release_path(path); 1242 goto again; 1243 } 1244 err = 0; 1245 break; 1246 } 1247 path->slots[0]++; 1248 } 1249 fail: 1250 return err; 1251 } 1252 1253 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1254 struct btrfs_fs_info *fs_info, 1255 struct btrfs_path *path, 1256 u64 bytenr, u64 parent, 1257 u64 root_objectid, u64 owner, 1258 u64 offset, int refs_to_add) 1259 { 1260 struct btrfs_root *root = fs_info->extent_root; 1261 struct btrfs_key key; 1262 struct extent_buffer *leaf; 1263 u32 size; 1264 u32 num_refs; 1265 int ret; 1266 1267 key.objectid = bytenr; 1268 if (parent) { 1269 key.type = BTRFS_SHARED_DATA_REF_KEY; 1270 key.offset = parent; 1271 size = sizeof(struct btrfs_shared_data_ref); 1272 } else { 1273 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1274 key.offset = hash_extent_data_ref(root_objectid, 1275 owner, offset); 1276 size = sizeof(struct btrfs_extent_data_ref); 1277 } 1278 1279 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1280 if (ret && ret != -EEXIST) 1281 goto fail; 1282 1283 leaf = path->nodes[0]; 1284 if (parent) { 1285 struct btrfs_shared_data_ref *ref; 1286 ref = btrfs_item_ptr(leaf, path->slots[0], 1287 struct btrfs_shared_data_ref); 1288 if (ret == 0) { 1289 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1290 } else { 1291 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1292 num_refs += refs_to_add; 1293 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1294 } 1295 } else { 1296 struct btrfs_extent_data_ref *ref; 1297 while (ret == -EEXIST) { 1298 ref = btrfs_item_ptr(leaf, path->slots[0], 1299 struct btrfs_extent_data_ref); 1300 if (match_extent_data_ref(leaf, ref, root_objectid, 1301 owner, offset)) 1302 break; 1303 btrfs_release_path(path); 1304 key.offset++; 1305 ret = btrfs_insert_empty_item(trans, root, path, &key, 1306 size); 1307 if (ret && ret != -EEXIST) 1308 goto fail; 1309 1310 leaf = path->nodes[0]; 1311 } 1312 ref = btrfs_item_ptr(leaf, path->slots[0], 1313 struct btrfs_extent_data_ref); 1314 if (ret == 0) { 1315 btrfs_set_extent_data_ref_root(leaf, ref, 1316 root_objectid); 1317 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1318 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1319 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1320 } else { 1321 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1322 num_refs += refs_to_add; 1323 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1324 } 1325 } 1326 btrfs_mark_buffer_dirty(leaf); 1327 ret = 0; 1328 fail: 1329 btrfs_release_path(path); 1330 return ret; 1331 } 1332 1333 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1334 struct btrfs_fs_info *fs_info, 1335 struct btrfs_path *path, 1336 int refs_to_drop, int *last_ref) 1337 { 1338 struct btrfs_key key; 1339 struct btrfs_extent_data_ref *ref1 = NULL; 1340 struct btrfs_shared_data_ref *ref2 = NULL; 1341 struct extent_buffer *leaf; 1342 u32 num_refs = 0; 1343 int ret = 0; 1344 1345 leaf = path->nodes[0]; 1346 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1347 1348 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1349 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1350 struct btrfs_extent_data_ref); 1351 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1352 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1353 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1354 struct btrfs_shared_data_ref); 1355 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1356 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1357 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1358 struct btrfs_extent_ref_v0 *ref0; 1359 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1360 struct btrfs_extent_ref_v0); 1361 num_refs = btrfs_ref_count_v0(leaf, ref0); 1362 #endif 1363 } else { 1364 BUG(); 1365 } 1366 1367 BUG_ON(num_refs < refs_to_drop); 1368 num_refs -= refs_to_drop; 1369 1370 if (num_refs == 0) { 1371 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1372 *last_ref = 1; 1373 } else { 1374 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1375 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1376 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1377 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1378 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1379 else { 1380 struct btrfs_extent_ref_v0 *ref0; 1381 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1382 struct btrfs_extent_ref_v0); 1383 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1384 } 1385 #endif 1386 btrfs_mark_buffer_dirty(leaf); 1387 } 1388 return ret; 1389 } 1390 1391 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1392 struct btrfs_extent_inline_ref *iref) 1393 { 1394 struct btrfs_key key; 1395 struct extent_buffer *leaf; 1396 struct btrfs_extent_data_ref *ref1; 1397 struct btrfs_shared_data_ref *ref2; 1398 u32 num_refs = 0; 1399 1400 leaf = path->nodes[0]; 1401 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1402 if (iref) { 1403 if (btrfs_extent_inline_ref_type(leaf, iref) == 1404 BTRFS_EXTENT_DATA_REF_KEY) { 1405 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1406 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1407 } else { 1408 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1409 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1410 } 1411 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1412 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1413 struct btrfs_extent_data_ref); 1414 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1415 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1416 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1417 struct btrfs_shared_data_ref); 1418 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1419 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1420 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1421 struct btrfs_extent_ref_v0 *ref0; 1422 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1423 struct btrfs_extent_ref_v0); 1424 num_refs = btrfs_ref_count_v0(leaf, ref0); 1425 #endif 1426 } else { 1427 WARN_ON(1); 1428 } 1429 return num_refs; 1430 } 1431 1432 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1433 struct btrfs_fs_info *fs_info, 1434 struct btrfs_path *path, 1435 u64 bytenr, u64 parent, 1436 u64 root_objectid) 1437 { 1438 struct btrfs_root *root = fs_info->extent_root; 1439 struct btrfs_key key; 1440 int ret; 1441 1442 key.objectid = bytenr; 1443 if (parent) { 1444 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1445 key.offset = parent; 1446 } else { 1447 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1448 key.offset = root_objectid; 1449 } 1450 1451 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1452 if (ret > 0) 1453 ret = -ENOENT; 1454 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1455 if (ret == -ENOENT && parent) { 1456 btrfs_release_path(path); 1457 key.type = BTRFS_EXTENT_REF_V0_KEY; 1458 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1459 if (ret > 0) 1460 ret = -ENOENT; 1461 } 1462 #endif 1463 return ret; 1464 } 1465 1466 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1467 struct btrfs_fs_info *fs_info, 1468 struct btrfs_path *path, 1469 u64 bytenr, u64 parent, 1470 u64 root_objectid) 1471 { 1472 struct btrfs_key key; 1473 int ret; 1474 1475 key.objectid = bytenr; 1476 if (parent) { 1477 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1478 key.offset = parent; 1479 } else { 1480 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1481 key.offset = root_objectid; 1482 } 1483 1484 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, 1485 path, &key, 0); 1486 btrfs_release_path(path); 1487 return ret; 1488 } 1489 1490 static inline int extent_ref_type(u64 parent, u64 owner) 1491 { 1492 int type; 1493 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1494 if (parent > 0) 1495 type = BTRFS_SHARED_BLOCK_REF_KEY; 1496 else 1497 type = BTRFS_TREE_BLOCK_REF_KEY; 1498 } else { 1499 if (parent > 0) 1500 type = BTRFS_SHARED_DATA_REF_KEY; 1501 else 1502 type = BTRFS_EXTENT_DATA_REF_KEY; 1503 } 1504 return type; 1505 } 1506 1507 static int find_next_key(struct btrfs_path *path, int level, 1508 struct btrfs_key *key) 1509 1510 { 1511 for (; level < BTRFS_MAX_LEVEL; level++) { 1512 if (!path->nodes[level]) 1513 break; 1514 if (path->slots[level] + 1 >= 1515 btrfs_header_nritems(path->nodes[level])) 1516 continue; 1517 if (level == 0) 1518 btrfs_item_key_to_cpu(path->nodes[level], key, 1519 path->slots[level] + 1); 1520 else 1521 btrfs_node_key_to_cpu(path->nodes[level], key, 1522 path->slots[level] + 1); 1523 return 0; 1524 } 1525 return 1; 1526 } 1527 1528 /* 1529 * look for inline back ref. if back ref is found, *ref_ret is set 1530 * to the address of inline back ref, and 0 is returned. 1531 * 1532 * if back ref isn't found, *ref_ret is set to the address where it 1533 * should be inserted, and -ENOENT is returned. 1534 * 1535 * if insert is true and there are too many inline back refs, the path 1536 * points to the extent item, and -EAGAIN is returned. 1537 * 1538 * NOTE: inline back refs are ordered in the same way that back ref 1539 * items in the tree are ordered. 1540 */ 1541 static noinline_for_stack 1542 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1543 struct btrfs_fs_info *fs_info, 1544 struct btrfs_path *path, 1545 struct btrfs_extent_inline_ref **ref_ret, 1546 u64 bytenr, u64 num_bytes, 1547 u64 parent, u64 root_objectid, 1548 u64 owner, u64 offset, int insert) 1549 { 1550 struct btrfs_root *root = fs_info->extent_root; 1551 struct btrfs_key key; 1552 struct extent_buffer *leaf; 1553 struct btrfs_extent_item *ei; 1554 struct btrfs_extent_inline_ref *iref; 1555 u64 flags; 1556 u64 item_size; 1557 unsigned long ptr; 1558 unsigned long end; 1559 int extra_size; 1560 int type; 1561 int want; 1562 int ret; 1563 int err = 0; 1564 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1565 1566 key.objectid = bytenr; 1567 key.type = BTRFS_EXTENT_ITEM_KEY; 1568 key.offset = num_bytes; 1569 1570 want = extent_ref_type(parent, owner); 1571 if (insert) { 1572 extra_size = btrfs_extent_inline_ref_size(want); 1573 path->keep_locks = 1; 1574 } else 1575 extra_size = -1; 1576 1577 /* 1578 * Owner is our parent level, so we can just add one to get the level 1579 * for the block we are interested in. 1580 */ 1581 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1582 key.type = BTRFS_METADATA_ITEM_KEY; 1583 key.offset = owner; 1584 } 1585 1586 again: 1587 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1588 if (ret < 0) { 1589 err = ret; 1590 goto out; 1591 } 1592 1593 /* 1594 * We may be a newly converted file system which still has the old fat 1595 * extent entries for metadata, so try and see if we have one of those. 1596 */ 1597 if (ret > 0 && skinny_metadata) { 1598 skinny_metadata = false; 1599 if (path->slots[0]) { 1600 path->slots[0]--; 1601 btrfs_item_key_to_cpu(path->nodes[0], &key, 1602 path->slots[0]); 1603 if (key.objectid == bytenr && 1604 key.type == BTRFS_EXTENT_ITEM_KEY && 1605 key.offset == num_bytes) 1606 ret = 0; 1607 } 1608 if (ret) { 1609 key.objectid = bytenr; 1610 key.type = BTRFS_EXTENT_ITEM_KEY; 1611 key.offset = num_bytes; 1612 btrfs_release_path(path); 1613 goto again; 1614 } 1615 } 1616 1617 if (ret && !insert) { 1618 err = -ENOENT; 1619 goto out; 1620 } else if (WARN_ON(ret)) { 1621 err = -EIO; 1622 goto out; 1623 } 1624 1625 leaf = path->nodes[0]; 1626 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1627 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1628 if (item_size < sizeof(*ei)) { 1629 if (!insert) { 1630 err = -ENOENT; 1631 goto out; 1632 } 1633 ret = convert_extent_item_v0(trans, fs_info, path, owner, 1634 extra_size); 1635 if (ret < 0) { 1636 err = ret; 1637 goto out; 1638 } 1639 leaf = path->nodes[0]; 1640 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1641 } 1642 #endif 1643 BUG_ON(item_size < sizeof(*ei)); 1644 1645 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1646 flags = btrfs_extent_flags(leaf, ei); 1647 1648 ptr = (unsigned long)(ei + 1); 1649 end = (unsigned long)ei + item_size; 1650 1651 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1652 ptr += sizeof(struct btrfs_tree_block_info); 1653 BUG_ON(ptr > end); 1654 } 1655 1656 err = -ENOENT; 1657 while (1) { 1658 if (ptr >= end) { 1659 WARN_ON(ptr > end); 1660 break; 1661 } 1662 iref = (struct btrfs_extent_inline_ref *)ptr; 1663 type = btrfs_extent_inline_ref_type(leaf, iref); 1664 if (want < type) 1665 break; 1666 if (want > type) { 1667 ptr += btrfs_extent_inline_ref_size(type); 1668 continue; 1669 } 1670 1671 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1672 struct btrfs_extent_data_ref *dref; 1673 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1674 if (match_extent_data_ref(leaf, dref, root_objectid, 1675 owner, offset)) { 1676 err = 0; 1677 break; 1678 } 1679 if (hash_extent_data_ref_item(leaf, dref) < 1680 hash_extent_data_ref(root_objectid, owner, offset)) 1681 break; 1682 } else { 1683 u64 ref_offset; 1684 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1685 if (parent > 0) { 1686 if (parent == ref_offset) { 1687 err = 0; 1688 break; 1689 } 1690 if (ref_offset < parent) 1691 break; 1692 } else { 1693 if (root_objectid == ref_offset) { 1694 err = 0; 1695 break; 1696 } 1697 if (ref_offset < root_objectid) 1698 break; 1699 } 1700 } 1701 ptr += btrfs_extent_inline_ref_size(type); 1702 } 1703 if (err == -ENOENT && insert) { 1704 if (item_size + extra_size >= 1705 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1706 err = -EAGAIN; 1707 goto out; 1708 } 1709 /* 1710 * To add new inline back ref, we have to make sure 1711 * there is no corresponding back ref item. 1712 * For simplicity, we just do not add new inline back 1713 * ref if there is any kind of item for this block 1714 */ 1715 if (find_next_key(path, 0, &key) == 0 && 1716 key.objectid == bytenr && 1717 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1718 err = -EAGAIN; 1719 goto out; 1720 } 1721 } 1722 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1723 out: 1724 if (insert) { 1725 path->keep_locks = 0; 1726 btrfs_unlock_up_safe(path, 1); 1727 } 1728 return err; 1729 } 1730 1731 /* 1732 * helper to add new inline back ref 1733 */ 1734 static noinline_for_stack 1735 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1736 struct btrfs_path *path, 1737 struct btrfs_extent_inline_ref *iref, 1738 u64 parent, u64 root_objectid, 1739 u64 owner, u64 offset, int refs_to_add, 1740 struct btrfs_delayed_extent_op *extent_op) 1741 { 1742 struct extent_buffer *leaf; 1743 struct btrfs_extent_item *ei; 1744 unsigned long ptr; 1745 unsigned long end; 1746 unsigned long item_offset; 1747 u64 refs; 1748 int size; 1749 int type; 1750 1751 leaf = path->nodes[0]; 1752 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1753 item_offset = (unsigned long)iref - (unsigned long)ei; 1754 1755 type = extent_ref_type(parent, owner); 1756 size = btrfs_extent_inline_ref_size(type); 1757 1758 btrfs_extend_item(fs_info, path, size); 1759 1760 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1761 refs = btrfs_extent_refs(leaf, ei); 1762 refs += refs_to_add; 1763 btrfs_set_extent_refs(leaf, ei, refs); 1764 if (extent_op) 1765 __run_delayed_extent_op(extent_op, leaf, ei); 1766 1767 ptr = (unsigned long)ei + item_offset; 1768 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1769 if (ptr < end - size) 1770 memmove_extent_buffer(leaf, ptr + size, ptr, 1771 end - size - ptr); 1772 1773 iref = (struct btrfs_extent_inline_ref *)ptr; 1774 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1775 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1776 struct btrfs_extent_data_ref *dref; 1777 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1778 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1779 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1780 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1781 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1782 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1783 struct btrfs_shared_data_ref *sref; 1784 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1785 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1786 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1787 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1788 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1789 } else { 1790 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1791 } 1792 btrfs_mark_buffer_dirty(leaf); 1793 } 1794 1795 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1796 struct btrfs_fs_info *fs_info, 1797 struct btrfs_path *path, 1798 struct btrfs_extent_inline_ref **ref_ret, 1799 u64 bytenr, u64 num_bytes, u64 parent, 1800 u64 root_objectid, u64 owner, u64 offset) 1801 { 1802 int ret; 1803 1804 ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret, 1805 bytenr, num_bytes, parent, 1806 root_objectid, owner, offset, 0); 1807 if (ret != -ENOENT) 1808 return ret; 1809 1810 btrfs_release_path(path); 1811 *ref_ret = NULL; 1812 1813 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1814 ret = lookup_tree_block_ref(trans, fs_info, path, bytenr, 1815 parent, root_objectid); 1816 } else { 1817 ret = lookup_extent_data_ref(trans, fs_info, path, bytenr, 1818 parent, root_objectid, owner, 1819 offset); 1820 } 1821 return ret; 1822 } 1823 1824 /* 1825 * helper to update/remove inline back ref 1826 */ 1827 static noinline_for_stack 1828 void update_inline_extent_backref(struct btrfs_fs_info *fs_info, 1829 struct btrfs_path *path, 1830 struct btrfs_extent_inline_ref *iref, 1831 int refs_to_mod, 1832 struct btrfs_delayed_extent_op *extent_op, 1833 int *last_ref) 1834 { 1835 struct extent_buffer *leaf; 1836 struct btrfs_extent_item *ei; 1837 struct btrfs_extent_data_ref *dref = NULL; 1838 struct btrfs_shared_data_ref *sref = NULL; 1839 unsigned long ptr; 1840 unsigned long end; 1841 u32 item_size; 1842 int size; 1843 int type; 1844 u64 refs; 1845 1846 leaf = path->nodes[0]; 1847 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1848 refs = btrfs_extent_refs(leaf, ei); 1849 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1850 refs += refs_to_mod; 1851 btrfs_set_extent_refs(leaf, ei, refs); 1852 if (extent_op) 1853 __run_delayed_extent_op(extent_op, leaf, ei); 1854 1855 type = btrfs_extent_inline_ref_type(leaf, iref); 1856 1857 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1858 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1859 refs = btrfs_extent_data_ref_count(leaf, dref); 1860 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1861 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1862 refs = btrfs_shared_data_ref_count(leaf, sref); 1863 } else { 1864 refs = 1; 1865 BUG_ON(refs_to_mod != -1); 1866 } 1867 1868 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1869 refs += refs_to_mod; 1870 1871 if (refs > 0) { 1872 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1873 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1874 else 1875 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1876 } else { 1877 *last_ref = 1; 1878 size = btrfs_extent_inline_ref_size(type); 1879 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1880 ptr = (unsigned long)iref; 1881 end = (unsigned long)ei + item_size; 1882 if (ptr + size < end) 1883 memmove_extent_buffer(leaf, ptr, ptr + size, 1884 end - ptr - size); 1885 item_size -= size; 1886 btrfs_truncate_item(fs_info, path, item_size, 1); 1887 } 1888 btrfs_mark_buffer_dirty(leaf); 1889 } 1890 1891 static noinline_for_stack 1892 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1893 struct btrfs_fs_info *fs_info, 1894 struct btrfs_path *path, 1895 u64 bytenr, u64 num_bytes, u64 parent, 1896 u64 root_objectid, u64 owner, 1897 u64 offset, int refs_to_add, 1898 struct btrfs_delayed_extent_op *extent_op) 1899 { 1900 struct btrfs_extent_inline_ref *iref; 1901 int ret; 1902 1903 ret = lookup_inline_extent_backref(trans, fs_info, path, &iref, 1904 bytenr, num_bytes, parent, 1905 root_objectid, owner, offset, 1); 1906 if (ret == 0) { 1907 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1908 update_inline_extent_backref(fs_info, path, iref, 1909 refs_to_add, extent_op, NULL); 1910 } else if (ret == -ENOENT) { 1911 setup_inline_extent_backref(fs_info, path, iref, parent, 1912 root_objectid, owner, offset, 1913 refs_to_add, extent_op); 1914 ret = 0; 1915 } 1916 return ret; 1917 } 1918 1919 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1920 struct btrfs_fs_info *fs_info, 1921 struct btrfs_path *path, 1922 u64 bytenr, u64 parent, u64 root_objectid, 1923 u64 owner, u64 offset, int refs_to_add) 1924 { 1925 int ret; 1926 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1927 BUG_ON(refs_to_add != 1); 1928 ret = insert_tree_block_ref(trans, fs_info, path, bytenr, 1929 parent, root_objectid); 1930 } else { 1931 ret = insert_extent_data_ref(trans, fs_info, path, bytenr, 1932 parent, root_objectid, 1933 owner, offset, refs_to_add); 1934 } 1935 return ret; 1936 } 1937 1938 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1939 struct btrfs_fs_info *fs_info, 1940 struct btrfs_path *path, 1941 struct btrfs_extent_inline_ref *iref, 1942 int refs_to_drop, int is_data, int *last_ref) 1943 { 1944 int ret = 0; 1945 1946 BUG_ON(!is_data && refs_to_drop != 1); 1947 if (iref) { 1948 update_inline_extent_backref(fs_info, path, iref, 1949 -refs_to_drop, NULL, last_ref); 1950 } else if (is_data) { 1951 ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop, 1952 last_ref); 1953 } else { 1954 *last_ref = 1; 1955 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1956 } 1957 return ret; 1958 } 1959 1960 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1961 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1962 u64 *discarded_bytes) 1963 { 1964 int j, ret = 0; 1965 u64 bytes_left, end; 1966 u64 aligned_start = ALIGN(start, 1 << 9); 1967 1968 if (WARN_ON(start != aligned_start)) { 1969 len -= aligned_start - start; 1970 len = round_down(len, 1 << 9); 1971 start = aligned_start; 1972 } 1973 1974 *discarded_bytes = 0; 1975 1976 if (!len) 1977 return 0; 1978 1979 end = start + len; 1980 bytes_left = len; 1981 1982 /* Skip any superblocks on this device. */ 1983 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1984 u64 sb_start = btrfs_sb_offset(j); 1985 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1986 u64 size = sb_start - start; 1987 1988 if (!in_range(sb_start, start, bytes_left) && 1989 !in_range(sb_end, start, bytes_left) && 1990 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1991 continue; 1992 1993 /* 1994 * Superblock spans beginning of range. Adjust start and 1995 * try again. 1996 */ 1997 if (sb_start <= start) { 1998 start += sb_end - start; 1999 if (start > end) { 2000 bytes_left = 0; 2001 break; 2002 } 2003 bytes_left = end - start; 2004 continue; 2005 } 2006 2007 if (size) { 2008 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 2009 GFP_NOFS, 0); 2010 if (!ret) 2011 *discarded_bytes += size; 2012 else if (ret != -EOPNOTSUPP) 2013 return ret; 2014 } 2015 2016 start = sb_end; 2017 if (start > end) { 2018 bytes_left = 0; 2019 break; 2020 } 2021 bytes_left = end - start; 2022 } 2023 2024 if (bytes_left) { 2025 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 2026 GFP_NOFS, 0); 2027 if (!ret) 2028 *discarded_bytes += bytes_left; 2029 } 2030 return ret; 2031 } 2032 2033 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 2034 u64 num_bytes, u64 *actual_bytes) 2035 { 2036 int ret; 2037 u64 discarded_bytes = 0; 2038 struct btrfs_bio *bbio = NULL; 2039 2040 2041 /* 2042 * Avoid races with device replace and make sure our bbio has devices 2043 * associated to its stripes that don't go away while we are discarding. 2044 */ 2045 btrfs_bio_counter_inc_blocked(fs_info); 2046 /* Tell the block device(s) that the sectors can be discarded */ 2047 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 2048 &bbio, 0); 2049 /* Error condition is -ENOMEM */ 2050 if (!ret) { 2051 struct btrfs_bio_stripe *stripe = bbio->stripes; 2052 int i; 2053 2054 2055 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2056 u64 bytes; 2057 if (!stripe->dev->can_discard) 2058 continue; 2059 2060 ret = btrfs_issue_discard(stripe->dev->bdev, 2061 stripe->physical, 2062 stripe->length, 2063 &bytes); 2064 if (!ret) 2065 discarded_bytes += bytes; 2066 else if (ret != -EOPNOTSUPP) 2067 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2068 2069 /* 2070 * Just in case we get back EOPNOTSUPP for some reason, 2071 * just ignore the return value so we don't screw up 2072 * people calling discard_extent. 2073 */ 2074 ret = 0; 2075 } 2076 btrfs_put_bbio(bbio); 2077 } 2078 btrfs_bio_counter_dec(fs_info); 2079 2080 if (actual_bytes) 2081 *actual_bytes = discarded_bytes; 2082 2083 2084 if (ret == -EOPNOTSUPP) 2085 ret = 0; 2086 return ret; 2087 } 2088 2089 /* Can return -ENOMEM */ 2090 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2091 struct btrfs_fs_info *fs_info, 2092 u64 bytenr, u64 num_bytes, u64 parent, 2093 u64 root_objectid, u64 owner, u64 offset) 2094 { 2095 int ret; 2096 2097 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2098 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2099 2100 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2101 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2102 num_bytes, 2103 parent, root_objectid, (int)owner, 2104 BTRFS_ADD_DELAYED_REF, NULL); 2105 } else { 2106 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2107 num_bytes, parent, root_objectid, 2108 owner, offset, 0, 2109 BTRFS_ADD_DELAYED_REF); 2110 } 2111 return ret; 2112 } 2113 2114 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2115 struct btrfs_fs_info *fs_info, 2116 struct btrfs_delayed_ref_node *node, 2117 u64 parent, u64 root_objectid, 2118 u64 owner, u64 offset, int refs_to_add, 2119 struct btrfs_delayed_extent_op *extent_op) 2120 { 2121 struct btrfs_path *path; 2122 struct extent_buffer *leaf; 2123 struct btrfs_extent_item *item; 2124 struct btrfs_key key; 2125 u64 bytenr = node->bytenr; 2126 u64 num_bytes = node->num_bytes; 2127 u64 refs; 2128 int ret; 2129 2130 path = btrfs_alloc_path(); 2131 if (!path) 2132 return -ENOMEM; 2133 2134 path->reada = READA_FORWARD; 2135 path->leave_spinning = 1; 2136 /* this will setup the path even if it fails to insert the back ref */ 2137 ret = insert_inline_extent_backref(trans, fs_info, path, bytenr, 2138 num_bytes, parent, root_objectid, 2139 owner, offset, 2140 refs_to_add, extent_op); 2141 if ((ret < 0 && ret != -EAGAIN) || !ret) 2142 goto out; 2143 2144 /* 2145 * Ok we had -EAGAIN which means we didn't have space to insert and 2146 * inline extent ref, so just update the reference count and add a 2147 * normal backref. 2148 */ 2149 leaf = path->nodes[0]; 2150 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2151 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2152 refs = btrfs_extent_refs(leaf, item); 2153 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2154 if (extent_op) 2155 __run_delayed_extent_op(extent_op, leaf, item); 2156 2157 btrfs_mark_buffer_dirty(leaf); 2158 btrfs_release_path(path); 2159 2160 path->reada = READA_FORWARD; 2161 path->leave_spinning = 1; 2162 /* now insert the actual backref */ 2163 ret = insert_extent_backref(trans, fs_info, path, bytenr, parent, 2164 root_objectid, owner, offset, refs_to_add); 2165 if (ret) 2166 btrfs_abort_transaction(trans, ret); 2167 out: 2168 btrfs_free_path(path); 2169 return ret; 2170 } 2171 2172 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2173 struct btrfs_fs_info *fs_info, 2174 struct btrfs_delayed_ref_node *node, 2175 struct btrfs_delayed_extent_op *extent_op, 2176 int insert_reserved) 2177 { 2178 int ret = 0; 2179 struct btrfs_delayed_data_ref *ref; 2180 struct btrfs_key ins; 2181 u64 parent = 0; 2182 u64 ref_root = 0; 2183 u64 flags = 0; 2184 2185 ins.objectid = node->bytenr; 2186 ins.offset = node->num_bytes; 2187 ins.type = BTRFS_EXTENT_ITEM_KEY; 2188 2189 ref = btrfs_delayed_node_to_data_ref(node); 2190 trace_run_delayed_data_ref(fs_info, node, ref, node->action); 2191 2192 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2193 parent = ref->parent; 2194 ref_root = ref->root; 2195 2196 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2197 if (extent_op) 2198 flags |= extent_op->flags_to_set; 2199 ret = alloc_reserved_file_extent(trans, fs_info, 2200 parent, ref_root, flags, 2201 ref->objectid, ref->offset, 2202 &ins, node->ref_mod); 2203 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2204 ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, 2205 ref_root, ref->objectid, 2206 ref->offset, node->ref_mod, 2207 extent_op); 2208 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2209 ret = __btrfs_free_extent(trans, fs_info, node, parent, 2210 ref_root, ref->objectid, 2211 ref->offset, node->ref_mod, 2212 extent_op); 2213 } else { 2214 BUG(); 2215 } 2216 return ret; 2217 } 2218 2219 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2220 struct extent_buffer *leaf, 2221 struct btrfs_extent_item *ei) 2222 { 2223 u64 flags = btrfs_extent_flags(leaf, ei); 2224 if (extent_op->update_flags) { 2225 flags |= extent_op->flags_to_set; 2226 btrfs_set_extent_flags(leaf, ei, flags); 2227 } 2228 2229 if (extent_op->update_key) { 2230 struct btrfs_tree_block_info *bi; 2231 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2232 bi = (struct btrfs_tree_block_info *)(ei + 1); 2233 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2234 } 2235 } 2236 2237 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2238 struct btrfs_fs_info *fs_info, 2239 struct btrfs_delayed_ref_node *node, 2240 struct btrfs_delayed_extent_op *extent_op) 2241 { 2242 struct btrfs_key key; 2243 struct btrfs_path *path; 2244 struct btrfs_extent_item *ei; 2245 struct extent_buffer *leaf; 2246 u32 item_size; 2247 int ret; 2248 int err = 0; 2249 int metadata = !extent_op->is_data; 2250 2251 if (trans->aborted) 2252 return 0; 2253 2254 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2255 metadata = 0; 2256 2257 path = btrfs_alloc_path(); 2258 if (!path) 2259 return -ENOMEM; 2260 2261 key.objectid = node->bytenr; 2262 2263 if (metadata) { 2264 key.type = BTRFS_METADATA_ITEM_KEY; 2265 key.offset = extent_op->level; 2266 } else { 2267 key.type = BTRFS_EXTENT_ITEM_KEY; 2268 key.offset = node->num_bytes; 2269 } 2270 2271 again: 2272 path->reada = READA_FORWARD; 2273 path->leave_spinning = 1; 2274 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2275 if (ret < 0) { 2276 err = ret; 2277 goto out; 2278 } 2279 if (ret > 0) { 2280 if (metadata) { 2281 if (path->slots[0] > 0) { 2282 path->slots[0]--; 2283 btrfs_item_key_to_cpu(path->nodes[0], &key, 2284 path->slots[0]); 2285 if (key.objectid == node->bytenr && 2286 key.type == BTRFS_EXTENT_ITEM_KEY && 2287 key.offset == node->num_bytes) 2288 ret = 0; 2289 } 2290 if (ret > 0) { 2291 btrfs_release_path(path); 2292 metadata = 0; 2293 2294 key.objectid = node->bytenr; 2295 key.offset = node->num_bytes; 2296 key.type = BTRFS_EXTENT_ITEM_KEY; 2297 goto again; 2298 } 2299 } else { 2300 err = -EIO; 2301 goto out; 2302 } 2303 } 2304 2305 leaf = path->nodes[0]; 2306 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2307 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2308 if (item_size < sizeof(*ei)) { 2309 ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0); 2310 if (ret < 0) { 2311 err = ret; 2312 goto out; 2313 } 2314 leaf = path->nodes[0]; 2315 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2316 } 2317 #endif 2318 BUG_ON(item_size < sizeof(*ei)); 2319 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2320 __run_delayed_extent_op(extent_op, leaf, ei); 2321 2322 btrfs_mark_buffer_dirty(leaf); 2323 out: 2324 btrfs_free_path(path); 2325 return err; 2326 } 2327 2328 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2329 struct btrfs_fs_info *fs_info, 2330 struct btrfs_delayed_ref_node *node, 2331 struct btrfs_delayed_extent_op *extent_op, 2332 int insert_reserved) 2333 { 2334 int ret = 0; 2335 struct btrfs_delayed_tree_ref *ref; 2336 struct btrfs_key ins; 2337 u64 parent = 0; 2338 u64 ref_root = 0; 2339 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 2340 2341 ref = btrfs_delayed_node_to_tree_ref(node); 2342 trace_run_delayed_tree_ref(fs_info, node, ref, node->action); 2343 2344 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2345 parent = ref->parent; 2346 ref_root = ref->root; 2347 2348 ins.objectid = node->bytenr; 2349 if (skinny_metadata) { 2350 ins.offset = ref->level; 2351 ins.type = BTRFS_METADATA_ITEM_KEY; 2352 } else { 2353 ins.offset = node->num_bytes; 2354 ins.type = BTRFS_EXTENT_ITEM_KEY; 2355 } 2356 2357 if (node->ref_mod != 1) { 2358 btrfs_err(fs_info, 2359 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2360 node->bytenr, node->ref_mod, node->action, ref_root, 2361 parent); 2362 return -EIO; 2363 } 2364 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2365 BUG_ON(!extent_op || !extent_op->update_flags); 2366 ret = alloc_reserved_tree_block(trans, fs_info, 2367 parent, ref_root, 2368 extent_op->flags_to_set, 2369 &extent_op->key, 2370 ref->level, &ins); 2371 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2372 ret = __btrfs_inc_extent_ref(trans, fs_info, node, 2373 parent, ref_root, 2374 ref->level, 0, 1, 2375 extent_op); 2376 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2377 ret = __btrfs_free_extent(trans, fs_info, node, 2378 parent, ref_root, 2379 ref->level, 0, 1, extent_op); 2380 } else { 2381 BUG(); 2382 } 2383 return ret; 2384 } 2385 2386 /* helper function to actually process a single delayed ref entry */ 2387 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2388 struct btrfs_fs_info *fs_info, 2389 struct btrfs_delayed_ref_node *node, 2390 struct btrfs_delayed_extent_op *extent_op, 2391 int insert_reserved) 2392 { 2393 int ret = 0; 2394 2395 if (trans->aborted) { 2396 if (insert_reserved) 2397 btrfs_pin_extent(fs_info, node->bytenr, 2398 node->num_bytes, 1); 2399 return 0; 2400 } 2401 2402 if (btrfs_delayed_ref_is_head(node)) { 2403 struct btrfs_delayed_ref_head *head; 2404 /* 2405 * we've hit the end of the chain and we were supposed 2406 * to insert this extent into the tree. But, it got 2407 * deleted before we ever needed to insert it, so all 2408 * we have to do is clean up the accounting 2409 */ 2410 BUG_ON(extent_op); 2411 head = btrfs_delayed_node_to_head(node); 2412 trace_run_delayed_ref_head(fs_info, node, head, node->action); 2413 2414 if (insert_reserved) { 2415 btrfs_pin_extent(fs_info, node->bytenr, 2416 node->num_bytes, 1); 2417 if (head->is_data) { 2418 ret = btrfs_del_csums(trans, fs_info, 2419 node->bytenr, 2420 node->num_bytes); 2421 } 2422 } 2423 2424 /* Also free its reserved qgroup space */ 2425 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2426 head->qgroup_reserved); 2427 return ret; 2428 } 2429 2430 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2431 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2432 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, 2433 insert_reserved); 2434 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2435 node->type == BTRFS_SHARED_DATA_REF_KEY) 2436 ret = run_delayed_data_ref(trans, fs_info, node, extent_op, 2437 insert_reserved); 2438 else 2439 BUG(); 2440 return ret; 2441 } 2442 2443 static inline struct btrfs_delayed_ref_node * 2444 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2445 { 2446 struct btrfs_delayed_ref_node *ref; 2447 2448 if (list_empty(&head->ref_list)) 2449 return NULL; 2450 2451 /* 2452 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2453 * This is to prevent a ref count from going down to zero, which deletes 2454 * the extent item from the extent tree, when there still are references 2455 * to add, which would fail because they would not find the extent item. 2456 */ 2457 if (!list_empty(&head->ref_add_list)) 2458 return list_first_entry(&head->ref_add_list, 2459 struct btrfs_delayed_ref_node, add_list); 2460 2461 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 2462 list); 2463 ASSERT(list_empty(&ref->add_list)); 2464 return ref; 2465 } 2466 2467 /* 2468 * Returns 0 on success or if called with an already aborted transaction. 2469 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2470 */ 2471 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2472 struct btrfs_fs_info *fs_info, 2473 unsigned long nr) 2474 { 2475 struct btrfs_delayed_ref_root *delayed_refs; 2476 struct btrfs_delayed_ref_node *ref; 2477 struct btrfs_delayed_ref_head *locked_ref = NULL; 2478 struct btrfs_delayed_extent_op *extent_op; 2479 ktime_t start = ktime_get(); 2480 int ret; 2481 unsigned long count = 0; 2482 unsigned long actual_count = 0; 2483 int must_insert_reserved = 0; 2484 2485 delayed_refs = &trans->transaction->delayed_refs; 2486 while (1) { 2487 if (!locked_ref) { 2488 if (count >= nr) 2489 break; 2490 2491 spin_lock(&delayed_refs->lock); 2492 locked_ref = btrfs_select_ref_head(trans); 2493 if (!locked_ref) { 2494 spin_unlock(&delayed_refs->lock); 2495 break; 2496 } 2497 2498 /* grab the lock that says we are going to process 2499 * all the refs for this head */ 2500 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2501 spin_unlock(&delayed_refs->lock); 2502 /* 2503 * we may have dropped the spin lock to get the head 2504 * mutex lock, and that might have given someone else 2505 * time to free the head. If that's true, it has been 2506 * removed from our list and we can move on. 2507 */ 2508 if (ret == -EAGAIN) { 2509 locked_ref = NULL; 2510 count++; 2511 continue; 2512 } 2513 } 2514 2515 /* 2516 * We need to try and merge add/drops of the same ref since we 2517 * can run into issues with relocate dropping the implicit ref 2518 * and then it being added back again before the drop can 2519 * finish. If we merged anything we need to re-loop so we can 2520 * get a good ref. 2521 * Or we can get node references of the same type that weren't 2522 * merged when created due to bumps in the tree mod seq, and 2523 * we need to merge them to prevent adding an inline extent 2524 * backref before dropping it (triggering a BUG_ON at 2525 * insert_inline_extent_backref()). 2526 */ 2527 spin_lock(&locked_ref->lock); 2528 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2529 locked_ref); 2530 2531 /* 2532 * locked_ref is the head node, so we have to go one 2533 * node back for any delayed ref updates 2534 */ 2535 ref = select_delayed_ref(locked_ref); 2536 2537 if (ref && ref->seq && 2538 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2539 spin_unlock(&locked_ref->lock); 2540 spin_lock(&delayed_refs->lock); 2541 locked_ref->processing = 0; 2542 delayed_refs->num_heads_ready++; 2543 spin_unlock(&delayed_refs->lock); 2544 btrfs_delayed_ref_unlock(locked_ref); 2545 locked_ref = NULL; 2546 cond_resched(); 2547 count++; 2548 continue; 2549 } 2550 2551 /* 2552 * record the must insert reserved flag before we 2553 * drop the spin lock. 2554 */ 2555 must_insert_reserved = locked_ref->must_insert_reserved; 2556 locked_ref->must_insert_reserved = 0; 2557 2558 extent_op = locked_ref->extent_op; 2559 locked_ref->extent_op = NULL; 2560 2561 if (!ref) { 2562 2563 2564 /* All delayed refs have been processed, Go ahead 2565 * and send the head node to run_one_delayed_ref, 2566 * so that any accounting fixes can happen 2567 */ 2568 ref = &locked_ref->node; 2569 2570 if (extent_op && must_insert_reserved) { 2571 btrfs_free_delayed_extent_op(extent_op); 2572 extent_op = NULL; 2573 } 2574 2575 if (extent_op) { 2576 spin_unlock(&locked_ref->lock); 2577 ret = run_delayed_extent_op(trans, fs_info, 2578 ref, extent_op); 2579 btrfs_free_delayed_extent_op(extent_op); 2580 2581 if (ret) { 2582 /* 2583 * Need to reset must_insert_reserved if 2584 * there was an error so the abort stuff 2585 * can cleanup the reserved space 2586 * properly. 2587 */ 2588 if (must_insert_reserved) 2589 locked_ref->must_insert_reserved = 1; 2590 spin_lock(&delayed_refs->lock); 2591 locked_ref->processing = 0; 2592 delayed_refs->num_heads_ready++; 2593 spin_unlock(&delayed_refs->lock); 2594 btrfs_debug(fs_info, 2595 "run_delayed_extent_op returned %d", 2596 ret); 2597 btrfs_delayed_ref_unlock(locked_ref); 2598 return ret; 2599 } 2600 continue; 2601 } 2602 2603 /* 2604 * Need to drop our head ref lock and re-acquire the 2605 * delayed ref lock and then re-check to make sure 2606 * nobody got added. 2607 */ 2608 spin_unlock(&locked_ref->lock); 2609 spin_lock(&delayed_refs->lock); 2610 spin_lock(&locked_ref->lock); 2611 if (!list_empty(&locked_ref->ref_list) || 2612 locked_ref->extent_op) { 2613 spin_unlock(&locked_ref->lock); 2614 spin_unlock(&delayed_refs->lock); 2615 continue; 2616 } 2617 ref->in_tree = 0; 2618 delayed_refs->num_heads--; 2619 rb_erase(&locked_ref->href_node, 2620 &delayed_refs->href_root); 2621 spin_unlock(&delayed_refs->lock); 2622 } else { 2623 actual_count++; 2624 ref->in_tree = 0; 2625 list_del(&ref->list); 2626 if (!list_empty(&ref->add_list)) 2627 list_del(&ref->add_list); 2628 } 2629 atomic_dec(&delayed_refs->num_entries); 2630 2631 if (!btrfs_delayed_ref_is_head(ref)) { 2632 /* 2633 * when we play the delayed ref, also correct the 2634 * ref_mod on head 2635 */ 2636 switch (ref->action) { 2637 case BTRFS_ADD_DELAYED_REF: 2638 case BTRFS_ADD_DELAYED_EXTENT: 2639 locked_ref->node.ref_mod -= ref->ref_mod; 2640 break; 2641 case BTRFS_DROP_DELAYED_REF: 2642 locked_ref->node.ref_mod += ref->ref_mod; 2643 break; 2644 default: 2645 WARN_ON(1); 2646 } 2647 } 2648 spin_unlock(&locked_ref->lock); 2649 2650 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, 2651 must_insert_reserved); 2652 2653 btrfs_free_delayed_extent_op(extent_op); 2654 if (ret) { 2655 spin_lock(&delayed_refs->lock); 2656 locked_ref->processing = 0; 2657 delayed_refs->num_heads_ready++; 2658 spin_unlock(&delayed_refs->lock); 2659 btrfs_delayed_ref_unlock(locked_ref); 2660 btrfs_put_delayed_ref(ref); 2661 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2662 ret); 2663 return ret; 2664 } 2665 2666 /* 2667 * If this node is a head, that means all the refs in this head 2668 * have been dealt with, and we will pick the next head to deal 2669 * with, so we must unlock the head and drop it from the cluster 2670 * list before we release it. 2671 */ 2672 if (btrfs_delayed_ref_is_head(ref)) { 2673 if (locked_ref->is_data && 2674 locked_ref->total_ref_mod < 0) { 2675 spin_lock(&delayed_refs->lock); 2676 delayed_refs->pending_csums -= ref->num_bytes; 2677 spin_unlock(&delayed_refs->lock); 2678 } 2679 btrfs_delayed_ref_unlock(locked_ref); 2680 locked_ref = NULL; 2681 } 2682 btrfs_put_delayed_ref(ref); 2683 count++; 2684 cond_resched(); 2685 } 2686 2687 /* 2688 * We don't want to include ref heads since we can have empty ref heads 2689 * and those will drastically skew our runtime down since we just do 2690 * accounting, no actual extent tree updates. 2691 */ 2692 if (actual_count > 0) { 2693 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2694 u64 avg; 2695 2696 /* 2697 * We weigh the current average higher than our current runtime 2698 * to avoid large swings in the average. 2699 */ 2700 spin_lock(&delayed_refs->lock); 2701 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2702 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2703 spin_unlock(&delayed_refs->lock); 2704 } 2705 return 0; 2706 } 2707 2708 #ifdef SCRAMBLE_DELAYED_REFS 2709 /* 2710 * Normally delayed refs get processed in ascending bytenr order. This 2711 * correlates in most cases to the order added. To expose dependencies on this 2712 * order, we start to process the tree in the middle instead of the beginning 2713 */ 2714 static u64 find_middle(struct rb_root *root) 2715 { 2716 struct rb_node *n = root->rb_node; 2717 struct btrfs_delayed_ref_node *entry; 2718 int alt = 1; 2719 u64 middle; 2720 u64 first = 0, last = 0; 2721 2722 n = rb_first(root); 2723 if (n) { 2724 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2725 first = entry->bytenr; 2726 } 2727 n = rb_last(root); 2728 if (n) { 2729 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2730 last = entry->bytenr; 2731 } 2732 n = root->rb_node; 2733 2734 while (n) { 2735 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2736 WARN_ON(!entry->in_tree); 2737 2738 middle = entry->bytenr; 2739 2740 if (alt) 2741 n = n->rb_left; 2742 else 2743 n = n->rb_right; 2744 2745 alt = 1 - alt; 2746 } 2747 return middle; 2748 } 2749 #endif 2750 2751 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2752 { 2753 u64 num_bytes; 2754 2755 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2756 sizeof(struct btrfs_extent_inline_ref)); 2757 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2758 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2759 2760 /* 2761 * We don't ever fill up leaves all the way so multiply by 2 just to be 2762 * closer to what we're really going to want to use. 2763 */ 2764 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2765 } 2766 2767 /* 2768 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2769 * would require to store the csums for that many bytes. 2770 */ 2771 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2772 { 2773 u64 csum_size; 2774 u64 num_csums_per_leaf; 2775 u64 num_csums; 2776 2777 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2778 num_csums_per_leaf = div64_u64(csum_size, 2779 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2780 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2781 num_csums += num_csums_per_leaf - 1; 2782 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2783 return num_csums; 2784 } 2785 2786 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2787 struct btrfs_fs_info *fs_info) 2788 { 2789 struct btrfs_block_rsv *global_rsv; 2790 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2791 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; 2792 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; 2793 u64 num_bytes, num_dirty_bgs_bytes; 2794 int ret = 0; 2795 2796 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 2797 num_heads = heads_to_leaves(fs_info, num_heads); 2798 if (num_heads > 1) 2799 num_bytes += (num_heads - 1) * fs_info->nodesize; 2800 num_bytes <<= 1; 2801 num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * 2802 fs_info->nodesize; 2803 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, 2804 num_dirty_bgs); 2805 global_rsv = &fs_info->global_block_rsv; 2806 2807 /* 2808 * If we can't allocate any more chunks lets make sure we have _lots_ of 2809 * wiggle room since running delayed refs can create more delayed refs. 2810 */ 2811 if (global_rsv->space_info->full) { 2812 num_dirty_bgs_bytes <<= 1; 2813 num_bytes <<= 1; 2814 } 2815 2816 spin_lock(&global_rsv->lock); 2817 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) 2818 ret = 1; 2819 spin_unlock(&global_rsv->lock); 2820 return ret; 2821 } 2822 2823 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2824 struct btrfs_fs_info *fs_info) 2825 { 2826 u64 num_entries = 2827 atomic_read(&trans->transaction->delayed_refs.num_entries); 2828 u64 avg_runtime; 2829 u64 val; 2830 2831 smp_mb(); 2832 avg_runtime = fs_info->avg_delayed_ref_runtime; 2833 val = num_entries * avg_runtime; 2834 if (val >= NSEC_PER_SEC) 2835 return 1; 2836 if (val >= NSEC_PER_SEC / 2) 2837 return 2; 2838 2839 return btrfs_check_space_for_delayed_refs(trans, fs_info); 2840 } 2841 2842 struct async_delayed_refs { 2843 struct btrfs_root *root; 2844 u64 transid; 2845 int count; 2846 int error; 2847 int sync; 2848 struct completion wait; 2849 struct btrfs_work work; 2850 }; 2851 2852 static inline struct async_delayed_refs * 2853 to_async_delayed_refs(struct btrfs_work *work) 2854 { 2855 return container_of(work, struct async_delayed_refs, work); 2856 } 2857 2858 static void delayed_ref_async_start(struct btrfs_work *work) 2859 { 2860 struct async_delayed_refs *async = to_async_delayed_refs(work); 2861 struct btrfs_trans_handle *trans; 2862 struct btrfs_fs_info *fs_info = async->root->fs_info; 2863 int ret; 2864 2865 /* if the commit is already started, we don't need to wait here */ 2866 if (btrfs_transaction_blocked(fs_info)) 2867 goto done; 2868 2869 trans = btrfs_join_transaction(async->root); 2870 if (IS_ERR(trans)) { 2871 async->error = PTR_ERR(trans); 2872 goto done; 2873 } 2874 2875 /* 2876 * trans->sync means that when we call end_transaction, we won't 2877 * wait on delayed refs 2878 */ 2879 trans->sync = true; 2880 2881 /* Don't bother flushing if we got into a different transaction */ 2882 if (trans->transid > async->transid) 2883 goto end; 2884 2885 ret = btrfs_run_delayed_refs(trans, fs_info, async->count); 2886 if (ret) 2887 async->error = ret; 2888 end: 2889 ret = btrfs_end_transaction(trans); 2890 if (ret && !async->error) 2891 async->error = ret; 2892 done: 2893 if (async->sync) 2894 complete(&async->wait); 2895 else 2896 kfree(async); 2897 } 2898 2899 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, 2900 unsigned long count, u64 transid, int wait) 2901 { 2902 struct async_delayed_refs *async; 2903 int ret; 2904 2905 async = kmalloc(sizeof(*async), GFP_NOFS); 2906 if (!async) 2907 return -ENOMEM; 2908 2909 async->root = fs_info->tree_root; 2910 async->count = count; 2911 async->error = 0; 2912 async->transid = transid; 2913 if (wait) 2914 async->sync = 1; 2915 else 2916 async->sync = 0; 2917 init_completion(&async->wait); 2918 2919 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2920 delayed_ref_async_start, NULL, NULL); 2921 2922 btrfs_queue_work(fs_info->extent_workers, &async->work); 2923 2924 if (wait) { 2925 wait_for_completion(&async->wait); 2926 ret = async->error; 2927 kfree(async); 2928 return ret; 2929 } 2930 return 0; 2931 } 2932 2933 /* 2934 * this starts processing the delayed reference count updates and 2935 * extent insertions we have queued up so far. count can be 2936 * 0, which means to process everything in the tree at the start 2937 * of the run (but not newly added entries), or it can be some target 2938 * number you'd like to process. 2939 * 2940 * Returns 0 on success or if called with an aborted transaction 2941 * Returns <0 on error and aborts the transaction 2942 */ 2943 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2944 struct btrfs_fs_info *fs_info, unsigned long count) 2945 { 2946 struct rb_node *node; 2947 struct btrfs_delayed_ref_root *delayed_refs; 2948 struct btrfs_delayed_ref_head *head; 2949 int ret; 2950 int run_all = count == (unsigned long)-1; 2951 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 2952 2953 /* We'll clean this up in btrfs_cleanup_transaction */ 2954 if (trans->aborted) 2955 return 0; 2956 2957 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2958 return 0; 2959 2960 delayed_refs = &trans->transaction->delayed_refs; 2961 if (count == 0) 2962 count = atomic_read(&delayed_refs->num_entries) * 2; 2963 2964 again: 2965 #ifdef SCRAMBLE_DELAYED_REFS 2966 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2967 #endif 2968 trans->can_flush_pending_bgs = false; 2969 ret = __btrfs_run_delayed_refs(trans, fs_info, count); 2970 if (ret < 0) { 2971 btrfs_abort_transaction(trans, ret); 2972 return ret; 2973 } 2974 2975 if (run_all) { 2976 if (!list_empty(&trans->new_bgs)) 2977 btrfs_create_pending_block_groups(trans, fs_info); 2978 2979 spin_lock(&delayed_refs->lock); 2980 node = rb_first(&delayed_refs->href_root); 2981 if (!node) { 2982 spin_unlock(&delayed_refs->lock); 2983 goto out; 2984 } 2985 2986 while (node) { 2987 head = rb_entry(node, struct btrfs_delayed_ref_head, 2988 href_node); 2989 if (btrfs_delayed_ref_is_head(&head->node)) { 2990 struct btrfs_delayed_ref_node *ref; 2991 2992 ref = &head->node; 2993 refcount_inc(&ref->refs); 2994 2995 spin_unlock(&delayed_refs->lock); 2996 /* 2997 * Mutex was contended, block until it's 2998 * released and try again 2999 */ 3000 mutex_lock(&head->mutex); 3001 mutex_unlock(&head->mutex); 3002 3003 btrfs_put_delayed_ref(ref); 3004 cond_resched(); 3005 goto again; 3006 } else { 3007 WARN_ON(1); 3008 } 3009 node = rb_next(node); 3010 } 3011 spin_unlock(&delayed_refs->lock); 3012 cond_resched(); 3013 goto again; 3014 } 3015 out: 3016 trans->can_flush_pending_bgs = can_flush_pending_bgs; 3017 return 0; 3018 } 3019 3020 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3021 struct btrfs_fs_info *fs_info, 3022 u64 bytenr, u64 num_bytes, u64 flags, 3023 int level, int is_data) 3024 { 3025 struct btrfs_delayed_extent_op *extent_op; 3026 int ret; 3027 3028 extent_op = btrfs_alloc_delayed_extent_op(); 3029 if (!extent_op) 3030 return -ENOMEM; 3031 3032 extent_op->flags_to_set = flags; 3033 extent_op->update_flags = true; 3034 extent_op->update_key = false; 3035 extent_op->is_data = is_data ? true : false; 3036 extent_op->level = level; 3037 3038 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, 3039 num_bytes, extent_op); 3040 if (ret) 3041 btrfs_free_delayed_extent_op(extent_op); 3042 return ret; 3043 } 3044 3045 static noinline int check_delayed_ref(struct btrfs_root *root, 3046 struct btrfs_path *path, 3047 u64 objectid, u64 offset, u64 bytenr) 3048 { 3049 struct btrfs_delayed_ref_head *head; 3050 struct btrfs_delayed_ref_node *ref; 3051 struct btrfs_delayed_data_ref *data_ref; 3052 struct btrfs_delayed_ref_root *delayed_refs; 3053 struct btrfs_transaction *cur_trans; 3054 int ret = 0; 3055 3056 cur_trans = root->fs_info->running_transaction; 3057 if (!cur_trans) 3058 return 0; 3059 3060 delayed_refs = &cur_trans->delayed_refs; 3061 spin_lock(&delayed_refs->lock); 3062 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3063 if (!head) { 3064 spin_unlock(&delayed_refs->lock); 3065 return 0; 3066 } 3067 3068 if (!mutex_trylock(&head->mutex)) { 3069 refcount_inc(&head->node.refs); 3070 spin_unlock(&delayed_refs->lock); 3071 3072 btrfs_release_path(path); 3073 3074 /* 3075 * Mutex was contended, block until it's released and let 3076 * caller try again 3077 */ 3078 mutex_lock(&head->mutex); 3079 mutex_unlock(&head->mutex); 3080 btrfs_put_delayed_ref(&head->node); 3081 return -EAGAIN; 3082 } 3083 spin_unlock(&delayed_refs->lock); 3084 3085 spin_lock(&head->lock); 3086 list_for_each_entry(ref, &head->ref_list, list) { 3087 /* If it's a shared ref we know a cross reference exists */ 3088 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3089 ret = 1; 3090 break; 3091 } 3092 3093 data_ref = btrfs_delayed_node_to_data_ref(ref); 3094 3095 /* 3096 * If our ref doesn't match the one we're currently looking at 3097 * then we have a cross reference. 3098 */ 3099 if (data_ref->root != root->root_key.objectid || 3100 data_ref->objectid != objectid || 3101 data_ref->offset != offset) { 3102 ret = 1; 3103 break; 3104 } 3105 } 3106 spin_unlock(&head->lock); 3107 mutex_unlock(&head->mutex); 3108 return ret; 3109 } 3110 3111 static noinline int check_committed_ref(struct btrfs_root *root, 3112 struct btrfs_path *path, 3113 u64 objectid, u64 offset, u64 bytenr) 3114 { 3115 struct btrfs_fs_info *fs_info = root->fs_info; 3116 struct btrfs_root *extent_root = fs_info->extent_root; 3117 struct extent_buffer *leaf; 3118 struct btrfs_extent_data_ref *ref; 3119 struct btrfs_extent_inline_ref *iref; 3120 struct btrfs_extent_item *ei; 3121 struct btrfs_key key; 3122 u32 item_size; 3123 int ret; 3124 3125 key.objectid = bytenr; 3126 key.offset = (u64)-1; 3127 key.type = BTRFS_EXTENT_ITEM_KEY; 3128 3129 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3130 if (ret < 0) 3131 goto out; 3132 BUG_ON(ret == 0); /* Corruption */ 3133 3134 ret = -ENOENT; 3135 if (path->slots[0] == 0) 3136 goto out; 3137 3138 path->slots[0]--; 3139 leaf = path->nodes[0]; 3140 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3141 3142 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3143 goto out; 3144 3145 ret = 1; 3146 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3147 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3148 if (item_size < sizeof(*ei)) { 3149 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3150 goto out; 3151 } 3152 #endif 3153 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3154 3155 if (item_size != sizeof(*ei) + 3156 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3157 goto out; 3158 3159 if (btrfs_extent_generation(leaf, ei) <= 3160 btrfs_root_last_snapshot(&root->root_item)) 3161 goto out; 3162 3163 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3164 if (btrfs_extent_inline_ref_type(leaf, iref) != 3165 BTRFS_EXTENT_DATA_REF_KEY) 3166 goto out; 3167 3168 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3169 if (btrfs_extent_refs(leaf, ei) != 3170 btrfs_extent_data_ref_count(leaf, ref) || 3171 btrfs_extent_data_ref_root(leaf, ref) != 3172 root->root_key.objectid || 3173 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3174 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3175 goto out; 3176 3177 ret = 0; 3178 out: 3179 return ret; 3180 } 3181 3182 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 3183 u64 bytenr) 3184 { 3185 struct btrfs_path *path; 3186 int ret; 3187 int ret2; 3188 3189 path = btrfs_alloc_path(); 3190 if (!path) 3191 return -ENOENT; 3192 3193 do { 3194 ret = check_committed_ref(root, path, objectid, 3195 offset, bytenr); 3196 if (ret && ret != -ENOENT) 3197 goto out; 3198 3199 ret2 = check_delayed_ref(root, path, objectid, 3200 offset, bytenr); 3201 } while (ret2 == -EAGAIN); 3202 3203 if (ret2 && ret2 != -ENOENT) { 3204 ret = ret2; 3205 goto out; 3206 } 3207 3208 if (ret != -ENOENT || ret2 != -ENOENT) 3209 ret = 0; 3210 out: 3211 btrfs_free_path(path); 3212 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3213 WARN_ON(ret > 0); 3214 return ret; 3215 } 3216 3217 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3218 struct btrfs_root *root, 3219 struct extent_buffer *buf, 3220 int full_backref, int inc) 3221 { 3222 struct btrfs_fs_info *fs_info = root->fs_info; 3223 u64 bytenr; 3224 u64 num_bytes; 3225 u64 parent; 3226 u64 ref_root; 3227 u32 nritems; 3228 struct btrfs_key key; 3229 struct btrfs_file_extent_item *fi; 3230 int i; 3231 int level; 3232 int ret = 0; 3233 int (*process_func)(struct btrfs_trans_handle *, 3234 struct btrfs_fs_info *, 3235 u64, u64, u64, u64, u64, u64); 3236 3237 3238 if (btrfs_is_testing(fs_info)) 3239 return 0; 3240 3241 ref_root = btrfs_header_owner(buf); 3242 nritems = btrfs_header_nritems(buf); 3243 level = btrfs_header_level(buf); 3244 3245 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3246 return 0; 3247 3248 if (inc) 3249 process_func = btrfs_inc_extent_ref; 3250 else 3251 process_func = btrfs_free_extent; 3252 3253 if (full_backref) 3254 parent = buf->start; 3255 else 3256 parent = 0; 3257 3258 for (i = 0; i < nritems; i++) { 3259 if (level == 0) { 3260 btrfs_item_key_to_cpu(buf, &key, i); 3261 if (key.type != BTRFS_EXTENT_DATA_KEY) 3262 continue; 3263 fi = btrfs_item_ptr(buf, i, 3264 struct btrfs_file_extent_item); 3265 if (btrfs_file_extent_type(buf, fi) == 3266 BTRFS_FILE_EXTENT_INLINE) 3267 continue; 3268 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3269 if (bytenr == 0) 3270 continue; 3271 3272 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3273 key.offset -= btrfs_file_extent_offset(buf, fi); 3274 ret = process_func(trans, fs_info, bytenr, num_bytes, 3275 parent, ref_root, key.objectid, 3276 key.offset); 3277 if (ret) 3278 goto fail; 3279 } else { 3280 bytenr = btrfs_node_blockptr(buf, i); 3281 num_bytes = fs_info->nodesize; 3282 ret = process_func(trans, fs_info, bytenr, num_bytes, 3283 parent, ref_root, level - 1, 0); 3284 if (ret) 3285 goto fail; 3286 } 3287 } 3288 return 0; 3289 fail: 3290 return ret; 3291 } 3292 3293 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3294 struct extent_buffer *buf, int full_backref) 3295 { 3296 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3297 } 3298 3299 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3300 struct extent_buffer *buf, int full_backref) 3301 { 3302 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3303 } 3304 3305 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3306 struct btrfs_fs_info *fs_info, 3307 struct btrfs_path *path, 3308 struct btrfs_block_group_cache *cache) 3309 { 3310 int ret; 3311 struct btrfs_root *extent_root = fs_info->extent_root; 3312 unsigned long bi; 3313 struct extent_buffer *leaf; 3314 3315 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3316 if (ret) { 3317 if (ret > 0) 3318 ret = -ENOENT; 3319 goto fail; 3320 } 3321 3322 leaf = path->nodes[0]; 3323 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3324 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3325 btrfs_mark_buffer_dirty(leaf); 3326 fail: 3327 btrfs_release_path(path); 3328 return ret; 3329 3330 } 3331 3332 static struct btrfs_block_group_cache * 3333 next_block_group(struct btrfs_fs_info *fs_info, 3334 struct btrfs_block_group_cache *cache) 3335 { 3336 struct rb_node *node; 3337 3338 spin_lock(&fs_info->block_group_cache_lock); 3339 3340 /* If our block group was removed, we need a full search. */ 3341 if (RB_EMPTY_NODE(&cache->cache_node)) { 3342 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3343 3344 spin_unlock(&fs_info->block_group_cache_lock); 3345 btrfs_put_block_group(cache); 3346 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3347 } 3348 node = rb_next(&cache->cache_node); 3349 btrfs_put_block_group(cache); 3350 if (node) { 3351 cache = rb_entry(node, struct btrfs_block_group_cache, 3352 cache_node); 3353 btrfs_get_block_group(cache); 3354 } else 3355 cache = NULL; 3356 spin_unlock(&fs_info->block_group_cache_lock); 3357 return cache; 3358 } 3359 3360 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3361 struct btrfs_trans_handle *trans, 3362 struct btrfs_path *path) 3363 { 3364 struct btrfs_fs_info *fs_info = block_group->fs_info; 3365 struct btrfs_root *root = fs_info->tree_root; 3366 struct inode *inode = NULL; 3367 u64 alloc_hint = 0; 3368 int dcs = BTRFS_DC_ERROR; 3369 u64 num_pages = 0; 3370 int retries = 0; 3371 int ret = 0; 3372 3373 /* 3374 * If this block group is smaller than 100 megs don't bother caching the 3375 * block group. 3376 */ 3377 if (block_group->key.offset < (100 * SZ_1M)) { 3378 spin_lock(&block_group->lock); 3379 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3380 spin_unlock(&block_group->lock); 3381 return 0; 3382 } 3383 3384 if (trans->aborted) 3385 return 0; 3386 again: 3387 inode = lookup_free_space_inode(fs_info, block_group, path); 3388 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3389 ret = PTR_ERR(inode); 3390 btrfs_release_path(path); 3391 goto out; 3392 } 3393 3394 if (IS_ERR(inode)) { 3395 BUG_ON(retries); 3396 retries++; 3397 3398 if (block_group->ro) 3399 goto out_free; 3400 3401 ret = create_free_space_inode(fs_info, trans, block_group, 3402 path); 3403 if (ret) 3404 goto out_free; 3405 goto again; 3406 } 3407 3408 /* We've already setup this transaction, go ahead and exit */ 3409 if (block_group->cache_generation == trans->transid && 3410 i_size_read(inode)) { 3411 dcs = BTRFS_DC_SETUP; 3412 goto out_put; 3413 } 3414 3415 /* 3416 * We want to set the generation to 0, that way if anything goes wrong 3417 * from here on out we know not to trust this cache when we load up next 3418 * time. 3419 */ 3420 BTRFS_I(inode)->generation = 0; 3421 ret = btrfs_update_inode(trans, root, inode); 3422 if (ret) { 3423 /* 3424 * So theoretically we could recover from this, simply set the 3425 * super cache generation to 0 so we know to invalidate the 3426 * cache, but then we'd have to keep track of the block groups 3427 * that fail this way so we know we _have_ to reset this cache 3428 * before the next commit or risk reading stale cache. So to 3429 * limit our exposure to horrible edge cases lets just abort the 3430 * transaction, this only happens in really bad situations 3431 * anyway. 3432 */ 3433 btrfs_abort_transaction(trans, ret); 3434 goto out_put; 3435 } 3436 WARN_ON(ret); 3437 3438 if (i_size_read(inode) > 0) { 3439 ret = btrfs_check_trunc_cache_free_space(fs_info, 3440 &fs_info->global_block_rsv); 3441 if (ret) 3442 goto out_put; 3443 3444 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3445 if (ret) 3446 goto out_put; 3447 } 3448 3449 spin_lock(&block_group->lock); 3450 if (block_group->cached != BTRFS_CACHE_FINISHED || 3451 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3452 /* 3453 * don't bother trying to write stuff out _if_ 3454 * a) we're not cached, 3455 * b) we're with nospace_cache mount option, 3456 * c) we're with v2 space_cache (FREE_SPACE_TREE). 3457 */ 3458 dcs = BTRFS_DC_WRITTEN; 3459 spin_unlock(&block_group->lock); 3460 goto out_put; 3461 } 3462 spin_unlock(&block_group->lock); 3463 3464 /* 3465 * We hit an ENOSPC when setting up the cache in this transaction, just 3466 * skip doing the setup, we've already cleared the cache so we're safe. 3467 */ 3468 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3469 ret = -ENOSPC; 3470 goto out_put; 3471 } 3472 3473 /* 3474 * Try to preallocate enough space based on how big the block group is. 3475 * Keep in mind this has to include any pinned space which could end up 3476 * taking up quite a bit since it's not folded into the other space 3477 * cache. 3478 */ 3479 num_pages = div_u64(block_group->key.offset, SZ_256M); 3480 if (!num_pages) 3481 num_pages = 1; 3482 3483 num_pages *= 16; 3484 num_pages *= PAGE_SIZE; 3485 3486 ret = btrfs_check_data_free_space(inode, 0, num_pages); 3487 if (ret) 3488 goto out_put; 3489 3490 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3491 num_pages, num_pages, 3492 &alloc_hint); 3493 /* 3494 * Our cache requires contiguous chunks so that we don't modify a bunch 3495 * of metadata or split extents when writing the cache out, which means 3496 * we can enospc if we are heavily fragmented in addition to just normal 3497 * out of space conditions. So if we hit this just skip setting up any 3498 * other block groups for this transaction, maybe we'll unpin enough 3499 * space the next time around. 3500 */ 3501 if (!ret) 3502 dcs = BTRFS_DC_SETUP; 3503 else if (ret == -ENOSPC) 3504 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3505 3506 out_put: 3507 iput(inode); 3508 out_free: 3509 btrfs_release_path(path); 3510 out: 3511 spin_lock(&block_group->lock); 3512 if (!ret && dcs == BTRFS_DC_SETUP) 3513 block_group->cache_generation = trans->transid; 3514 block_group->disk_cache_state = dcs; 3515 spin_unlock(&block_group->lock); 3516 3517 return ret; 3518 } 3519 3520 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3521 struct btrfs_fs_info *fs_info) 3522 { 3523 struct btrfs_block_group_cache *cache, *tmp; 3524 struct btrfs_transaction *cur_trans = trans->transaction; 3525 struct btrfs_path *path; 3526 3527 if (list_empty(&cur_trans->dirty_bgs) || 3528 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3529 return 0; 3530 3531 path = btrfs_alloc_path(); 3532 if (!path) 3533 return -ENOMEM; 3534 3535 /* Could add new block groups, use _safe just in case */ 3536 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3537 dirty_list) { 3538 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3539 cache_save_setup(cache, trans, path); 3540 } 3541 3542 btrfs_free_path(path); 3543 return 0; 3544 } 3545 3546 /* 3547 * transaction commit does final block group cache writeback during a 3548 * critical section where nothing is allowed to change the FS. This is 3549 * required in order for the cache to actually match the block group, 3550 * but can introduce a lot of latency into the commit. 3551 * 3552 * So, btrfs_start_dirty_block_groups is here to kick off block group 3553 * cache IO. There's a chance we'll have to redo some of it if the 3554 * block group changes again during the commit, but it greatly reduces 3555 * the commit latency by getting rid of the easy block groups while 3556 * we're still allowing others to join the commit. 3557 */ 3558 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3559 struct btrfs_fs_info *fs_info) 3560 { 3561 struct btrfs_block_group_cache *cache; 3562 struct btrfs_transaction *cur_trans = trans->transaction; 3563 int ret = 0; 3564 int should_put; 3565 struct btrfs_path *path = NULL; 3566 LIST_HEAD(dirty); 3567 struct list_head *io = &cur_trans->io_bgs; 3568 int num_started = 0; 3569 int loops = 0; 3570 3571 spin_lock(&cur_trans->dirty_bgs_lock); 3572 if (list_empty(&cur_trans->dirty_bgs)) { 3573 spin_unlock(&cur_trans->dirty_bgs_lock); 3574 return 0; 3575 } 3576 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3577 spin_unlock(&cur_trans->dirty_bgs_lock); 3578 3579 again: 3580 /* 3581 * make sure all the block groups on our dirty list actually 3582 * exist 3583 */ 3584 btrfs_create_pending_block_groups(trans, fs_info); 3585 3586 if (!path) { 3587 path = btrfs_alloc_path(); 3588 if (!path) 3589 return -ENOMEM; 3590 } 3591 3592 /* 3593 * cache_write_mutex is here only to save us from balance or automatic 3594 * removal of empty block groups deleting this block group while we are 3595 * writing out the cache 3596 */ 3597 mutex_lock(&trans->transaction->cache_write_mutex); 3598 while (!list_empty(&dirty)) { 3599 cache = list_first_entry(&dirty, 3600 struct btrfs_block_group_cache, 3601 dirty_list); 3602 /* 3603 * this can happen if something re-dirties a block 3604 * group that is already under IO. Just wait for it to 3605 * finish and then do it all again 3606 */ 3607 if (!list_empty(&cache->io_list)) { 3608 list_del_init(&cache->io_list); 3609 btrfs_wait_cache_io(trans, cache, path); 3610 btrfs_put_block_group(cache); 3611 } 3612 3613 3614 /* 3615 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3616 * if it should update the cache_state. Don't delete 3617 * until after we wait. 3618 * 3619 * Since we're not running in the commit critical section 3620 * we need the dirty_bgs_lock to protect from update_block_group 3621 */ 3622 spin_lock(&cur_trans->dirty_bgs_lock); 3623 list_del_init(&cache->dirty_list); 3624 spin_unlock(&cur_trans->dirty_bgs_lock); 3625 3626 should_put = 1; 3627 3628 cache_save_setup(cache, trans, path); 3629 3630 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3631 cache->io_ctl.inode = NULL; 3632 ret = btrfs_write_out_cache(fs_info, trans, 3633 cache, path); 3634 if (ret == 0 && cache->io_ctl.inode) { 3635 num_started++; 3636 should_put = 0; 3637 3638 /* 3639 * the cache_write_mutex is protecting 3640 * the io_list 3641 */ 3642 list_add_tail(&cache->io_list, io); 3643 } else { 3644 /* 3645 * if we failed to write the cache, the 3646 * generation will be bad and life goes on 3647 */ 3648 ret = 0; 3649 } 3650 } 3651 if (!ret) { 3652 ret = write_one_cache_group(trans, fs_info, 3653 path, cache); 3654 /* 3655 * Our block group might still be attached to the list 3656 * of new block groups in the transaction handle of some 3657 * other task (struct btrfs_trans_handle->new_bgs). This 3658 * means its block group item isn't yet in the extent 3659 * tree. If this happens ignore the error, as we will 3660 * try again later in the critical section of the 3661 * transaction commit. 3662 */ 3663 if (ret == -ENOENT) { 3664 ret = 0; 3665 spin_lock(&cur_trans->dirty_bgs_lock); 3666 if (list_empty(&cache->dirty_list)) { 3667 list_add_tail(&cache->dirty_list, 3668 &cur_trans->dirty_bgs); 3669 btrfs_get_block_group(cache); 3670 } 3671 spin_unlock(&cur_trans->dirty_bgs_lock); 3672 } else if (ret) { 3673 btrfs_abort_transaction(trans, ret); 3674 } 3675 } 3676 3677 /* if its not on the io list, we need to put the block group */ 3678 if (should_put) 3679 btrfs_put_block_group(cache); 3680 3681 if (ret) 3682 break; 3683 3684 /* 3685 * Avoid blocking other tasks for too long. It might even save 3686 * us from writing caches for block groups that are going to be 3687 * removed. 3688 */ 3689 mutex_unlock(&trans->transaction->cache_write_mutex); 3690 mutex_lock(&trans->transaction->cache_write_mutex); 3691 } 3692 mutex_unlock(&trans->transaction->cache_write_mutex); 3693 3694 /* 3695 * go through delayed refs for all the stuff we've just kicked off 3696 * and then loop back (just once) 3697 */ 3698 ret = btrfs_run_delayed_refs(trans, fs_info, 0); 3699 if (!ret && loops == 0) { 3700 loops++; 3701 spin_lock(&cur_trans->dirty_bgs_lock); 3702 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3703 /* 3704 * dirty_bgs_lock protects us from concurrent block group 3705 * deletes too (not just cache_write_mutex). 3706 */ 3707 if (!list_empty(&dirty)) { 3708 spin_unlock(&cur_trans->dirty_bgs_lock); 3709 goto again; 3710 } 3711 spin_unlock(&cur_trans->dirty_bgs_lock); 3712 } else if (ret < 0) { 3713 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3714 } 3715 3716 btrfs_free_path(path); 3717 return ret; 3718 } 3719 3720 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3721 struct btrfs_fs_info *fs_info) 3722 { 3723 struct btrfs_block_group_cache *cache; 3724 struct btrfs_transaction *cur_trans = trans->transaction; 3725 int ret = 0; 3726 int should_put; 3727 struct btrfs_path *path; 3728 struct list_head *io = &cur_trans->io_bgs; 3729 int num_started = 0; 3730 3731 path = btrfs_alloc_path(); 3732 if (!path) 3733 return -ENOMEM; 3734 3735 /* 3736 * Even though we are in the critical section of the transaction commit, 3737 * we can still have concurrent tasks adding elements to this 3738 * transaction's list of dirty block groups. These tasks correspond to 3739 * endio free space workers started when writeback finishes for a 3740 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3741 * allocate new block groups as a result of COWing nodes of the root 3742 * tree when updating the free space inode. The writeback for the space 3743 * caches is triggered by an earlier call to 3744 * btrfs_start_dirty_block_groups() and iterations of the following 3745 * loop. 3746 * Also we want to do the cache_save_setup first and then run the 3747 * delayed refs to make sure we have the best chance at doing this all 3748 * in one shot. 3749 */ 3750 spin_lock(&cur_trans->dirty_bgs_lock); 3751 while (!list_empty(&cur_trans->dirty_bgs)) { 3752 cache = list_first_entry(&cur_trans->dirty_bgs, 3753 struct btrfs_block_group_cache, 3754 dirty_list); 3755 3756 /* 3757 * this can happen if cache_save_setup re-dirties a block 3758 * group that is already under IO. Just wait for it to 3759 * finish and then do it all again 3760 */ 3761 if (!list_empty(&cache->io_list)) { 3762 spin_unlock(&cur_trans->dirty_bgs_lock); 3763 list_del_init(&cache->io_list); 3764 btrfs_wait_cache_io(trans, cache, path); 3765 btrfs_put_block_group(cache); 3766 spin_lock(&cur_trans->dirty_bgs_lock); 3767 } 3768 3769 /* 3770 * don't remove from the dirty list until after we've waited 3771 * on any pending IO 3772 */ 3773 list_del_init(&cache->dirty_list); 3774 spin_unlock(&cur_trans->dirty_bgs_lock); 3775 should_put = 1; 3776 3777 cache_save_setup(cache, trans, path); 3778 3779 if (!ret) 3780 ret = btrfs_run_delayed_refs(trans, fs_info, 3781 (unsigned long) -1); 3782 3783 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3784 cache->io_ctl.inode = NULL; 3785 ret = btrfs_write_out_cache(fs_info, trans, 3786 cache, path); 3787 if (ret == 0 && cache->io_ctl.inode) { 3788 num_started++; 3789 should_put = 0; 3790 list_add_tail(&cache->io_list, io); 3791 } else { 3792 /* 3793 * if we failed to write the cache, the 3794 * generation will be bad and life goes on 3795 */ 3796 ret = 0; 3797 } 3798 } 3799 if (!ret) { 3800 ret = write_one_cache_group(trans, fs_info, 3801 path, cache); 3802 /* 3803 * One of the free space endio workers might have 3804 * created a new block group while updating a free space 3805 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3806 * and hasn't released its transaction handle yet, in 3807 * which case the new block group is still attached to 3808 * its transaction handle and its creation has not 3809 * finished yet (no block group item in the extent tree 3810 * yet, etc). If this is the case, wait for all free 3811 * space endio workers to finish and retry. This is a 3812 * a very rare case so no need for a more efficient and 3813 * complex approach. 3814 */ 3815 if (ret == -ENOENT) { 3816 wait_event(cur_trans->writer_wait, 3817 atomic_read(&cur_trans->num_writers) == 1); 3818 ret = write_one_cache_group(trans, fs_info, 3819 path, cache); 3820 } 3821 if (ret) 3822 btrfs_abort_transaction(trans, ret); 3823 } 3824 3825 /* if its not on the io list, we need to put the block group */ 3826 if (should_put) 3827 btrfs_put_block_group(cache); 3828 spin_lock(&cur_trans->dirty_bgs_lock); 3829 } 3830 spin_unlock(&cur_trans->dirty_bgs_lock); 3831 3832 while (!list_empty(io)) { 3833 cache = list_first_entry(io, struct btrfs_block_group_cache, 3834 io_list); 3835 list_del_init(&cache->io_list); 3836 btrfs_wait_cache_io(trans, cache, path); 3837 btrfs_put_block_group(cache); 3838 } 3839 3840 btrfs_free_path(path); 3841 return ret; 3842 } 3843 3844 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3845 { 3846 struct btrfs_block_group_cache *block_group; 3847 int readonly = 0; 3848 3849 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3850 if (!block_group || block_group->ro) 3851 readonly = 1; 3852 if (block_group) 3853 btrfs_put_block_group(block_group); 3854 return readonly; 3855 } 3856 3857 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3858 { 3859 struct btrfs_block_group_cache *bg; 3860 bool ret = true; 3861 3862 bg = btrfs_lookup_block_group(fs_info, bytenr); 3863 if (!bg) 3864 return false; 3865 3866 spin_lock(&bg->lock); 3867 if (bg->ro) 3868 ret = false; 3869 else 3870 atomic_inc(&bg->nocow_writers); 3871 spin_unlock(&bg->lock); 3872 3873 /* no put on block group, done by btrfs_dec_nocow_writers */ 3874 if (!ret) 3875 btrfs_put_block_group(bg); 3876 3877 return ret; 3878 3879 } 3880 3881 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3882 { 3883 struct btrfs_block_group_cache *bg; 3884 3885 bg = btrfs_lookup_block_group(fs_info, bytenr); 3886 ASSERT(bg); 3887 if (atomic_dec_and_test(&bg->nocow_writers)) 3888 wake_up_atomic_t(&bg->nocow_writers); 3889 /* 3890 * Once for our lookup and once for the lookup done by a previous call 3891 * to btrfs_inc_nocow_writers() 3892 */ 3893 btrfs_put_block_group(bg); 3894 btrfs_put_block_group(bg); 3895 } 3896 3897 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) 3898 { 3899 schedule(); 3900 return 0; 3901 } 3902 3903 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3904 { 3905 wait_on_atomic_t(&bg->nocow_writers, 3906 btrfs_wait_nocow_writers_atomic_t, 3907 TASK_UNINTERRUPTIBLE); 3908 } 3909 3910 static const char *alloc_name(u64 flags) 3911 { 3912 switch (flags) { 3913 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3914 return "mixed"; 3915 case BTRFS_BLOCK_GROUP_METADATA: 3916 return "metadata"; 3917 case BTRFS_BLOCK_GROUP_DATA: 3918 return "data"; 3919 case BTRFS_BLOCK_GROUP_SYSTEM: 3920 return "system"; 3921 default: 3922 WARN_ON(1); 3923 return "invalid-combination"; 3924 }; 3925 } 3926 3927 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3928 u64 total_bytes, u64 bytes_used, 3929 u64 bytes_readonly, 3930 struct btrfs_space_info **space_info) 3931 { 3932 struct btrfs_space_info *found; 3933 int i; 3934 int factor; 3935 int ret; 3936 3937 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3938 BTRFS_BLOCK_GROUP_RAID10)) 3939 factor = 2; 3940 else 3941 factor = 1; 3942 3943 found = __find_space_info(info, flags); 3944 if (found) { 3945 spin_lock(&found->lock); 3946 found->total_bytes += total_bytes; 3947 found->disk_total += total_bytes * factor; 3948 found->bytes_used += bytes_used; 3949 found->disk_used += bytes_used * factor; 3950 found->bytes_readonly += bytes_readonly; 3951 if (total_bytes > 0) 3952 found->full = 0; 3953 space_info_add_new_bytes(info, found, total_bytes - 3954 bytes_used - bytes_readonly); 3955 spin_unlock(&found->lock); 3956 *space_info = found; 3957 return 0; 3958 } 3959 found = kzalloc(sizeof(*found), GFP_NOFS); 3960 if (!found) 3961 return -ENOMEM; 3962 3963 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3964 if (ret) { 3965 kfree(found); 3966 return ret; 3967 } 3968 3969 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3970 INIT_LIST_HEAD(&found->block_groups[i]); 3971 init_rwsem(&found->groups_sem); 3972 spin_lock_init(&found->lock); 3973 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3974 found->total_bytes = total_bytes; 3975 found->disk_total = total_bytes * factor; 3976 found->bytes_used = bytes_used; 3977 found->disk_used = bytes_used * factor; 3978 found->bytes_pinned = 0; 3979 found->bytes_reserved = 0; 3980 found->bytes_readonly = bytes_readonly; 3981 found->bytes_may_use = 0; 3982 found->full = 0; 3983 found->max_extent_size = 0; 3984 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3985 found->chunk_alloc = 0; 3986 found->flush = 0; 3987 init_waitqueue_head(&found->wait); 3988 INIT_LIST_HEAD(&found->ro_bgs); 3989 INIT_LIST_HEAD(&found->tickets); 3990 INIT_LIST_HEAD(&found->priority_tickets); 3991 3992 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3993 info->space_info_kobj, "%s", 3994 alloc_name(found->flags)); 3995 if (ret) { 3996 percpu_counter_destroy(&found->total_bytes_pinned); 3997 kfree(found); 3998 return ret; 3999 } 4000 4001 *space_info = found; 4002 list_add_rcu(&found->list, &info->space_info); 4003 if (flags & BTRFS_BLOCK_GROUP_DATA) 4004 info->data_sinfo = found; 4005 4006 return ret; 4007 } 4008 4009 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 4010 { 4011 u64 extra_flags = chunk_to_extended(flags) & 4012 BTRFS_EXTENDED_PROFILE_MASK; 4013 4014 write_seqlock(&fs_info->profiles_lock); 4015 if (flags & BTRFS_BLOCK_GROUP_DATA) 4016 fs_info->avail_data_alloc_bits |= extra_flags; 4017 if (flags & BTRFS_BLOCK_GROUP_METADATA) 4018 fs_info->avail_metadata_alloc_bits |= extra_flags; 4019 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4020 fs_info->avail_system_alloc_bits |= extra_flags; 4021 write_sequnlock(&fs_info->profiles_lock); 4022 } 4023 4024 /* 4025 * returns target flags in extended format or 0 if restripe for this 4026 * chunk_type is not in progress 4027 * 4028 * should be called with either volume_mutex or balance_lock held 4029 */ 4030 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4031 { 4032 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4033 u64 target = 0; 4034 4035 if (!bctl) 4036 return 0; 4037 4038 if (flags & BTRFS_BLOCK_GROUP_DATA && 4039 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4040 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4041 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4042 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4043 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4044 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4045 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4046 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4047 } 4048 4049 return target; 4050 } 4051 4052 /* 4053 * @flags: available profiles in extended format (see ctree.h) 4054 * 4055 * Returns reduced profile in chunk format. If profile changing is in 4056 * progress (either running or paused) picks the target profile (if it's 4057 * already available), otherwise falls back to plain reducing. 4058 */ 4059 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 4060 { 4061 u64 num_devices = fs_info->fs_devices->rw_devices; 4062 u64 target; 4063 u64 raid_type; 4064 u64 allowed = 0; 4065 4066 /* 4067 * see if restripe for this chunk_type is in progress, if so 4068 * try to reduce to the target profile 4069 */ 4070 spin_lock(&fs_info->balance_lock); 4071 target = get_restripe_target(fs_info, flags); 4072 if (target) { 4073 /* pick target profile only if it's already available */ 4074 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4075 spin_unlock(&fs_info->balance_lock); 4076 return extended_to_chunk(target); 4077 } 4078 } 4079 spin_unlock(&fs_info->balance_lock); 4080 4081 /* First, mask out the RAID levels which aren't possible */ 4082 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4083 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4084 allowed |= btrfs_raid_group[raid_type]; 4085 } 4086 allowed &= flags; 4087 4088 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4089 allowed = BTRFS_BLOCK_GROUP_RAID6; 4090 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4091 allowed = BTRFS_BLOCK_GROUP_RAID5; 4092 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4093 allowed = BTRFS_BLOCK_GROUP_RAID10; 4094 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4095 allowed = BTRFS_BLOCK_GROUP_RAID1; 4096 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4097 allowed = BTRFS_BLOCK_GROUP_RAID0; 4098 4099 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4100 4101 return extended_to_chunk(flags | allowed); 4102 } 4103 4104 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 4105 { 4106 unsigned seq; 4107 u64 flags; 4108 4109 do { 4110 flags = orig_flags; 4111 seq = read_seqbegin(&fs_info->profiles_lock); 4112 4113 if (flags & BTRFS_BLOCK_GROUP_DATA) 4114 flags |= fs_info->avail_data_alloc_bits; 4115 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4116 flags |= fs_info->avail_system_alloc_bits; 4117 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4118 flags |= fs_info->avail_metadata_alloc_bits; 4119 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4120 4121 return btrfs_reduce_alloc_profile(fs_info, flags); 4122 } 4123 4124 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 4125 { 4126 struct btrfs_fs_info *fs_info = root->fs_info; 4127 u64 flags; 4128 u64 ret; 4129 4130 if (data) 4131 flags = BTRFS_BLOCK_GROUP_DATA; 4132 else if (root == fs_info->chunk_root) 4133 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4134 else 4135 flags = BTRFS_BLOCK_GROUP_METADATA; 4136 4137 ret = get_alloc_profile(fs_info, flags); 4138 return ret; 4139 } 4140 4141 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4142 bool may_use_included) 4143 { 4144 ASSERT(s_info); 4145 return s_info->bytes_used + s_info->bytes_reserved + 4146 s_info->bytes_pinned + s_info->bytes_readonly + 4147 (may_use_included ? s_info->bytes_may_use : 0); 4148 } 4149 4150 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 4151 { 4152 struct btrfs_space_info *data_sinfo; 4153 struct btrfs_root *root = inode->root; 4154 struct btrfs_fs_info *fs_info = root->fs_info; 4155 u64 used; 4156 int ret = 0; 4157 int need_commit = 2; 4158 int have_pinned_space; 4159 4160 /* make sure bytes are sectorsize aligned */ 4161 bytes = ALIGN(bytes, fs_info->sectorsize); 4162 4163 if (btrfs_is_free_space_inode(inode)) { 4164 need_commit = 0; 4165 ASSERT(current->journal_info); 4166 } 4167 4168 data_sinfo = fs_info->data_sinfo; 4169 if (!data_sinfo) 4170 goto alloc; 4171 4172 again: 4173 /* make sure we have enough space to handle the data first */ 4174 spin_lock(&data_sinfo->lock); 4175 used = btrfs_space_info_used(data_sinfo, true); 4176 4177 if (used + bytes > data_sinfo->total_bytes) { 4178 struct btrfs_trans_handle *trans; 4179 4180 /* 4181 * if we don't have enough free bytes in this space then we need 4182 * to alloc a new chunk. 4183 */ 4184 if (!data_sinfo->full) { 4185 u64 alloc_target; 4186 4187 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4188 spin_unlock(&data_sinfo->lock); 4189 alloc: 4190 alloc_target = btrfs_get_alloc_profile(root, 1); 4191 /* 4192 * It is ugly that we don't call nolock join 4193 * transaction for the free space inode case here. 4194 * But it is safe because we only do the data space 4195 * reservation for the free space cache in the 4196 * transaction context, the common join transaction 4197 * just increase the counter of the current transaction 4198 * handler, doesn't try to acquire the trans_lock of 4199 * the fs. 4200 */ 4201 trans = btrfs_join_transaction(root); 4202 if (IS_ERR(trans)) 4203 return PTR_ERR(trans); 4204 4205 ret = do_chunk_alloc(trans, fs_info, alloc_target, 4206 CHUNK_ALLOC_NO_FORCE); 4207 btrfs_end_transaction(trans); 4208 if (ret < 0) { 4209 if (ret != -ENOSPC) 4210 return ret; 4211 else { 4212 have_pinned_space = 1; 4213 goto commit_trans; 4214 } 4215 } 4216 4217 if (!data_sinfo) 4218 data_sinfo = fs_info->data_sinfo; 4219 4220 goto again; 4221 } 4222 4223 /* 4224 * If we don't have enough pinned space to deal with this 4225 * allocation, and no removed chunk in current transaction, 4226 * don't bother committing the transaction. 4227 */ 4228 have_pinned_space = percpu_counter_compare( 4229 &data_sinfo->total_bytes_pinned, 4230 used + bytes - data_sinfo->total_bytes); 4231 spin_unlock(&data_sinfo->lock); 4232 4233 /* commit the current transaction and try again */ 4234 commit_trans: 4235 if (need_commit && 4236 !atomic_read(&fs_info->open_ioctl_trans)) { 4237 need_commit--; 4238 4239 if (need_commit > 0) { 4240 btrfs_start_delalloc_roots(fs_info, 0, -1); 4241 btrfs_wait_ordered_roots(fs_info, -1, 0, 4242 (u64)-1); 4243 } 4244 4245 trans = btrfs_join_transaction(root); 4246 if (IS_ERR(trans)) 4247 return PTR_ERR(trans); 4248 if (have_pinned_space >= 0 || 4249 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4250 &trans->transaction->flags) || 4251 need_commit > 0) { 4252 ret = btrfs_commit_transaction(trans); 4253 if (ret) 4254 return ret; 4255 /* 4256 * The cleaner kthread might still be doing iput 4257 * operations. Wait for it to finish so that 4258 * more space is released. 4259 */ 4260 mutex_lock(&fs_info->cleaner_delayed_iput_mutex); 4261 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); 4262 goto again; 4263 } else { 4264 btrfs_end_transaction(trans); 4265 } 4266 } 4267 4268 trace_btrfs_space_reservation(fs_info, 4269 "space_info:enospc", 4270 data_sinfo->flags, bytes, 1); 4271 return -ENOSPC; 4272 } 4273 data_sinfo->bytes_may_use += bytes; 4274 trace_btrfs_space_reservation(fs_info, "space_info", 4275 data_sinfo->flags, bytes, 1); 4276 spin_unlock(&data_sinfo->lock); 4277 4278 return ret; 4279 } 4280 4281 /* 4282 * New check_data_free_space() with ability for precious data reservation 4283 * Will replace old btrfs_check_data_free_space(), but for patch split, 4284 * add a new function first and then replace it. 4285 */ 4286 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) 4287 { 4288 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4289 int ret; 4290 4291 /* align the range */ 4292 len = round_up(start + len, fs_info->sectorsize) - 4293 round_down(start, fs_info->sectorsize); 4294 start = round_down(start, fs_info->sectorsize); 4295 4296 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 4297 if (ret < 0) 4298 return ret; 4299 4300 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4301 ret = btrfs_qgroup_reserve_data(inode, start, len); 4302 if (ret) 4303 btrfs_free_reserved_data_space_noquota(inode, start, len); 4304 return ret; 4305 } 4306 4307 /* 4308 * Called if we need to clear a data reservation for this inode 4309 * Normally in a error case. 4310 * 4311 * This one will *NOT* use accurate qgroup reserved space API, just for case 4312 * which we can't sleep and is sure it won't affect qgroup reserved space. 4313 * Like clear_bit_hook(). 4314 */ 4315 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4316 u64 len) 4317 { 4318 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4319 struct btrfs_space_info *data_sinfo; 4320 4321 /* Make sure the range is aligned to sectorsize */ 4322 len = round_up(start + len, fs_info->sectorsize) - 4323 round_down(start, fs_info->sectorsize); 4324 start = round_down(start, fs_info->sectorsize); 4325 4326 data_sinfo = fs_info->data_sinfo; 4327 spin_lock(&data_sinfo->lock); 4328 if (WARN_ON(data_sinfo->bytes_may_use < len)) 4329 data_sinfo->bytes_may_use = 0; 4330 else 4331 data_sinfo->bytes_may_use -= len; 4332 trace_btrfs_space_reservation(fs_info, "space_info", 4333 data_sinfo->flags, len, 0); 4334 spin_unlock(&data_sinfo->lock); 4335 } 4336 4337 /* 4338 * Called if we need to clear a data reservation for this inode 4339 * Normally in a error case. 4340 * 4341 * This one will handle the per-inode data rsv map for accurate reserved 4342 * space framework. 4343 */ 4344 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) 4345 { 4346 struct btrfs_root *root = BTRFS_I(inode)->root; 4347 4348 /* Make sure the range is aligned to sectorsize */ 4349 len = round_up(start + len, root->fs_info->sectorsize) - 4350 round_down(start, root->fs_info->sectorsize); 4351 start = round_down(start, root->fs_info->sectorsize); 4352 4353 btrfs_free_reserved_data_space_noquota(inode, start, len); 4354 btrfs_qgroup_free_data(inode, start, len); 4355 } 4356 4357 static void force_metadata_allocation(struct btrfs_fs_info *info) 4358 { 4359 struct list_head *head = &info->space_info; 4360 struct btrfs_space_info *found; 4361 4362 rcu_read_lock(); 4363 list_for_each_entry_rcu(found, head, list) { 4364 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4365 found->force_alloc = CHUNK_ALLOC_FORCE; 4366 } 4367 rcu_read_unlock(); 4368 } 4369 4370 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4371 { 4372 return (global->size << 1); 4373 } 4374 4375 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4376 struct btrfs_space_info *sinfo, int force) 4377 { 4378 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4379 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 4380 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 4381 u64 thresh; 4382 4383 if (force == CHUNK_ALLOC_FORCE) 4384 return 1; 4385 4386 /* 4387 * We need to take into account the global rsv because for all intents 4388 * and purposes it's used space. Don't worry about locking the 4389 * global_rsv, it doesn't change except when the transaction commits. 4390 */ 4391 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 4392 num_allocated += calc_global_rsv_need_space(global_rsv); 4393 4394 /* 4395 * in limited mode, we want to have some free space up to 4396 * about 1% of the FS size. 4397 */ 4398 if (force == CHUNK_ALLOC_LIMITED) { 4399 thresh = btrfs_super_total_bytes(fs_info->super_copy); 4400 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4401 4402 if (num_bytes - num_allocated < thresh) 4403 return 1; 4404 } 4405 4406 if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) 4407 return 0; 4408 return 1; 4409 } 4410 4411 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4412 { 4413 u64 num_dev; 4414 4415 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4416 BTRFS_BLOCK_GROUP_RAID0 | 4417 BTRFS_BLOCK_GROUP_RAID5 | 4418 BTRFS_BLOCK_GROUP_RAID6)) 4419 num_dev = fs_info->fs_devices->rw_devices; 4420 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4421 num_dev = 2; 4422 else 4423 num_dev = 1; /* DUP or single */ 4424 4425 return num_dev; 4426 } 4427 4428 /* 4429 * If @is_allocation is true, reserve space in the system space info necessary 4430 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4431 * removing a chunk. 4432 */ 4433 void check_system_chunk(struct btrfs_trans_handle *trans, 4434 struct btrfs_fs_info *fs_info, u64 type) 4435 { 4436 struct btrfs_space_info *info; 4437 u64 left; 4438 u64 thresh; 4439 int ret = 0; 4440 u64 num_devs; 4441 4442 /* 4443 * Needed because we can end up allocating a system chunk and for an 4444 * atomic and race free space reservation in the chunk block reserve. 4445 */ 4446 ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); 4447 4448 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4449 spin_lock(&info->lock); 4450 left = info->total_bytes - btrfs_space_info_used(info, true); 4451 spin_unlock(&info->lock); 4452 4453 num_devs = get_profile_num_devs(fs_info, type); 4454 4455 /* num_devs device items to update and 1 chunk item to add or remove */ 4456 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 4457 btrfs_calc_trans_metadata_size(fs_info, 1); 4458 4459 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4460 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4461 left, thresh, type); 4462 dump_space_info(fs_info, info, 0, 0); 4463 } 4464 4465 if (left < thresh) { 4466 u64 flags; 4467 4468 flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4469 /* 4470 * Ignore failure to create system chunk. We might end up not 4471 * needing it, as we might not need to COW all nodes/leafs from 4472 * the paths we visit in the chunk tree (they were already COWed 4473 * or created in the current transaction for example). 4474 */ 4475 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4476 } 4477 4478 if (!ret) { 4479 ret = btrfs_block_rsv_add(fs_info->chunk_root, 4480 &fs_info->chunk_block_rsv, 4481 thresh, BTRFS_RESERVE_NO_FLUSH); 4482 if (!ret) 4483 trans->chunk_bytes_reserved += thresh; 4484 } 4485 } 4486 4487 /* 4488 * If force is CHUNK_ALLOC_FORCE: 4489 * - return 1 if it successfully allocates a chunk, 4490 * - return errors including -ENOSPC otherwise. 4491 * If force is NOT CHUNK_ALLOC_FORCE: 4492 * - return 0 if it doesn't need to allocate a new chunk, 4493 * - return 1 if it successfully allocates a chunk, 4494 * - return errors including -ENOSPC otherwise. 4495 */ 4496 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 4497 struct btrfs_fs_info *fs_info, u64 flags, int force) 4498 { 4499 struct btrfs_space_info *space_info; 4500 int wait_for_alloc = 0; 4501 int ret = 0; 4502 4503 /* Don't re-enter if we're already allocating a chunk */ 4504 if (trans->allocating_chunk) 4505 return -ENOSPC; 4506 4507 space_info = __find_space_info(fs_info, flags); 4508 if (!space_info) { 4509 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 4510 BUG_ON(ret); /* -ENOMEM */ 4511 } 4512 BUG_ON(!space_info); /* Logic error */ 4513 4514 again: 4515 spin_lock(&space_info->lock); 4516 if (force < space_info->force_alloc) 4517 force = space_info->force_alloc; 4518 if (space_info->full) { 4519 if (should_alloc_chunk(fs_info, space_info, force)) 4520 ret = -ENOSPC; 4521 else 4522 ret = 0; 4523 spin_unlock(&space_info->lock); 4524 return ret; 4525 } 4526 4527 if (!should_alloc_chunk(fs_info, space_info, force)) { 4528 spin_unlock(&space_info->lock); 4529 return 0; 4530 } else if (space_info->chunk_alloc) { 4531 wait_for_alloc = 1; 4532 } else { 4533 space_info->chunk_alloc = 1; 4534 } 4535 4536 spin_unlock(&space_info->lock); 4537 4538 mutex_lock(&fs_info->chunk_mutex); 4539 4540 /* 4541 * The chunk_mutex is held throughout the entirety of a chunk 4542 * allocation, so once we've acquired the chunk_mutex we know that the 4543 * other guy is done and we need to recheck and see if we should 4544 * allocate. 4545 */ 4546 if (wait_for_alloc) { 4547 mutex_unlock(&fs_info->chunk_mutex); 4548 wait_for_alloc = 0; 4549 goto again; 4550 } 4551 4552 trans->allocating_chunk = true; 4553 4554 /* 4555 * If we have mixed data/metadata chunks we want to make sure we keep 4556 * allocating mixed chunks instead of individual chunks. 4557 */ 4558 if (btrfs_mixed_space_info(space_info)) 4559 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4560 4561 /* 4562 * if we're doing a data chunk, go ahead and make sure that 4563 * we keep a reasonable number of metadata chunks allocated in the 4564 * FS as well. 4565 */ 4566 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4567 fs_info->data_chunk_allocations++; 4568 if (!(fs_info->data_chunk_allocations % 4569 fs_info->metadata_ratio)) 4570 force_metadata_allocation(fs_info); 4571 } 4572 4573 /* 4574 * Check if we have enough space in SYSTEM chunk because we may need 4575 * to update devices. 4576 */ 4577 check_system_chunk(trans, fs_info, flags); 4578 4579 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4580 trans->allocating_chunk = false; 4581 4582 spin_lock(&space_info->lock); 4583 if (ret < 0 && ret != -ENOSPC) 4584 goto out; 4585 if (ret) 4586 space_info->full = 1; 4587 else 4588 ret = 1; 4589 4590 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4591 out: 4592 space_info->chunk_alloc = 0; 4593 spin_unlock(&space_info->lock); 4594 mutex_unlock(&fs_info->chunk_mutex); 4595 /* 4596 * When we allocate a new chunk we reserve space in the chunk block 4597 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4598 * add new nodes/leafs to it if we end up needing to do it when 4599 * inserting the chunk item and updating device items as part of the 4600 * second phase of chunk allocation, performed by 4601 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4602 * large number of new block groups to create in our transaction 4603 * handle's new_bgs list to avoid exhausting the chunk block reserve 4604 * in extreme cases - like having a single transaction create many new 4605 * block groups when starting to write out the free space caches of all 4606 * the block groups that were made dirty during the lifetime of the 4607 * transaction. 4608 */ 4609 if (trans->can_flush_pending_bgs && 4610 trans->chunk_bytes_reserved >= (u64)SZ_2M) { 4611 btrfs_create_pending_block_groups(trans, fs_info); 4612 btrfs_trans_release_chunk_metadata(trans); 4613 } 4614 return ret; 4615 } 4616 4617 static int can_overcommit(struct btrfs_root *root, 4618 struct btrfs_space_info *space_info, u64 bytes, 4619 enum btrfs_reserve_flush_enum flush) 4620 { 4621 struct btrfs_fs_info *fs_info = root->fs_info; 4622 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4623 u64 profile; 4624 u64 space_size; 4625 u64 avail; 4626 u64 used; 4627 4628 /* Don't overcommit when in mixed mode. */ 4629 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4630 return 0; 4631 4632 profile = btrfs_get_alloc_profile(root, 0); 4633 used = btrfs_space_info_used(space_info, false); 4634 4635 /* 4636 * We only want to allow over committing if we have lots of actual space 4637 * free, but if we don't have enough space to handle the global reserve 4638 * space then we could end up having a real enospc problem when trying 4639 * to allocate a chunk or some other such important allocation. 4640 */ 4641 spin_lock(&global_rsv->lock); 4642 space_size = calc_global_rsv_need_space(global_rsv); 4643 spin_unlock(&global_rsv->lock); 4644 if (used + space_size >= space_info->total_bytes) 4645 return 0; 4646 4647 used += space_info->bytes_may_use; 4648 4649 spin_lock(&fs_info->free_chunk_lock); 4650 avail = fs_info->free_chunk_space; 4651 spin_unlock(&fs_info->free_chunk_lock); 4652 4653 /* 4654 * If we have dup, raid1 or raid10 then only half of the free 4655 * space is actually useable. For raid56, the space info used 4656 * doesn't include the parity drive, so we don't have to 4657 * change the math 4658 */ 4659 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4660 BTRFS_BLOCK_GROUP_RAID1 | 4661 BTRFS_BLOCK_GROUP_RAID10)) 4662 avail >>= 1; 4663 4664 /* 4665 * If we aren't flushing all things, let us overcommit up to 4666 * 1/2th of the space. If we can flush, don't let us overcommit 4667 * too much, let it overcommit up to 1/8 of the space. 4668 */ 4669 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4670 avail >>= 3; 4671 else 4672 avail >>= 1; 4673 4674 if (used + bytes < space_info->total_bytes + avail) 4675 return 1; 4676 return 0; 4677 } 4678 4679 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4680 unsigned long nr_pages, int nr_items) 4681 { 4682 struct super_block *sb = fs_info->sb; 4683 4684 if (down_read_trylock(&sb->s_umount)) { 4685 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4686 up_read(&sb->s_umount); 4687 } else { 4688 /* 4689 * We needn't worry the filesystem going from r/w to r/o though 4690 * we don't acquire ->s_umount mutex, because the filesystem 4691 * should guarantee the delalloc inodes list be empty after 4692 * the filesystem is readonly(all dirty pages are written to 4693 * the disk). 4694 */ 4695 btrfs_start_delalloc_roots(fs_info, 0, nr_items); 4696 if (!current->journal_info) 4697 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4698 } 4699 } 4700 4701 static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4702 u64 to_reclaim) 4703 { 4704 u64 bytes; 4705 int nr; 4706 4707 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4708 nr = (int)div64_u64(to_reclaim, bytes); 4709 if (!nr) 4710 nr = 1; 4711 return nr; 4712 } 4713 4714 #define EXTENT_SIZE_PER_ITEM SZ_256K 4715 4716 /* 4717 * shrink metadata reservation for delalloc 4718 */ 4719 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4720 bool wait_ordered) 4721 { 4722 struct btrfs_fs_info *fs_info = root->fs_info; 4723 struct btrfs_block_rsv *block_rsv; 4724 struct btrfs_space_info *space_info; 4725 struct btrfs_trans_handle *trans; 4726 u64 delalloc_bytes; 4727 u64 max_reclaim; 4728 long time_left; 4729 unsigned long nr_pages; 4730 int loops; 4731 int items; 4732 enum btrfs_reserve_flush_enum flush; 4733 4734 /* Calc the number of the pages we need flush for space reservation */ 4735 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4736 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; 4737 4738 trans = (struct btrfs_trans_handle *)current->journal_info; 4739 block_rsv = &fs_info->delalloc_block_rsv; 4740 space_info = block_rsv->space_info; 4741 4742 delalloc_bytes = percpu_counter_sum_positive( 4743 &fs_info->delalloc_bytes); 4744 if (delalloc_bytes == 0) { 4745 if (trans) 4746 return; 4747 if (wait_ordered) 4748 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4749 return; 4750 } 4751 4752 loops = 0; 4753 while (delalloc_bytes && loops < 3) { 4754 max_reclaim = min(delalloc_bytes, to_reclaim); 4755 nr_pages = max_reclaim >> PAGE_SHIFT; 4756 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4757 /* 4758 * We need to wait for the async pages to actually start before 4759 * we do anything. 4760 */ 4761 max_reclaim = atomic_read(&fs_info->async_delalloc_pages); 4762 if (!max_reclaim) 4763 goto skip_async; 4764 4765 if (max_reclaim <= nr_pages) 4766 max_reclaim = 0; 4767 else 4768 max_reclaim -= nr_pages; 4769 4770 wait_event(fs_info->async_submit_wait, 4771 atomic_read(&fs_info->async_delalloc_pages) <= 4772 (int)max_reclaim); 4773 skip_async: 4774 if (!trans) 4775 flush = BTRFS_RESERVE_FLUSH_ALL; 4776 else 4777 flush = BTRFS_RESERVE_NO_FLUSH; 4778 spin_lock(&space_info->lock); 4779 if (can_overcommit(root, space_info, orig, flush)) { 4780 spin_unlock(&space_info->lock); 4781 break; 4782 } 4783 if (list_empty(&space_info->tickets) && 4784 list_empty(&space_info->priority_tickets)) { 4785 spin_unlock(&space_info->lock); 4786 break; 4787 } 4788 spin_unlock(&space_info->lock); 4789 4790 loops++; 4791 if (wait_ordered && !trans) { 4792 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4793 } else { 4794 time_left = schedule_timeout_killable(1); 4795 if (time_left) 4796 break; 4797 } 4798 delalloc_bytes = percpu_counter_sum_positive( 4799 &fs_info->delalloc_bytes); 4800 } 4801 } 4802 4803 /** 4804 * maybe_commit_transaction - possibly commit the transaction if its ok to 4805 * @root - the root we're allocating for 4806 * @bytes - the number of bytes we want to reserve 4807 * @force - force the commit 4808 * 4809 * This will check to make sure that committing the transaction will actually 4810 * get us somewhere and then commit the transaction if it does. Otherwise it 4811 * will return -ENOSPC. 4812 */ 4813 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4814 struct btrfs_space_info *space_info, 4815 u64 bytes, int force) 4816 { 4817 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4818 struct btrfs_trans_handle *trans; 4819 4820 trans = (struct btrfs_trans_handle *)current->journal_info; 4821 if (trans) 4822 return -EAGAIN; 4823 4824 if (force) 4825 goto commit; 4826 4827 /* See if there is enough pinned space to make this reservation */ 4828 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4829 bytes) >= 0) 4830 goto commit; 4831 4832 /* 4833 * See if there is some space in the delayed insertion reservation for 4834 * this reservation. 4835 */ 4836 if (space_info != delayed_rsv->space_info) 4837 return -ENOSPC; 4838 4839 spin_lock(&delayed_rsv->lock); 4840 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4841 bytes - delayed_rsv->size) >= 0) { 4842 spin_unlock(&delayed_rsv->lock); 4843 return -ENOSPC; 4844 } 4845 spin_unlock(&delayed_rsv->lock); 4846 4847 commit: 4848 trans = btrfs_join_transaction(fs_info->extent_root); 4849 if (IS_ERR(trans)) 4850 return -ENOSPC; 4851 4852 return btrfs_commit_transaction(trans); 4853 } 4854 4855 struct reserve_ticket { 4856 u64 bytes; 4857 int error; 4858 struct list_head list; 4859 wait_queue_head_t wait; 4860 }; 4861 4862 static int flush_space(struct btrfs_fs_info *fs_info, 4863 struct btrfs_space_info *space_info, u64 num_bytes, 4864 u64 orig_bytes, int state) 4865 { 4866 struct btrfs_root *root = fs_info->extent_root; 4867 struct btrfs_trans_handle *trans; 4868 int nr; 4869 int ret = 0; 4870 4871 switch (state) { 4872 case FLUSH_DELAYED_ITEMS_NR: 4873 case FLUSH_DELAYED_ITEMS: 4874 if (state == FLUSH_DELAYED_ITEMS_NR) 4875 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4876 else 4877 nr = -1; 4878 4879 trans = btrfs_join_transaction(root); 4880 if (IS_ERR(trans)) { 4881 ret = PTR_ERR(trans); 4882 break; 4883 } 4884 ret = btrfs_run_delayed_items_nr(trans, fs_info, nr); 4885 btrfs_end_transaction(trans); 4886 break; 4887 case FLUSH_DELALLOC: 4888 case FLUSH_DELALLOC_WAIT: 4889 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4890 state == FLUSH_DELALLOC_WAIT); 4891 break; 4892 case ALLOC_CHUNK: 4893 trans = btrfs_join_transaction(root); 4894 if (IS_ERR(trans)) { 4895 ret = PTR_ERR(trans); 4896 break; 4897 } 4898 ret = do_chunk_alloc(trans, fs_info, 4899 btrfs_get_alloc_profile(root, 0), 4900 CHUNK_ALLOC_NO_FORCE); 4901 btrfs_end_transaction(trans); 4902 if (ret > 0 || ret == -ENOSPC) 4903 ret = 0; 4904 break; 4905 case COMMIT_TRANS: 4906 ret = may_commit_transaction(fs_info, space_info, 4907 orig_bytes, 0); 4908 break; 4909 default: 4910 ret = -ENOSPC; 4911 break; 4912 } 4913 4914 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, 4915 orig_bytes, state, ret); 4916 return ret; 4917 } 4918 4919 static inline u64 4920 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4921 struct btrfs_space_info *space_info) 4922 { 4923 struct reserve_ticket *ticket; 4924 u64 used; 4925 u64 expected; 4926 u64 to_reclaim = 0; 4927 4928 list_for_each_entry(ticket, &space_info->tickets, list) 4929 to_reclaim += ticket->bytes; 4930 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4931 to_reclaim += ticket->bytes; 4932 if (to_reclaim) 4933 return to_reclaim; 4934 4935 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4936 if (can_overcommit(root, space_info, to_reclaim, 4937 BTRFS_RESERVE_FLUSH_ALL)) 4938 return 0; 4939 4940 used = space_info->bytes_used + space_info->bytes_reserved + 4941 space_info->bytes_pinned + space_info->bytes_readonly + 4942 space_info->bytes_may_use; 4943 if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 4944 expected = div_factor_fine(space_info->total_bytes, 95); 4945 else 4946 expected = div_factor_fine(space_info->total_bytes, 90); 4947 4948 if (used > expected) 4949 to_reclaim = used - expected; 4950 else 4951 to_reclaim = 0; 4952 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4953 space_info->bytes_reserved); 4954 return to_reclaim; 4955 } 4956 4957 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4958 struct btrfs_root *root, u64 used) 4959 { 4960 struct btrfs_fs_info *fs_info = root->fs_info; 4961 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4962 4963 /* If we're just plain full then async reclaim just slows us down. */ 4964 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4965 return 0; 4966 4967 if (!btrfs_calc_reclaim_metadata_size(root, space_info)) 4968 return 0; 4969 4970 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4971 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4972 } 4973 4974 static void wake_all_tickets(struct list_head *head) 4975 { 4976 struct reserve_ticket *ticket; 4977 4978 while (!list_empty(head)) { 4979 ticket = list_first_entry(head, struct reserve_ticket, list); 4980 list_del_init(&ticket->list); 4981 ticket->error = -ENOSPC; 4982 wake_up(&ticket->wait); 4983 } 4984 } 4985 4986 /* 4987 * This is for normal flushers, we can wait all goddamned day if we want to. We 4988 * will loop and continuously try to flush as long as we are making progress. 4989 * We count progress as clearing off tickets each time we have to loop. 4990 */ 4991 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4992 { 4993 struct btrfs_fs_info *fs_info; 4994 struct btrfs_space_info *space_info; 4995 u64 to_reclaim; 4996 int flush_state; 4997 int commit_cycles = 0; 4998 u64 last_tickets_id; 4999 5000 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 5001 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5002 5003 spin_lock(&space_info->lock); 5004 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5005 space_info); 5006 if (!to_reclaim) { 5007 space_info->flush = 0; 5008 spin_unlock(&space_info->lock); 5009 return; 5010 } 5011 last_tickets_id = space_info->tickets_id; 5012 spin_unlock(&space_info->lock); 5013 5014 flush_state = FLUSH_DELAYED_ITEMS_NR; 5015 do { 5016 struct reserve_ticket *ticket; 5017 int ret; 5018 5019 ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5020 flush_state); 5021 spin_lock(&space_info->lock); 5022 if (list_empty(&space_info->tickets)) { 5023 space_info->flush = 0; 5024 spin_unlock(&space_info->lock); 5025 return; 5026 } 5027 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5028 space_info); 5029 ticket = list_first_entry(&space_info->tickets, 5030 struct reserve_ticket, list); 5031 if (last_tickets_id == space_info->tickets_id) { 5032 flush_state++; 5033 } else { 5034 last_tickets_id = space_info->tickets_id; 5035 flush_state = FLUSH_DELAYED_ITEMS_NR; 5036 if (commit_cycles) 5037 commit_cycles--; 5038 } 5039 5040 if (flush_state > COMMIT_TRANS) { 5041 commit_cycles++; 5042 if (commit_cycles > 2) { 5043 wake_all_tickets(&space_info->tickets); 5044 space_info->flush = 0; 5045 } else { 5046 flush_state = FLUSH_DELAYED_ITEMS_NR; 5047 } 5048 } 5049 spin_unlock(&space_info->lock); 5050 } while (flush_state <= COMMIT_TRANS); 5051 } 5052 5053 void btrfs_init_async_reclaim_work(struct work_struct *work) 5054 { 5055 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5056 } 5057 5058 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5059 struct btrfs_space_info *space_info, 5060 struct reserve_ticket *ticket) 5061 { 5062 u64 to_reclaim; 5063 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5064 5065 spin_lock(&space_info->lock); 5066 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->extent_root, 5067 space_info); 5068 if (!to_reclaim) { 5069 spin_unlock(&space_info->lock); 5070 return; 5071 } 5072 spin_unlock(&space_info->lock); 5073 5074 do { 5075 flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5076 flush_state); 5077 flush_state++; 5078 spin_lock(&space_info->lock); 5079 if (ticket->bytes == 0) { 5080 spin_unlock(&space_info->lock); 5081 return; 5082 } 5083 spin_unlock(&space_info->lock); 5084 5085 /* 5086 * Priority flushers can't wait on delalloc without 5087 * deadlocking. 5088 */ 5089 if (flush_state == FLUSH_DELALLOC || 5090 flush_state == FLUSH_DELALLOC_WAIT) 5091 flush_state = ALLOC_CHUNK; 5092 } while (flush_state < COMMIT_TRANS); 5093 } 5094 5095 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5096 struct btrfs_space_info *space_info, 5097 struct reserve_ticket *ticket, u64 orig_bytes) 5098 5099 { 5100 DEFINE_WAIT(wait); 5101 int ret = 0; 5102 5103 spin_lock(&space_info->lock); 5104 while (ticket->bytes > 0 && ticket->error == 0) { 5105 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5106 if (ret) { 5107 ret = -EINTR; 5108 break; 5109 } 5110 spin_unlock(&space_info->lock); 5111 5112 schedule(); 5113 5114 finish_wait(&ticket->wait, &wait); 5115 spin_lock(&space_info->lock); 5116 } 5117 if (!ret) 5118 ret = ticket->error; 5119 if (!list_empty(&ticket->list)) 5120 list_del_init(&ticket->list); 5121 if (ticket->bytes && ticket->bytes < orig_bytes) { 5122 u64 num_bytes = orig_bytes - ticket->bytes; 5123 space_info->bytes_may_use -= num_bytes; 5124 trace_btrfs_space_reservation(fs_info, "space_info", 5125 space_info->flags, num_bytes, 0); 5126 } 5127 spin_unlock(&space_info->lock); 5128 5129 return ret; 5130 } 5131 5132 /** 5133 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5134 * @root - the root we're allocating for 5135 * @space_info - the space info we want to allocate from 5136 * @orig_bytes - the number of bytes we want 5137 * @flush - whether or not we can flush to make our reservation 5138 * 5139 * This will reserve orig_bytes number of bytes from the space info associated 5140 * with the block_rsv. If there is not enough space it will make an attempt to 5141 * flush out space to make room. It will do this by flushing delalloc if 5142 * possible or committing the transaction. If flush is 0 then no attempts to 5143 * regain reservations will be made and this will fail if there is not enough 5144 * space already. 5145 */ 5146 static int __reserve_metadata_bytes(struct btrfs_root *root, 5147 struct btrfs_space_info *space_info, 5148 u64 orig_bytes, 5149 enum btrfs_reserve_flush_enum flush) 5150 { 5151 struct btrfs_fs_info *fs_info = root->fs_info; 5152 struct reserve_ticket ticket; 5153 u64 used; 5154 int ret = 0; 5155 5156 ASSERT(orig_bytes); 5157 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5158 5159 spin_lock(&space_info->lock); 5160 ret = -ENOSPC; 5161 used = btrfs_space_info_used(space_info, true); 5162 5163 /* 5164 * If we have enough space then hooray, make our reservation and carry 5165 * on. If not see if we can overcommit, and if we can, hooray carry on. 5166 * If not things get more complicated. 5167 */ 5168 if (used + orig_bytes <= space_info->total_bytes) { 5169 space_info->bytes_may_use += orig_bytes; 5170 trace_btrfs_space_reservation(fs_info, "space_info", 5171 space_info->flags, orig_bytes, 1); 5172 ret = 0; 5173 } else if (can_overcommit(root, space_info, orig_bytes, flush)) { 5174 space_info->bytes_may_use += orig_bytes; 5175 trace_btrfs_space_reservation(fs_info, "space_info", 5176 space_info->flags, orig_bytes, 1); 5177 ret = 0; 5178 } 5179 5180 /* 5181 * If we couldn't make a reservation then setup our reservation ticket 5182 * and kick the async worker if it's not already running. 5183 * 5184 * If we are a priority flusher then we just need to add our ticket to 5185 * the list and we will do our own flushing further down. 5186 */ 5187 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5188 ticket.bytes = orig_bytes; 5189 ticket.error = 0; 5190 init_waitqueue_head(&ticket.wait); 5191 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5192 list_add_tail(&ticket.list, &space_info->tickets); 5193 if (!space_info->flush) { 5194 space_info->flush = 1; 5195 trace_btrfs_trigger_flush(fs_info, 5196 space_info->flags, 5197 orig_bytes, flush, 5198 "enospc"); 5199 queue_work(system_unbound_wq, 5200 &root->fs_info->async_reclaim_work); 5201 } 5202 } else { 5203 list_add_tail(&ticket.list, 5204 &space_info->priority_tickets); 5205 } 5206 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5207 used += orig_bytes; 5208 /* 5209 * We will do the space reservation dance during log replay, 5210 * which means we won't have fs_info->fs_root set, so don't do 5211 * the async reclaim as we will panic. 5212 */ 5213 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5214 need_do_async_reclaim(space_info, root, used) && 5215 !work_busy(&fs_info->async_reclaim_work)) { 5216 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5217 orig_bytes, flush, "preempt"); 5218 queue_work(system_unbound_wq, 5219 &fs_info->async_reclaim_work); 5220 } 5221 } 5222 spin_unlock(&space_info->lock); 5223 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5224 return ret; 5225 5226 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5227 return wait_reserve_ticket(fs_info, space_info, &ticket, 5228 orig_bytes); 5229 5230 ret = 0; 5231 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5232 spin_lock(&space_info->lock); 5233 if (ticket.bytes) { 5234 if (ticket.bytes < orig_bytes) { 5235 u64 num_bytes = orig_bytes - ticket.bytes; 5236 space_info->bytes_may_use -= num_bytes; 5237 trace_btrfs_space_reservation(fs_info, "space_info", 5238 space_info->flags, 5239 num_bytes, 0); 5240 5241 } 5242 list_del_init(&ticket.list); 5243 ret = -ENOSPC; 5244 } 5245 spin_unlock(&space_info->lock); 5246 ASSERT(list_empty(&ticket.list)); 5247 return ret; 5248 } 5249 5250 /** 5251 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5252 * @root - the root we're allocating for 5253 * @block_rsv - the block_rsv we're allocating for 5254 * @orig_bytes - the number of bytes we want 5255 * @flush - whether or not we can flush to make our reservation 5256 * 5257 * This will reserve orgi_bytes number of bytes from the space info associated 5258 * with the block_rsv. If there is not enough space it will make an attempt to 5259 * flush out space to make room. It will do this by flushing delalloc if 5260 * possible or committing the transaction. If flush is 0 then no attempts to 5261 * regain reservations will be made and this will fail if there is not enough 5262 * space already. 5263 */ 5264 static int reserve_metadata_bytes(struct btrfs_root *root, 5265 struct btrfs_block_rsv *block_rsv, 5266 u64 orig_bytes, 5267 enum btrfs_reserve_flush_enum flush) 5268 { 5269 struct btrfs_fs_info *fs_info = root->fs_info; 5270 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5271 int ret; 5272 5273 ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, 5274 flush); 5275 if (ret == -ENOSPC && 5276 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5277 if (block_rsv != global_rsv && 5278 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5279 ret = 0; 5280 } 5281 if (ret == -ENOSPC) 5282 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 5283 block_rsv->space_info->flags, 5284 orig_bytes, 1); 5285 return ret; 5286 } 5287 5288 static struct btrfs_block_rsv *get_block_rsv( 5289 const struct btrfs_trans_handle *trans, 5290 const struct btrfs_root *root) 5291 { 5292 struct btrfs_fs_info *fs_info = root->fs_info; 5293 struct btrfs_block_rsv *block_rsv = NULL; 5294 5295 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5296 (root == fs_info->csum_root && trans->adding_csums) || 5297 (root == fs_info->uuid_root)) 5298 block_rsv = trans->block_rsv; 5299 5300 if (!block_rsv) 5301 block_rsv = root->block_rsv; 5302 5303 if (!block_rsv) 5304 block_rsv = &fs_info->empty_block_rsv; 5305 5306 return block_rsv; 5307 } 5308 5309 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5310 u64 num_bytes) 5311 { 5312 int ret = -ENOSPC; 5313 spin_lock(&block_rsv->lock); 5314 if (block_rsv->reserved >= num_bytes) { 5315 block_rsv->reserved -= num_bytes; 5316 if (block_rsv->reserved < block_rsv->size) 5317 block_rsv->full = 0; 5318 ret = 0; 5319 } 5320 spin_unlock(&block_rsv->lock); 5321 return ret; 5322 } 5323 5324 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5325 u64 num_bytes, int update_size) 5326 { 5327 spin_lock(&block_rsv->lock); 5328 block_rsv->reserved += num_bytes; 5329 if (update_size) 5330 block_rsv->size += num_bytes; 5331 else if (block_rsv->reserved >= block_rsv->size) 5332 block_rsv->full = 1; 5333 spin_unlock(&block_rsv->lock); 5334 } 5335 5336 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5337 struct btrfs_block_rsv *dest, u64 num_bytes, 5338 int min_factor) 5339 { 5340 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5341 u64 min_bytes; 5342 5343 if (global_rsv->space_info != dest->space_info) 5344 return -ENOSPC; 5345 5346 spin_lock(&global_rsv->lock); 5347 min_bytes = div_factor(global_rsv->size, min_factor); 5348 if (global_rsv->reserved < min_bytes + num_bytes) { 5349 spin_unlock(&global_rsv->lock); 5350 return -ENOSPC; 5351 } 5352 global_rsv->reserved -= num_bytes; 5353 if (global_rsv->reserved < global_rsv->size) 5354 global_rsv->full = 0; 5355 spin_unlock(&global_rsv->lock); 5356 5357 block_rsv_add_bytes(dest, num_bytes, 1); 5358 return 0; 5359 } 5360 5361 /* 5362 * This is for space we already have accounted in space_info->bytes_may_use, so 5363 * basically when we're returning space from block_rsv's. 5364 */ 5365 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5366 struct btrfs_space_info *space_info, 5367 u64 num_bytes) 5368 { 5369 struct reserve_ticket *ticket; 5370 struct list_head *head; 5371 u64 used; 5372 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5373 bool check_overcommit = false; 5374 5375 spin_lock(&space_info->lock); 5376 head = &space_info->priority_tickets; 5377 5378 /* 5379 * If we are over our limit then we need to check and see if we can 5380 * overcommit, and if we can't then we just need to free up our space 5381 * and not satisfy any requests. 5382 */ 5383 used = space_info->bytes_used + space_info->bytes_reserved + 5384 space_info->bytes_pinned + space_info->bytes_readonly + 5385 space_info->bytes_may_use; 5386 if (used - num_bytes >= space_info->total_bytes) 5387 check_overcommit = true; 5388 again: 5389 while (!list_empty(head) && num_bytes) { 5390 ticket = list_first_entry(head, struct reserve_ticket, 5391 list); 5392 /* 5393 * We use 0 bytes because this space is already reserved, so 5394 * adding the ticket space would be a double count. 5395 */ 5396 if (check_overcommit && 5397 !can_overcommit(fs_info->extent_root, space_info, 0, 5398 flush)) 5399 break; 5400 if (num_bytes >= ticket->bytes) { 5401 list_del_init(&ticket->list); 5402 num_bytes -= ticket->bytes; 5403 ticket->bytes = 0; 5404 space_info->tickets_id++; 5405 wake_up(&ticket->wait); 5406 } else { 5407 ticket->bytes -= num_bytes; 5408 num_bytes = 0; 5409 } 5410 } 5411 5412 if (num_bytes && head == &space_info->priority_tickets) { 5413 head = &space_info->tickets; 5414 flush = BTRFS_RESERVE_FLUSH_ALL; 5415 goto again; 5416 } 5417 space_info->bytes_may_use -= num_bytes; 5418 trace_btrfs_space_reservation(fs_info, "space_info", 5419 space_info->flags, num_bytes, 0); 5420 spin_unlock(&space_info->lock); 5421 } 5422 5423 /* 5424 * This is for newly allocated space that isn't accounted in 5425 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5426 * we use this helper. 5427 */ 5428 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5429 struct btrfs_space_info *space_info, 5430 u64 num_bytes) 5431 { 5432 struct reserve_ticket *ticket; 5433 struct list_head *head = &space_info->priority_tickets; 5434 5435 again: 5436 while (!list_empty(head) && num_bytes) { 5437 ticket = list_first_entry(head, struct reserve_ticket, 5438 list); 5439 if (num_bytes >= ticket->bytes) { 5440 trace_btrfs_space_reservation(fs_info, "space_info", 5441 space_info->flags, 5442 ticket->bytes, 1); 5443 list_del_init(&ticket->list); 5444 num_bytes -= ticket->bytes; 5445 space_info->bytes_may_use += ticket->bytes; 5446 ticket->bytes = 0; 5447 space_info->tickets_id++; 5448 wake_up(&ticket->wait); 5449 } else { 5450 trace_btrfs_space_reservation(fs_info, "space_info", 5451 space_info->flags, 5452 num_bytes, 1); 5453 space_info->bytes_may_use += num_bytes; 5454 ticket->bytes -= num_bytes; 5455 num_bytes = 0; 5456 } 5457 } 5458 5459 if (num_bytes && head == &space_info->priority_tickets) { 5460 head = &space_info->tickets; 5461 goto again; 5462 } 5463 } 5464 5465 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5466 struct btrfs_block_rsv *block_rsv, 5467 struct btrfs_block_rsv *dest, u64 num_bytes) 5468 { 5469 struct btrfs_space_info *space_info = block_rsv->space_info; 5470 5471 spin_lock(&block_rsv->lock); 5472 if (num_bytes == (u64)-1) 5473 num_bytes = block_rsv->size; 5474 block_rsv->size -= num_bytes; 5475 if (block_rsv->reserved >= block_rsv->size) { 5476 num_bytes = block_rsv->reserved - block_rsv->size; 5477 block_rsv->reserved = block_rsv->size; 5478 block_rsv->full = 1; 5479 } else { 5480 num_bytes = 0; 5481 } 5482 spin_unlock(&block_rsv->lock); 5483 5484 if (num_bytes > 0) { 5485 if (dest) { 5486 spin_lock(&dest->lock); 5487 if (!dest->full) { 5488 u64 bytes_to_add; 5489 5490 bytes_to_add = dest->size - dest->reserved; 5491 bytes_to_add = min(num_bytes, bytes_to_add); 5492 dest->reserved += bytes_to_add; 5493 if (dest->reserved >= dest->size) 5494 dest->full = 1; 5495 num_bytes -= bytes_to_add; 5496 } 5497 spin_unlock(&dest->lock); 5498 } 5499 if (num_bytes) 5500 space_info_add_old_bytes(fs_info, space_info, 5501 num_bytes); 5502 } 5503 } 5504 5505 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5506 struct btrfs_block_rsv *dst, u64 num_bytes, 5507 int update_size) 5508 { 5509 int ret; 5510 5511 ret = block_rsv_use_bytes(src, num_bytes); 5512 if (ret) 5513 return ret; 5514 5515 block_rsv_add_bytes(dst, num_bytes, update_size); 5516 return 0; 5517 } 5518 5519 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5520 { 5521 memset(rsv, 0, sizeof(*rsv)); 5522 spin_lock_init(&rsv->lock); 5523 rsv->type = type; 5524 } 5525 5526 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5527 unsigned short type) 5528 { 5529 struct btrfs_block_rsv *block_rsv; 5530 5531 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5532 if (!block_rsv) 5533 return NULL; 5534 5535 btrfs_init_block_rsv(block_rsv, type); 5536 block_rsv->space_info = __find_space_info(fs_info, 5537 BTRFS_BLOCK_GROUP_METADATA); 5538 return block_rsv; 5539 } 5540 5541 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5542 struct btrfs_block_rsv *rsv) 5543 { 5544 if (!rsv) 5545 return; 5546 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5547 kfree(rsv); 5548 } 5549 5550 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) 5551 { 5552 kfree(rsv); 5553 } 5554 5555 int btrfs_block_rsv_add(struct btrfs_root *root, 5556 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5557 enum btrfs_reserve_flush_enum flush) 5558 { 5559 int ret; 5560 5561 if (num_bytes == 0) 5562 return 0; 5563 5564 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5565 if (!ret) { 5566 block_rsv_add_bytes(block_rsv, num_bytes, 1); 5567 return 0; 5568 } 5569 5570 return ret; 5571 } 5572 5573 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5574 { 5575 u64 num_bytes = 0; 5576 int ret = -ENOSPC; 5577 5578 if (!block_rsv) 5579 return 0; 5580 5581 spin_lock(&block_rsv->lock); 5582 num_bytes = div_factor(block_rsv->size, min_factor); 5583 if (block_rsv->reserved >= num_bytes) 5584 ret = 0; 5585 spin_unlock(&block_rsv->lock); 5586 5587 return ret; 5588 } 5589 5590 int btrfs_block_rsv_refill(struct btrfs_root *root, 5591 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5592 enum btrfs_reserve_flush_enum flush) 5593 { 5594 u64 num_bytes = 0; 5595 int ret = -ENOSPC; 5596 5597 if (!block_rsv) 5598 return 0; 5599 5600 spin_lock(&block_rsv->lock); 5601 num_bytes = min_reserved; 5602 if (block_rsv->reserved >= num_bytes) 5603 ret = 0; 5604 else 5605 num_bytes -= block_rsv->reserved; 5606 spin_unlock(&block_rsv->lock); 5607 5608 if (!ret) 5609 return 0; 5610 5611 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5612 if (!ret) { 5613 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5614 return 0; 5615 } 5616 5617 return ret; 5618 } 5619 5620 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5621 struct btrfs_block_rsv *block_rsv, 5622 u64 num_bytes) 5623 { 5624 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5625 5626 if (global_rsv == block_rsv || 5627 block_rsv->space_info != global_rsv->space_info) 5628 global_rsv = NULL; 5629 block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); 5630 } 5631 5632 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5633 { 5634 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5635 struct btrfs_space_info *sinfo = block_rsv->space_info; 5636 u64 num_bytes; 5637 5638 /* 5639 * The global block rsv is based on the size of the extent tree, the 5640 * checksum tree and the root tree. If the fs is empty we want to set 5641 * it to a minimal amount for safety. 5642 */ 5643 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5644 btrfs_root_used(&fs_info->csum_root->root_item) + 5645 btrfs_root_used(&fs_info->tree_root->root_item); 5646 num_bytes = max_t(u64, num_bytes, SZ_16M); 5647 5648 spin_lock(&sinfo->lock); 5649 spin_lock(&block_rsv->lock); 5650 5651 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5652 5653 if (block_rsv->reserved < block_rsv->size) { 5654 num_bytes = btrfs_space_info_used(sinfo, true); 5655 if (sinfo->total_bytes > num_bytes) { 5656 num_bytes = sinfo->total_bytes - num_bytes; 5657 num_bytes = min(num_bytes, 5658 block_rsv->size - block_rsv->reserved); 5659 block_rsv->reserved += num_bytes; 5660 sinfo->bytes_may_use += num_bytes; 5661 trace_btrfs_space_reservation(fs_info, "space_info", 5662 sinfo->flags, num_bytes, 5663 1); 5664 } 5665 } else if (block_rsv->reserved > block_rsv->size) { 5666 num_bytes = block_rsv->reserved - block_rsv->size; 5667 sinfo->bytes_may_use -= num_bytes; 5668 trace_btrfs_space_reservation(fs_info, "space_info", 5669 sinfo->flags, num_bytes, 0); 5670 block_rsv->reserved = block_rsv->size; 5671 } 5672 5673 if (block_rsv->reserved == block_rsv->size) 5674 block_rsv->full = 1; 5675 else 5676 block_rsv->full = 0; 5677 5678 spin_unlock(&block_rsv->lock); 5679 spin_unlock(&sinfo->lock); 5680 } 5681 5682 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5683 { 5684 struct btrfs_space_info *space_info; 5685 5686 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5687 fs_info->chunk_block_rsv.space_info = space_info; 5688 5689 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5690 fs_info->global_block_rsv.space_info = space_info; 5691 fs_info->delalloc_block_rsv.space_info = space_info; 5692 fs_info->trans_block_rsv.space_info = space_info; 5693 fs_info->empty_block_rsv.space_info = space_info; 5694 fs_info->delayed_block_rsv.space_info = space_info; 5695 5696 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 5697 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 5698 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5699 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5700 if (fs_info->quota_root) 5701 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5702 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5703 5704 update_global_block_rsv(fs_info); 5705 } 5706 5707 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5708 { 5709 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5710 (u64)-1); 5711 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 5712 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 5713 WARN_ON(fs_info->trans_block_rsv.size > 0); 5714 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5715 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5716 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5717 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5718 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5719 } 5720 5721 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5722 struct btrfs_fs_info *fs_info) 5723 { 5724 if (!trans->block_rsv) 5725 return; 5726 5727 if (!trans->bytes_reserved) 5728 return; 5729 5730 trace_btrfs_space_reservation(fs_info, "transaction", 5731 trans->transid, trans->bytes_reserved, 0); 5732 btrfs_block_rsv_release(fs_info, trans->block_rsv, 5733 trans->bytes_reserved); 5734 trans->bytes_reserved = 0; 5735 } 5736 5737 /* 5738 * To be called after all the new block groups attached to the transaction 5739 * handle have been created (btrfs_create_pending_block_groups()). 5740 */ 5741 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5742 { 5743 struct btrfs_fs_info *fs_info = trans->fs_info; 5744 5745 if (!trans->chunk_bytes_reserved) 5746 return; 5747 5748 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5749 5750 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5751 trans->chunk_bytes_reserved); 5752 trans->chunk_bytes_reserved = 0; 5753 } 5754 5755 /* Can only return 0 or -ENOSPC */ 5756 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5757 struct btrfs_inode *inode) 5758 { 5759 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5760 struct btrfs_root *root = inode->root; 5761 /* 5762 * We always use trans->block_rsv here as we will have reserved space 5763 * for our orphan when starting the transaction, using get_block_rsv() 5764 * here will sometimes make us choose the wrong block rsv as we could be 5765 * doing a reloc inode for a non refcounted root. 5766 */ 5767 struct btrfs_block_rsv *src_rsv = trans->block_rsv; 5768 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 5769 5770 /* 5771 * We need to hold space in order to delete our orphan item once we've 5772 * added it, so this takes the reservation so we can release it later 5773 * when we are truly done with the orphan item. 5774 */ 5775 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5776 5777 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 5778 num_bytes, 1); 5779 return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 5780 } 5781 5782 void btrfs_orphan_release_metadata(struct btrfs_inode *inode) 5783 { 5784 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5785 struct btrfs_root *root = inode->root; 5786 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5787 5788 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 5789 num_bytes, 0); 5790 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); 5791 } 5792 5793 /* 5794 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5795 * root: the root of the parent directory 5796 * rsv: block reservation 5797 * items: the number of items that we need do reservation 5798 * qgroup_reserved: used to return the reserved size in qgroup 5799 * 5800 * This function is used to reserve the space for snapshot/subvolume 5801 * creation and deletion. Those operations are different with the 5802 * common file/directory operations, they change two fs/file trees 5803 * and root tree, the number of items that the qgroup reserves is 5804 * different with the free space reservation. So we can not use 5805 * the space reservation mechanism in start_transaction(). 5806 */ 5807 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5808 struct btrfs_block_rsv *rsv, 5809 int items, 5810 u64 *qgroup_reserved, 5811 bool use_global_rsv) 5812 { 5813 u64 num_bytes; 5814 int ret; 5815 struct btrfs_fs_info *fs_info = root->fs_info; 5816 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5817 5818 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5819 /* One for parent inode, two for dir entries */ 5820 num_bytes = 3 * fs_info->nodesize; 5821 ret = btrfs_qgroup_reserve_meta(root, num_bytes, true); 5822 if (ret) 5823 return ret; 5824 } else { 5825 num_bytes = 0; 5826 } 5827 5828 *qgroup_reserved = num_bytes; 5829 5830 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 5831 rsv->space_info = __find_space_info(fs_info, 5832 BTRFS_BLOCK_GROUP_METADATA); 5833 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5834 BTRFS_RESERVE_FLUSH_ALL); 5835 5836 if (ret == -ENOSPC && use_global_rsv) 5837 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); 5838 5839 if (ret && *qgroup_reserved) 5840 btrfs_qgroup_free_meta(root, *qgroup_reserved); 5841 5842 return ret; 5843 } 5844 5845 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 5846 struct btrfs_block_rsv *rsv) 5847 { 5848 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5849 } 5850 5851 /** 5852 * drop_outstanding_extent - drop an outstanding extent 5853 * @inode: the inode we're dropping the extent for 5854 * @num_bytes: the number of bytes we're releasing. 5855 * 5856 * This is called when we are freeing up an outstanding extent, either called 5857 * after an error or after an extent is written. This will return the number of 5858 * reserved extents that need to be freed. This must be called with 5859 * BTRFS_I(inode)->lock held. 5860 */ 5861 static unsigned drop_outstanding_extent(struct btrfs_inode *inode, 5862 u64 num_bytes) 5863 { 5864 unsigned drop_inode_space = 0; 5865 unsigned dropped_extents = 0; 5866 unsigned num_extents; 5867 5868 num_extents = count_max_extents(num_bytes); 5869 ASSERT(num_extents); 5870 ASSERT(inode->outstanding_extents >= num_extents); 5871 inode->outstanding_extents -= num_extents; 5872 5873 if (inode->outstanding_extents == 0 && 5874 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5875 &inode->runtime_flags)) 5876 drop_inode_space = 1; 5877 5878 /* 5879 * If we have more or the same amount of outstanding extents than we have 5880 * reserved then we need to leave the reserved extents count alone. 5881 */ 5882 if (inode->outstanding_extents >= inode->reserved_extents) 5883 return drop_inode_space; 5884 5885 dropped_extents = inode->reserved_extents - inode->outstanding_extents; 5886 inode->reserved_extents -= dropped_extents; 5887 return dropped_extents + drop_inode_space; 5888 } 5889 5890 /** 5891 * calc_csum_metadata_size - return the amount of metadata space that must be 5892 * reserved/freed for the given bytes. 5893 * @inode: the inode we're manipulating 5894 * @num_bytes: the number of bytes in question 5895 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5896 * 5897 * This adjusts the number of csum_bytes in the inode and then returns the 5898 * correct amount of metadata that must either be reserved or freed. We 5899 * calculate how many checksums we can fit into one leaf and then divide the 5900 * number of bytes that will need to be checksumed by this value to figure out 5901 * how many checksums will be required. If we are adding bytes then the number 5902 * may go up and we will return the number of additional bytes that must be 5903 * reserved. If it is going down we will return the number of bytes that must 5904 * be freed. 5905 * 5906 * This must be called with BTRFS_I(inode)->lock held. 5907 */ 5908 static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, 5909 int reserve) 5910 { 5911 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5912 u64 old_csums, num_csums; 5913 5914 if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) 5915 return 0; 5916 5917 old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 5918 if (reserve) 5919 inode->csum_bytes += num_bytes; 5920 else 5921 inode->csum_bytes -= num_bytes; 5922 num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 5923 5924 /* No change, no need to reserve more */ 5925 if (old_csums == num_csums) 5926 return 0; 5927 5928 if (reserve) 5929 return btrfs_calc_trans_metadata_size(fs_info, 5930 num_csums - old_csums); 5931 5932 return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); 5933 } 5934 5935 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 5936 { 5937 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5938 struct btrfs_root *root = inode->root; 5939 struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; 5940 u64 to_reserve = 0; 5941 u64 csum_bytes; 5942 unsigned nr_extents; 5943 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5944 int ret = 0; 5945 bool delalloc_lock = true; 5946 u64 to_free = 0; 5947 unsigned dropped; 5948 bool release_extra = false; 5949 5950 /* If we are a free space inode we need to not flush since we will be in 5951 * the middle of a transaction commit. We also don't need the delalloc 5952 * mutex since we won't race with anybody. We need this mostly to make 5953 * lockdep shut its filthy mouth. 5954 * 5955 * If we have a transaction open (can happen if we call truncate_block 5956 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 5957 */ 5958 if (btrfs_is_free_space_inode(inode)) { 5959 flush = BTRFS_RESERVE_NO_FLUSH; 5960 delalloc_lock = false; 5961 } else if (current->journal_info) { 5962 flush = BTRFS_RESERVE_FLUSH_LIMIT; 5963 } 5964 5965 if (flush != BTRFS_RESERVE_NO_FLUSH && 5966 btrfs_transaction_in_commit(fs_info)) 5967 schedule_timeout(1); 5968 5969 if (delalloc_lock) 5970 mutex_lock(&inode->delalloc_mutex); 5971 5972 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 5973 5974 spin_lock(&inode->lock); 5975 nr_extents = count_max_extents(num_bytes); 5976 inode->outstanding_extents += nr_extents; 5977 5978 nr_extents = 0; 5979 if (inode->outstanding_extents > inode->reserved_extents) 5980 nr_extents += inode->outstanding_extents - 5981 inode->reserved_extents; 5982 5983 /* We always want to reserve a slot for updating the inode. */ 5984 to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); 5985 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5986 csum_bytes = inode->csum_bytes; 5987 spin_unlock(&inode->lock); 5988 5989 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5990 ret = btrfs_qgroup_reserve_meta(root, 5991 nr_extents * fs_info->nodesize, true); 5992 if (ret) 5993 goto out_fail; 5994 } 5995 5996 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 5997 if (unlikely(ret)) { 5998 btrfs_qgroup_free_meta(root, 5999 nr_extents * fs_info->nodesize); 6000 goto out_fail; 6001 } 6002 6003 spin_lock(&inode->lock); 6004 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 6005 &inode->runtime_flags)) { 6006 to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); 6007 release_extra = true; 6008 } 6009 inode->reserved_extents += nr_extents; 6010 spin_unlock(&inode->lock); 6011 6012 if (delalloc_lock) 6013 mutex_unlock(&inode->delalloc_mutex); 6014 6015 if (to_reserve) 6016 trace_btrfs_space_reservation(fs_info, "delalloc", 6017 btrfs_ino(inode), to_reserve, 1); 6018 if (release_extra) 6019 btrfs_block_rsv_release(fs_info, block_rsv, 6020 btrfs_calc_trans_metadata_size(fs_info, 1)); 6021 return 0; 6022 6023 out_fail: 6024 spin_lock(&inode->lock); 6025 dropped = drop_outstanding_extent(inode, num_bytes); 6026 /* 6027 * If the inodes csum_bytes is the same as the original 6028 * csum_bytes then we know we haven't raced with any free()ers 6029 * so we can just reduce our inodes csum bytes and carry on. 6030 */ 6031 if (inode->csum_bytes == csum_bytes) { 6032 calc_csum_metadata_size(inode, num_bytes, 0); 6033 } else { 6034 u64 orig_csum_bytes = inode->csum_bytes; 6035 u64 bytes; 6036 6037 /* 6038 * This is tricky, but first we need to figure out how much we 6039 * freed from any free-ers that occurred during this 6040 * reservation, so we reset ->csum_bytes to the csum_bytes 6041 * before we dropped our lock, and then call the free for the 6042 * number of bytes that were freed while we were trying our 6043 * reservation. 6044 */ 6045 bytes = csum_bytes - inode->csum_bytes; 6046 inode->csum_bytes = csum_bytes; 6047 to_free = calc_csum_metadata_size(inode, bytes, 0); 6048 6049 6050 /* 6051 * Now we need to see how much we would have freed had we not 6052 * been making this reservation and our ->csum_bytes were not 6053 * artificially inflated. 6054 */ 6055 inode->csum_bytes = csum_bytes - num_bytes; 6056 bytes = csum_bytes - orig_csum_bytes; 6057 bytes = calc_csum_metadata_size(inode, bytes, 0); 6058 6059 /* 6060 * Now reset ->csum_bytes to what it should be. If bytes is 6061 * more than to_free then we would have freed more space had we 6062 * not had an artificially high ->csum_bytes, so we need to free 6063 * the remainder. If bytes is the same or less then we don't 6064 * need to do anything, the other free-ers did the correct 6065 * thing. 6066 */ 6067 inode->csum_bytes = orig_csum_bytes - num_bytes; 6068 if (bytes > to_free) 6069 to_free = bytes - to_free; 6070 else 6071 to_free = 0; 6072 } 6073 spin_unlock(&inode->lock); 6074 if (dropped) 6075 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6076 6077 if (to_free) { 6078 btrfs_block_rsv_release(fs_info, block_rsv, to_free); 6079 trace_btrfs_space_reservation(fs_info, "delalloc", 6080 btrfs_ino(inode), to_free, 0); 6081 } 6082 if (delalloc_lock) 6083 mutex_unlock(&inode->delalloc_mutex); 6084 return ret; 6085 } 6086 6087 /** 6088 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6089 * @inode: the inode to release the reservation for 6090 * @num_bytes: the number of bytes we're releasing 6091 * 6092 * This will release the metadata reservation for an inode. This can be called 6093 * once we complete IO for a given set of bytes to release their metadata 6094 * reservations. 6095 */ 6096 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) 6097 { 6098 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6099 u64 to_free = 0; 6100 unsigned dropped; 6101 6102 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6103 spin_lock(&inode->lock); 6104 dropped = drop_outstanding_extent(inode, num_bytes); 6105 6106 if (num_bytes) 6107 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 6108 spin_unlock(&inode->lock); 6109 if (dropped > 0) 6110 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6111 6112 if (btrfs_is_testing(fs_info)) 6113 return; 6114 6115 trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), 6116 to_free, 0); 6117 6118 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); 6119 } 6120 6121 /** 6122 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6123 * delalloc 6124 * @inode: inode we're writing to 6125 * @start: start range we are writing to 6126 * @len: how long the range we are writing to 6127 * 6128 * This will do the following things 6129 * 6130 * o reserve space in data space info for num bytes 6131 * and reserve precious corresponding qgroup space 6132 * (Done in check_data_free_space) 6133 * 6134 * o reserve space for metadata space, based on the number of outstanding 6135 * extents and how much csums will be needed 6136 * also reserve metadata space in a per root over-reserve method. 6137 * o add to the inodes->delalloc_bytes 6138 * o add it to the fs_info's delalloc inodes list. 6139 * (Above 3 all done in delalloc_reserve_metadata) 6140 * 6141 * Return 0 for success 6142 * Return <0 for error(-ENOSPC or -EQUOT) 6143 */ 6144 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) 6145 { 6146 int ret; 6147 6148 ret = btrfs_check_data_free_space(inode, start, len); 6149 if (ret < 0) 6150 return ret; 6151 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 6152 if (ret < 0) 6153 btrfs_free_reserved_data_space(inode, start, len); 6154 return ret; 6155 } 6156 6157 /** 6158 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6159 * @inode: inode we're releasing space for 6160 * @start: start position of the space already reserved 6161 * @len: the len of the space already reserved 6162 * 6163 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 6164 * called in the case that we don't need the metadata AND data reservations 6165 * anymore. So if there is an error or we insert an inline extent. 6166 * 6167 * This function will release the metadata space that was not used and will 6168 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6169 * list if there are no delalloc bytes left. 6170 * Also it will handle the qgroup reserved space. 6171 */ 6172 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) 6173 { 6174 btrfs_delalloc_release_metadata(BTRFS_I(inode), len); 6175 btrfs_free_reserved_data_space(inode, start, len); 6176 } 6177 6178 static int update_block_group(struct btrfs_trans_handle *trans, 6179 struct btrfs_fs_info *info, u64 bytenr, 6180 u64 num_bytes, int alloc) 6181 { 6182 struct btrfs_block_group_cache *cache = NULL; 6183 u64 total = num_bytes; 6184 u64 old_val; 6185 u64 byte_in_group; 6186 int factor; 6187 6188 /* block accounting for super block */ 6189 spin_lock(&info->delalloc_root_lock); 6190 old_val = btrfs_super_bytes_used(info->super_copy); 6191 if (alloc) 6192 old_val += num_bytes; 6193 else 6194 old_val -= num_bytes; 6195 btrfs_set_super_bytes_used(info->super_copy, old_val); 6196 spin_unlock(&info->delalloc_root_lock); 6197 6198 while (total) { 6199 cache = btrfs_lookup_block_group(info, bytenr); 6200 if (!cache) 6201 return -ENOENT; 6202 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 6203 BTRFS_BLOCK_GROUP_RAID1 | 6204 BTRFS_BLOCK_GROUP_RAID10)) 6205 factor = 2; 6206 else 6207 factor = 1; 6208 /* 6209 * If this block group has free space cache written out, we 6210 * need to make sure to load it if we are removing space. This 6211 * is because we need the unpinning stage to actually add the 6212 * space back to the block group, otherwise we will leak space. 6213 */ 6214 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6215 cache_block_group(cache, 1); 6216 6217 byte_in_group = bytenr - cache->key.objectid; 6218 WARN_ON(byte_in_group > cache->key.offset); 6219 6220 spin_lock(&cache->space_info->lock); 6221 spin_lock(&cache->lock); 6222 6223 if (btrfs_test_opt(info, SPACE_CACHE) && 6224 cache->disk_cache_state < BTRFS_DC_CLEAR) 6225 cache->disk_cache_state = BTRFS_DC_CLEAR; 6226 6227 old_val = btrfs_block_group_used(&cache->item); 6228 num_bytes = min(total, cache->key.offset - byte_in_group); 6229 if (alloc) { 6230 old_val += num_bytes; 6231 btrfs_set_block_group_used(&cache->item, old_val); 6232 cache->reserved -= num_bytes; 6233 cache->space_info->bytes_reserved -= num_bytes; 6234 cache->space_info->bytes_used += num_bytes; 6235 cache->space_info->disk_used += num_bytes * factor; 6236 spin_unlock(&cache->lock); 6237 spin_unlock(&cache->space_info->lock); 6238 } else { 6239 old_val -= num_bytes; 6240 btrfs_set_block_group_used(&cache->item, old_val); 6241 cache->pinned += num_bytes; 6242 cache->space_info->bytes_pinned += num_bytes; 6243 cache->space_info->bytes_used -= num_bytes; 6244 cache->space_info->disk_used -= num_bytes * factor; 6245 spin_unlock(&cache->lock); 6246 spin_unlock(&cache->space_info->lock); 6247 6248 trace_btrfs_space_reservation(info, "pinned", 6249 cache->space_info->flags, 6250 num_bytes, 1); 6251 set_extent_dirty(info->pinned_extents, 6252 bytenr, bytenr + num_bytes - 1, 6253 GFP_NOFS | __GFP_NOFAIL); 6254 } 6255 6256 spin_lock(&trans->transaction->dirty_bgs_lock); 6257 if (list_empty(&cache->dirty_list)) { 6258 list_add_tail(&cache->dirty_list, 6259 &trans->transaction->dirty_bgs); 6260 trans->transaction->num_dirty_bgs++; 6261 btrfs_get_block_group(cache); 6262 } 6263 spin_unlock(&trans->transaction->dirty_bgs_lock); 6264 6265 /* 6266 * No longer have used bytes in this block group, queue it for 6267 * deletion. We do this after adding the block group to the 6268 * dirty list to avoid races between cleaner kthread and space 6269 * cache writeout. 6270 */ 6271 if (!alloc && old_val == 0) { 6272 spin_lock(&info->unused_bgs_lock); 6273 if (list_empty(&cache->bg_list)) { 6274 btrfs_get_block_group(cache); 6275 list_add_tail(&cache->bg_list, 6276 &info->unused_bgs); 6277 } 6278 spin_unlock(&info->unused_bgs_lock); 6279 } 6280 6281 btrfs_put_block_group(cache); 6282 total -= num_bytes; 6283 bytenr += num_bytes; 6284 } 6285 return 0; 6286 } 6287 6288 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6289 { 6290 struct btrfs_block_group_cache *cache; 6291 u64 bytenr; 6292 6293 spin_lock(&fs_info->block_group_cache_lock); 6294 bytenr = fs_info->first_logical_byte; 6295 spin_unlock(&fs_info->block_group_cache_lock); 6296 6297 if (bytenr < (u64)-1) 6298 return bytenr; 6299 6300 cache = btrfs_lookup_first_block_group(fs_info, search_start); 6301 if (!cache) 6302 return 0; 6303 6304 bytenr = cache->key.objectid; 6305 btrfs_put_block_group(cache); 6306 6307 return bytenr; 6308 } 6309 6310 static int pin_down_extent(struct btrfs_fs_info *fs_info, 6311 struct btrfs_block_group_cache *cache, 6312 u64 bytenr, u64 num_bytes, int reserved) 6313 { 6314 spin_lock(&cache->space_info->lock); 6315 spin_lock(&cache->lock); 6316 cache->pinned += num_bytes; 6317 cache->space_info->bytes_pinned += num_bytes; 6318 if (reserved) { 6319 cache->reserved -= num_bytes; 6320 cache->space_info->bytes_reserved -= num_bytes; 6321 } 6322 spin_unlock(&cache->lock); 6323 spin_unlock(&cache->space_info->lock); 6324 6325 trace_btrfs_space_reservation(fs_info, "pinned", 6326 cache->space_info->flags, num_bytes, 1); 6327 set_extent_dirty(fs_info->pinned_extents, bytenr, 6328 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6329 return 0; 6330 } 6331 6332 /* 6333 * this function must be called within transaction 6334 */ 6335 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 6336 u64 bytenr, u64 num_bytes, int reserved) 6337 { 6338 struct btrfs_block_group_cache *cache; 6339 6340 cache = btrfs_lookup_block_group(fs_info, bytenr); 6341 BUG_ON(!cache); /* Logic error */ 6342 6343 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); 6344 6345 btrfs_put_block_group(cache); 6346 return 0; 6347 } 6348 6349 /* 6350 * this function must be called within transaction 6351 */ 6352 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 6353 u64 bytenr, u64 num_bytes) 6354 { 6355 struct btrfs_block_group_cache *cache; 6356 int ret; 6357 6358 cache = btrfs_lookup_block_group(fs_info, bytenr); 6359 if (!cache) 6360 return -EINVAL; 6361 6362 /* 6363 * pull in the free space cache (if any) so that our pin 6364 * removes the free space from the cache. We have load_only set 6365 * to one because the slow code to read in the free extents does check 6366 * the pinned extents. 6367 */ 6368 cache_block_group(cache, 1); 6369 6370 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); 6371 6372 /* remove us from the free space cache (if we're there at all) */ 6373 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6374 btrfs_put_block_group(cache); 6375 return ret; 6376 } 6377 6378 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 6379 u64 start, u64 num_bytes) 6380 { 6381 int ret; 6382 struct btrfs_block_group_cache *block_group; 6383 struct btrfs_caching_control *caching_ctl; 6384 6385 block_group = btrfs_lookup_block_group(fs_info, start); 6386 if (!block_group) 6387 return -EINVAL; 6388 6389 cache_block_group(block_group, 0); 6390 caching_ctl = get_caching_control(block_group); 6391 6392 if (!caching_ctl) { 6393 /* Logic error */ 6394 BUG_ON(!block_group_cache_done(block_group)); 6395 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6396 } else { 6397 mutex_lock(&caching_ctl->mutex); 6398 6399 if (start >= caching_ctl->progress) { 6400 ret = add_excluded_extent(fs_info, start, num_bytes); 6401 } else if (start + num_bytes <= caching_ctl->progress) { 6402 ret = btrfs_remove_free_space(block_group, 6403 start, num_bytes); 6404 } else { 6405 num_bytes = caching_ctl->progress - start; 6406 ret = btrfs_remove_free_space(block_group, 6407 start, num_bytes); 6408 if (ret) 6409 goto out_lock; 6410 6411 num_bytes = (start + num_bytes) - 6412 caching_ctl->progress; 6413 start = caching_ctl->progress; 6414 ret = add_excluded_extent(fs_info, start, num_bytes); 6415 } 6416 out_lock: 6417 mutex_unlock(&caching_ctl->mutex); 6418 put_caching_control(caching_ctl); 6419 } 6420 btrfs_put_block_group(block_group); 6421 return ret; 6422 } 6423 6424 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, 6425 struct extent_buffer *eb) 6426 { 6427 struct btrfs_file_extent_item *item; 6428 struct btrfs_key key; 6429 int found_type; 6430 int i; 6431 6432 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 6433 return 0; 6434 6435 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6436 btrfs_item_key_to_cpu(eb, &key, i); 6437 if (key.type != BTRFS_EXTENT_DATA_KEY) 6438 continue; 6439 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6440 found_type = btrfs_file_extent_type(eb, item); 6441 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6442 continue; 6443 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6444 continue; 6445 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6446 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6447 __exclude_logged_extent(fs_info, key.objectid, key.offset); 6448 } 6449 6450 return 0; 6451 } 6452 6453 static void 6454 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6455 { 6456 atomic_inc(&bg->reservations); 6457 } 6458 6459 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6460 const u64 start) 6461 { 6462 struct btrfs_block_group_cache *bg; 6463 6464 bg = btrfs_lookup_block_group(fs_info, start); 6465 ASSERT(bg); 6466 if (atomic_dec_and_test(&bg->reservations)) 6467 wake_up_atomic_t(&bg->reservations); 6468 btrfs_put_block_group(bg); 6469 } 6470 6471 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) 6472 { 6473 schedule(); 6474 return 0; 6475 } 6476 6477 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6478 { 6479 struct btrfs_space_info *space_info = bg->space_info; 6480 6481 ASSERT(bg->ro); 6482 6483 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6484 return; 6485 6486 /* 6487 * Our block group is read only but before we set it to read only, 6488 * some task might have had allocated an extent from it already, but it 6489 * has not yet created a respective ordered extent (and added it to a 6490 * root's list of ordered extents). 6491 * Therefore wait for any task currently allocating extents, since the 6492 * block group's reservations counter is incremented while a read lock 6493 * on the groups' semaphore is held and decremented after releasing 6494 * the read access on that semaphore and creating the ordered extent. 6495 */ 6496 down_write(&space_info->groups_sem); 6497 up_write(&space_info->groups_sem); 6498 6499 wait_on_atomic_t(&bg->reservations, 6500 btrfs_wait_bg_reservations_atomic_t, 6501 TASK_UNINTERRUPTIBLE); 6502 } 6503 6504 /** 6505 * btrfs_add_reserved_bytes - update the block_group and space info counters 6506 * @cache: The cache we are manipulating 6507 * @ram_bytes: The number of bytes of file content, and will be same to 6508 * @num_bytes except for the compress path. 6509 * @num_bytes: The number of bytes in question 6510 * @delalloc: The blocks are allocated for the delalloc write 6511 * 6512 * This is called by the allocator when it reserves space. If this is a 6513 * reservation and the block group has become read only we cannot make the 6514 * reservation and return -EAGAIN, otherwise this function always succeeds. 6515 */ 6516 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 6517 u64 ram_bytes, u64 num_bytes, int delalloc) 6518 { 6519 struct btrfs_space_info *space_info = cache->space_info; 6520 int ret = 0; 6521 6522 spin_lock(&space_info->lock); 6523 spin_lock(&cache->lock); 6524 if (cache->ro) { 6525 ret = -EAGAIN; 6526 } else { 6527 cache->reserved += num_bytes; 6528 space_info->bytes_reserved += num_bytes; 6529 6530 trace_btrfs_space_reservation(cache->fs_info, 6531 "space_info", space_info->flags, 6532 ram_bytes, 0); 6533 space_info->bytes_may_use -= ram_bytes; 6534 if (delalloc) 6535 cache->delalloc_bytes += num_bytes; 6536 } 6537 spin_unlock(&cache->lock); 6538 spin_unlock(&space_info->lock); 6539 return ret; 6540 } 6541 6542 /** 6543 * btrfs_free_reserved_bytes - update the block_group and space info counters 6544 * @cache: The cache we are manipulating 6545 * @num_bytes: The number of bytes in question 6546 * @delalloc: The blocks are allocated for the delalloc write 6547 * 6548 * This is called by somebody who is freeing space that was never actually used 6549 * on disk. For example if you reserve some space for a new leaf in transaction 6550 * A and before transaction A commits you free that leaf, you call this with 6551 * reserve set to 0 in order to clear the reservation. 6552 */ 6553 6554 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 6555 u64 num_bytes, int delalloc) 6556 { 6557 struct btrfs_space_info *space_info = cache->space_info; 6558 int ret = 0; 6559 6560 spin_lock(&space_info->lock); 6561 spin_lock(&cache->lock); 6562 if (cache->ro) 6563 space_info->bytes_readonly += num_bytes; 6564 cache->reserved -= num_bytes; 6565 space_info->bytes_reserved -= num_bytes; 6566 6567 if (delalloc) 6568 cache->delalloc_bytes -= num_bytes; 6569 spin_unlock(&cache->lock); 6570 spin_unlock(&space_info->lock); 6571 return ret; 6572 } 6573 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 6574 { 6575 struct btrfs_caching_control *next; 6576 struct btrfs_caching_control *caching_ctl; 6577 struct btrfs_block_group_cache *cache; 6578 6579 down_write(&fs_info->commit_root_sem); 6580 6581 list_for_each_entry_safe(caching_ctl, next, 6582 &fs_info->caching_block_groups, list) { 6583 cache = caching_ctl->block_group; 6584 if (block_group_cache_done(cache)) { 6585 cache->last_byte_to_unpin = (u64)-1; 6586 list_del_init(&caching_ctl->list); 6587 put_caching_control(caching_ctl); 6588 } else { 6589 cache->last_byte_to_unpin = caching_ctl->progress; 6590 } 6591 } 6592 6593 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6594 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6595 else 6596 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6597 6598 up_write(&fs_info->commit_root_sem); 6599 6600 update_global_block_rsv(fs_info); 6601 } 6602 6603 /* 6604 * Returns the free cluster for the given space info and sets empty_cluster to 6605 * what it should be based on the mount options. 6606 */ 6607 static struct btrfs_free_cluster * 6608 fetch_cluster_info(struct btrfs_fs_info *fs_info, 6609 struct btrfs_space_info *space_info, u64 *empty_cluster) 6610 { 6611 struct btrfs_free_cluster *ret = NULL; 6612 bool ssd = btrfs_test_opt(fs_info, SSD); 6613 6614 *empty_cluster = 0; 6615 if (btrfs_mixed_space_info(space_info)) 6616 return ret; 6617 6618 if (ssd) 6619 *empty_cluster = SZ_2M; 6620 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6621 ret = &fs_info->meta_alloc_cluster; 6622 if (!ssd) 6623 *empty_cluster = SZ_64K; 6624 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { 6625 ret = &fs_info->data_alloc_cluster; 6626 } 6627 6628 return ret; 6629 } 6630 6631 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 6632 u64 start, u64 end, 6633 const bool return_free_space) 6634 { 6635 struct btrfs_block_group_cache *cache = NULL; 6636 struct btrfs_space_info *space_info; 6637 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6638 struct btrfs_free_cluster *cluster = NULL; 6639 u64 len; 6640 u64 total_unpinned = 0; 6641 u64 empty_cluster = 0; 6642 bool readonly; 6643 6644 while (start <= end) { 6645 readonly = false; 6646 if (!cache || 6647 start >= cache->key.objectid + cache->key.offset) { 6648 if (cache) 6649 btrfs_put_block_group(cache); 6650 total_unpinned = 0; 6651 cache = btrfs_lookup_block_group(fs_info, start); 6652 BUG_ON(!cache); /* Logic error */ 6653 6654 cluster = fetch_cluster_info(fs_info, 6655 cache->space_info, 6656 &empty_cluster); 6657 empty_cluster <<= 1; 6658 } 6659 6660 len = cache->key.objectid + cache->key.offset - start; 6661 len = min(len, end + 1 - start); 6662 6663 if (start < cache->last_byte_to_unpin) { 6664 len = min(len, cache->last_byte_to_unpin - start); 6665 if (return_free_space) 6666 btrfs_add_free_space(cache, start, len); 6667 } 6668 6669 start += len; 6670 total_unpinned += len; 6671 space_info = cache->space_info; 6672 6673 /* 6674 * If this space cluster has been marked as fragmented and we've 6675 * unpinned enough in this block group to potentially allow a 6676 * cluster to be created inside of it go ahead and clear the 6677 * fragmented check. 6678 */ 6679 if (cluster && cluster->fragmented && 6680 total_unpinned > empty_cluster) { 6681 spin_lock(&cluster->lock); 6682 cluster->fragmented = 0; 6683 spin_unlock(&cluster->lock); 6684 } 6685 6686 spin_lock(&space_info->lock); 6687 spin_lock(&cache->lock); 6688 cache->pinned -= len; 6689 space_info->bytes_pinned -= len; 6690 6691 trace_btrfs_space_reservation(fs_info, "pinned", 6692 space_info->flags, len, 0); 6693 space_info->max_extent_size = 0; 6694 percpu_counter_add(&space_info->total_bytes_pinned, -len); 6695 if (cache->ro) { 6696 space_info->bytes_readonly += len; 6697 readonly = true; 6698 } 6699 spin_unlock(&cache->lock); 6700 if (!readonly && return_free_space && 6701 global_rsv->space_info == space_info) { 6702 u64 to_add = len; 6703 WARN_ON(!return_free_space); 6704 spin_lock(&global_rsv->lock); 6705 if (!global_rsv->full) { 6706 to_add = min(len, global_rsv->size - 6707 global_rsv->reserved); 6708 global_rsv->reserved += to_add; 6709 space_info->bytes_may_use += to_add; 6710 if (global_rsv->reserved >= global_rsv->size) 6711 global_rsv->full = 1; 6712 trace_btrfs_space_reservation(fs_info, 6713 "space_info", 6714 space_info->flags, 6715 to_add, 1); 6716 len -= to_add; 6717 } 6718 spin_unlock(&global_rsv->lock); 6719 /* Add to any tickets we may have */ 6720 if (len) 6721 space_info_add_new_bytes(fs_info, space_info, 6722 len); 6723 } 6724 spin_unlock(&space_info->lock); 6725 } 6726 6727 if (cache) 6728 btrfs_put_block_group(cache); 6729 return 0; 6730 } 6731 6732 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 6733 struct btrfs_fs_info *fs_info) 6734 { 6735 struct btrfs_block_group_cache *block_group, *tmp; 6736 struct list_head *deleted_bgs; 6737 struct extent_io_tree *unpin; 6738 u64 start; 6739 u64 end; 6740 int ret; 6741 6742 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6743 unpin = &fs_info->freed_extents[1]; 6744 else 6745 unpin = &fs_info->freed_extents[0]; 6746 6747 while (!trans->aborted) { 6748 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6749 ret = find_first_extent_bit(unpin, 0, &start, &end, 6750 EXTENT_DIRTY, NULL); 6751 if (ret) { 6752 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6753 break; 6754 } 6755 6756 if (btrfs_test_opt(fs_info, DISCARD)) 6757 ret = btrfs_discard_extent(fs_info, start, 6758 end + 1 - start, NULL); 6759 6760 clear_extent_dirty(unpin, start, end); 6761 unpin_extent_range(fs_info, start, end, true); 6762 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6763 cond_resched(); 6764 } 6765 6766 /* 6767 * Transaction is finished. We don't need the lock anymore. We 6768 * do need to clean up the block groups in case of a transaction 6769 * abort. 6770 */ 6771 deleted_bgs = &trans->transaction->deleted_bgs; 6772 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6773 u64 trimmed = 0; 6774 6775 ret = -EROFS; 6776 if (!trans->aborted) 6777 ret = btrfs_discard_extent(fs_info, 6778 block_group->key.objectid, 6779 block_group->key.offset, 6780 &trimmed); 6781 6782 list_del_init(&block_group->bg_list); 6783 btrfs_put_block_group_trimming(block_group); 6784 btrfs_put_block_group(block_group); 6785 6786 if (ret) { 6787 const char *errstr = btrfs_decode_error(ret); 6788 btrfs_warn(fs_info, 6789 "Discard failed while removing blockgroup: errno=%d %s\n", 6790 ret, errstr); 6791 } 6792 } 6793 6794 return 0; 6795 } 6796 6797 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 6798 u64 owner, u64 root_objectid) 6799 { 6800 struct btrfs_space_info *space_info; 6801 u64 flags; 6802 6803 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6804 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 6805 flags = BTRFS_BLOCK_GROUP_SYSTEM; 6806 else 6807 flags = BTRFS_BLOCK_GROUP_METADATA; 6808 } else { 6809 flags = BTRFS_BLOCK_GROUP_DATA; 6810 } 6811 6812 space_info = __find_space_info(fs_info, flags); 6813 BUG_ON(!space_info); /* Logic bug */ 6814 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 6815 } 6816 6817 6818 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6819 struct btrfs_fs_info *info, 6820 struct btrfs_delayed_ref_node *node, u64 parent, 6821 u64 root_objectid, u64 owner_objectid, 6822 u64 owner_offset, int refs_to_drop, 6823 struct btrfs_delayed_extent_op *extent_op) 6824 { 6825 struct btrfs_key key; 6826 struct btrfs_path *path; 6827 struct btrfs_root *extent_root = info->extent_root; 6828 struct extent_buffer *leaf; 6829 struct btrfs_extent_item *ei; 6830 struct btrfs_extent_inline_ref *iref; 6831 int ret; 6832 int is_data; 6833 int extent_slot = 0; 6834 int found_extent = 0; 6835 int num_to_del = 1; 6836 u32 item_size; 6837 u64 refs; 6838 u64 bytenr = node->bytenr; 6839 u64 num_bytes = node->num_bytes; 6840 int last_ref = 0; 6841 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 6842 6843 path = btrfs_alloc_path(); 6844 if (!path) 6845 return -ENOMEM; 6846 6847 path->reada = READA_FORWARD; 6848 path->leave_spinning = 1; 6849 6850 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6851 BUG_ON(!is_data && refs_to_drop != 1); 6852 6853 if (is_data) 6854 skinny_metadata = 0; 6855 6856 ret = lookup_extent_backref(trans, info, path, &iref, 6857 bytenr, num_bytes, parent, 6858 root_objectid, owner_objectid, 6859 owner_offset); 6860 if (ret == 0) { 6861 extent_slot = path->slots[0]; 6862 while (extent_slot >= 0) { 6863 btrfs_item_key_to_cpu(path->nodes[0], &key, 6864 extent_slot); 6865 if (key.objectid != bytenr) 6866 break; 6867 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6868 key.offset == num_bytes) { 6869 found_extent = 1; 6870 break; 6871 } 6872 if (key.type == BTRFS_METADATA_ITEM_KEY && 6873 key.offset == owner_objectid) { 6874 found_extent = 1; 6875 break; 6876 } 6877 if (path->slots[0] - extent_slot > 5) 6878 break; 6879 extent_slot--; 6880 } 6881 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6882 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 6883 if (found_extent && item_size < sizeof(*ei)) 6884 found_extent = 0; 6885 #endif 6886 if (!found_extent) { 6887 BUG_ON(iref); 6888 ret = remove_extent_backref(trans, info, path, NULL, 6889 refs_to_drop, 6890 is_data, &last_ref); 6891 if (ret) { 6892 btrfs_abort_transaction(trans, ret); 6893 goto out; 6894 } 6895 btrfs_release_path(path); 6896 path->leave_spinning = 1; 6897 6898 key.objectid = bytenr; 6899 key.type = BTRFS_EXTENT_ITEM_KEY; 6900 key.offset = num_bytes; 6901 6902 if (!is_data && skinny_metadata) { 6903 key.type = BTRFS_METADATA_ITEM_KEY; 6904 key.offset = owner_objectid; 6905 } 6906 6907 ret = btrfs_search_slot(trans, extent_root, 6908 &key, path, -1, 1); 6909 if (ret > 0 && skinny_metadata && path->slots[0]) { 6910 /* 6911 * Couldn't find our skinny metadata item, 6912 * see if we have ye olde extent item. 6913 */ 6914 path->slots[0]--; 6915 btrfs_item_key_to_cpu(path->nodes[0], &key, 6916 path->slots[0]); 6917 if (key.objectid == bytenr && 6918 key.type == BTRFS_EXTENT_ITEM_KEY && 6919 key.offset == num_bytes) 6920 ret = 0; 6921 } 6922 6923 if (ret > 0 && skinny_metadata) { 6924 skinny_metadata = false; 6925 key.objectid = bytenr; 6926 key.type = BTRFS_EXTENT_ITEM_KEY; 6927 key.offset = num_bytes; 6928 btrfs_release_path(path); 6929 ret = btrfs_search_slot(trans, extent_root, 6930 &key, path, -1, 1); 6931 } 6932 6933 if (ret) { 6934 btrfs_err(info, 6935 "umm, got %d back from search, was looking for %llu", 6936 ret, bytenr); 6937 if (ret > 0) 6938 btrfs_print_leaf(info, path->nodes[0]); 6939 } 6940 if (ret < 0) { 6941 btrfs_abort_transaction(trans, ret); 6942 goto out; 6943 } 6944 extent_slot = path->slots[0]; 6945 } 6946 } else if (WARN_ON(ret == -ENOENT)) { 6947 btrfs_print_leaf(info, path->nodes[0]); 6948 btrfs_err(info, 6949 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 6950 bytenr, parent, root_objectid, owner_objectid, 6951 owner_offset); 6952 btrfs_abort_transaction(trans, ret); 6953 goto out; 6954 } else { 6955 btrfs_abort_transaction(trans, ret); 6956 goto out; 6957 } 6958 6959 leaf = path->nodes[0]; 6960 item_size = btrfs_item_size_nr(leaf, extent_slot); 6961 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6962 if (item_size < sizeof(*ei)) { 6963 BUG_ON(found_extent || extent_slot != path->slots[0]); 6964 ret = convert_extent_item_v0(trans, info, path, owner_objectid, 6965 0); 6966 if (ret < 0) { 6967 btrfs_abort_transaction(trans, ret); 6968 goto out; 6969 } 6970 6971 btrfs_release_path(path); 6972 path->leave_spinning = 1; 6973 6974 key.objectid = bytenr; 6975 key.type = BTRFS_EXTENT_ITEM_KEY; 6976 key.offset = num_bytes; 6977 6978 ret = btrfs_search_slot(trans, extent_root, &key, path, 6979 -1, 1); 6980 if (ret) { 6981 btrfs_err(info, 6982 "umm, got %d back from search, was looking for %llu", 6983 ret, bytenr); 6984 btrfs_print_leaf(info, path->nodes[0]); 6985 } 6986 if (ret < 0) { 6987 btrfs_abort_transaction(trans, ret); 6988 goto out; 6989 } 6990 6991 extent_slot = path->slots[0]; 6992 leaf = path->nodes[0]; 6993 item_size = btrfs_item_size_nr(leaf, extent_slot); 6994 } 6995 #endif 6996 BUG_ON(item_size < sizeof(*ei)); 6997 ei = btrfs_item_ptr(leaf, extent_slot, 6998 struct btrfs_extent_item); 6999 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 7000 key.type == BTRFS_EXTENT_ITEM_KEY) { 7001 struct btrfs_tree_block_info *bi; 7002 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 7003 bi = (struct btrfs_tree_block_info *)(ei + 1); 7004 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 7005 } 7006 7007 refs = btrfs_extent_refs(leaf, ei); 7008 if (refs < refs_to_drop) { 7009 btrfs_err(info, 7010 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 7011 refs_to_drop, refs, bytenr); 7012 ret = -EINVAL; 7013 btrfs_abort_transaction(trans, ret); 7014 goto out; 7015 } 7016 refs -= refs_to_drop; 7017 7018 if (refs > 0) { 7019 if (extent_op) 7020 __run_delayed_extent_op(extent_op, leaf, ei); 7021 /* 7022 * In the case of inline back ref, reference count will 7023 * be updated by remove_extent_backref 7024 */ 7025 if (iref) { 7026 BUG_ON(!found_extent); 7027 } else { 7028 btrfs_set_extent_refs(leaf, ei, refs); 7029 btrfs_mark_buffer_dirty(leaf); 7030 } 7031 if (found_extent) { 7032 ret = remove_extent_backref(trans, info, path, 7033 iref, refs_to_drop, 7034 is_data, &last_ref); 7035 if (ret) { 7036 btrfs_abort_transaction(trans, ret); 7037 goto out; 7038 } 7039 } 7040 add_pinned_bytes(info, -num_bytes, owner_objectid, 7041 root_objectid); 7042 } else { 7043 if (found_extent) { 7044 BUG_ON(is_data && refs_to_drop != 7045 extent_data_ref_count(path, iref)); 7046 if (iref) { 7047 BUG_ON(path->slots[0] != extent_slot); 7048 } else { 7049 BUG_ON(path->slots[0] != extent_slot + 1); 7050 path->slots[0] = extent_slot; 7051 num_to_del = 2; 7052 } 7053 } 7054 7055 last_ref = 1; 7056 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7057 num_to_del); 7058 if (ret) { 7059 btrfs_abort_transaction(trans, ret); 7060 goto out; 7061 } 7062 btrfs_release_path(path); 7063 7064 if (is_data) { 7065 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 7066 if (ret) { 7067 btrfs_abort_transaction(trans, ret); 7068 goto out; 7069 } 7070 } 7071 7072 ret = add_to_free_space_tree(trans, info, bytenr, num_bytes); 7073 if (ret) { 7074 btrfs_abort_transaction(trans, ret); 7075 goto out; 7076 } 7077 7078 ret = update_block_group(trans, info, bytenr, num_bytes, 0); 7079 if (ret) { 7080 btrfs_abort_transaction(trans, ret); 7081 goto out; 7082 } 7083 } 7084 btrfs_release_path(path); 7085 7086 out: 7087 btrfs_free_path(path); 7088 return ret; 7089 } 7090 7091 /* 7092 * when we free an block, it is possible (and likely) that we free the last 7093 * delayed ref for that extent as well. This searches the delayed ref tree for 7094 * a given extent, and if there are no other delayed refs to be processed, it 7095 * removes it from the tree. 7096 */ 7097 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7098 u64 bytenr) 7099 { 7100 struct btrfs_delayed_ref_head *head; 7101 struct btrfs_delayed_ref_root *delayed_refs; 7102 int ret = 0; 7103 7104 delayed_refs = &trans->transaction->delayed_refs; 7105 spin_lock(&delayed_refs->lock); 7106 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 7107 if (!head) 7108 goto out_delayed_unlock; 7109 7110 spin_lock(&head->lock); 7111 if (!list_empty(&head->ref_list)) 7112 goto out; 7113 7114 if (head->extent_op) { 7115 if (!head->must_insert_reserved) 7116 goto out; 7117 btrfs_free_delayed_extent_op(head->extent_op); 7118 head->extent_op = NULL; 7119 } 7120 7121 /* 7122 * waiting for the lock here would deadlock. If someone else has it 7123 * locked they are already in the process of dropping it anyway 7124 */ 7125 if (!mutex_trylock(&head->mutex)) 7126 goto out; 7127 7128 /* 7129 * at this point we have a head with no other entries. Go 7130 * ahead and process it. 7131 */ 7132 head->node.in_tree = 0; 7133 rb_erase(&head->href_node, &delayed_refs->href_root); 7134 7135 atomic_dec(&delayed_refs->num_entries); 7136 7137 /* 7138 * we don't take a ref on the node because we're removing it from the 7139 * tree, so we just steal the ref the tree was holding. 7140 */ 7141 delayed_refs->num_heads--; 7142 if (head->processing == 0) 7143 delayed_refs->num_heads_ready--; 7144 head->processing = 0; 7145 spin_unlock(&head->lock); 7146 spin_unlock(&delayed_refs->lock); 7147 7148 BUG_ON(head->extent_op); 7149 if (head->must_insert_reserved) 7150 ret = 1; 7151 7152 mutex_unlock(&head->mutex); 7153 btrfs_put_delayed_ref(&head->node); 7154 return ret; 7155 out: 7156 spin_unlock(&head->lock); 7157 7158 out_delayed_unlock: 7159 spin_unlock(&delayed_refs->lock); 7160 return 0; 7161 } 7162 7163 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7164 struct btrfs_root *root, 7165 struct extent_buffer *buf, 7166 u64 parent, int last_ref) 7167 { 7168 struct btrfs_fs_info *fs_info = root->fs_info; 7169 int pin = 1; 7170 int ret; 7171 7172 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7173 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 7174 buf->start, buf->len, 7175 parent, 7176 root->root_key.objectid, 7177 btrfs_header_level(buf), 7178 BTRFS_DROP_DELAYED_REF, NULL); 7179 BUG_ON(ret); /* -ENOMEM */ 7180 } 7181 7182 if (!last_ref) 7183 return; 7184 7185 if (btrfs_header_generation(buf) == trans->transid) { 7186 struct btrfs_block_group_cache *cache; 7187 7188 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7189 ret = check_ref_cleanup(trans, buf->start); 7190 if (!ret) 7191 goto out; 7192 } 7193 7194 cache = btrfs_lookup_block_group(fs_info, buf->start); 7195 7196 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7197 pin_down_extent(fs_info, cache, buf->start, 7198 buf->len, 1); 7199 btrfs_put_block_group(cache); 7200 goto out; 7201 } 7202 7203 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7204 7205 btrfs_add_free_space(cache, buf->start, buf->len); 7206 btrfs_free_reserved_bytes(cache, buf->len, 0); 7207 btrfs_put_block_group(cache); 7208 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7209 pin = 0; 7210 } 7211 out: 7212 if (pin) 7213 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), 7214 root->root_key.objectid); 7215 7216 /* 7217 * Deleting the buffer, clear the corrupt flag since it doesn't matter 7218 * anymore. 7219 */ 7220 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7221 } 7222 7223 /* Can return -ENOMEM */ 7224 int btrfs_free_extent(struct btrfs_trans_handle *trans, 7225 struct btrfs_fs_info *fs_info, 7226 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7227 u64 owner, u64 offset) 7228 { 7229 int ret; 7230 7231 if (btrfs_is_testing(fs_info)) 7232 return 0; 7233 7234 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); 7235 7236 /* 7237 * tree log blocks never actually go into the extent allocation 7238 * tree, just update pinning info and exit early. 7239 */ 7240 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7241 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7242 /* unlocks the pinned mutex */ 7243 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); 7244 ret = 0; 7245 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7246 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 7247 num_bytes, 7248 parent, root_objectid, (int)owner, 7249 BTRFS_DROP_DELAYED_REF, NULL); 7250 } else { 7251 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 7252 num_bytes, 7253 parent, root_objectid, owner, 7254 offset, 0, 7255 BTRFS_DROP_DELAYED_REF); 7256 } 7257 return ret; 7258 } 7259 7260 /* 7261 * when we wait for progress in the block group caching, its because 7262 * our allocation attempt failed at least once. So, we must sleep 7263 * and let some progress happen before we try again. 7264 * 7265 * This function will sleep at least once waiting for new free space to 7266 * show up, and then it will check the block group free space numbers 7267 * for our min num_bytes. Another option is to have it go ahead 7268 * and look in the rbtree for a free extent of a given size, but this 7269 * is a good start. 7270 * 7271 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7272 * any of the information in this block group. 7273 */ 7274 static noinline void 7275 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7276 u64 num_bytes) 7277 { 7278 struct btrfs_caching_control *caching_ctl; 7279 7280 caching_ctl = get_caching_control(cache); 7281 if (!caching_ctl) 7282 return; 7283 7284 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7285 (cache->free_space_ctl->free_space >= num_bytes)); 7286 7287 put_caching_control(caching_ctl); 7288 } 7289 7290 static noinline int 7291 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7292 { 7293 struct btrfs_caching_control *caching_ctl; 7294 int ret = 0; 7295 7296 caching_ctl = get_caching_control(cache); 7297 if (!caching_ctl) 7298 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7299 7300 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7301 if (cache->cached == BTRFS_CACHE_ERROR) 7302 ret = -EIO; 7303 put_caching_control(caching_ctl); 7304 return ret; 7305 } 7306 7307 int __get_raid_index(u64 flags) 7308 { 7309 if (flags & BTRFS_BLOCK_GROUP_RAID10) 7310 return BTRFS_RAID_RAID10; 7311 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 7312 return BTRFS_RAID_RAID1; 7313 else if (flags & BTRFS_BLOCK_GROUP_DUP) 7314 return BTRFS_RAID_DUP; 7315 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 7316 return BTRFS_RAID_RAID0; 7317 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 7318 return BTRFS_RAID_RAID5; 7319 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 7320 return BTRFS_RAID_RAID6; 7321 7322 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 7323 } 7324 7325 int get_block_group_index(struct btrfs_block_group_cache *cache) 7326 { 7327 return __get_raid_index(cache->flags); 7328 } 7329 7330 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 7331 [BTRFS_RAID_RAID10] = "raid10", 7332 [BTRFS_RAID_RAID1] = "raid1", 7333 [BTRFS_RAID_DUP] = "dup", 7334 [BTRFS_RAID_RAID0] = "raid0", 7335 [BTRFS_RAID_SINGLE] = "single", 7336 [BTRFS_RAID_RAID5] = "raid5", 7337 [BTRFS_RAID_RAID6] = "raid6", 7338 }; 7339 7340 static const char *get_raid_name(enum btrfs_raid_types type) 7341 { 7342 if (type >= BTRFS_NR_RAID_TYPES) 7343 return NULL; 7344 7345 return btrfs_raid_type_names[type]; 7346 } 7347 7348 enum btrfs_loop_type { 7349 LOOP_CACHING_NOWAIT = 0, 7350 LOOP_CACHING_WAIT = 1, 7351 LOOP_ALLOC_CHUNK = 2, 7352 LOOP_NO_EMPTY_SIZE = 3, 7353 }; 7354 7355 static inline void 7356 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7357 int delalloc) 7358 { 7359 if (delalloc) 7360 down_read(&cache->data_rwsem); 7361 } 7362 7363 static inline void 7364 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7365 int delalloc) 7366 { 7367 btrfs_get_block_group(cache); 7368 if (delalloc) 7369 down_read(&cache->data_rwsem); 7370 } 7371 7372 static struct btrfs_block_group_cache * 7373 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7374 struct btrfs_free_cluster *cluster, 7375 int delalloc) 7376 { 7377 struct btrfs_block_group_cache *used_bg = NULL; 7378 7379 spin_lock(&cluster->refill_lock); 7380 while (1) { 7381 used_bg = cluster->block_group; 7382 if (!used_bg) 7383 return NULL; 7384 7385 if (used_bg == block_group) 7386 return used_bg; 7387 7388 btrfs_get_block_group(used_bg); 7389 7390 if (!delalloc) 7391 return used_bg; 7392 7393 if (down_read_trylock(&used_bg->data_rwsem)) 7394 return used_bg; 7395 7396 spin_unlock(&cluster->refill_lock); 7397 7398 /* We should only have one-level nested. */ 7399 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 7400 7401 spin_lock(&cluster->refill_lock); 7402 if (used_bg == cluster->block_group) 7403 return used_bg; 7404 7405 up_read(&used_bg->data_rwsem); 7406 btrfs_put_block_group(used_bg); 7407 } 7408 } 7409 7410 static inline void 7411 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7412 int delalloc) 7413 { 7414 if (delalloc) 7415 up_read(&cache->data_rwsem); 7416 btrfs_put_block_group(cache); 7417 } 7418 7419 /* 7420 * walks the btree of allocated extents and find a hole of a given size. 7421 * The key ins is changed to record the hole: 7422 * ins->objectid == start position 7423 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7424 * ins->offset == the size of the hole. 7425 * Any available blocks before search_start are skipped. 7426 * 7427 * If there is no suitable free space, we will record the max size of 7428 * the free space extent currently. 7429 */ 7430 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 7431 u64 ram_bytes, u64 num_bytes, u64 empty_size, 7432 u64 hint_byte, struct btrfs_key *ins, 7433 u64 flags, int delalloc) 7434 { 7435 int ret = 0; 7436 struct btrfs_root *root = fs_info->extent_root; 7437 struct btrfs_free_cluster *last_ptr = NULL; 7438 struct btrfs_block_group_cache *block_group = NULL; 7439 u64 search_start = 0; 7440 u64 max_extent_size = 0; 7441 u64 empty_cluster = 0; 7442 struct btrfs_space_info *space_info; 7443 int loop = 0; 7444 int index = __get_raid_index(flags); 7445 bool failed_cluster_refill = false; 7446 bool failed_alloc = false; 7447 bool use_cluster = true; 7448 bool have_caching_bg = false; 7449 bool orig_have_caching_bg = false; 7450 bool full_search = false; 7451 7452 WARN_ON(num_bytes < fs_info->sectorsize); 7453 ins->type = BTRFS_EXTENT_ITEM_KEY; 7454 ins->objectid = 0; 7455 ins->offset = 0; 7456 7457 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 7458 7459 space_info = __find_space_info(fs_info, flags); 7460 if (!space_info) { 7461 btrfs_err(fs_info, "No space info for %llu", flags); 7462 return -ENOSPC; 7463 } 7464 7465 /* 7466 * If our free space is heavily fragmented we may not be able to make 7467 * big contiguous allocations, so instead of doing the expensive search 7468 * for free space, simply return ENOSPC with our max_extent_size so we 7469 * can go ahead and search for a more manageable chunk. 7470 * 7471 * If our max_extent_size is large enough for our allocation simply 7472 * disable clustering since we will likely not be able to find enough 7473 * space to create a cluster and induce latency trying. 7474 */ 7475 if (unlikely(space_info->max_extent_size)) { 7476 spin_lock(&space_info->lock); 7477 if (space_info->max_extent_size && 7478 num_bytes > space_info->max_extent_size) { 7479 ins->offset = space_info->max_extent_size; 7480 spin_unlock(&space_info->lock); 7481 return -ENOSPC; 7482 } else if (space_info->max_extent_size) { 7483 use_cluster = false; 7484 } 7485 spin_unlock(&space_info->lock); 7486 } 7487 7488 last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); 7489 if (last_ptr) { 7490 spin_lock(&last_ptr->lock); 7491 if (last_ptr->block_group) 7492 hint_byte = last_ptr->window_start; 7493 if (last_ptr->fragmented) { 7494 /* 7495 * We still set window_start so we can keep track of the 7496 * last place we found an allocation to try and save 7497 * some time. 7498 */ 7499 hint_byte = last_ptr->window_start; 7500 use_cluster = false; 7501 } 7502 spin_unlock(&last_ptr->lock); 7503 } 7504 7505 search_start = max(search_start, first_logical_byte(fs_info, 0)); 7506 search_start = max(search_start, hint_byte); 7507 if (search_start == hint_byte) { 7508 block_group = btrfs_lookup_block_group(fs_info, search_start); 7509 /* 7510 * we don't want to use the block group if it doesn't match our 7511 * allocation bits, or if its not cached. 7512 * 7513 * However if we are re-searching with an ideal block group 7514 * picked out then we don't care that the block group is cached. 7515 */ 7516 if (block_group && block_group_bits(block_group, flags) && 7517 block_group->cached != BTRFS_CACHE_NO) { 7518 down_read(&space_info->groups_sem); 7519 if (list_empty(&block_group->list) || 7520 block_group->ro) { 7521 /* 7522 * someone is removing this block group, 7523 * we can't jump into the have_block_group 7524 * target because our list pointers are not 7525 * valid 7526 */ 7527 btrfs_put_block_group(block_group); 7528 up_read(&space_info->groups_sem); 7529 } else { 7530 index = get_block_group_index(block_group); 7531 btrfs_lock_block_group(block_group, delalloc); 7532 goto have_block_group; 7533 } 7534 } else if (block_group) { 7535 btrfs_put_block_group(block_group); 7536 } 7537 } 7538 search: 7539 have_caching_bg = false; 7540 if (index == 0 || index == __get_raid_index(flags)) 7541 full_search = true; 7542 down_read(&space_info->groups_sem); 7543 list_for_each_entry(block_group, &space_info->block_groups[index], 7544 list) { 7545 u64 offset; 7546 int cached; 7547 7548 btrfs_grab_block_group(block_group, delalloc); 7549 search_start = block_group->key.objectid; 7550 7551 /* 7552 * this can happen if we end up cycling through all the 7553 * raid types, but we want to make sure we only allocate 7554 * for the proper type. 7555 */ 7556 if (!block_group_bits(block_group, flags)) { 7557 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7558 BTRFS_BLOCK_GROUP_RAID1 | 7559 BTRFS_BLOCK_GROUP_RAID5 | 7560 BTRFS_BLOCK_GROUP_RAID6 | 7561 BTRFS_BLOCK_GROUP_RAID10; 7562 7563 /* 7564 * if they asked for extra copies and this block group 7565 * doesn't provide them, bail. This does allow us to 7566 * fill raid0 from raid1. 7567 */ 7568 if ((flags & extra) && !(block_group->flags & extra)) 7569 goto loop; 7570 } 7571 7572 have_block_group: 7573 cached = block_group_cache_done(block_group); 7574 if (unlikely(!cached)) { 7575 have_caching_bg = true; 7576 ret = cache_block_group(block_group, 0); 7577 BUG_ON(ret < 0); 7578 ret = 0; 7579 } 7580 7581 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7582 goto loop; 7583 if (unlikely(block_group->ro)) 7584 goto loop; 7585 7586 /* 7587 * Ok we want to try and use the cluster allocator, so 7588 * lets look there 7589 */ 7590 if (last_ptr && use_cluster) { 7591 struct btrfs_block_group_cache *used_block_group; 7592 unsigned long aligned_cluster; 7593 /* 7594 * the refill lock keeps out other 7595 * people trying to start a new cluster 7596 */ 7597 used_block_group = btrfs_lock_cluster(block_group, 7598 last_ptr, 7599 delalloc); 7600 if (!used_block_group) 7601 goto refill_cluster; 7602 7603 if (used_block_group != block_group && 7604 (used_block_group->ro || 7605 !block_group_bits(used_block_group, flags))) 7606 goto release_cluster; 7607 7608 offset = btrfs_alloc_from_cluster(used_block_group, 7609 last_ptr, 7610 num_bytes, 7611 used_block_group->key.objectid, 7612 &max_extent_size); 7613 if (offset) { 7614 /* we have a block, we're done */ 7615 spin_unlock(&last_ptr->refill_lock); 7616 trace_btrfs_reserve_extent_cluster(fs_info, 7617 used_block_group, 7618 search_start, num_bytes); 7619 if (used_block_group != block_group) { 7620 btrfs_release_block_group(block_group, 7621 delalloc); 7622 block_group = used_block_group; 7623 } 7624 goto checks; 7625 } 7626 7627 WARN_ON(last_ptr->block_group != used_block_group); 7628 release_cluster: 7629 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 7630 * set up a new clusters, so lets just skip it 7631 * and let the allocator find whatever block 7632 * it can find. If we reach this point, we 7633 * will have tried the cluster allocator 7634 * plenty of times and not have found 7635 * anything, so we are likely way too 7636 * fragmented for the clustering stuff to find 7637 * anything. 7638 * 7639 * However, if the cluster is taken from the 7640 * current block group, release the cluster 7641 * first, so that we stand a better chance of 7642 * succeeding in the unclustered 7643 * allocation. */ 7644 if (loop >= LOOP_NO_EMPTY_SIZE && 7645 used_block_group != block_group) { 7646 spin_unlock(&last_ptr->refill_lock); 7647 btrfs_release_block_group(used_block_group, 7648 delalloc); 7649 goto unclustered_alloc; 7650 } 7651 7652 /* 7653 * this cluster didn't work out, free it and 7654 * start over 7655 */ 7656 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7657 7658 if (used_block_group != block_group) 7659 btrfs_release_block_group(used_block_group, 7660 delalloc); 7661 refill_cluster: 7662 if (loop >= LOOP_NO_EMPTY_SIZE) { 7663 spin_unlock(&last_ptr->refill_lock); 7664 goto unclustered_alloc; 7665 } 7666 7667 aligned_cluster = max_t(unsigned long, 7668 empty_cluster + empty_size, 7669 block_group->full_stripe_len); 7670 7671 /* allocate a cluster in this block group */ 7672 ret = btrfs_find_space_cluster(fs_info, block_group, 7673 last_ptr, search_start, 7674 num_bytes, 7675 aligned_cluster); 7676 if (ret == 0) { 7677 /* 7678 * now pull our allocation out of this 7679 * cluster 7680 */ 7681 offset = btrfs_alloc_from_cluster(block_group, 7682 last_ptr, 7683 num_bytes, 7684 search_start, 7685 &max_extent_size); 7686 if (offset) { 7687 /* we found one, proceed */ 7688 spin_unlock(&last_ptr->refill_lock); 7689 trace_btrfs_reserve_extent_cluster(fs_info, 7690 block_group, search_start, 7691 num_bytes); 7692 goto checks; 7693 } 7694 } else if (!cached && loop > LOOP_CACHING_NOWAIT 7695 && !failed_cluster_refill) { 7696 spin_unlock(&last_ptr->refill_lock); 7697 7698 failed_cluster_refill = true; 7699 wait_block_group_cache_progress(block_group, 7700 num_bytes + empty_cluster + empty_size); 7701 goto have_block_group; 7702 } 7703 7704 /* 7705 * at this point we either didn't find a cluster 7706 * or we weren't able to allocate a block from our 7707 * cluster. Free the cluster we've been trying 7708 * to use, and go to the next block group 7709 */ 7710 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7711 spin_unlock(&last_ptr->refill_lock); 7712 goto loop; 7713 } 7714 7715 unclustered_alloc: 7716 /* 7717 * We are doing an unclustered alloc, set the fragmented flag so 7718 * we don't bother trying to setup a cluster again until we get 7719 * more space. 7720 */ 7721 if (unlikely(last_ptr)) { 7722 spin_lock(&last_ptr->lock); 7723 last_ptr->fragmented = 1; 7724 spin_unlock(&last_ptr->lock); 7725 } 7726 if (cached) { 7727 struct btrfs_free_space_ctl *ctl = 7728 block_group->free_space_ctl; 7729 7730 spin_lock(&ctl->tree_lock); 7731 if (ctl->free_space < 7732 num_bytes + empty_cluster + empty_size) { 7733 if (ctl->free_space > max_extent_size) 7734 max_extent_size = ctl->free_space; 7735 spin_unlock(&ctl->tree_lock); 7736 goto loop; 7737 } 7738 spin_unlock(&ctl->tree_lock); 7739 } 7740 7741 offset = btrfs_find_space_for_alloc(block_group, search_start, 7742 num_bytes, empty_size, 7743 &max_extent_size); 7744 /* 7745 * If we didn't find a chunk, and we haven't failed on this 7746 * block group before, and this block group is in the middle of 7747 * caching and we are ok with waiting, then go ahead and wait 7748 * for progress to be made, and set failed_alloc to true. 7749 * 7750 * If failed_alloc is true then we've already waited on this 7751 * block group once and should move on to the next block group. 7752 */ 7753 if (!offset && !failed_alloc && !cached && 7754 loop > LOOP_CACHING_NOWAIT) { 7755 wait_block_group_cache_progress(block_group, 7756 num_bytes + empty_size); 7757 failed_alloc = true; 7758 goto have_block_group; 7759 } else if (!offset) { 7760 goto loop; 7761 } 7762 checks: 7763 search_start = ALIGN(offset, fs_info->stripesize); 7764 7765 /* move on to the next group */ 7766 if (search_start + num_bytes > 7767 block_group->key.objectid + block_group->key.offset) { 7768 btrfs_add_free_space(block_group, offset, num_bytes); 7769 goto loop; 7770 } 7771 7772 if (offset < search_start) 7773 btrfs_add_free_space(block_group, offset, 7774 search_start - offset); 7775 BUG_ON(offset > search_start); 7776 7777 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 7778 num_bytes, delalloc); 7779 if (ret == -EAGAIN) { 7780 btrfs_add_free_space(block_group, offset, num_bytes); 7781 goto loop; 7782 } 7783 btrfs_inc_block_group_reservations(block_group); 7784 7785 /* we are all good, lets return */ 7786 ins->objectid = search_start; 7787 ins->offset = num_bytes; 7788 7789 trace_btrfs_reserve_extent(fs_info, block_group, 7790 search_start, num_bytes); 7791 btrfs_release_block_group(block_group, delalloc); 7792 break; 7793 loop: 7794 failed_cluster_refill = false; 7795 failed_alloc = false; 7796 BUG_ON(index != get_block_group_index(block_group)); 7797 btrfs_release_block_group(block_group, delalloc); 7798 } 7799 up_read(&space_info->groups_sem); 7800 7801 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg 7802 && !orig_have_caching_bg) 7803 orig_have_caching_bg = true; 7804 7805 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 7806 goto search; 7807 7808 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 7809 goto search; 7810 7811 /* 7812 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7813 * caching kthreads as we move along 7814 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7815 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7816 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7817 * again 7818 */ 7819 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 7820 index = 0; 7821 if (loop == LOOP_CACHING_NOWAIT) { 7822 /* 7823 * We want to skip the LOOP_CACHING_WAIT step if we 7824 * don't have any uncached bgs and we've already done a 7825 * full search through. 7826 */ 7827 if (orig_have_caching_bg || !full_search) 7828 loop = LOOP_CACHING_WAIT; 7829 else 7830 loop = LOOP_ALLOC_CHUNK; 7831 } else { 7832 loop++; 7833 } 7834 7835 if (loop == LOOP_ALLOC_CHUNK) { 7836 struct btrfs_trans_handle *trans; 7837 int exist = 0; 7838 7839 trans = current->journal_info; 7840 if (trans) 7841 exist = 1; 7842 else 7843 trans = btrfs_join_transaction(root); 7844 7845 if (IS_ERR(trans)) { 7846 ret = PTR_ERR(trans); 7847 goto out; 7848 } 7849 7850 ret = do_chunk_alloc(trans, fs_info, flags, 7851 CHUNK_ALLOC_FORCE); 7852 7853 /* 7854 * If we can't allocate a new chunk we've already looped 7855 * through at least once, move on to the NO_EMPTY_SIZE 7856 * case. 7857 */ 7858 if (ret == -ENOSPC) 7859 loop = LOOP_NO_EMPTY_SIZE; 7860 7861 /* 7862 * Do not bail out on ENOSPC since we 7863 * can do more things. 7864 */ 7865 if (ret < 0 && ret != -ENOSPC) 7866 btrfs_abort_transaction(trans, ret); 7867 else 7868 ret = 0; 7869 if (!exist) 7870 btrfs_end_transaction(trans); 7871 if (ret) 7872 goto out; 7873 } 7874 7875 if (loop == LOOP_NO_EMPTY_SIZE) { 7876 /* 7877 * Don't loop again if we already have no empty_size and 7878 * no empty_cluster. 7879 */ 7880 if (empty_size == 0 && 7881 empty_cluster == 0) { 7882 ret = -ENOSPC; 7883 goto out; 7884 } 7885 empty_size = 0; 7886 empty_cluster = 0; 7887 } 7888 7889 goto search; 7890 } else if (!ins->objectid) { 7891 ret = -ENOSPC; 7892 } else if (ins->objectid) { 7893 if (!use_cluster && last_ptr) { 7894 spin_lock(&last_ptr->lock); 7895 last_ptr->window_start = ins->objectid; 7896 spin_unlock(&last_ptr->lock); 7897 } 7898 ret = 0; 7899 } 7900 out: 7901 if (ret == -ENOSPC) { 7902 spin_lock(&space_info->lock); 7903 space_info->max_extent_size = max_extent_size; 7904 spin_unlock(&space_info->lock); 7905 ins->offset = max_extent_size; 7906 } 7907 return ret; 7908 } 7909 7910 static void dump_space_info(struct btrfs_fs_info *fs_info, 7911 struct btrfs_space_info *info, u64 bytes, 7912 int dump_block_groups) 7913 { 7914 struct btrfs_block_group_cache *cache; 7915 int index = 0; 7916 7917 spin_lock(&info->lock); 7918 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 7919 info->flags, 7920 info->total_bytes - btrfs_space_info_used(info, true), 7921 info->full ? "" : "not "); 7922 btrfs_info(fs_info, 7923 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 7924 info->total_bytes, info->bytes_used, info->bytes_pinned, 7925 info->bytes_reserved, info->bytes_may_use, 7926 info->bytes_readonly); 7927 spin_unlock(&info->lock); 7928 7929 if (!dump_block_groups) 7930 return; 7931 7932 down_read(&info->groups_sem); 7933 again: 7934 list_for_each_entry(cache, &info->block_groups[index], list) { 7935 spin_lock(&cache->lock); 7936 btrfs_info(fs_info, 7937 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 7938 cache->key.objectid, cache->key.offset, 7939 btrfs_block_group_used(&cache->item), cache->pinned, 7940 cache->reserved, cache->ro ? "[readonly]" : ""); 7941 btrfs_dump_free_space(cache, bytes); 7942 spin_unlock(&cache->lock); 7943 } 7944 if (++index < BTRFS_NR_RAID_TYPES) 7945 goto again; 7946 up_read(&info->groups_sem); 7947 } 7948 7949 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 7950 u64 num_bytes, u64 min_alloc_size, 7951 u64 empty_size, u64 hint_byte, 7952 struct btrfs_key *ins, int is_data, int delalloc) 7953 { 7954 struct btrfs_fs_info *fs_info = root->fs_info; 7955 bool final_tried = num_bytes == min_alloc_size; 7956 u64 flags; 7957 int ret; 7958 7959 flags = btrfs_get_alloc_profile(root, is_data); 7960 again: 7961 WARN_ON(num_bytes < fs_info->sectorsize); 7962 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 7963 hint_byte, ins, flags, delalloc); 7964 if (!ret && !is_data) { 7965 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 7966 } else if (ret == -ENOSPC) { 7967 if (!final_tried && ins->offset) { 7968 num_bytes = min(num_bytes >> 1, ins->offset); 7969 num_bytes = round_down(num_bytes, 7970 fs_info->sectorsize); 7971 num_bytes = max(num_bytes, min_alloc_size); 7972 ram_bytes = num_bytes; 7973 if (num_bytes == min_alloc_size) 7974 final_tried = true; 7975 goto again; 7976 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 7977 struct btrfs_space_info *sinfo; 7978 7979 sinfo = __find_space_info(fs_info, flags); 7980 btrfs_err(fs_info, 7981 "allocation failed flags %llu, wanted %llu", 7982 flags, num_bytes); 7983 if (sinfo) 7984 dump_space_info(fs_info, sinfo, num_bytes, 1); 7985 } 7986 } 7987 7988 return ret; 7989 } 7990 7991 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 7992 u64 start, u64 len, 7993 int pin, int delalloc) 7994 { 7995 struct btrfs_block_group_cache *cache; 7996 int ret = 0; 7997 7998 cache = btrfs_lookup_block_group(fs_info, start); 7999 if (!cache) { 8000 btrfs_err(fs_info, "Unable to find block group for %llu", 8001 start); 8002 return -ENOSPC; 8003 } 8004 8005 if (pin) 8006 pin_down_extent(fs_info, cache, start, len, 1); 8007 else { 8008 if (btrfs_test_opt(fs_info, DISCARD)) 8009 ret = btrfs_discard_extent(fs_info, start, len, NULL); 8010 btrfs_add_free_space(cache, start, len); 8011 btrfs_free_reserved_bytes(cache, len, delalloc); 8012 trace_btrfs_reserved_extent_free(fs_info, start, len); 8013 } 8014 8015 btrfs_put_block_group(cache); 8016 return ret; 8017 } 8018 8019 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8020 u64 start, u64 len, int delalloc) 8021 { 8022 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 8023 } 8024 8025 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 8026 u64 start, u64 len) 8027 { 8028 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 8029 } 8030 8031 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8032 struct btrfs_fs_info *fs_info, 8033 u64 parent, u64 root_objectid, 8034 u64 flags, u64 owner, u64 offset, 8035 struct btrfs_key *ins, int ref_mod) 8036 { 8037 int ret; 8038 struct btrfs_extent_item *extent_item; 8039 struct btrfs_extent_inline_ref *iref; 8040 struct btrfs_path *path; 8041 struct extent_buffer *leaf; 8042 int type; 8043 u32 size; 8044 8045 if (parent > 0) 8046 type = BTRFS_SHARED_DATA_REF_KEY; 8047 else 8048 type = BTRFS_EXTENT_DATA_REF_KEY; 8049 8050 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8051 8052 path = btrfs_alloc_path(); 8053 if (!path) 8054 return -ENOMEM; 8055 8056 path->leave_spinning = 1; 8057 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8058 ins, size); 8059 if (ret) { 8060 btrfs_free_path(path); 8061 return ret; 8062 } 8063 8064 leaf = path->nodes[0]; 8065 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8066 struct btrfs_extent_item); 8067 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8068 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8069 btrfs_set_extent_flags(leaf, extent_item, 8070 flags | BTRFS_EXTENT_FLAG_DATA); 8071 8072 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8073 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8074 if (parent > 0) { 8075 struct btrfs_shared_data_ref *ref; 8076 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8077 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8078 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8079 } else { 8080 struct btrfs_extent_data_ref *ref; 8081 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8082 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8083 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8084 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8085 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8086 } 8087 8088 btrfs_mark_buffer_dirty(path->nodes[0]); 8089 btrfs_free_path(path); 8090 8091 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8092 ins->offset); 8093 if (ret) 8094 return ret; 8095 8096 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); 8097 if (ret) { /* -ENOENT, logic error */ 8098 btrfs_err(fs_info, "update block group failed for %llu %llu", 8099 ins->objectid, ins->offset); 8100 BUG(); 8101 } 8102 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 8103 return ret; 8104 } 8105 8106 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8107 struct btrfs_fs_info *fs_info, 8108 u64 parent, u64 root_objectid, 8109 u64 flags, struct btrfs_disk_key *key, 8110 int level, struct btrfs_key *ins) 8111 { 8112 int ret; 8113 struct btrfs_extent_item *extent_item; 8114 struct btrfs_tree_block_info *block_info; 8115 struct btrfs_extent_inline_ref *iref; 8116 struct btrfs_path *path; 8117 struct extent_buffer *leaf; 8118 u32 size = sizeof(*extent_item) + sizeof(*iref); 8119 u64 num_bytes = ins->offset; 8120 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8121 8122 if (!skinny_metadata) 8123 size += sizeof(*block_info); 8124 8125 path = btrfs_alloc_path(); 8126 if (!path) { 8127 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8128 fs_info->nodesize); 8129 return -ENOMEM; 8130 } 8131 8132 path->leave_spinning = 1; 8133 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8134 ins, size); 8135 if (ret) { 8136 btrfs_free_path(path); 8137 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8138 fs_info->nodesize); 8139 return ret; 8140 } 8141 8142 leaf = path->nodes[0]; 8143 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8144 struct btrfs_extent_item); 8145 btrfs_set_extent_refs(leaf, extent_item, 1); 8146 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8147 btrfs_set_extent_flags(leaf, extent_item, 8148 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8149 8150 if (skinny_metadata) { 8151 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8152 num_bytes = fs_info->nodesize; 8153 } else { 8154 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8155 btrfs_set_tree_block_key(leaf, block_info, key); 8156 btrfs_set_tree_block_level(leaf, block_info, level); 8157 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8158 } 8159 8160 if (parent > 0) { 8161 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8162 btrfs_set_extent_inline_ref_type(leaf, iref, 8163 BTRFS_SHARED_BLOCK_REF_KEY); 8164 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8165 } else { 8166 btrfs_set_extent_inline_ref_type(leaf, iref, 8167 BTRFS_TREE_BLOCK_REF_KEY); 8168 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 8169 } 8170 8171 btrfs_mark_buffer_dirty(leaf); 8172 btrfs_free_path(path); 8173 8174 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8175 num_bytes); 8176 if (ret) 8177 return ret; 8178 8179 ret = update_block_group(trans, fs_info, ins->objectid, 8180 fs_info->nodesize, 1); 8181 if (ret) { /* -ENOENT, logic error */ 8182 btrfs_err(fs_info, "update block group failed for %llu %llu", 8183 ins->objectid, ins->offset); 8184 BUG(); 8185 } 8186 8187 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, 8188 fs_info->nodesize); 8189 return ret; 8190 } 8191 8192 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8193 u64 root_objectid, u64 owner, 8194 u64 offset, u64 ram_bytes, 8195 struct btrfs_key *ins) 8196 { 8197 struct btrfs_fs_info *fs_info = trans->fs_info; 8198 int ret; 8199 8200 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8201 8202 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, 8203 ins->offset, 0, 8204 root_objectid, owner, offset, 8205 ram_bytes, BTRFS_ADD_DELAYED_EXTENT); 8206 return ret; 8207 } 8208 8209 /* 8210 * this is used by the tree logging recovery code. It records that 8211 * an extent has been allocated and makes sure to clear the free 8212 * space cache bits as well 8213 */ 8214 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8215 struct btrfs_fs_info *fs_info, 8216 u64 root_objectid, u64 owner, u64 offset, 8217 struct btrfs_key *ins) 8218 { 8219 int ret; 8220 struct btrfs_block_group_cache *block_group; 8221 struct btrfs_space_info *space_info; 8222 8223 /* 8224 * Mixed block groups will exclude before processing the log so we only 8225 * need to do the exclude dance if this fs isn't mixed. 8226 */ 8227 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 8228 ret = __exclude_logged_extent(fs_info, ins->objectid, 8229 ins->offset); 8230 if (ret) 8231 return ret; 8232 } 8233 8234 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 8235 if (!block_group) 8236 return -EINVAL; 8237 8238 space_info = block_group->space_info; 8239 spin_lock(&space_info->lock); 8240 spin_lock(&block_group->lock); 8241 space_info->bytes_reserved += ins->offset; 8242 block_group->reserved += ins->offset; 8243 spin_unlock(&block_group->lock); 8244 spin_unlock(&space_info->lock); 8245 8246 ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid, 8247 0, owner, offset, ins, 1); 8248 btrfs_put_block_group(block_group); 8249 return ret; 8250 } 8251 8252 static struct extent_buffer * 8253 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8254 u64 bytenr, int level) 8255 { 8256 struct btrfs_fs_info *fs_info = root->fs_info; 8257 struct extent_buffer *buf; 8258 8259 buf = btrfs_find_create_tree_block(fs_info, bytenr); 8260 if (IS_ERR(buf)) 8261 return buf; 8262 8263 btrfs_set_header_generation(buf, trans->transid); 8264 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8265 btrfs_tree_lock(buf); 8266 clean_tree_block(fs_info, buf); 8267 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8268 8269 btrfs_set_lock_blocking(buf); 8270 set_extent_buffer_uptodate(buf); 8271 8272 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8273 buf->log_index = root->log_transid % 2; 8274 /* 8275 * we allow two log transactions at a time, use different 8276 * EXENT bit to differentiate dirty pages. 8277 */ 8278 if (buf->log_index == 0) 8279 set_extent_dirty(&root->dirty_log_pages, buf->start, 8280 buf->start + buf->len - 1, GFP_NOFS); 8281 else 8282 set_extent_new(&root->dirty_log_pages, buf->start, 8283 buf->start + buf->len - 1); 8284 } else { 8285 buf->log_index = -1; 8286 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8287 buf->start + buf->len - 1, GFP_NOFS); 8288 } 8289 trans->dirty = true; 8290 /* this returns a buffer locked for blocking */ 8291 return buf; 8292 } 8293 8294 static struct btrfs_block_rsv * 8295 use_block_rsv(struct btrfs_trans_handle *trans, 8296 struct btrfs_root *root, u32 blocksize) 8297 { 8298 struct btrfs_fs_info *fs_info = root->fs_info; 8299 struct btrfs_block_rsv *block_rsv; 8300 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 8301 int ret; 8302 bool global_updated = false; 8303 8304 block_rsv = get_block_rsv(trans, root); 8305 8306 if (unlikely(block_rsv->size == 0)) 8307 goto try_reserve; 8308 again: 8309 ret = block_rsv_use_bytes(block_rsv, blocksize); 8310 if (!ret) 8311 return block_rsv; 8312 8313 if (block_rsv->failfast) 8314 return ERR_PTR(ret); 8315 8316 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8317 global_updated = true; 8318 update_global_block_rsv(fs_info); 8319 goto again; 8320 } 8321 8322 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8323 static DEFINE_RATELIMIT_STATE(_rs, 8324 DEFAULT_RATELIMIT_INTERVAL * 10, 8325 /*DEFAULT_RATELIMIT_BURST*/ 1); 8326 if (__ratelimit(&_rs)) 8327 WARN(1, KERN_DEBUG 8328 "BTRFS: block rsv returned %d\n", ret); 8329 } 8330 try_reserve: 8331 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8332 BTRFS_RESERVE_NO_FLUSH); 8333 if (!ret) 8334 return block_rsv; 8335 /* 8336 * If we couldn't reserve metadata bytes try and use some from 8337 * the global reserve if its space type is the same as the global 8338 * reservation. 8339 */ 8340 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8341 block_rsv->space_info == global_rsv->space_info) { 8342 ret = block_rsv_use_bytes(global_rsv, blocksize); 8343 if (!ret) 8344 return global_rsv; 8345 } 8346 return ERR_PTR(ret); 8347 } 8348 8349 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8350 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8351 { 8352 block_rsv_add_bytes(block_rsv, blocksize, 0); 8353 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 8354 } 8355 8356 /* 8357 * finds a free extent and does all the dirty work required for allocation 8358 * returns the tree buffer or an ERR_PTR on error. 8359 */ 8360 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8361 struct btrfs_root *root, 8362 u64 parent, u64 root_objectid, 8363 const struct btrfs_disk_key *key, 8364 int level, u64 hint, 8365 u64 empty_size) 8366 { 8367 struct btrfs_fs_info *fs_info = root->fs_info; 8368 struct btrfs_key ins; 8369 struct btrfs_block_rsv *block_rsv; 8370 struct extent_buffer *buf; 8371 struct btrfs_delayed_extent_op *extent_op; 8372 u64 flags = 0; 8373 int ret; 8374 u32 blocksize = fs_info->nodesize; 8375 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8376 8377 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8378 if (btrfs_is_testing(fs_info)) { 8379 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8380 level); 8381 if (!IS_ERR(buf)) 8382 root->alloc_bytenr += blocksize; 8383 return buf; 8384 } 8385 #endif 8386 8387 block_rsv = use_block_rsv(trans, root, blocksize); 8388 if (IS_ERR(block_rsv)) 8389 return ERR_CAST(block_rsv); 8390 8391 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 8392 empty_size, hint, &ins, 0, 0); 8393 if (ret) 8394 goto out_unuse; 8395 8396 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 8397 if (IS_ERR(buf)) { 8398 ret = PTR_ERR(buf); 8399 goto out_free_reserved; 8400 } 8401 8402 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8403 if (parent == 0) 8404 parent = ins.objectid; 8405 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8406 } else 8407 BUG_ON(parent > 0); 8408 8409 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8410 extent_op = btrfs_alloc_delayed_extent_op(); 8411 if (!extent_op) { 8412 ret = -ENOMEM; 8413 goto out_free_buf; 8414 } 8415 if (key) 8416 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8417 else 8418 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8419 extent_op->flags_to_set = flags; 8420 extent_op->update_key = skinny_metadata ? false : true; 8421 extent_op->update_flags = true; 8422 extent_op->is_data = false; 8423 extent_op->level = level; 8424 8425 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 8426 ins.objectid, ins.offset, 8427 parent, root_objectid, level, 8428 BTRFS_ADD_DELAYED_EXTENT, 8429 extent_op); 8430 if (ret) 8431 goto out_free_delayed; 8432 } 8433 return buf; 8434 8435 out_free_delayed: 8436 btrfs_free_delayed_extent_op(extent_op); 8437 out_free_buf: 8438 free_extent_buffer(buf); 8439 out_free_reserved: 8440 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 8441 out_unuse: 8442 unuse_block_rsv(fs_info, block_rsv, blocksize); 8443 return ERR_PTR(ret); 8444 } 8445 8446 struct walk_control { 8447 u64 refs[BTRFS_MAX_LEVEL]; 8448 u64 flags[BTRFS_MAX_LEVEL]; 8449 struct btrfs_key update_progress; 8450 int stage; 8451 int level; 8452 int shared_level; 8453 int update_ref; 8454 int keep_locks; 8455 int reada_slot; 8456 int reada_count; 8457 int for_reloc; 8458 }; 8459 8460 #define DROP_REFERENCE 1 8461 #define UPDATE_BACKREF 2 8462 8463 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8464 struct btrfs_root *root, 8465 struct walk_control *wc, 8466 struct btrfs_path *path) 8467 { 8468 struct btrfs_fs_info *fs_info = root->fs_info; 8469 u64 bytenr; 8470 u64 generation; 8471 u64 refs; 8472 u64 flags; 8473 u32 nritems; 8474 struct btrfs_key key; 8475 struct extent_buffer *eb; 8476 int ret; 8477 int slot; 8478 int nread = 0; 8479 8480 if (path->slots[wc->level] < wc->reada_slot) { 8481 wc->reada_count = wc->reada_count * 2 / 3; 8482 wc->reada_count = max(wc->reada_count, 2); 8483 } else { 8484 wc->reada_count = wc->reada_count * 3 / 2; 8485 wc->reada_count = min_t(int, wc->reada_count, 8486 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 8487 } 8488 8489 eb = path->nodes[wc->level]; 8490 nritems = btrfs_header_nritems(eb); 8491 8492 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8493 if (nread >= wc->reada_count) 8494 break; 8495 8496 cond_resched(); 8497 bytenr = btrfs_node_blockptr(eb, slot); 8498 generation = btrfs_node_ptr_generation(eb, slot); 8499 8500 if (slot == path->slots[wc->level]) 8501 goto reada; 8502 8503 if (wc->stage == UPDATE_BACKREF && 8504 generation <= root->root_key.offset) 8505 continue; 8506 8507 /* We don't lock the tree block, it's OK to be racy here */ 8508 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 8509 wc->level - 1, 1, &refs, 8510 &flags); 8511 /* We don't care about errors in readahead. */ 8512 if (ret < 0) 8513 continue; 8514 BUG_ON(refs == 0); 8515 8516 if (wc->stage == DROP_REFERENCE) { 8517 if (refs == 1) 8518 goto reada; 8519 8520 if (wc->level == 1 && 8521 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8522 continue; 8523 if (!wc->update_ref || 8524 generation <= root->root_key.offset) 8525 continue; 8526 btrfs_node_key_to_cpu(eb, &key, slot); 8527 ret = btrfs_comp_cpu_keys(&key, 8528 &wc->update_progress); 8529 if (ret < 0) 8530 continue; 8531 } else { 8532 if (wc->level == 1 && 8533 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8534 continue; 8535 } 8536 reada: 8537 readahead_tree_block(fs_info, bytenr); 8538 nread++; 8539 } 8540 wc->reada_slot = slot; 8541 } 8542 8543 /* 8544 * helper to process tree block while walking down the tree. 8545 * 8546 * when wc->stage == UPDATE_BACKREF, this function updates 8547 * back refs for pointers in the block. 8548 * 8549 * NOTE: return value 1 means we should stop walking down. 8550 */ 8551 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8552 struct btrfs_root *root, 8553 struct btrfs_path *path, 8554 struct walk_control *wc, int lookup_info) 8555 { 8556 struct btrfs_fs_info *fs_info = root->fs_info; 8557 int level = wc->level; 8558 struct extent_buffer *eb = path->nodes[level]; 8559 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8560 int ret; 8561 8562 if (wc->stage == UPDATE_BACKREF && 8563 btrfs_header_owner(eb) != root->root_key.objectid) 8564 return 1; 8565 8566 /* 8567 * when reference count of tree block is 1, it won't increase 8568 * again. once full backref flag is set, we never clear it. 8569 */ 8570 if (lookup_info && 8571 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8572 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8573 BUG_ON(!path->locks[level]); 8574 ret = btrfs_lookup_extent_info(trans, fs_info, 8575 eb->start, level, 1, 8576 &wc->refs[level], 8577 &wc->flags[level]); 8578 BUG_ON(ret == -ENOMEM); 8579 if (ret) 8580 return ret; 8581 BUG_ON(wc->refs[level] == 0); 8582 } 8583 8584 if (wc->stage == DROP_REFERENCE) { 8585 if (wc->refs[level] > 1) 8586 return 1; 8587 8588 if (path->locks[level] && !wc->keep_locks) { 8589 btrfs_tree_unlock_rw(eb, path->locks[level]); 8590 path->locks[level] = 0; 8591 } 8592 return 0; 8593 } 8594 8595 /* wc->stage == UPDATE_BACKREF */ 8596 if (!(wc->flags[level] & flag)) { 8597 BUG_ON(!path->locks[level]); 8598 ret = btrfs_inc_ref(trans, root, eb, 1); 8599 BUG_ON(ret); /* -ENOMEM */ 8600 ret = btrfs_dec_ref(trans, root, eb, 0); 8601 BUG_ON(ret); /* -ENOMEM */ 8602 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, 8603 eb->len, flag, 8604 btrfs_header_level(eb), 0); 8605 BUG_ON(ret); /* -ENOMEM */ 8606 wc->flags[level] |= flag; 8607 } 8608 8609 /* 8610 * the block is shared by multiple trees, so it's not good to 8611 * keep the tree lock 8612 */ 8613 if (path->locks[level] && level > 0) { 8614 btrfs_tree_unlock_rw(eb, path->locks[level]); 8615 path->locks[level] = 0; 8616 } 8617 return 0; 8618 } 8619 8620 /* 8621 * helper to process tree block pointer. 8622 * 8623 * when wc->stage == DROP_REFERENCE, this function checks 8624 * reference count of the block pointed to. if the block 8625 * is shared and we need update back refs for the subtree 8626 * rooted at the block, this function changes wc->stage to 8627 * UPDATE_BACKREF. if the block is shared and there is no 8628 * need to update back, this function drops the reference 8629 * to the block. 8630 * 8631 * NOTE: return value 1 means we should stop walking down. 8632 */ 8633 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8634 struct btrfs_root *root, 8635 struct btrfs_path *path, 8636 struct walk_control *wc, int *lookup_info) 8637 { 8638 struct btrfs_fs_info *fs_info = root->fs_info; 8639 u64 bytenr; 8640 u64 generation; 8641 u64 parent; 8642 u32 blocksize; 8643 struct btrfs_key key; 8644 struct extent_buffer *next; 8645 int level = wc->level; 8646 int reada = 0; 8647 int ret = 0; 8648 bool need_account = false; 8649 8650 generation = btrfs_node_ptr_generation(path->nodes[level], 8651 path->slots[level]); 8652 /* 8653 * if the lower level block was created before the snapshot 8654 * was created, we know there is no need to update back refs 8655 * for the subtree 8656 */ 8657 if (wc->stage == UPDATE_BACKREF && 8658 generation <= root->root_key.offset) { 8659 *lookup_info = 1; 8660 return 1; 8661 } 8662 8663 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8664 blocksize = fs_info->nodesize; 8665 8666 next = find_extent_buffer(fs_info, bytenr); 8667 if (!next) { 8668 next = btrfs_find_create_tree_block(fs_info, bytenr); 8669 if (IS_ERR(next)) 8670 return PTR_ERR(next); 8671 8672 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8673 level - 1); 8674 reada = 1; 8675 } 8676 btrfs_tree_lock(next); 8677 btrfs_set_lock_blocking(next); 8678 8679 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 8680 &wc->refs[level - 1], 8681 &wc->flags[level - 1]); 8682 if (ret < 0) 8683 goto out_unlock; 8684 8685 if (unlikely(wc->refs[level - 1] == 0)) { 8686 btrfs_err(fs_info, "Missing references."); 8687 ret = -EIO; 8688 goto out_unlock; 8689 } 8690 *lookup_info = 0; 8691 8692 if (wc->stage == DROP_REFERENCE) { 8693 if (wc->refs[level - 1] > 1) { 8694 need_account = true; 8695 if (level == 1 && 8696 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8697 goto skip; 8698 8699 if (!wc->update_ref || 8700 generation <= root->root_key.offset) 8701 goto skip; 8702 8703 btrfs_node_key_to_cpu(path->nodes[level], &key, 8704 path->slots[level]); 8705 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8706 if (ret < 0) 8707 goto skip; 8708 8709 wc->stage = UPDATE_BACKREF; 8710 wc->shared_level = level - 1; 8711 } 8712 } else { 8713 if (level == 1 && 8714 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8715 goto skip; 8716 } 8717 8718 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8719 btrfs_tree_unlock(next); 8720 free_extent_buffer(next); 8721 next = NULL; 8722 *lookup_info = 1; 8723 } 8724 8725 if (!next) { 8726 if (reada && level == 1) 8727 reada_walk_down(trans, root, wc, path); 8728 next = read_tree_block(fs_info, bytenr, generation); 8729 if (IS_ERR(next)) { 8730 return PTR_ERR(next); 8731 } else if (!extent_buffer_uptodate(next)) { 8732 free_extent_buffer(next); 8733 return -EIO; 8734 } 8735 btrfs_tree_lock(next); 8736 btrfs_set_lock_blocking(next); 8737 } 8738 8739 level--; 8740 ASSERT(level == btrfs_header_level(next)); 8741 if (level != btrfs_header_level(next)) { 8742 btrfs_err(root->fs_info, "mismatched level"); 8743 ret = -EIO; 8744 goto out_unlock; 8745 } 8746 path->nodes[level] = next; 8747 path->slots[level] = 0; 8748 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8749 wc->level = level; 8750 if (wc->level == 1) 8751 wc->reada_slot = 0; 8752 return 0; 8753 skip: 8754 wc->refs[level - 1] = 0; 8755 wc->flags[level - 1] = 0; 8756 if (wc->stage == DROP_REFERENCE) { 8757 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8758 parent = path->nodes[level]->start; 8759 } else { 8760 ASSERT(root->root_key.objectid == 8761 btrfs_header_owner(path->nodes[level])); 8762 if (root->root_key.objectid != 8763 btrfs_header_owner(path->nodes[level])) { 8764 btrfs_err(root->fs_info, 8765 "mismatched block owner"); 8766 ret = -EIO; 8767 goto out_unlock; 8768 } 8769 parent = 0; 8770 } 8771 8772 if (need_account) { 8773 ret = btrfs_qgroup_trace_subtree(trans, root, next, 8774 generation, level - 1); 8775 if (ret) { 8776 btrfs_err_rl(fs_info, 8777 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 8778 ret); 8779 } 8780 } 8781 ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, 8782 parent, root->root_key.objectid, 8783 level - 1, 0); 8784 if (ret) 8785 goto out_unlock; 8786 } 8787 8788 *lookup_info = 1; 8789 ret = 1; 8790 8791 out_unlock: 8792 btrfs_tree_unlock(next); 8793 free_extent_buffer(next); 8794 8795 return ret; 8796 } 8797 8798 /* 8799 * helper to process tree block while walking up the tree. 8800 * 8801 * when wc->stage == DROP_REFERENCE, this function drops 8802 * reference count on the block. 8803 * 8804 * when wc->stage == UPDATE_BACKREF, this function changes 8805 * wc->stage back to DROP_REFERENCE if we changed wc->stage 8806 * to UPDATE_BACKREF previously while processing the block. 8807 * 8808 * NOTE: return value 1 means we should stop walking up. 8809 */ 8810 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 8811 struct btrfs_root *root, 8812 struct btrfs_path *path, 8813 struct walk_control *wc) 8814 { 8815 struct btrfs_fs_info *fs_info = root->fs_info; 8816 int ret; 8817 int level = wc->level; 8818 struct extent_buffer *eb = path->nodes[level]; 8819 u64 parent = 0; 8820 8821 if (wc->stage == UPDATE_BACKREF) { 8822 BUG_ON(wc->shared_level < level); 8823 if (level < wc->shared_level) 8824 goto out; 8825 8826 ret = find_next_key(path, level + 1, &wc->update_progress); 8827 if (ret > 0) 8828 wc->update_ref = 0; 8829 8830 wc->stage = DROP_REFERENCE; 8831 wc->shared_level = -1; 8832 path->slots[level] = 0; 8833 8834 /* 8835 * check reference count again if the block isn't locked. 8836 * we should start walking down the tree again if reference 8837 * count is one. 8838 */ 8839 if (!path->locks[level]) { 8840 BUG_ON(level == 0); 8841 btrfs_tree_lock(eb); 8842 btrfs_set_lock_blocking(eb); 8843 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8844 8845 ret = btrfs_lookup_extent_info(trans, fs_info, 8846 eb->start, level, 1, 8847 &wc->refs[level], 8848 &wc->flags[level]); 8849 if (ret < 0) { 8850 btrfs_tree_unlock_rw(eb, path->locks[level]); 8851 path->locks[level] = 0; 8852 return ret; 8853 } 8854 BUG_ON(wc->refs[level] == 0); 8855 if (wc->refs[level] == 1) { 8856 btrfs_tree_unlock_rw(eb, path->locks[level]); 8857 path->locks[level] = 0; 8858 return 1; 8859 } 8860 } 8861 } 8862 8863 /* wc->stage == DROP_REFERENCE */ 8864 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 8865 8866 if (wc->refs[level] == 1) { 8867 if (level == 0) { 8868 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8869 ret = btrfs_dec_ref(trans, root, eb, 1); 8870 else 8871 ret = btrfs_dec_ref(trans, root, eb, 0); 8872 BUG_ON(ret); /* -ENOMEM */ 8873 ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb); 8874 if (ret) { 8875 btrfs_err_rl(fs_info, 8876 "error %d accounting leaf items. Quota is out of sync, rescan required.", 8877 ret); 8878 } 8879 } 8880 /* make block locked assertion in clean_tree_block happy */ 8881 if (!path->locks[level] && 8882 btrfs_header_generation(eb) == trans->transid) { 8883 btrfs_tree_lock(eb); 8884 btrfs_set_lock_blocking(eb); 8885 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8886 } 8887 clean_tree_block(fs_info, eb); 8888 } 8889 8890 if (eb == root->node) { 8891 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8892 parent = eb->start; 8893 else 8894 BUG_ON(root->root_key.objectid != 8895 btrfs_header_owner(eb)); 8896 } else { 8897 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8898 parent = path->nodes[level + 1]->start; 8899 else 8900 BUG_ON(root->root_key.objectid != 8901 btrfs_header_owner(path->nodes[level + 1])); 8902 } 8903 8904 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8905 out: 8906 wc->refs[level] = 0; 8907 wc->flags[level] = 0; 8908 return 0; 8909 } 8910 8911 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8912 struct btrfs_root *root, 8913 struct btrfs_path *path, 8914 struct walk_control *wc) 8915 { 8916 int level = wc->level; 8917 int lookup_info = 1; 8918 int ret; 8919 8920 while (level >= 0) { 8921 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8922 if (ret > 0) 8923 break; 8924 8925 if (level == 0) 8926 break; 8927 8928 if (path->slots[level] >= 8929 btrfs_header_nritems(path->nodes[level])) 8930 break; 8931 8932 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8933 if (ret > 0) { 8934 path->slots[level]++; 8935 continue; 8936 } else if (ret < 0) 8937 return ret; 8938 level = wc->level; 8939 } 8940 return 0; 8941 } 8942 8943 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8944 struct btrfs_root *root, 8945 struct btrfs_path *path, 8946 struct walk_control *wc, int max_level) 8947 { 8948 int level = wc->level; 8949 int ret; 8950 8951 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8952 while (level < max_level && path->nodes[level]) { 8953 wc->level = level; 8954 if (path->slots[level] + 1 < 8955 btrfs_header_nritems(path->nodes[level])) { 8956 path->slots[level]++; 8957 return 0; 8958 } else { 8959 ret = walk_up_proc(trans, root, path, wc); 8960 if (ret > 0) 8961 return 0; 8962 8963 if (path->locks[level]) { 8964 btrfs_tree_unlock_rw(path->nodes[level], 8965 path->locks[level]); 8966 path->locks[level] = 0; 8967 } 8968 free_extent_buffer(path->nodes[level]); 8969 path->nodes[level] = NULL; 8970 level++; 8971 } 8972 } 8973 return 1; 8974 } 8975 8976 /* 8977 * drop a subvolume tree. 8978 * 8979 * this function traverses the tree freeing any blocks that only 8980 * referenced by the tree. 8981 * 8982 * when a shared tree block is found. this function decreases its 8983 * reference count by one. if update_ref is true, this function 8984 * also make sure backrefs for the shared block and all lower level 8985 * blocks are properly updated. 8986 * 8987 * If called with for_reloc == 0, may exit early with -EAGAIN 8988 */ 8989 int btrfs_drop_snapshot(struct btrfs_root *root, 8990 struct btrfs_block_rsv *block_rsv, int update_ref, 8991 int for_reloc) 8992 { 8993 struct btrfs_fs_info *fs_info = root->fs_info; 8994 struct btrfs_path *path; 8995 struct btrfs_trans_handle *trans; 8996 struct btrfs_root *tree_root = fs_info->tree_root; 8997 struct btrfs_root_item *root_item = &root->root_item; 8998 struct walk_control *wc; 8999 struct btrfs_key key; 9000 int err = 0; 9001 int ret; 9002 int level; 9003 bool root_dropped = false; 9004 9005 btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); 9006 9007 path = btrfs_alloc_path(); 9008 if (!path) { 9009 err = -ENOMEM; 9010 goto out; 9011 } 9012 9013 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9014 if (!wc) { 9015 btrfs_free_path(path); 9016 err = -ENOMEM; 9017 goto out; 9018 } 9019 9020 trans = btrfs_start_transaction(tree_root, 0); 9021 if (IS_ERR(trans)) { 9022 err = PTR_ERR(trans); 9023 goto out_free; 9024 } 9025 9026 if (block_rsv) 9027 trans->block_rsv = block_rsv; 9028 9029 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9030 level = btrfs_header_level(root->node); 9031 path->nodes[level] = btrfs_lock_root_node(root); 9032 btrfs_set_lock_blocking(path->nodes[level]); 9033 path->slots[level] = 0; 9034 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9035 memset(&wc->update_progress, 0, 9036 sizeof(wc->update_progress)); 9037 } else { 9038 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9039 memcpy(&wc->update_progress, &key, 9040 sizeof(wc->update_progress)); 9041 9042 level = root_item->drop_level; 9043 BUG_ON(level == 0); 9044 path->lowest_level = level; 9045 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9046 path->lowest_level = 0; 9047 if (ret < 0) { 9048 err = ret; 9049 goto out_end_trans; 9050 } 9051 WARN_ON(ret > 0); 9052 9053 /* 9054 * unlock our path, this is safe because only this 9055 * function is allowed to delete this snapshot 9056 */ 9057 btrfs_unlock_up_safe(path, 0); 9058 9059 level = btrfs_header_level(root->node); 9060 while (1) { 9061 btrfs_tree_lock(path->nodes[level]); 9062 btrfs_set_lock_blocking(path->nodes[level]); 9063 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9064 9065 ret = btrfs_lookup_extent_info(trans, fs_info, 9066 path->nodes[level]->start, 9067 level, 1, &wc->refs[level], 9068 &wc->flags[level]); 9069 if (ret < 0) { 9070 err = ret; 9071 goto out_end_trans; 9072 } 9073 BUG_ON(wc->refs[level] == 0); 9074 9075 if (level == root_item->drop_level) 9076 break; 9077 9078 btrfs_tree_unlock(path->nodes[level]); 9079 path->locks[level] = 0; 9080 WARN_ON(wc->refs[level] != 1); 9081 level--; 9082 } 9083 } 9084 9085 wc->level = level; 9086 wc->shared_level = -1; 9087 wc->stage = DROP_REFERENCE; 9088 wc->update_ref = update_ref; 9089 wc->keep_locks = 0; 9090 wc->for_reloc = for_reloc; 9091 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9092 9093 while (1) { 9094 9095 ret = walk_down_tree(trans, root, path, wc); 9096 if (ret < 0) { 9097 err = ret; 9098 break; 9099 } 9100 9101 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9102 if (ret < 0) { 9103 err = ret; 9104 break; 9105 } 9106 9107 if (ret > 0) { 9108 BUG_ON(wc->stage != DROP_REFERENCE); 9109 break; 9110 } 9111 9112 if (wc->stage == DROP_REFERENCE) { 9113 level = wc->level; 9114 btrfs_node_key(path->nodes[level], 9115 &root_item->drop_progress, 9116 path->slots[level]); 9117 root_item->drop_level = level; 9118 } 9119 9120 BUG_ON(wc->level == 0); 9121 if (btrfs_should_end_transaction(trans) || 9122 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 9123 ret = btrfs_update_root(trans, tree_root, 9124 &root->root_key, 9125 root_item); 9126 if (ret) { 9127 btrfs_abort_transaction(trans, ret); 9128 err = ret; 9129 goto out_end_trans; 9130 } 9131 9132 btrfs_end_transaction_throttle(trans); 9133 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 9134 btrfs_debug(fs_info, 9135 "drop snapshot early exit"); 9136 err = -EAGAIN; 9137 goto out_free; 9138 } 9139 9140 trans = btrfs_start_transaction(tree_root, 0); 9141 if (IS_ERR(trans)) { 9142 err = PTR_ERR(trans); 9143 goto out_free; 9144 } 9145 if (block_rsv) 9146 trans->block_rsv = block_rsv; 9147 } 9148 } 9149 btrfs_release_path(path); 9150 if (err) 9151 goto out_end_trans; 9152 9153 ret = btrfs_del_root(trans, tree_root, &root->root_key); 9154 if (ret) { 9155 btrfs_abort_transaction(trans, ret); 9156 goto out_end_trans; 9157 } 9158 9159 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9160 ret = btrfs_find_root(tree_root, &root->root_key, path, 9161 NULL, NULL); 9162 if (ret < 0) { 9163 btrfs_abort_transaction(trans, ret); 9164 err = ret; 9165 goto out_end_trans; 9166 } else if (ret > 0) { 9167 /* if we fail to delete the orphan item this time 9168 * around, it'll get picked up the next time. 9169 * 9170 * The most common failure here is just -ENOENT. 9171 */ 9172 btrfs_del_orphan_item(trans, tree_root, 9173 root->root_key.objectid); 9174 } 9175 } 9176 9177 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9178 btrfs_add_dropped_root(trans, root); 9179 } else { 9180 free_extent_buffer(root->node); 9181 free_extent_buffer(root->commit_root); 9182 btrfs_put_fs_root(root); 9183 } 9184 root_dropped = true; 9185 out_end_trans: 9186 btrfs_end_transaction_throttle(trans); 9187 out_free: 9188 kfree(wc); 9189 btrfs_free_path(path); 9190 out: 9191 /* 9192 * So if we need to stop dropping the snapshot for whatever reason we 9193 * need to make sure to add it back to the dead root list so that we 9194 * keep trying to do the work later. This also cleans up roots if we 9195 * don't have it in the radix (like when we recover after a power fail 9196 * or unmount) so we don't leak memory. 9197 */ 9198 if (!for_reloc && root_dropped == false) 9199 btrfs_add_dead_root(root); 9200 if (err && err != -EAGAIN) 9201 btrfs_handle_fs_error(fs_info, err, NULL); 9202 return err; 9203 } 9204 9205 /* 9206 * drop subtree rooted at tree block 'node'. 9207 * 9208 * NOTE: this function will unlock and release tree block 'node' 9209 * only used by relocation code 9210 */ 9211 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9212 struct btrfs_root *root, 9213 struct extent_buffer *node, 9214 struct extent_buffer *parent) 9215 { 9216 struct btrfs_fs_info *fs_info = root->fs_info; 9217 struct btrfs_path *path; 9218 struct walk_control *wc; 9219 int level; 9220 int parent_level; 9221 int ret = 0; 9222 int wret; 9223 9224 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9225 9226 path = btrfs_alloc_path(); 9227 if (!path) 9228 return -ENOMEM; 9229 9230 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9231 if (!wc) { 9232 btrfs_free_path(path); 9233 return -ENOMEM; 9234 } 9235 9236 btrfs_assert_tree_locked(parent); 9237 parent_level = btrfs_header_level(parent); 9238 extent_buffer_get(parent); 9239 path->nodes[parent_level] = parent; 9240 path->slots[parent_level] = btrfs_header_nritems(parent); 9241 9242 btrfs_assert_tree_locked(node); 9243 level = btrfs_header_level(node); 9244 path->nodes[level] = node; 9245 path->slots[level] = 0; 9246 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9247 9248 wc->refs[parent_level] = 1; 9249 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9250 wc->level = level; 9251 wc->shared_level = -1; 9252 wc->stage = DROP_REFERENCE; 9253 wc->update_ref = 0; 9254 wc->keep_locks = 1; 9255 wc->for_reloc = 1; 9256 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9257 9258 while (1) { 9259 wret = walk_down_tree(trans, root, path, wc); 9260 if (wret < 0) { 9261 ret = wret; 9262 break; 9263 } 9264 9265 wret = walk_up_tree(trans, root, path, wc, parent_level); 9266 if (wret < 0) 9267 ret = wret; 9268 if (wret != 0) 9269 break; 9270 } 9271 9272 kfree(wc); 9273 btrfs_free_path(path); 9274 return ret; 9275 } 9276 9277 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 9278 { 9279 u64 num_devices; 9280 u64 stripped; 9281 9282 /* 9283 * if restripe for this chunk_type is on pick target profile and 9284 * return, otherwise do the usual balance 9285 */ 9286 stripped = get_restripe_target(fs_info, flags); 9287 if (stripped) 9288 return extended_to_chunk(stripped); 9289 9290 num_devices = fs_info->fs_devices->rw_devices; 9291 9292 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9293 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9294 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9295 9296 if (num_devices == 1) { 9297 stripped |= BTRFS_BLOCK_GROUP_DUP; 9298 stripped = flags & ~stripped; 9299 9300 /* turn raid0 into single device chunks */ 9301 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9302 return stripped; 9303 9304 /* turn mirroring into duplication */ 9305 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9306 BTRFS_BLOCK_GROUP_RAID10)) 9307 return stripped | BTRFS_BLOCK_GROUP_DUP; 9308 } else { 9309 /* they already had raid on here, just return */ 9310 if (flags & stripped) 9311 return flags; 9312 9313 stripped |= BTRFS_BLOCK_GROUP_DUP; 9314 stripped = flags & ~stripped; 9315 9316 /* switch duplicated blocks with raid1 */ 9317 if (flags & BTRFS_BLOCK_GROUP_DUP) 9318 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9319 9320 /* this is drive concat, leave it alone */ 9321 } 9322 9323 return flags; 9324 } 9325 9326 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9327 { 9328 struct btrfs_space_info *sinfo = cache->space_info; 9329 u64 num_bytes; 9330 u64 min_allocable_bytes; 9331 int ret = -ENOSPC; 9332 9333 /* 9334 * We need some metadata space and system metadata space for 9335 * allocating chunks in some corner cases until we force to set 9336 * it to be readonly. 9337 */ 9338 if ((sinfo->flags & 9339 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9340 !force) 9341 min_allocable_bytes = SZ_1M; 9342 else 9343 min_allocable_bytes = 0; 9344 9345 spin_lock(&sinfo->lock); 9346 spin_lock(&cache->lock); 9347 9348 if (cache->ro) { 9349 cache->ro++; 9350 ret = 0; 9351 goto out; 9352 } 9353 9354 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9355 cache->bytes_super - btrfs_block_group_used(&cache->item); 9356 9357 if (btrfs_space_info_used(sinfo, true) + num_bytes + 9358 min_allocable_bytes <= sinfo->total_bytes) { 9359 sinfo->bytes_readonly += num_bytes; 9360 cache->ro++; 9361 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9362 ret = 0; 9363 } 9364 out: 9365 spin_unlock(&cache->lock); 9366 spin_unlock(&sinfo->lock); 9367 return ret; 9368 } 9369 9370 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, 9371 struct btrfs_block_group_cache *cache) 9372 9373 { 9374 struct btrfs_trans_handle *trans; 9375 u64 alloc_flags; 9376 int ret; 9377 9378 again: 9379 trans = btrfs_join_transaction(fs_info->extent_root); 9380 if (IS_ERR(trans)) 9381 return PTR_ERR(trans); 9382 9383 /* 9384 * we're not allowed to set block groups readonly after the dirty 9385 * block groups cache has started writing. If it already started, 9386 * back off and let this transaction commit 9387 */ 9388 mutex_lock(&fs_info->ro_block_group_mutex); 9389 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9390 u64 transid = trans->transid; 9391 9392 mutex_unlock(&fs_info->ro_block_group_mutex); 9393 btrfs_end_transaction(trans); 9394 9395 ret = btrfs_wait_for_commit(fs_info, transid); 9396 if (ret) 9397 return ret; 9398 goto again; 9399 } 9400 9401 /* 9402 * if we are changing raid levels, try to allocate a corresponding 9403 * block group with the new raid level. 9404 */ 9405 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9406 if (alloc_flags != cache->flags) { 9407 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9408 CHUNK_ALLOC_FORCE); 9409 /* 9410 * ENOSPC is allowed here, we may have enough space 9411 * already allocated at the new raid level to 9412 * carry on 9413 */ 9414 if (ret == -ENOSPC) 9415 ret = 0; 9416 if (ret < 0) 9417 goto out; 9418 } 9419 9420 ret = inc_block_group_ro(cache, 0); 9421 if (!ret) 9422 goto out; 9423 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 9424 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9425 CHUNK_ALLOC_FORCE); 9426 if (ret < 0) 9427 goto out; 9428 ret = inc_block_group_ro(cache, 0); 9429 out: 9430 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9431 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9432 mutex_lock(&fs_info->chunk_mutex); 9433 check_system_chunk(trans, fs_info, alloc_flags); 9434 mutex_unlock(&fs_info->chunk_mutex); 9435 } 9436 mutex_unlock(&fs_info->ro_block_group_mutex); 9437 9438 btrfs_end_transaction(trans); 9439 return ret; 9440 } 9441 9442 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 9443 struct btrfs_fs_info *fs_info, u64 type) 9444 { 9445 u64 alloc_flags = get_alloc_profile(fs_info, type); 9446 9447 return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); 9448 } 9449 9450 /* 9451 * helper to account the unused space of all the readonly block group in the 9452 * space_info. takes mirrors into account. 9453 */ 9454 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9455 { 9456 struct btrfs_block_group_cache *block_group; 9457 u64 free_bytes = 0; 9458 int factor; 9459 9460 /* It's df, we don't care if it's racy */ 9461 if (list_empty(&sinfo->ro_bgs)) 9462 return 0; 9463 9464 spin_lock(&sinfo->lock); 9465 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9466 spin_lock(&block_group->lock); 9467 9468 if (!block_group->ro) { 9469 spin_unlock(&block_group->lock); 9470 continue; 9471 } 9472 9473 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 9474 BTRFS_BLOCK_GROUP_RAID10 | 9475 BTRFS_BLOCK_GROUP_DUP)) 9476 factor = 2; 9477 else 9478 factor = 1; 9479 9480 free_bytes += (block_group->key.offset - 9481 btrfs_block_group_used(&block_group->item)) * 9482 factor; 9483 9484 spin_unlock(&block_group->lock); 9485 } 9486 spin_unlock(&sinfo->lock); 9487 9488 return free_bytes; 9489 } 9490 9491 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 9492 { 9493 struct btrfs_space_info *sinfo = cache->space_info; 9494 u64 num_bytes; 9495 9496 BUG_ON(!cache->ro); 9497 9498 spin_lock(&sinfo->lock); 9499 spin_lock(&cache->lock); 9500 if (!--cache->ro) { 9501 num_bytes = cache->key.offset - cache->reserved - 9502 cache->pinned - cache->bytes_super - 9503 btrfs_block_group_used(&cache->item); 9504 sinfo->bytes_readonly -= num_bytes; 9505 list_del_init(&cache->ro_list); 9506 } 9507 spin_unlock(&cache->lock); 9508 spin_unlock(&sinfo->lock); 9509 } 9510 9511 /* 9512 * checks to see if its even possible to relocate this block group. 9513 * 9514 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9515 * ok to go ahead and try. 9516 */ 9517 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 9518 { 9519 struct btrfs_root *root = fs_info->extent_root; 9520 struct btrfs_block_group_cache *block_group; 9521 struct btrfs_space_info *space_info; 9522 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 9523 struct btrfs_device *device; 9524 struct btrfs_trans_handle *trans; 9525 u64 min_free; 9526 u64 dev_min = 1; 9527 u64 dev_nr = 0; 9528 u64 target; 9529 int debug; 9530 int index; 9531 int full = 0; 9532 int ret = 0; 9533 9534 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 9535 9536 block_group = btrfs_lookup_block_group(fs_info, bytenr); 9537 9538 /* odd, couldn't find the block group, leave it alone */ 9539 if (!block_group) { 9540 if (debug) 9541 btrfs_warn(fs_info, 9542 "can't find block group for bytenr %llu", 9543 bytenr); 9544 return -1; 9545 } 9546 9547 min_free = btrfs_block_group_used(&block_group->item); 9548 9549 /* no bytes used, we're good */ 9550 if (!min_free) 9551 goto out; 9552 9553 space_info = block_group->space_info; 9554 spin_lock(&space_info->lock); 9555 9556 full = space_info->full; 9557 9558 /* 9559 * if this is the last block group we have in this space, we can't 9560 * relocate it unless we're able to allocate a new chunk below. 9561 * 9562 * Otherwise, we need to make sure we have room in the space to handle 9563 * all of the extents from this block group. If we can, we're good 9564 */ 9565 if ((space_info->total_bytes != block_group->key.offset) && 9566 (btrfs_space_info_used(space_info, false) + min_free < 9567 space_info->total_bytes)) { 9568 spin_unlock(&space_info->lock); 9569 goto out; 9570 } 9571 spin_unlock(&space_info->lock); 9572 9573 /* 9574 * ok we don't have enough space, but maybe we have free space on our 9575 * devices to allocate new chunks for relocation, so loop through our 9576 * alloc devices and guess if we have enough space. if this block 9577 * group is going to be restriped, run checks against the target 9578 * profile instead of the current one. 9579 */ 9580 ret = -1; 9581 9582 /* 9583 * index: 9584 * 0: raid10 9585 * 1: raid1 9586 * 2: dup 9587 * 3: raid0 9588 * 4: single 9589 */ 9590 target = get_restripe_target(fs_info, block_group->flags); 9591 if (target) { 9592 index = __get_raid_index(extended_to_chunk(target)); 9593 } else { 9594 /* 9595 * this is just a balance, so if we were marked as full 9596 * we know there is no space for a new chunk 9597 */ 9598 if (full) { 9599 if (debug) 9600 btrfs_warn(fs_info, 9601 "no space to alloc new chunk for block group %llu", 9602 block_group->key.objectid); 9603 goto out; 9604 } 9605 9606 index = get_block_group_index(block_group); 9607 } 9608 9609 if (index == BTRFS_RAID_RAID10) { 9610 dev_min = 4; 9611 /* Divide by 2 */ 9612 min_free >>= 1; 9613 } else if (index == BTRFS_RAID_RAID1) { 9614 dev_min = 2; 9615 } else if (index == BTRFS_RAID_DUP) { 9616 /* Multiply by 2 */ 9617 min_free <<= 1; 9618 } else if (index == BTRFS_RAID_RAID0) { 9619 dev_min = fs_devices->rw_devices; 9620 min_free = div64_u64(min_free, dev_min); 9621 } 9622 9623 /* We need to do this so that we can look at pending chunks */ 9624 trans = btrfs_join_transaction(root); 9625 if (IS_ERR(trans)) { 9626 ret = PTR_ERR(trans); 9627 goto out; 9628 } 9629 9630 mutex_lock(&fs_info->chunk_mutex); 9631 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9632 u64 dev_offset; 9633 9634 /* 9635 * check to make sure we can actually find a chunk with enough 9636 * space to fit our block group in. 9637 */ 9638 if (device->total_bytes > device->bytes_used + min_free && 9639 !device->is_tgtdev_for_dev_replace) { 9640 ret = find_free_dev_extent(trans, device, min_free, 9641 &dev_offset, NULL); 9642 if (!ret) 9643 dev_nr++; 9644 9645 if (dev_nr >= dev_min) 9646 break; 9647 9648 ret = -1; 9649 } 9650 } 9651 if (debug && ret == -1) 9652 btrfs_warn(fs_info, 9653 "no space to allocate a new chunk for block group %llu", 9654 block_group->key.objectid); 9655 mutex_unlock(&fs_info->chunk_mutex); 9656 btrfs_end_transaction(trans); 9657 out: 9658 btrfs_put_block_group(block_group); 9659 return ret; 9660 } 9661 9662 static int find_first_block_group(struct btrfs_fs_info *fs_info, 9663 struct btrfs_path *path, 9664 struct btrfs_key *key) 9665 { 9666 struct btrfs_root *root = fs_info->extent_root; 9667 int ret = 0; 9668 struct btrfs_key found_key; 9669 struct extent_buffer *leaf; 9670 int slot; 9671 9672 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9673 if (ret < 0) 9674 goto out; 9675 9676 while (1) { 9677 slot = path->slots[0]; 9678 leaf = path->nodes[0]; 9679 if (slot >= btrfs_header_nritems(leaf)) { 9680 ret = btrfs_next_leaf(root, path); 9681 if (ret == 0) 9682 continue; 9683 if (ret < 0) 9684 goto out; 9685 break; 9686 } 9687 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9688 9689 if (found_key.objectid >= key->objectid && 9690 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9691 struct extent_map_tree *em_tree; 9692 struct extent_map *em; 9693 9694 em_tree = &root->fs_info->mapping_tree.map_tree; 9695 read_lock(&em_tree->lock); 9696 em = lookup_extent_mapping(em_tree, found_key.objectid, 9697 found_key.offset); 9698 read_unlock(&em_tree->lock); 9699 if (!em) { 9700 btrfs_err(fs_info, 9701 "logical %llu len %llu found bg but no related chunk", 9702 found_key.objectid, found_key.offset); 9703 ret = -ENOENT; 9704 } else { 9705 ret = 0; 9706 } 9707 free_extent_map(em); 9708 goto out; 9709 } 9710 path->slots[0]++; 9711 } 9712 out: 9713 return ret; 9714 } 9715 9716 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9717 { 9718 struct btrfs_block_group_cache *block_group; 9719 u64 last = 0; 9720 9721 while (1) { 9722 struct inode *inode; 9723 9724 block_group = btrfs_lookup_first_block_group(info, last); 9725 while (block_group) { 9726 spin_lock(&block_group->lock); 9727 if (block_group->iref) 9728 break; 9729 spin_unlock(&block_group->lock); 9730 block_group = next_block_group(info, block_group); 9731 } 9732 if (!block_group) { 9733 if (last == 0) 9734 break; 9735 last = 0; 9736 continue; 9737 } 9738 9739 inode = block_group->inode; 9740 block_group->iref = 0; 9741 block_group->inode = NULL; 9742 spin_unlock(&block_group->lock); 9743 ASSERT(block_group->io_ctl.inode == NULL); 9744 iput(inode); 9745 last = block_group->key.objectid + block_group->key.offset; 9746 btrfs_put_block_group(block_group); 9747 } 9748 } 9749 9750 /* 9751 * Must be called only after stopping all workers, since we could have block 9752 * group caching kthreads running, and therefore they could race with us if we 9753 * freed the block groups before stopping them. 9754 */ 9755 int btrfs_free_block_groups(struct btrfs_fs_info *info) 9756 { 9757 struct btrfs_block_group_cache *block_group; 9758 struct btrfs_space_info *space_info; 9759 struct btrfs_caching_control *caching_ctl; 9760 struct rb_node *n; 9761 9762 down_write(&info->commit_root_sem); 9763 while (!list_empty(&info->caching_block_groups)) { 9764 caching_ctl = list_entry(info->caching_block_groups.next, 9765 struct btrfs_caching_control, list); 9766 list_del(&caching_ctl->list); 9767 put_caching_control(caching_ctl); 9768 } 9769 up_write(&info->commit_root_sem); 9770 9771 spin_lock(&info->unused_bgs_lock); 9772 while (!list_empty(&info->unused_bgs)) { 9773 block_group = list_first_entry(&info->unused_bgs, 9774 struct btrfs_block_group_cache, 9775 bg_list); 9776 list_del_init(&block_group->bg_list); 9777 btrfs_put_block_group(block_group); 9778 } 9779 spin_unlock(&info->unused_bgs_lock); 9780 9781 spin_lock(&info->block_group_cache_lock); 9782 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 9783 block_group = rb_entry(n, struct btrfs_block_group_cache, 9784 cache_node); 9785 rb_erase(&block_group->cache_node, 9786 &info->block_group_cache_tree); 9787 RB_CLEAR_NODE(&block_group->cache_node); 9788 spin_unlock(&info->block_group_cache_lock); 9789 9790 down_write(&block_group->space_info->groups_sem); 9791 list_del(&block_group->list); 9792 up_write(&block_group->space_info->groups_sem); 9793 9794 /* 9795 * We haven't cached this block group, which means we could 9796 * possibly have excluded extents on this block group. 9797 */ 9798 if (block_group->cached == BTRFS_CACHE_NO || 9799 block_group->cached == BTRFS_CACHE_ERROR) 9800 free_excluded_extents(info, block_group); 9801 9802 btrfs_remove_free_space_cache(block_group); 9803 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 9804 ASSERT(list_empty(&block_group->dirty_list)); 9805 ASSERT(list_empty(&block_group->io_list)); 9806 ASSERT(list_empty(&block_group->bg_list)); 9807 ASSERT(atomic_read(&block_group->count) == 1); 9808 btrfs_put_block_group(block_group); 9809 9810 spin_lock(&info->block_group_cache_lock); 9811 } 9812 spin_unlock(&info->block_group_cache_lock); 9813 9814 /* now that all the block groups are freed, go through and 9815 * free all the space_info structs. This is only called during 9816 * the final stages of unmount, and so we know nobody is 9817 * using them. We call synchronize_rcu() once before we start, 9818 * just to be on the safe side. 9819 */ 9820 synchronize_rcu(); 9821 9822 release_global_block_rsv(info); 9823 9824 while (!list_empty(&info->space_info)) { 9825 int i; 9826 9827 space_info = list_entry(info->space_info.next, 9828 struct btrfs_space_info, 9829 list); 9830 9831 /* 9832 * Do not hide this behind enospc_debug, this is actually 9833 * important and indicates a real bug if this happens. 9834 */ 9835 if (WARN_ON(space_info->bytes_pinned > 0 || 9836 space_info->bytes_reserved > 0 || 9837 space_info->bytes_may_use > 0)) 9838 dump_space_info(info, space_info, 0, 0); 9839 list_del(&space_info->list); 9840 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 9841 struct kobject *kobj; 9842 kobj = space_info->block_group_kobjs[i]; 9843 space_info->block_group_kobjs[i] = NULL; 9844 if (kobj) { 9845 kobject_del(kobj); 9846 kobject_put(kobj); 9847 } 9848 } 9849 kobject_del(&space_info->kobj); 9850 kobject_put(&space_info->kobj); 9851 } 9852 return 0; 9853 } 9854 9855 static void __link_block_group(struct btrfs_space_info *space_info, 9856 struct btrfs_block_group_cache *cache) 9857 { 9858 int index = get_block_group_index(cache); 9859 bool first = false; 9860 9861 down_write(&space_info->groups_sem); 9862 if (list_empty(&space_info->block_groups[index])) 9863 first = true; 9864 list_add_tail(&cache->list, &space_info->block_groups[index]); 9865 up_write(&space_info->groups_sem); 9866 9867 if (first) { 9868 struct raid_kobject *rkobj; 9869 int ret; 9870 9871 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 9872 if (!rkobj) 9873 goto out_err; 9874 rkobj->raid_type = index; 9875 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 9876 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 9877 "%s", get_raid_name(index)); 9878 if (ret) { 9879 kobject_put(&rkobj->kobj); 9880 goto out_err; 9881 } 9882 space_info->block_group_kobjs[index] = &rkobj->kobj; 9883 } 9884 9885 return; 9886 out_err: 9887 btrfs_warn(cache->fs_info, 9888 "failed to add kobject for block cache, ignoring"); 9889 } 9890 9891 static struct btrfs_block_group_cache * 9892 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 9893 u64 start, u64 size) 9894 { 9895 struct btrfs_block_group_cache *cache; 9896 9897 cache = kzalloc(sizeof(*cache), GFP_NOFS); 9898 if (!cache) 9899 return NULL; 9900 9901 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 9902 GFP_NOFS); 9903 if (!cache->free_space_ctl) { 9904 kfree(cache); 9905 return NULL; 9906 } 9907 9908 cache->key.objectid = start; 9909 cache->key.offset = size; 9910 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9911 9912 cache->sectorsize = fs_info->sectorsize; 9913 cache->fs_info = fs_info; 9914 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, 9915 &fs_info->mapping_tree, 9916 start); 9917 set_free_space_tree_thresholds(cache); 9918 9919 atomic_set(&cache->count, 1); 9920 spin_lock_init(&cache->lock); 9921 init_rwsem(&cache->data_rwsem); 9922 INIT_LIST_HEAD(&cache->list); 9923 INIT_LIST_HEAD(&cache->cluster_list); 9924 INIT_LIST_HEAD(&cache->bg_list); 9925 INIT_LIST_HEAD(&cache->ro_list); 9926 INIT_LIST_HEAD(&cache->dirty_list); 9927 INIT_LIST_HEAD(&cache->io_list); 9928 btrfs_init_free_space_ctl(cache); 9929 atomic_set(&cache->trimming, 0); 9930 mutex_init(&cache->free_space_lock); 9931 btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); 9932 9933 return cache; 9934 } 9935 9936 int btrfs_read_block_groups(struct btrfs_fs_info *info) 9937 { 9938 struct btrfs_path *path; 9939 int ret; 9940 struct btrfs_block_group_cache *cache; 9941 struct btrfs_space_info *space_info; 9942 struct btrfs_key key; 9943 struct btrfs_key found_key; 9944 struct extent_buffer *leaf; 9945 int need_clear = 0; 9946 u64 cache_gen; 9947 u64 feature; 9948 int mixed; 9949 9950 feature = btrfs_super_incompat_flags(info->super_copy); 9951 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 9952 9953 key.objectid = 0; 9954 key.offset = 0; 9955 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9956 path = btrfs_alloc_path(); 9957 if (!path) 9958 return -ENOMEM; 9959 path->reada = READA_FORWARD; 9960 9961 cache_gen = btrfs_super_cache_generation(info->super_copy); 9962 if (btrfs_test_opt(info, SPACE_CACHE) && 9963 btrfs_super_generation(info->super_copy) != cache_gen) 9964 need_clear = 1; 9965 if (btrfs_test_opt(info, CLEAR_CACHE)) 9966 need_clear = 1; 9967 9968 while (1) { 9969 ret = find_first_block_group(info, path, &key); 9970 if (ret > 0) 9971 break; 9972 if (ret != 0) 9973 goto error; 9974 9975 leaf = path->nodes[0]; 9976 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9977 9978 cache = btrfs_create_block_group_cache(info, found_key.objectid, 9979 found_key.offset); 9980 if (!cache) { 9981 ret = -ENOMEM; 9982 goto error; 9983 } 9984 9985 if (need_clear) { 9986 /* 9987 * When we mount with old space cache, we need to 9988 * set BTRFS_DC_CLEAR and set dirty flag. 9989 * 9990 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9991 * truncate the old free space cache inode and 9992 * setup a new one. 9993 * b) Setting 'dirty flag' makes sure that we flush 9994 * the new space cache info onto disk. 9995 */ 9996 if (btrfs_test_opt(info, SPACE_CACHE)) 9997 cache->disk_cache_state = BTRFS_DC_CLEAR; 9998 } 9999 10000 read_extent_buffer(leaf, &cache->item, 10001 btrfs_item_ptr_offset(leaf, path->slots[0]), 10002 sizeof(cache->item)); 10003 cache->flags = btrfs_block_group_flags(&cache->item); 10004 if (!mixed && 10005 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 10006 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 10007 btrfs_err(info, 10008 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 10009 cache->key.objectid); 10010 ret = -EINVAL; 10011 goto error; 10012 } 10013 10014 key.objectid = found_key.objectid + found_key.offset; 10015 btrfs_release_path(path); 10016 10017 /* 10018 * We need to exclude the super stripes now so that the space 10019 * info has super bytes accounted for, otherwise we'll think 10020 * we have more space than we actually do. 10021 */ 10022 ret = exclude_super_stripes(info, cache); 10023 if (ret) { 10024 /* 10025 * We may have excluded something, so call this just in 10026 * case. 10027 */ 10028 free_excluded_extents(info, cache); 10029 btrfs_put_block_group(cache); 10030 goto error; 10031 } 10032 10033 /* 10034 * check for two cases, either we are full, and therefore 10035 * don't need to bother with the caching work since we won't 10036 * find any space, or we are empty, and we can just add all 10037 * the space in and be done with it. This saves us _alot_ of 10038 * time, particularly in the full case. 10039 */ 10040 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10041 cache->last_byte_to_unpin = (u64)-1; 10042 cache->cached = BTRFS_CACHE_FINISHED; 10043 free_excluded_extents(info, cache); 10044 } else if (btrfs_block_group_used(&cache->item) == 0) { 10045 cache->last_byte_to_unpin = (u64)-1; 10046 cache->cached = BTRFS_CACHE_FINISHED; 10047 add_new_free_space(cache, info, 10048 found_key.objectid, 10049 found_key.objectid + 10050 found_key.offset); 10051 free_excluded_extents(info, cache); 10052 } 10053 10054 ret = btrfs_add_block_group_cache(info, cache); 10055 if (ret) { 10056 btrfs_remove_free_space_cache(cache); 10057 btrfs_put_block_group(cache); 10058 goto error; 10059 } 10060 10061 trace_btrfs_add_block_group(info, cache, 0); 10062 ret = update_space_info(info, cache->flags, found_key.offset, 10063 btrfs_block_group_used(&cache->item), 10064 cache->bytes_super, &space_info); 10065 if (ret) { 10066 btrfs_remove_free_space_cache(cache); 10067 spin_lock(&info->block_group_cache_lock); 10068 rb_erase(&cache->cache_node, 10069 &info->block_group_cache_tree); 10070 RB_CLEAR_NODE(&cache->cache_node); 10071 spin_unlock(&info->block_group_cache_lock); 10072 btrfs_put_block_group(cache); 10073 goto error; 10074 } 10075 10076 cache->space_info = space_info; 10077 10078 __link_block_group(space_info, cache); 10079 10080 set_avail_alloc_bits(info, cache->flags); 10081 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10082 inc_block_group_ro(cache, 1); 10083 } else if (btrfs_block_group_used(&cache->item) == 0) { 10084 spin_lock(&info->unused_bgs_lock); 10085 /* Should always be true but just in case. */ 10086 if (list_empty(&cache->bg_list)) { 10087 btrfs_get_block_group(cache); 10088 list_add_tail(&cache->bg_list, 10089 &info->unused_bgs); 10090 } 10091 spin_unlock(&info->unused_bgs_lock); 10092 } 10093 } 10094 10095 list_for_each_entry_rcu(space_info, &info->space_info, list) { 10096 if (!(get_alloc_profile(info, space_info->flags) & 10097 (BTRFS_BLOCK_GROUP_RAID10 | 10098 BTRFS_BLOCK_GROUP_RAID1 | 10099 BTRFS_BLOCK_GROUP_RAID5 | 10100 BTRFS_BLOCK_GROUP_RAID6 | 10101 BTRFS_BLOCK_GROUP_DUP))) 10102 continue; 10103 /* 10104 * avoid allocating from un-mirrored block group if there are 10105 * mirrored block groups. 10106 */ 10107 list_for_each_entry(cache, 10108 &space_info->block_groups[BTRFS_RAID_RAID0], 10109 list) 10110 inc_block_group_ro(cache, 1); 10111 list_for_each_entry(cache, 10112 &space_info->block_groups[BTRFS_RAID_SINGLE], 10113 list) 10114 inc_block_group_ro(cache, 1); 10115 } 10116 10117 init_global_block_rsv(info); 10118 ret = 0; 10119 error: 10120 btrfs_free_path(path); 10121 return ret; 10122 } 10123 10124 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 10125 struct btrfs_fs_info *fs_info) 10126 { 10127 struct btrfs_block_group_cache *block_group, *tmp; 10128 struct btrfs_root *extent_root = fs_info->extent_root; 10129 struct btrfs_block_group_item item; 10130 struct btrfs_key key; 10131 int ret = 0; 10132 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 10133 10134 trans->can_flush_pending_bgs = false; 10135 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 10136 if (ret) 10137 goto next; 10138 10139 spin_lock(&block_group->lock); 10140 memcpy(&item, &block_group->item, sizeof(item)); 10141 memcpy(&key, &block_group->key, sizeof(key)); 10142 spin_unlock(&block_group->lock); 10143 10144 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10145 sizeof(item)); 10146 if (ret) 10147 btrfs_abort_transaction(trans, ret); 10148 ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid, 10149 key.offset); 10150 if (ret) 10151 btrfs_abort_transaction(trans, ret); 10152 add_block_group_free_space(trans, fs_info, block_group); 10153 /* already aborted the transaction if it failed. */ 10154 next: 10155 list_del_init(&block_group->bg_list); 10156 } 10157 trans->can_flush_pending_bgs = can_flush_pending_bgs; 10158 } 10159 10160 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 10161 struct btrfs_fs_info *fs_info, u64 bytes_used, 10162 u64 type, u64 chunk_objectid, u64 chunk_offset, 10163 u64 size) 10164 { 10165 struct btrfs_block_group_cache *cache; 10166 int ret; 10167 10168 btrfs_set_log_full_commit(fs_info, trans); 10169 10170 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 10171 if (!cache) 10172 return -ENOMEM; 10173 10174 btrfs_set_block_group_used(&cache->item, bytes_used); 10175 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 10176 btrfs_set_block_group_flags(&cache->item, type); 10177 10178 cache->flags = type; 10179 cache->last_byte_to_unpin = (u64)-1; 10180 cache->cached = BTRFS_CACHE_FINISHED; 10181 cache->needs_free_space = 1; 10182 ret = exclude_super_stripes(fs_info, cache); 10183 if (ret) { 10184 /* 10185 * We may have excluded something, so call this just in 10186 * case. 10187 */ 10188 free_excluded_extents(fs_info, cache); 10189 btrfs_put_block_group(cache); 10190 return ret; 10191 } 10192 10193 add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size); 10194 10195 free_excluded_extents(fs_info, cache); 10196 10197 #ifdef CONFIG_BTRFS_DEBUG 10198 if (btrfs_should_fragment_free_space(cache)) { 10199 u64 new_bytes_used = size - bytes_used; 10200 10201 bytes_used += new_bytes_used >> 1; 10202 fragment_free_space(cache); 10203 } 10204 #endif 10205 /* 10206 * Call to ensure the corresponding space_info object is created and 10207 * assigned to our block group, but don't update its counters just yet. 10208 * We want our bg to be added to the rbtree with its ->space_info set. 10209 */ 10210 ret = update_space_info(fs_info, cache->flags, 0, 0, 0, 10211 &cache->space_info); 10212 if (ret) { 10213 btrfs_remove_free_space_cache(cache); 10214 btrfs_put_block_group(cache); 10215 return ret; 10216 } 10217 10218 ret = btrfs_add_block_group_cache(fs_info, cache); 10219 if (ret) { 10220 btrfs_remove_free_space_cache(cache); 10221 btrfs_put_block_group(cache); 10222 return ret; 10223 } 10224 10225 /* 10226 * Now that our block group has its ->space_info set and is inserted in 10227 * the rbtree, update the space info's counters. 10228 */ 10229 trace_btrfs_add_block_group(fs_info, cache, 1); 10230 ret = update_space_info(fs_info, cache->flags, size, bytes_used, 10231 cache->bytes_super, &cache->space_info); 10232 if (ret) { 10233 btrfs_remove_free_space_cache(cache); 10234 spin_lock(&fs_info->block_group_cache_lock); 10235 rb_erase(&cache->cache_node, 10236 &fs_info->block_group_cache_tree); 10237 RB_CLEAR_NODE(&cache->cache_node); 10238 spin_unlock(&fs_info->block_group_cache_lock); 10239 btrfs_put_block_group(cache); 10240 return ret; 10241 } 10242 update_global_block_rsv(fs_info); 10243 10244 __link_block_group(cache->space_info, cache); 10245 10246 list_add_tail(&cache->bg_list, &trans->new_bgs); 10247 10248 set_avail_alloc_bits(fs_info, type); 10249 return 0; 10250 } 10251 10252 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10253 { 10254 u64 extra_flags = chunk_to_extended(flags) & 10255 BTRFS_EXTENDED_PROFILE_MASK; 10256 10257 write_seqlock(&fs_info->profiles_lock); 10258 if (flags & BTRFS_BLOCK_GROUP_DATA) 10259 fs_info->avail_data_alloc_bits &= ~extra_flags; 10260 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10261 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10262 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10263 fs_info->avail_system_alloc_bits &= ~extra_flags; 10264 write_sequnlock(&fs_info->profiles_lock); 10265 } 10266 10267 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10268 struct btrfs_fs_info *fs_info, u64 group_start, 10269 struct extent_map *em) 10270 { 10271 struct btrfs_root *root = fs_info->extent_root; 10272 struct btrfs_path *path; 10273 struct btrfs_block_group_cache *block_group; 10274 struct btrfs_free_cluster *cluster; 10275 struct btrfs_root *tree_root = fs_info->tree_root; 10276 struct btrfs_key key; 10277 struct inode *inode; 10278 struct kobject *kobj = NULL; 10279 int ret; 10280 int index; 10281 int factor; 10282 struct btrfs_caching_control *caching_ctl = NULL; 10283 bool remove_em; 10284 10285 block_group = btrfs_lookup_block_group(fs_info, group_start); 10286 BUG_ON(!block_group); 10287 BUG_ON(!block_group->ro); 10288 10289 /* 10290 * Free the reserved super bytes from this block group before 10291 * remove it. 10292 */ 10293 free_excluded_extents(fs_info, block_group); 10294 10295 memcpy(&key, &block_group->key, sizeof(key)); 10296 index = get_block_group_index(block_group); 10297 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 10298 BTRFS_BLOCK_GROUP_RAID1 | 10299 BTRFS_BLOCK_GROUP_RAID10)) 10300 factor = 2; 10301 else 10302 factor = 1; 10303 10304 /* make sure this block group isn't part of an allocation cluster */ 10305 cluster = &fs_info->data_alloc_cluster; 10306 spin_lock(&cluster->refill_lock); 10307 btrfs_return_cluster_to_free_space(block_group, cluster); 10308 spin_unlock(&cluster->refill_lock); 10309 10310 /* 10311 * make sure this block group isn't part of a metadata 10312 * allocation cluster 10313 */ 10314 cluster = &fs_info->meta_alloc_cluster; 10315 spin_lock(&cluster->refill_lock); 10316 btrfs_return_cluster_to_free_space(block_group, cluster); 10317 spin_unlock(&cluster->refill_lock); 10318 10319 path = btrfs_alloc_path(); 10320 if (!path) { 10321 ret = -ENOMEM; 10322 goto out; 10323 } 10324 10325 /* 10326 * get the inode first so any iput calls done for the io_list 10327 * aren't the final iput (no unlinks allowed now) 10328 */ 10329 inode = lookup_free_space_inode(fs_info, block_group, path); 10330 10331 mutex_lock(&trans->transaction->cache_write_mutex); 10332 /* 10333 * make sure our free spache cache IO is done before remove the 10334 * free space inode 10335 */ 10336 spin_lock(&trans->transaction->dirty_bgs_lock); 10337 if (!list_empty(&block_group->io_list)) { 10338 list_del_init(&block_group->io_list); 10339 10340 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10341 10342 spin_unlock(&trans->transaction->dirty_bgs_lock); 10343 btrfs_wait_cache_io(trans, block_group, path); 10344 btrfs_put_block_group(block_group); 10345 spin_lock(&trans->transaction->dirty_bgs_lock); 10346 } 10347 10348 if (!list_empty(&block_group->dirty_list)) { 10349 list_del_init(&block_group->dirty_list); 10350 btrfs_put_block_group(block_group); 10351 } 10352 spin_unlock(&trans->transaction->dirty_bgs_lock); 10353 mutex_unlock(&trans->transaction->cache_write_mutex); 10354 10355 if (!IS_ERR(inode)) { 10356 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10357 if (ret) { 10358 btrfs_add_delayed_iput(inode); 10359 goto out; 10360 } 10361 clear_nlink(inode); 10362 /* One for the block groups ref */ 10363 spin_lock(&block_group->lock); 10364 if (block_group->iref) { 10365 block_group->iref = 0; 10366 block_group->inode = NULL; 10367 spin_unlock(&block_group->lock); 10368 iput(inode); 10369 } else { 10370 spin_unlock(&block_group->lock); 10371 } 10372 /* One for our lookup ref */ 10373 btrfs_add_delayed_iput(inode); 10374 } 10375 10376 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10377 key.offset = block_group->key.objectid; 10378 key.type = 0; 10379 10380 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10381 if (ret < 0) 10382 goto out; 10383 if (ret > 0) 10384 btrfs_release_path(path); 10385 if (ret == 0) { 10386 ret = btrfs_del_item(trans, tree_root, path); 10387 if (ret) 10388 goto out; 10389 btrfs_release_path(path); 10390 } 10391 10392 spin_lock(&fs_info->block_group_cache_lock); 10393 rb_erase(&block_group->cache_node, 10394 &fs_info->block_group_cache_tree); 10395 RB_CLEAR_NODE(&block_group->cache_node); 10396 10397 if (fs_info->first_logical_byte == block_group->key.objectid) 10398 fs_info->first_logical_byte = (u64)-1; 10399 spin_unlock(&fs_info->block_group_cache_lock); 10400 10401 down_write(&block_group->space_info->groups_sem); 10402 /* 10403 * we must use list_del_init so people can check to see if they 10404 * are still on the list after taking the semaphore 10405 */ 10406 list_del_init(&block_group->list); 10407 if (list_empty(&block_group->space_info->block_groups[index])) { 10408 kobj = block_group->space_info->block_group_kobjs[index]; 10409 block_group->space_info->block_group_kobjs[index] = NULL; 10410 clear_avail_alloc_bits(fs_info, block_group->flags); 10411 } 10412 up_write(&block_group->space_info->groups_sem); 10413 if (kobj) { 10414 kobject_del(kobj); 10415 kobject_put(kobj); 10416 } 10417 10418 if (block_group->has_caching_ctl) 10419 caching_ctl = get_caching_control(block_group); 10420 if (block_group->cached == BTRFS_CACHE_STARTED) 10421 wait_block_group_cache_done(block_group); 10422 if (block_group->has_caching_ctl) { 10423 down_write(&fs_info->commit_root_sem); 10424 if (!caching_ctl) { 10425 struct btrfs_caching_control *ctl; 10426 10427 list_for_each_entry(ctl, 10428 &fs_info->caching_block_groups, list) 10429 if (ctl->block_group == block_group) { 10430 caching_ctl = ctl; 10431 refcount_inc(&caching_ctl->count); 10432 break; 10433 } 10434 } 10435 if (caching_ctl) 10436 list_del_init(&caching_ctl->list); 10437 up_write(&fs_info->commit_root_sem); 10438 if (caching_ctl) { 10439 /* Once for the caching bgs list and once for us. */ 10440 put_caching_control(caching_ctl); 10441 put_caching_control(caching_ctl); 10442 } 10443 } 10444 10445 spin_lock(&trans->transaction->dirty_bgs_lock); 10446 if (!list_empty(&block_group->dirty_list)) { 10447 WARN_ON(1); 10448 } 10449 if (!list_empty(&block_group->io_list)) { 10450 WARN_ON(1); 10451 } 10452 spin_unlock(&trans->transaction->dirty_bgs_lock); 10453 btrfs_remove_free_space_cache(block_group); 10454 10455 spin_lock(&block_group->space_info->lock); 10456 list_del_init(&block_group->ro_list); 10457 10458 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 10459 WARN_ON(block_group->space_info->total_bytes 10460 < block_group->key.offset); 10461 WARN_ON(block_group->space_info->bytes_readonly 10462 < block_group->key.offset); 10463 WARN_ON(block_group->space_info->disk_total 10464 < block_group->key.offset * factor); 10465 } 10466 block_group->space_info->total_bytes -= block_group->key.offset; 10467 block_group->space_info->bytes_readonly -= block_group->key.offset; 10468 block_group->space_info->disk_total -= block_group->key.offset * factor; 10469 10470 spin_unlock(&block_group->space_info->lock); 10471 10472 memcpy(&key, &block_group->key, sizeof(key)); 10473 10474 mutex_lock(&fs_info->chunk_mutex); 10475 if (!list_empty(&em->list)) { 10476 /* We're in the transaction->pending_chunks list. */ 10477 free_extent_map(em); 10478 } 10479 spin_lock(&block_group->lock); 10480 block_group->removed = 1; 10481 /* 10482 * At this point trimming can't start on this block group, because we 10483 * removed the block group from the tree fs_info->block_group_cache_tree 10484 * so no one can't find it anymore and even if someone already got this 10485 * block group before we removed it from the rbtree, they have already 10486 * incremented block_group->trimming - if they didn't, they won't find 10487 * any free space entries because we already removed them all when we 10488 * called btrfs_remove_free_space_cache(). 10489 * 10490 * And we must not remove the extent map from the fs_info->mapping_tree 10491 * to prevent the same logical address range and physical device space 10492 * ranges from being reused for a new block group. This is because our 10493 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10494 * completely transactionless, so while it is trimming a range the 10495 * currently running transaction might finish and a new one start, 10496 * allowing for new block groups to be created that can reuse the same 10497 * physical device locations unless we take this special care. 10498 * 10499 * There may also be an implicit trim operation if the file system 10500 * is mounted with -odiscard. The same protections must remain 10501 * in place until the extents have been discarded completely when 10502 * the transaction commit has completed. 10503 */ 10504 remove_em = (atomic_read(&block_group->trimming) == 0); 10505 /* 10506 * Make sure a trimmer task always sees the em in the pinned_chunks list 10507 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10508 * before checking block_group->removed). 10509 */ 10510 if (!remove_em) { 10511 /* 10512 * Our em might be in trans->transaction->pending_chunks which 10513 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10514 * and so is the fs_info->pinned_chunks list. 10515 * 10516 * So at this point we must be holding the chunk_mutex to avoid 10517 * any races with chunk allocation (more specifically at 10518 * volumes.c:contains_pending_extent()), to ensure it always 10519 * sees the em, either in the pending_chunks list or in the 10520 * pinned_chunks list. 10521 */ 10522 list_move_tail(&em->list, &fs_info->pinned_chunks); 10523 } 10524 spin_unlock(&block_group->lock); 10525 10526 if (remove_em) { 10527 struct extent_map_tree *em_tree; 10528 10529 em_tree = &fs_info->mapping_tree.map_tree; 10530 write_lock(&em_tree->lock); 10531 /* 10532 * The em might be in the pending_chunks list, so make sure the 10533 * chunk mutex is locked, since remove_extent_mapping() will 10534 * delete us from that list. 10535 */ 10536 remove_extent_mapping(em_tree, em); 10537 write_unlock(&em_tree->lock); 10538 /* once for the tree */ 10539 free_extent_map(em); 10540 } 10541 10542 mutex_unlock(&fs_info->chunk_mutex); 10543 10544 ret = remove_block_group_free_space(trans, fs_info, block_group); 10545 if (ret) 10546 goto out; 10547 10548 btrfs_put_block_group(block_group); 10549 btrfs_put_block_group(block_group); 10550 10551 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10552 if (ret > 0) 10553 ret = -EIO; 10554 if (ret < 0) 10555 goto out; 10556 10557 ret = btrfs_del_item(trans, root, path); 10558 out: 10559 btrfs_free_path(path); 10560 return ret; 10561 } 10562 10563 struct btrfs_trans_handle * 10564 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10565 const u64 chunk_offset) 10566 { 10567 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10568 struct extent_map *em; 10569 struct map_lookup *map; 10570 unsigned int num_items; 10571 10572 read_lock(&em_tree->lock); 10573 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10574 read_unlock(&em_tree->lock); 10575 ASSERT(em && em->start == chunk_offset); 10576 10577 /* 10578 * We need to reserve 3 + N units from the metadata space info in order 10579 * to remove a block group (done at btrfs_remove_chunk() and at 10580 * btrfs_remove_block_group()), which are used for: 10581 * 10582 * 1 unit for adding the free space inode's orphan (located in the tree 10583 * of tree roots). 10584 * 1 unit for deleting the block group item (located in the extent 10585 * tree). 10586 * 1 unit for deleting the free space item (located in tree of tree 10587 * roots). 10588 * N units for deleting N device extent items corresponding to each 10589 * stripe (located in the device tree). 10590 * 10591 * In order to remove a block group we also need to reserve units in the 10592 * system space info in order to update the chunk tree (update one or 10593 * more device items and remove one chunk item), but this is done at 10594 * btrfs_remove_chunk() through a call to check_system_chunk(). 10595 */ 10596 map = em->map_lookup; 10597 num_items = 3 + map->num_stripes; 10598 free_extent_map(em); 10599 10600 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10601 num_items, 1); 10602 } 10603 10604 /* 10605 * Process the unused_bgs list and remove any that don't have any allocated 10606 * space inside of them. 10607 */ 10608 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10609 { 10610 struct btrfs_block_group_cache *block_group; 10611 struct btrfs_space_info *space_info; 10612 struct btrfs_trans_handle *trans; 10613 int ret = 0; 10614 10615 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 10616 return; 10617 10618 spin_lock(&fs_info->unused_bgs_lock); 10619 while (!list_empty(&fs_info->unused_bgs)) { 10620 u64 start, end; 10621 int trimming; 10622 10623 block_group = list_first_entry(&fs_info->unused_bgs, 10624 struct btrfs_block_group_cache, 10625 bg_list); 10626 list_del_init(&block_group->bg_list); 10627 10628 space_info = block_group->space_info; 10629 10630 if (ret || btrfs_mixed_space_info(space_info)) { 10631 btrfs_put_block_group(block_group); 10632 continue; 10633 } 10634 spin_unlock(&fs_info->unused_bgs_lock); 10635 10636 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10637 10638 /* Don't want to race with allocators so take the groups_sem */ 10639 down_write(&space_info->groups_sem); 10640 spin_lock(&block_group->lock); 10641 if (block_group->reserved || 10642 btrfs_block_group_used(&block_group->item) || 10643 block_group->ro || 10644 list_is_singular(&block_group->list)) { 10645 /* 10646 * We want to bail if we made new allocations or have 10647 * outstanding allocations in this block group. We do 10648 * the ro check in case balance is currently acting on 10649 * this block group. 10650 */ 10651 spin_unlock(&block_group->lock); 10652 up_write(&space_info->groups_sem); 10653 goto next; 10654 } 10655 spin_unlock(&block_group->lock); 10656 10657 /* We don't want to force the issue, only flip if it's ok. */ 10658 ret = inc_block_group_ro(block_group, 0); 10659 up_write(&space_info->groups_sem); 10660 if (ret < 0) { 10661 ret = 0; 10662 goto next; 10663 } 10664 10665 /* 10666 * Want to do this before we do anything else so we can recover 10667 * properly if we fail to join the transaction. 10668 */ 10669 trans = btrfs_start_trans_remove_block_group(fs_info, 10670 block_group->key.objectid); 10671 if (IS_ERR(trans)) { 10672 btrfs_dec_block_group_ro(block_group); 10673 ret = PTR_ERR(trans); 10674 goto next; 10675 } 10676 10677 /* 10678 * We could have pending pinned extents for this block group, 10679 * just delete them, we don't care about them anymore. 10680 */ 10681 start = block_group->key.objectid; 10682 end = start + block_group->key.offset - 1; 10683 /* 10684 * Hold the unused_bg_unpin_mutex lock to avoid racing with 10685 * btrfs_finish_extent_commit(). If we are at transaction N, 10686 * another task might be running finish_extent_commit() for the 10687 * previous transaction N - 1, and have seen a range belonging 10688 * to the block group in freed_extents[] before we were able to 10689 * clear the whole block group range from freed_extents[]. This 10690 * means that task can lookup for the block group after we 10691 * unpinned it from freed_extents[] and removed it, leading to 10692 * a BUG_ON() at btrfs_unpin_extent_range(). 10693 */ 10694 mutex_lock(&fs_info->unused_bg_unpin_mutex); 10695 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 10696 EXTENT_DIRTY); 10697 if (ret) { 10698 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10699 btrfs_dec_block_group_ro(block_group); 10700 goto end_trans; 10701 } 10702 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 10703 EXTENT_DIRTY); 10704 if (ret) { 10705 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10706 btrfs_dec_block_group_ro(block_group); 10707 goto end_trans; 10708 } 10709 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10710 10711 /* Reset pinned so btrfs_put_block_group doesn't complain */ 10712 spin_lock(&space_info->lock); 10713 spin_lock(&block_group->lock); 10714 10715 space_info->bytes_pinned -= block_group->pinned; 10716 space_info->bytes_readonly += block_group->pinned; 10717 percpu_counter_add(&space_info->total_bytes_pinned, 10718 -block_group->pinned); 10719 block_group->pinned = 0; 10720 10721 spin_unlock(&block_group->lock); 10722 spin_unlock(&space_info->lock); 10723 10724 /* DISCARD can flip during remount */ 10725 trimming = btrfs_test_opt(fs_info, DISCARD); 10726 10727 /* Implicit trim during transaction commit. */ 10728 if (trimming) 10729 btrfs_get_block_group_trimming(block_group); 10730 10731 /* 10732 * Btrfs_remove_chunk will abort the transaction if things go 10733 * horribly wrong. 10734 */ 10735 ret = btrfs_remove_chunk(trans, fs_info, 10736 block_group->key.objectid); 10737 10738 if (ret) { 10739 if (trimming) 10740 btrfs_put_block_group_trimming(block_group); 10741 goto end_trans; 10742 } 10743 10744 /* 10745 * If we're not mounted with -odiscard, we can just forget 10746 * about this block group. Otherwise we'll need to wait 10747 * until transaction commit to do the actual discard. 10748 */ 10749 if (trimming) { 10750 spin_lock(&fs_info->unused_bgs_lock); 10751 /* 10752 * A concurrent scrub might have added us to the list 10753 * fs_info->unused_bgs, so use a list_move operation 10754 * to add the block group to the deleted_bgs list. 10755 */ 10756 list_move(&block_group->bg_list, 10757 &trans->transaction->deleted_bgs); 10758 spin_unlock(&fs_info->unused_bgs_lock); 10759 btrfs_get_block_group(block_group); 10760 } 10761 end_trans: 10762 btrfs_end_transaction(trans); 10763 next: 10764 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 10765 btrfs_put_block_group(block_group); 10766 spin_lock(&fs_info->unused_bgs_lock); 10767 } 10768 spin_unlock(&fs_info->unused_bgs_lock); 10769 } 10770 10771 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 10772 { 10773 struct btrfs_space_info *space_info; 10774 struct btrfs_super_block *disk_super; 10775 u64 features; 10776 u64 flags; 10777 int mixed = 0; 10778 int ret; 10779 10780 disk_super = fs_info->super_copy; 10781 if (!btrfs_super_root(disk_super)) 10782 return -EINVAL; 10783 10784 features = btrfs_super_incompat_flags(disk_super); 10785 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 10786 mixed = 1; 10787 10788 flags = BTRFS_BLOCK_GROUP_SYSTEM; 10789 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10790 if (ret) 10791 goto out; 10792 10793 if (mixed) { 10794 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 10795 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10796 } else { 10797 flags = BTRFS_BLOCK_GROUP_METADATA; 10798 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10799 if (ret) 10800 goto out; 10801 10802 flags = BTRFS_BLOCK_GROUP_DATA; 10803 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10804 } 10805 out: 10806 return ret; 10807 } 10808 10809 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 10810 u64 start, u64 end) 10811 { 10812 return unpin_extent_range(fs_info, start, end, false); 10813 } 10814 10815 /* 10816 * It used to be that old block groups would be left around forever. 10817 * Iterating over them would be enough to trim unused space. Since we 10818 * now automatically remove them, we also need to iterate over unallocated 10819 * space. 10820 * 10821 * We don't want a transaction for this since the discard may take a 10822 * substantial amount of time. We don't require that a transaction be 10823 * running, but we do need to take a running transaction into account 10824 * to ensure that we're not discarding chunks that were released in 10825 * the current transaction. 10826 * 10827 * Holding the chunks lock will prevent other threads from allocating 10828 * or releasing chunks, but it won't prevent a running transaction 10829 * from committing and releasing the memory that the pending chunks 10830 * list head uses. For that, we need to take a reference to the 10831 * transaction. 10832 */ 10833 static int btrfs_trim_free_extents(struct btrfs_device *device, 10834 u64 minlen, u64 *trimmed) 10835 { 10836 u64 start = 0, len = 0; 10837 int ret; 10838 10839 *trimmed = 0; 10840 10841 /* Not writeable = nothing to do. */ 10842 if (!device->writeable) 10843 return 0; 10844 10845 /* No free space = nothing to do. */ 10846 if (device->total_bytes <= device->bytes_used) 10847 return 0; 10848 10849 ret = 0; 10850 10851 while (1) { 10852 struct btrfs_fs_info *fs_info = device->fs_info; 10853 struct btrfs_transaction *trans; 10854 u64 bytes; 10855 10856 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 10857 if (ret) 10858 return ret; 10859 10860 down_read(&fs_info->commit_root_sem); 10861 10862 spin_lock(&fs_info->trans_lock); 10863 trans = fs_info->running_transaction; 10864 if (trans) 10865 refcount_inc(&trans->use_count); 10866 spin_unlock(&fs_info->trans_lock); 10867 10868 ret = find_free_dev_extent_start(trans, device, minlen, start, 10869 &start, &len); 10870 if (trans) 10871 btrfs_put_transaction(trans); 10872 10873 if (ret) { 10874 up_read(&fs_info->commit_root_sem); 10875 mutex_unlock(&fs_info->chunk_mutex); 10876 if (ret == -ENOSPC) 10877 ret = 0; 10878 break; 10879 } 10880 10881 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 10882 up_read(&fs_info->commit_root_sem); 10883 mutex_unlock(&fs_info->chunk_mutex); 10884 10885 if (ret) 10886 break; 10887 10888 start += len; 10889 *trimmed += bytes; 10890 10891 if (fatal_signal_pending(current)) { 10892 ret = -ERESTARTSYS; 10893 break; 10894 } 10895 10896 cond_resched(); 10897 } 10898 10899 return ret; 10900 } 10901 10902 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 10903 { 10904 struct btrfs_block_group_cache *cache = NULL; 10905 struct btrfs_device *device; 10906 struct list_head *devices; 10907 u64 group_trimmed; 10908 u64 start; 10909 u64 end; 10910 u64 trimmed = 0; 10911 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 10912 int ret = 0; 10913 10914 /* 10915 * try to trim all FS space, our block group may start from non-zero. 10916 */ 10917 if (range->len == total_bytes) 10918 cache = btrfs_lookup_first_block_group(fs_info, range->start); 10919 else 10920 cache = btrfs_lookup_block_group(fs_info, range->start); 10921 10922 while (cache) { 10923 if (cache->key.objectid >= (range->start + range->len)) { 10924 btrfs_put_block_group(cache); 10925 break; 10926 } 10927 10928 start = max(range->start, cache->key.objectid); 10929 end = min(range->start + range->len, 10930 cache->key.objectid + cache->key.offset); 10931 10932 if (end - start >= range->minlen) { 10933 if (!block_group_cache_done(cache)) { 10934 ret = cache_block_group(cache, 0); 10935 if (ret) { 10936 btrfs_put_block_group(cache); 10937 break; 10938 } 10939 ret = wait_block_group_cache_done(cache); 10940 if (ret) { 10941 btrfs_put_block_group(cache); 10942 break; 10943 } 10944 } 10945 ret = btrfs_trim_block_group(cache, 10946 &group_trimmed, 10947 start, 10948 end, 10949 range->minlen); 10950 10951 trimmed += group_trimmed; 10952 if (ret) { 10953 btrfs_put_block_group(cache); 10954 break; 10955 } 10956 } 10957 10958 cache = next_block_group(fs_info, cache); 10959 } 10960 10961 mutex_lock(&fs_info->fs_devices->device_list_mutex); 10962 devices = &fs_info->fs_devices->alloc_list; 10963 list_for_each_entry(device, devices, dev_alloc_list) { 10964 ret = btrfs_trim_free_extents(device, range->minlen, 10965 &group_trimmed); 10966 if (ret) 10967 break; 10968 10969 trimmed += group_trimmed; 10970 } 10971 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 10972 10973 range->len = trimmed; 10974 return ret; 10975 } 10976 10977 /* 10978 * btrfs_{start,end}_write_no_snapshoting() are similar to 10979 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 10980 * data into the page cache through nocow before the subvolume is snapshoted, 10981 * but flush the data into disk after the snapshot creation, or to prevent 10982 * operations while snapshoting is ongoing and that cause the snapshot to be 10983 * inconsistent (writes followed by expanding truncates for example). 10984 */ 10985 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 10986 { 10987 percpu_counter_dec(&root->subv_writers->counter); 10988 /* 10989 * Make sure counter is updated before we wake up waiters. 10990 */ 10991 smp_mb(); 10992 if (waitqueue_active(&root->subv_writers->wait)) 10993 wake_up(&root->subv_writers->wait); 10994 } 10995 10996 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 10997 { 10998 if (atomic_read(&root->will_be_snapshoted)) 10999 return 0; 11000 11001 percpu_counter_inc(&root->subv_writers->counter); 11002 /* 11003 * Make sure counter is updated before we check for snapshot creation. 11004 */ 11005 smp_mb(); 11006 if (atomic_read(&root->will_be_snapshoted)) { 11007 btrfs_end_write_no_snapshoting(root); 11008 return 0; 11009 } 11010 return 1; 11011 } 11012 11013 static int wait_snapshoting_atomic_t(atomic_t *a) 11014 { 11015 schedule(); 11016 return 0; 11017 } 11018 11019 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11020 { 11021 while (true) { 11022 int ret; 11023 11024 ret = btrfs_start_write_no_snapshoting(root); 11025 if (ret) 11026 break; 11027 wait_on_atomic_t(&root->will_be_snapshoted, 11028 wait_snapshoting_atomic_t, 11029 TASK_UNINTERRUPTIBLE); 11030 } 11031 } 11032