1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 #include <linux/sched.h> 19 #include <linux/sched/signal.h> 20 #include <linux/pagemap.h> 21 #include <linux/writeback.h> 22 #include <linux/blkdev.h> 23 #include <linux/sort.h> 24 #include <linux/rcupdate.h> 25 #include <linux/kthread.h> 26 #include <linux/slab.h> 27 #include <linux/ratelimit.h> 28 #include <linux/percpu_counter.h> 29 #include "hash.h" 30 #include "tree-log.h" 31 #include "disk-io.h" 32 #include "print-tree.h" 33 #include "volumes.h" 34 #include "raid56.h" 35 #include "locking.h" 36 #include "free-space-cache.h" 37 #include "free-space-tree.h" 38 #include "math.h" 39 #include "sysfs.h" 40 #include "qgroup.h" 41 42 #undef SCRAMBLE_DELAYED_REFS 43 44 /* 45 * control flags for do_chunk_alloc's force field 46 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 47 * if we really need one. 48 * 49 * CHUNK_ALLOC_LIMITED means to only try and allocate one 50 * if we have very few chunks already allocated. This is 51 * used as part of the clustering code to help make sure 52 * we have a good pool of storage to cluster in, without 53 * filling the FS with empty chunks 54 * 55 * CHUNK_ALLOC_FORCE means it must try to allocate one 56 * 57 */ 58 enum { 59 CHUNK_ALLOC_NO_FORCE = 0, 60 CHUNK_ALLOC_LIMITED = 1, 61 CHUNK_ALLOC_FORCE = 2, 62 }; 63 64 static int update_block_group(struct btrfs_trans_handle *trans, 65 struct btrfs_fs_info *fs_info, u64 bytenr, 66 u64 num_bytes, int alloc); 67 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 68 struct btrfs_fs_info *fs_info, 69 struct btrfs_delayed_ref_node *node, u64 parent, 70 u64 root_objectid, u64 owner_objectid, 71 u64 owner_offset, int refs_to_drop, 72 struct btrfs_delayed_extent_op *extra_op); 73 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 74 struct extent_buffer *leaf, 75 struct btrfs_extent_item *ei); 76 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 77 struct btrfs_fs_info *fs_info, 78 u64 parent, u64 root_objectid, 79 u64 flags, u64 owner, u64 offset, 80 struct btrfs_key *ins, int ref_mod); 81 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 82 struct btrfs_fs_info *fs_info, 83 u64 parent, u64 root_objectid, 84 u64 flags, struct btrfs_disk_key *key, 85 int level, struct btrfs_key *ins); 86 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 87 struct btrfs_fs_info *fs_info, u64 flags, 88 int force); 89 static int find_next_key(struct btrfs_path *path, int level, 90 struct btrfs_key *key); 91 static void dump_space_info(struct btrfs_fs_info *fs_info, 92 struct btrfs_space_info *info, u64 bytes, 93 int dump_block_groups); 94 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 95 u64 ram_bytes, u64 num_bytes, int delalloc); 96 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 97 u64 num_bytes, int delalloc); 98 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 99 u64 num_bytes); 100 static int __reserve_metadata_bytes(struct btrfs_root *root, 101 struct btrfs_space_info *space_info, 102 u64 orig_bytes, 103 enum btrfs_reserve_flush_enum flush); 104 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 105 struct btrfs_space_info *space_info, 106 u64 num_bytes); 107 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 108 struct btrfs_space_info *space_info, 109 u64 num_bytes); 110 111 static noinline int 112 block_group_cache_done(struct btrfs_block_group_cache *cache) 113 { 114 smp_mb(); 115 return cache->cached == BTRFS_CACHE_FINISHED || 116 cache->cached == BTRFS_CACHE_ERROR; 117 } 118 119 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) 120 { 121 return (cache->flags & bits) == bits; 122 } 123 124 void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 125 { 126 atomic_inc(&cache->count); 127 } 128 129 void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 130 { 131 if (atomic_dec_and_test(&cache->count)) { 132 WARN_ON(cache->pinned > 0); 133 WARN_ON(cache->reserved > 0); 134 kfree(cache->free_space_ctl); 135 kfree(cache); 136 } 137 } 138 139 /* 140 * this adds the block group to the fs_info rb tree for the block group 141 * cache 142 */ 143 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, 144 struct btrfs_block_group_cache *block_group) 145 { 146 struct rb_node **p; 147 struct rb_node *parent = NULL; 148 struct btrfs_block_group_cache *cache; 149 150 spin_lock(&info->block_group_cache_lock); 151 p = &info->block_group_cache_tree.rb_node; 152 153 while (*p) { 154 parent = *p; 155 cache = rb_entry(parent, struct btrfs_block_group_cache, 156 cache_node); 157 if (block_group->key.objectid < cache->key.objectid) { 158 p = &(*p)->rb_left; 159 } else if (block_group->key.objectid > cache->key.objectid) { 160 p = &(*p)->rb_right; 161 } else { 162 spin_unlock(&info->block_group_cache_lock); 163 return -EEXIST; 164 } 165 } 166 167 rb_link_node(&block_group->cache_node, parent, p); 168 rb_insert_color(&block_group->cache_node, 169 &info->block_group_cache_tree); 170 171 if (info->first_logical_byte > block_group->key.objectid) 172 info->first_logical_byte = block_group->key.objectid; 173 174 spin_unlock(&info->block_group_cache_lock); 175 176 return 0; 177 } 178 179 /* 180 * This will return the block group at or after bytenr if contains is 0, else 181 * it will return the block group that contains the bytenr 182 */ 183 static struct btrfs_block_group_cache * 184 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, 185 int contains) 186 { 187 struct btrfs_block_group_cache *cache, *ret = NULL; 188 struct rb_node *n; 189 u64 end, start; 190 191 spin_lock(&info->block_group_cache_lock); 192 n = info->block_group_cache_tree.rb_node; 193 194 while (n) { 195 cache = rb_entry(n, struct btrfs_block_group_cache, 196 cache_node); 197 end = cache->key.objectid + cache->key.offset - 1; 198 start = cache->key.objectid; 199 200 if (bytenr < start) { 201 if (!contains && (!ret || start < ret->key.objectid)) 202 ret = cache; 203 n = n->rb_left; 204 } else if (bytenr > start) { 205 if (contains && bytenr <= end) { 206 ret = cache; 207 break; 208 } 209 n = n->rb_right; 210 } else { 211 ret = cache; 212 break; 213 } 214 } 215 if (ret) { 216 btrfs_get_block_group(ret); 217 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) 218 info->first_logical_byte = ret->key.objectid; 219 } 220 spin_unlock(&info->block_group_cache_lock); 221 222 return ret; 223 } 224 225 static int add_excluded_extent(struct btrfs_fs_info *fs_info, 226 u64 start, u64 num_bytes) 227 { 228 u64 end = start + num_bytes - 1; 229 set_extent_bits(&fs_info->freed_extents[0], 230 start, end, EXTENT_UPTODATE); 231 set_extent_bits(&fs_info->freed_extents[1], 232 start, end, EXTENT_UPTODATE); 233 return 0; 234 } 235 236 static void free_excluded_extents(struct btrfs_fs_info *fs_info, 237 struct btrfs_block_group_cache *cache) 238 { 239 u64 start, end; 240 241 start = cache->key.objectid; 242 end = start + cache->key.offset - 1; 243 244 clear_extent_bits(&fs_info->freed_extents[0], 245 start, end, EXTENT_UPTODATE); 246 clear_extent_bits(&fs_info->freed_extents[1], 247 start, end, EXTENT_UPTODATE); 248 } 249 250 static int exclude_super_stripes(struct btrfs_fs_info *fs_info, 251 struct btrfs_block_group_cache *cache) 252 { 253 u64 bytenr; 254 u64 *logical; 255 int stripe_len; 256 int i, nr, ret; 257 258 if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { 259 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; 260 cache->bytes_super += stripe_len; 261 ret = add_excluded_extent(fs_info, cache->key.objectid, 262 stripe_len); 263 if (ret) 264 return ret; 265 } 266 267 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 268 bytenr = btrfs_sb_offset(i); 269 ret = btrfs_rmap_block(fs_info, cache->key.objectid, 270 bytenr, 0, &logical, &nr, &stripe_len); 271 if (ret) 272 return ret; 273 274 while (nr--) { 275 u64 start, len; 276 277 if (logical[nr] > cache->key.objectid + 278 cache->key.offset) 279 continue; 280 281 if (logical[nr] + stripe_len <= cache->key.objectid) 282 continue; 283 284 start = logical[nr]; 285 if (start < cache->key.objectid) { 286 start = cache->key.objectid; 287 len = (logical[nr] + stripe_len) - start; 288 } else { 289 len = min_t(u64, stripe_len, 290 cache->key.objectid + 291 cache->key.offset - start); 292 } 293 294 cache->bytes_super += len; 295 ret = add_excluded_extent(fs_info, start, len); 296 if (ret) { 297 kfree(logical); 298 return ret; 299 } 300 } 301 302 kfree(logical); 303 } 304 return 0; 305 } 306 307 static struct btrfs_caching_control * 308 get_caching_control(struct btrfs_block_group_cache *cache) 309 { 310 struct btrfs_caching_control *ctl; 311 312 spin_lock(&cache->lock); 313 if (!cache->caching_ctl) { 314 spin_unlock(&cache->lock); 315 return NULL; 316 } 317 318 ctl = cache->caching_ctl; 319 atomic_inc(&ctl->count); 320 spin_unlock(&cache->lock); 321 return ctl; 322 } 323 324 static void put_caching_control(struct btrfs_caching_control *ctl) 325 { 326 if (atomic_dec_and_test(&ctl->count)) 327 kfree(ctl); 328 } 329 330 #ifdef CONFIG_BTRFS_DEBUG 331 static void fragment_free_space(struct btrfs_block_group_cache *block_group) 332 { 333 struct btrfs_fs_info *fs_info = block_group->fs_info; 334 u64 start = block_group->key.objectid; 335 u64 len = block_group->key.offset; 336 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? 337 fs_info->nodesize : fs_info->sectorsize; 338 u64 step = chunk << 1; 339 340 while (len > chunk) { 341 btrfs_remove_free_space(block_group, start, chunk); 342 start += step; 343 if (len < step) 344 len = 0; 345 else 346 len -= step; 347 } 348 } 349 #endif 350 351 /* 352 * this is only called by cache_block_group, since we could have freed extents 353 * we need to check the pinned_extents for any extents that can't be used yet 354 * since their free space will be released as soon as the transaction commits. 355 */ 356 u64 add_new_free_space(struct btrfs_block_group_cache *block_group, 357 struct btrfs_fs_info *info, u64 start, u64 end) 358 { 359 u64 extent_start, extent_end, size, total_added = 0; 360 int ret; 361 362 while (start < end) { 363 ret = find_first_extent_bit(info->pinned_extents, start, 364 &extent_start, &extent_end, 365 EXTENT_DIRTY | EXTENT_UPTODATE, 366 NULL); 367 if (ret) 368 break; 369 370 if (extent_start <= start) { 371 start = extent_end + 1; 372 } else if (extent_start > start && extent_start < end) { 373 size = extent_start - start; 374 total_added += size; 375 ret = btrfs_add_free_space(block_group, start, 376 size); 377 BUG_ON(ret); /* -ENOMEM or logic error */ 378 start = extent_end + 1; 379 } else { 380 break; 381 } 382 } 383 384 if (start < end) { 385 size = end - start; 386 total_added += size; 387 ret = btrfs_add_free_space(block_group, start, size); 388 BUG_ON(ret); /* -ENOMEM or logic error */ 389 } 390 391 return total_added; 392 } 393 394 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 395 { 396 struct btrfs_block_group_cache *block_group = caching_ctl->block_group; 397 struct btrfs_fs_info *fs_info = block_group->fs_info; 398 struct btrfs_root *extent_root = fs_info->extent_root; 399 struct btrfs_path *path; 400 struct extent_buffer *leaf; 401 struct btrfs_key key; 402 u64 total_found = 0; 403 u64 last = 0; 404 u32 nritems; 405 int ret; 406 bool wakeup = true; 407 408 path = btrfs_alloc_path(); 409 if (!path) 410 return -ENOMEM; 411 412 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 413 414 #ifdef CONFIG_BTRFS_DEBUG 415 /* 416 * If we're fragmenting we don't want to make anybody think we can 417 * allocate from this block group until we've had a chance to fragment 418 * the free space. 419 */ 420 if (btrfs_should_fragment_free_space(block_group)) 421 wakeup = false; 422 #endif 423 /* 424 * We don't want to deadlock with somebody trying to allocate a new 425 * extent for the extent root while also trying to search the extent 426 * root to add free space. So we skip locking and search the commit 427 * root, since its read-only 428 */ 429 path->skip_locking = 1; 430 path->search_commit_root = 1; 431 path->reada = READA_FORWARD; 432 433 key.objectid = last; 434 key.offset = 0; 435 key.type = BTRFS_EXTENT_ITEM_KEY; 436 437 next: 438 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 439 if (ret < 0) 440 goto out; 441 442 leaf = path->nodes[0]; 443 nritems = btrfs_header_nritems(leaf); 444 445 while (1) { 446 if (btrfs_fs_closing(fs_info) > 1) { 447 last = (u64)-1; 448 break; 449 } 450 451 if (path->slots[0] < nritems) { 452 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 453 } else { 454 ret = find_next_key(path, 0, &key); 455 if (ret) 456 break; 457 458 if (need_resched() || 459 rwsem_is_contended(&fs_info->commit_root_sem)) { 460 if (wakeup) 461 caching_ctl->progress = last; 462 btrfs_release_path(path); 463 up_read(&fs_info->commit_root_sem); 464 mutex_unlock(&caching_ctl->mutex); 465 cond_resched(); 466 mutex_lock(&caching_ctl->mutex); 467 down_read(&fs_info->commit_root_sem); 468 goto next; 469 } 470 471 ret = btrfs_next_leaf(extent_root, path); 472 if (ret < 0) 473 goto out; 474 if (ret) 475 break; 476 leaf = path->nodes[0]; 477 nritems = btrfs_header_nritems(leaf); 478 continue; 479 } 480 481 if (key.objectid < last) { 482 key.objectid = last; 483 key.offset = 0; 484 key.type = BTRFS_EXTENT_ITEM_KEY; 485 486 if (wakeup) 487 caching_ctl->progress = last; 488 btrfs_release_path(path); 489 goto next; 490 } 491 492 if (key.objectid < block_group->key.objectid) { 493 path->slots[0]++; 494 continue; 495 } 496 497 if (key.objectid >= block_group->key.objectid + 498 block_group->key.offset) 499 break; 500 501 if (key.type == BTRFS_EXTENT_ITEM_KEY || 502 key.type == BTRFS_METADATA_ITEM_KEY) { 503 total_found += add_new_free_space(block_group, 504 fs_info, last, 505 key.objectid); 506 if (key.type == BTRFS_METADATA_ITEM_KEY) 507 last = key.objectid + 508 fs_info->nodesize; 509 else 510 last = key.objectid + key.offset; 511 512 if (total_found > CACHING_CTL_WAKE_UP) { 513 total_found = 0; 514 if (wakeup) 515 wake_up(&caching_ctl->wait); 516 } 517 } 518 path->slots[0]++; 519 } 520 ret = 0; 521 522 total_found += add_new_free_space(block_group, fs_info, last, 523 block_group->key.objectid + 524 block_group->key.offset); 525 caching_ctl->progress = (u64)-1; 526 527 out: 528 btrfs_free_path(path); 529 return ret; 530 } 531 532 static noinline void caching_thread(struct btrfs_work *work) 533 { 534 struct btrfs_block_group_cache *block_group; 535 struct btrfs_fs_info *fs_info; 536 struct btrfs_caching_control *caching_ctl; 537 struct btrfs_root *extent_root; 538 int ret; 539 540 caching_ctl = container_of(work, struct btrfs_caching_control, work); 541 block_group = caching_ctl->block_group; 542 fs_info = block_group->fs_info; 543 extent_root = fs_info->extent_root; 544 545 mutex_lock(&caching_ctl->mutex); 546 down_read(&fs_info->commit_root_sem); 547 548 if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 549 ret = load_free_space_tree(caching_ctl); 550 else 551 ret = load_extent_tree_free(caching_ctl); 552 553 spin_lock(&block_group->lock); 554 block_group->caching_ctl = NULL; 555 block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; 556 spin_unlock(&block_group->lock); 557 558 #ifdef CONFIG_BTRFS_DEBUG 559 if (btrfs_should_fragment_free_space(block_group)) { 560 u64 bytes_used; 561 562 spin_lock(&block_group->space_info->lock); 563 spin_lock(&block_group->lock); 564 bytes_used = block_group->key.offset - 565 btrfs_block_group_used(&block_group->item); 566 block_group->space_info->bytes_used += bytes_used >> 1; 567 spin_unlock(&block_group->lock); 568 spin_unlock(&block_group->space_info->lock); 569 fragment_free_space(block_group); 570 } 571 #endif 572 573 caching_ctl->progress = (u64)-1; 574 575 up_read(&fs_info->commit_root_sem); 576 free_excluded_extents(fs_info, block_group); 577 mutex_unlock(&caching_ctl->mutex); 578 579 wake_up(&caching_ctl->wait); 580 581 put_caching_control(caching_ctl); 582 btrfs_put_block_group(block_group); 583 } 584 585 static int cache_block_group(struct btrfs_block_group_cache *cache, 586 int load_cache_only) 587 { 588 DEFINE_WAIT(wait); 589 struct btrfs_fs_info *fs_info = cache->fs_info; 590 struct btrfs_caching_control *caching_ctl; 591 int ret = 0; 592 593 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); 594 if (!caching_ctl) 595 return -ENOMEM; 596 597 INIT_LIST_HEAD(&caching_ctl->list); 598 mutex_init(&caching_ctl->mutex); 599 init_waitqueue_head(&caching_ctl->wait); 600 caching_ctl->block_group = cache; 601 caching_ctl->progress = cache->key.objectid; 602 atomic_set(&caching_ctl->count, 1); 603 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, 604 caching_thread, NULL, NULL); 605 606 spin_lock(&cache->lock); 607 /* 608 * This should be a rare occasion, but this could happen I think in the 609 * case where one thread starts to load the space cache info, and then 610 * some other thread starts a transaction commit which tries to do an 611 * allocation while the other thread is still loading the space cache 612 * info. The previous loop should have kept us from choosing this block 613 * group, but if we've moved to the state where we will wait on caching 614 * block groups we need to first check if we're doing a fast load here, 615 * so we can wait for it to finish, otherwise we could end up allocating 616 * from a block group who's cache gets evicted for one reason or 617 * another. 618 */ 619 while (cache->cached == BTRFS_CACHE_FAST) { 620 struct btrfs_caching_control *ctl; 621 622 ctl = cache->caching_ctl; 623 atomic_inc(&ctl->count); 624 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); 625 spin_unlock(&cache->lock); 626 627 schedule(); 628 629 finish_wait(&ctl->wait, &wait); 630 put_caching_control(ctl); 631 spin_lock(&cache->lock); 632 } 633 634 if (cache->cached != BTRFS_CACHE_NO) { 635 spin_unlock(&cache->lock); 636 kfree(caching_ctl); 637 return 0; 638 } 639 WARN_ON(cache->caching_ctl); 640 cache->caching_ctl = caching_ctl; 641 cache->cached = BTRFS_CACHE_FAST; 642 spin_unlock(&cache->lock); 643 644 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 645 mutex_lock(&caching_ctl->mutex); 646 ret = load_free_space_cache(fs_info, cache); 647 648 spin_lock(&cache->lock); 649 if (ret == 1) { 650 cache->caching_ctl = NULL; 651 cache->cached = BTRFS_CACHE_FINISHED; 652 cache->last_byte_to_unpin = (u64)-1; 653 caching_ctl->progress = (u64)-1; 654 } else { 655 if (load_cache_only) { 656 cache->caching_ctl = NULL; 657 cache->cached = BTRFS_CACHE_NO; 658 } else { 659 cache->cached = BTRFS_CACHE_STARTED; 660 cache->has_caching_ctl = 1; 661 } 662 } 663 spin_unlock(&cache->lock); 664 #ifdef CONFIG_BTRFS_DEBUG 665 if (ret == 1 && 666 btrfs_should_fragment_free_space(cache)) { 667 u64 bytes_used; 668 669 spin_lock(&cache->space_info->lock); 670 spin_lock(&cache->lock); 671 bytes_used = cache->key.offset - 672 btrfs_block_group_used(&cache->item); 673 cache->space_info->bytes_used += bytes_used >> 1; 674 spin_unlock(&cache->lock); 675 spin_unlock(&cache->space_info->lock); 676 fragment_free_space(cache); 677 } 678 #endif 679 mutex_unlock(&caching_ctl->mutex); 680 681 wake_up(&caching_ctl->wait); 682 if (ret == 1) { 683 put_caching_control(caching_ctl); 684 free_excluded_extents(fs_info, cache); 685 return 0; 686 } 687 } else { 688 /* 689 * We're either using the free space tree or no caching at all. 690 * Set cached to the appropriate value and wakeup any waiters. 691 */ 692 spin_lock(&cache->lock); 693 if (load_cache_only) { 694 cache->caching_ctl = NULL; 695 cache->cached = BTRFS_CACHE_NO; 696 } else { 697 cache->cached = BTRFS_CACHE_STARTED; 698 cache->has_caching_ctl = 1; 699 } 700 spin_unlock(&cache->lock); 701 wake_up(&caching_ctl->wait); 702 } 703 704 if (load_cache_only) { 705 put_caching_control(caching_ctl); 706 return 0; 707 } 708 709 down_write(&fs_info->commit_root_sem); 710 atomic_inc(&caching_ctl->count); 711 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 712 up_write(&fs_info->commit_root_sem); 713 714 btrfs_get_block_group(cache); 715 716 btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work); 717 718 return ret; 719 } 720 721 /* 722 * return the block group that starts at or after bytenr 723 */ 724 static struct btrfs_block_group_cache * 725 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) 726 { 727 return block_group_cache_tree_search(info, bytenr, 0); 728 } 729 730 /* 731 * return the block group that contains the given bytenr 732 */ 733 struct btrfs_block_group_cache *btrfs_lookup_block_group( 734 struct btrfs_fs_info *info, 735 u64 bytenr) 736 { 737 return block_group_cache_tree_search(info, bytenr, 1); 738 } 739 740 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, 741 u64 flags) 742 { 743 struct list_head *head = &info->space_info; 744 struct btrfs_space_info *found; 745 746 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; 747 748 rcu_read_lock(); 749 list_for_each_entry_rcu(found, head, list) { 750 if (found->flags & flags) { 751 rcu_read_unlock(); 752 return found; 753 } 754 } 755 rcu_read_unlock(); 756 return NULL; 757 } 758 759 /* 760 * after adding space to the filesystem, we need to clear the full flags 761 * on all the space infos. 762 */ 763 void btrfs_clear_space_info_full(struct btrfs_fs_info *info) 764 { 765 struct list_head *head = &info->space_info; 766 struct btrfs_space_info *found; 767 768 rcu_read_lock(); 769 list_for_each_entry_rcu(found, head, list) 770 found->full = 0; 771 rcu_read_unlock(); 772 } 773 774 /* simple helper to search for an existing data extent at a given offset */ 775 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 776 { 777 int ret; 778 struct btrfs_key key; 779 struct btrfs_path *path; 780 781 path = btrfs_alloc_path(); 782 if (!path) 783 return -ENOMEM; 784 785 key.objectid = start; 786 key.offset = len; 787 key.type = BTRFS_EXTENT_ITEM_KEY; 788 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 789 btrfs_free_path(path); 790 return ret; 791 } 792 793 /* 794 * helper function to lookup reference count and flags of a tree block. 795 * 796 * the head node for delayed ref is used to store the sum of all the 797 * reference count modifications queued up in the rbtree. the head 798 * node may also store the extent flags to set. This way you can check 799 * to see what the reference count and extent flags would be if all of 800 * the delayed refs are not processed. 801 */ 802 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 803 struct btrfs_fs_info *fs_info, u64 bytenr, 804 u64 offset, int metadata, u64 *refs, u64 *flags) 805 { 806 struct btrfs_delayed_ref_head *head; 807 struct btrfs_delayed_ref_root *delayed_refs; 808 struct btrfs_path *path; 809 struct btrfs_extent_item *ei; 810 struct extent_buffer *leaf; 811 struct btrfs_key key; 812 u32 item_size; 813 u64 num_refs; 814 u64 extent_flags; 815 int ret; 816 817 /* 818 * If we don't have skinny metadata, don't bother doing anything 819 * different 820 */ 821 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 822 offset = fs_info->nodesize; 823 metadata = 0; 824 } 825 826 path = btrfs_alloc_path(); 827 if (!path) 828 return -ENOMEM; 829 830 if (!trans) { 831 path->skip_locking = 1; 832 path->search_commit_root = 1; 833 } 834 835 search_again: 836 key.objectid = bytenr; 837 key.offset = offset; 838 if (metadata) 839 key.type = BTRFS_METADATA_ITEM_KEY; 840 else 841 key.type = BTRFS_EXTENT_ITEM_KEY; 842 843 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); 844 if (ret < 0) 845 goto out_free; 846 847 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 848 if (path->slots[0]) { 849 path->slots[0]--; 850 btrfs_item_key_to_cpu(path->nodes[0], &key, 851 path->slots[0]); 852 if (key.objectid == bytenr && 853 key.type == BTRFS_EXTENT_ITEM_KEY && 854 key.offset == fs_info->nodesize) 855 ret = 0; 856 } 857 } 858 859 if (ret == 0) { 860 leaf = path->nodes[0]; 861 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 862 if (item_size >= sizeof(*ei)) { 863 ei = btrfs_item_ptr(leaf, path->slots[0], 864 struct btrfs_extent_item); 865 num_refs = btrfs_extent_refs(leaf, ei); 866 extent_flags = btrfs_extent_flags(leaf, ei); 867 } else { 868 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 869 struct btrfs_extent_item_v0 *ei0; 870 BUG_ON(item_size != sizeof(*ei0)); 871 ei0 = btrfs_item_ptr(leaf, path->slots[0], 872 struct btrfs_extent_item_v0); 873 num_refs = btrfs_extent_refs_v0(leaf, ei0); 874 /* FIXME: this isn't correct for data */ 875 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 876 #else 877 BUG(); 878 #endif 879 } 880 BUG_ON(num_refs == 0); 881 } else { 882 num_refs = 0; 883 extent_flags = 0; 884 ret = 0; 885 } 886 887 if (!trans) 888 goto out; 889 890 delayed_refs = &trans->transaction->delayed_refs; 891 spin_lock(&delayed_refs->lock); 892 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 893 if (head) { 894 if (!mutex_trylock(&head->mutex)) { 895 atomic_inc(&head->node.refs); 896 spin_unlock(&delayed_refs->lock); 897 898 btrfs_release_path(path); 899 900 /* 901 * Mutex was contended, block until it's released and try 902 * again 903 */ 904 mutex_lock(&head->mutex); 905 mutex_unlock(&head->mutex); 906 btrfs_put_delayed_ref(&head->node); 907 goto search_again; 908 } 909 spin_lock(&head->lock); 910 if (head->extent_op && head->extent_op->update_flags) 911 extent_flags |= head->extent_op->flags_to_set; 912 else 913 BUG_ON(num_refs == 0); 914 915 num_refs += head->node.ref_mod; 916 spin_unlock(&head->lock); 917 mutex_unlock(&head->mutex); 918 } 919 spin_unlock(&delayed_refs->lock); 920 out: 921 WARN_ON(num_refs == 0); 922 if (refs) 923 *refs = num_refs; 924 if (flags) 925 *flags = extent_flags; 926 out_free: 927 btrfs_free_path(path); 928 return ret; 929 } 930 931 /* 932 * Back reference rules. Back refs have three main goals: 933 * 934 * 1) differentiate between all holders of references to an extent so that 935 * when a reference is dropped we can make sure it was a valid reference 936 * before freeing the extent. 937 * 938 * 2) Provide enough information to quickly find the holders of an extent 939 * if we notice a given block is corrupted or bad. 940 * 941 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 942 * maintenance. This is actually the same as #2, but with a slightly 943 * different use case. 944 * 945 * There are two kinds of back refs. The implicit back refs is optimized 946 * for pointers in non-shared tree blocks. For a given pointer in a block, 947 * back refs of this kind provide information about the block's owner tree 948 * and the pointer's key. These information allow us to find the block by 949 * b-tree searching. The full back refs is for pointers in tree blocks not 950 * referenced by their owner trees. The location of tree block is recorded 951 * in the back refs. Actually the full back refs is generic, and can be 952 * used in all cases the implicit back refs is used. The major shortcoming 953 * of the full back refs is its overhead. Every time a tree block gets 954 * COWed, we have to update back refs entry for all pointers in it. 955 * 956 * For a newly allocated tree block, we use implicit back refs for 957 * pointers in it. This means most tree related operations only involve 958 * implicit back refs. For a tree block created in old transaction, the 959 * only way to drop a reference to it is COW it. So we can detect the 960 * event that tree block loses its owner tree's reference and do the 961 * back refs conversion. 962 * 963 * When a tree block is COWed through a tree, there are four cases: 964 * 965 * The reference count of the block is one and the tree is the block's 966 * owner tree. Nothing to do in this case. 967 * 968 * The reference count of the block is one and the tree is not the 969 * block's owner tree. In this case, full back refs is used for pointers 970 * in the block. Remove these full back refs, add implicit back refs for 971 * every pointers in the new block. 972 * 973 * The reference count of the block is greater than one and the tree is 974 * the block's owner tree. In this case, implicit back refs is used for 975 * pointers in the block. Add full back refs for every pointers in the 976 * block, increase lower level extents' reference counts. The original 977 * implicit back refs are entailed to the new block. 978 * 979 * The reference count of the block is greater than one and the tree is 980 * not the block's owner tree. Add implicit back refs for every pointer in 981 * the new block, increase lower level extents' reference count. 982 * 983 * Back Reference Key composing: 984 * 985 * The key objectid corresponds to the first byte in the extent, 986 * The key type is used to differentiate between types of back refs. 987 * There are different meanings of the key offset for different types 988 * of back refs. 989 * 990 * File extents can be referenced by: 991 * 992 * - multiple snapshots, subvolumes, or different generations in one subvol 993 * - different files inside a single subvolume 994 * - different offsets inside a file (bookend extents in file.c) 995 * 996 * The extent ref structure for the implicit back refs has fields for: 997 * 998 * - Objectid of the subvolume root 999 * - objectid of the file holding the reference 1000 * - original offset in the file 1001 * - how many bookend extents 1002 * 1003 * The key offset for the implicit back refs is hash of the first 1004 * three fields. 1005 * 1006 * The extent ref structure for the full back refs has field for: 1007 * 1008 * - number of pointers in the tree leaf 1009 * 1010 * The key offset for the implicit back refs is the first byte of 1011 * the tree leaf 1012 * 1013 * When a file extent is allocated, The implicit back refs is used. 1014 * the fields are filled in: 1015 * 1016 * (root_key.objectid, inode objectid, offset in file, 1) 1017 * 1018 * When a file extent is removed file truncation, we find the 1019 * corresponding implicit back refs and check the following fields: 1020 * 1021 * (btrfs_header_owner(leaf), inode objectid, offset in file) 1022 * 1023 * Btree extents can be referenced by: 1024 * 1025 * - Different subvolumes 1026 * 1027 * Both the implicit back refs and the full back refs for tree blocks 1028 * only consist of key. The key offset for the implicit back refs is 1029 * objectid of block's owner tree. The key offset for the full back refs 1030 * is the first byte of parent block. 1031 * 1032 * When implicit back refs is used, information about the lowest key and 1033 * level of the tree block are required. These information are stored in 1034 * tree block info structure. 1035 */ 1036 1037 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1038 static int convert_extent_item_v0(struct btrfs_trans_handle *trans, 1039 struct btrfs_fs_info *fs_info, 1040 struct btrfs_path *path, 1041 u64 owner, u32 extra_size) 1042 { 1043 struct btrfs_root *root = fs_info->extent_root; 1044 struct btrfs_extent_item *item; 1045 struct btrfs_extent_item_v0 *ei0; 1046 struct btrfs_extent_ref_v0 *ref0; 1047 struct btrfs_tree_block_info *bi; 1048 struct extent_buffer *leaf; 1049 struct btrfs_key key; 1050 struct btrfs_key found_key; 1051 u32 new_size = sizeof(*item); 1052 u64 refs; 1053 int ret; 1054 1055 leaf = path->nodes[0]; 1056 BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); 1057 1058 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1059 ei0 = btrfs_item_ptr(leaf, path->slots[0], 1060 struct btrfs_extent_item_v0); 1061 refs = btrfs_extent_refs_v0(leaf, ei0); 1062 1063 if (owner == (u64)-1) { 1064 while (1) { 1065 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 1066 ret = btrfs_next_leaf(root, path); 1067 if (ret < 0) 1068 return ret; 1069 BUG_ON(ret > 0); /* Corruption */ 1070 leaf = path->nodes[0]; 1071 } 1072 btrfs_item_key_to_cpu(leaf, &found_key, 1073 path->slots[0]); 1074 BUG_ON(key.objectid != found_key.objectid); 1075 if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { 1076 path->slots[0]++; 1077 continue; 1078 } 1079 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1080 struct btrfs_extent_ref_v0); 1081 owner = btrfs_ref_objectid_v0(leaf, ref0); 1082 break; 1083 } 1084 } 1085 btrfs_release_path(path); 1086 1087 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1088 new_size += sizeof(*bi); 1089 1090 new_size -= sizeof(*ei0); 1091 ret = btrfs_search_slot(trans, root, &key, path, 1092 new_size + extra_size, 1); 1093 if (ret < 0) 1094 return ret; 1095 BUG_ON(ret); /* Corruption */ 1096 1097 btrfs_extend_item(fs_info, path, new_size); 1098 1099 leaf = path->nodes[0]; 1100 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1101 btrfs_set_extent_refs(leaf, item, refs); 1102 /* FIXME: get real generation */ 1103 btrfs_set_extent_generation(leaf, item, 0); 1104 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1105 btrfs_set_extent_flags(leaf, item, 1106 BTRFS_EXTENT_FLAG_TREE_BLOCK | 1107 BTRFS_BLOCK_FLAG_FULL_BACKREF); 1108 bi = (struct btrfs_tree_block_info *)(item + 1); 1109 /* FIXME: get first key of the block */ 1110 memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi)); 1111 btrfs_set_tree_block_level(leaf, bi, (int)owner); 1112 } else { 1113 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); 1114 } 1115 btrfs_mark_buffer_dirty(leaf); 1116 return 0; 1117 } 1118 #endif 1119 1120 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 1121 { 1122 u32 high_crc = ~(u32)0; 1123 u32 low_crc = ~(u32)0; 1124 __le64 lenum; 1125 1126 lenum = cpu_to_le64(root_objectid); 1127 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 1128 lenum = cpu_to_le64(owner); 1129 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1130 lenum = cpu_to_le64(offset); 1131 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 1132 1133 return ((u64)high_crc << 31) ^ (u64)low_crc; 1134 } 1135 1136 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 1137 struct btrfs_extent_data_ref *ref) 1138 { 1139 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 1140 btrfs_extent_data_ref_objectid(leaf, ref), 1141 btrfs_extent_data_ref_offset(leaf, ref)); 1142 } 1143 1144 static int match_extent_data_ref(struct extent_buffer *leaf, 1145 struct btrfs_extent_data_ref *ref, 1146 u64 root_objectid, u64 owner, u64 offset) 1147 { 1148 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 1149 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 1150 btrfs_extent_data_ref_offset(leaf, ref) != offset) 1151 return 0; 1152 return 1; 1153 } 1154 1155 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 1156 struct btrfs_fs_info *fs_info, 1157 struct btrfs_path *path, 1158 u64 bytenr, u64 parent, 1159 u64 root_objectid, 1160 u64 owner, u64 offset) 1161 { 1162 struct btrfs_root *root = fs_info->extent_root; 1163 struct btrfs_key key; 1164 struct btrfs_extent_data_ref *ref; 1165 struct extent_buffer *leaf; 1166 u32 nritems; 1167 int ret; 1168 int recow; 1169 int err = -ENOENT; 1170 1171 key.objectid = bytenr; 1172 if (parent) { 1173 key.type = BTRFS_SHARED_DATA_REF_KEY; 1174 key.offset = parent; 1175 } else { 1176 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1177 key.offset = hash_extent_data_ref(root_objectid, 1178 owner, offset); 1179 } 1180 again: 1181 recow = 0; 1182 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1183 if (ret < 0) { 1184 err = ret; 1185 goto fail; 1186 } 1187 1188 if (parent) { 1189 if (!ret) 1190 return 0; 1191 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1192 key.type = BTRFS_EXTENT_REF_V0_KEY; 1193 btrfs_release_path(path); 1194 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1195 if (ret < 0) { 1196 err = ret; 1197 goto fail; 1198 } 1199 if (!ret) 1200 return 0; 1201 #endif 1202 goto fail; 1203 } 1204 1205 leaf = path->nodes[0]; 1206 nritems = btrfs_header_nritems(leaf); 1207 while (1) { 1208 if (path->slots[0] >= nritems) { 1209 ret = btrfs_next_leaf(root, path); 1210 if (ret < 0) 1211 err = ret; 1212 if (ret) 1213 goto fail; 1214 1215 leaf = path->nodes[0]; 1216 nritems = btrfs_header_nritems(leaf); 1217 recow = 1; 1218 } 1219 1220 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1221 if (key.objectid != bytenr || 1222 key.type != BTRFS_EXTENT_DATA_REF_KEY) 1223 goto fail; 1224 1225 ref = btrfs_item_ptr(leaf, path->slots[0], 1226 struct btrfs_extent_data_ref); 1227 1228 if (match_extent_data_ref(leaf, ref, root_objectid, 1229 owner, offset)) { 1230 if (recow) { 1231 btrfs_release_path(path); 1232 goto again; 1233 } 1234 err = 0; 1235 break; 1236 } 1237 path->slots[0]++; 1238 } 1239 fail: 1240 return err; 1241 } 1242 1243 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 1244 struct btrfs_fs_info *fs_info, 1245 struct btrfs_path *path, 1246 u64 bytenr, u64 parent, 1247 u64 root_objectid, u64 owner, 1248 u64 offset, int refs_to_add) 1249 { 1250 struct btrfs_root *root = fs_info->extent_root; 1251 struct btrfs_key key; 1252 struct extent_buffer *leaf; 1253 u32 size; 1254 u32 num_refs; 1255 int ret; 1256 1257 key.objectid = bytenr; 1258 if (parent) { 1259 key.type = BTRFS_SHARED_DATA_REF_KEY; 1260 key.offset = parent; 1261 size = sizeof(struct btrfs_shared_data_ref); 1262 } else { 1263 key.type = BTRFS_EXTENT_DATA_REF_KEY; 1264 key.offset = hash_extent_data_ref(root_objectid, 1265 owner, offset); 1266 size = sizeof(struct btrfs_extent_data_ref); 1267 } 1268 1269 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 1270 if (ret && ret != -EEXIST) 1271 goto fail; 1272 1273 leaf = path->nodes[0]; 1274 if (parent) { 1275 struct btrfs_shared_data_ref *ref; 1276 ref = btrfs_item_ptr(leaf, path->slots[0], 1277 struct btrfs_shared_data_ref); 1278 if (ret == 0) { 1279 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 1280 } else { 1281 num_refs = btrfs_shared_data_ref_count(leaf, ref); 1282 num_refs += refs_to_add; 1283 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 1284 } 1285 } else { 1286 struct btrfs_extent_data_ref *ref; 1287 while (ret == -EEXIST) { 1288 ref = btrfs_item_ptr(leaf, path->slots[0], 1289 struct btrfs_extent_data_ref); 1290 if (match_extent_data_ref(leaf, ref, root_objectid, 1291 owner, offset)) 1292 break; 1293 btrfs_release_path(path); 1294 key.offset++; 1295 ret = btrfs_insert_empty_item(trans, root, path, &key, 1296 size); 1297 if (ret && ret != -EEXIST) 1298 goto fail; 1299 1300 leaf = path->nodes[0]; 1301 } 1302 ref = btrfs_item_ptr(leaf, path->slots[0], 1303 struct btrfs_extent_data_ref); 1304 if (ret == 0) { 1305 btrfs_set_extent_data_ref_root(leaf, ref, 1306 root_objectid); 1307 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 1308 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 1309 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 1310 } else { 1311 num_refs = btrfs_extent_data_ref_count(leaf, ref); 1312 num_refs += refs_to_add; 1313 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 1314 } 1315 } 1316 btrfs_mark_buffer_dirty(leaf); 1317 ret = 0; 1318 fail: 1319 btrfs_release_path(path); 1320 return ret; 1321 } 1322 1323 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 1324 struct btrfs_fs_info *fs_info, 1325 struct btrfs_path *path, 1326 int refs_to_drop, int *last_ref) 1327 { 1328 struct btrfs_key key; 1329 struct btrfs_extent_data_ref *ref1 = NULL; 1330 struct btrfs_shared_data_ref *ref2 = NULL; 1331 struct extent_buffer *leaf; 1332 u32 num_refs = 0; 1333 int ret = 0; 1334 1335 leaf = path->nodes[0]; 1336 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1337 1338 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1339 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1340 struct btrfs_extent_data_ref); 1341 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1342 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1343 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1344 struct btrfs_shared_data_ref); 1345 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1346 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1347 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1348 struct btrfs_extent_ref_v0 *ref0; 1349 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1350 struct btrfs_extent_ref_v0); 1351 num_refs = btrfs_ref_count_v0(leaf, ref0); 1352 #endif 1353 } else { 1354 BUG(); 1355 } 1356 1357 BUG_ON(num_refs < refs_to_drop); 1358 num_refs -= refs_to_drop; 1359 1360 if (num_refs == 0) { 1361 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1362 *last_ref = 1; 1363 } else { 1364 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 1365 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 1366 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 1367 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 1368 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1369 else { 1370 struct btrfs_extent_ref_v0 *ref0; 1371 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1372 struct btrfs_extent_ref_v0); 1373 btrfs_set_ref_count_v0(leaf, ref0, num_refs); 1374 } 1375 #endif 1376 btrfs_mark_buffer_dirty(leaf); 1377 } 1378 return ret; 1379 } 1380 1381 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 1382 struct btrfs_extent_inline_ref *iref) 1383 { 1384 struct btrfs_key key; 1385 struct extent_buffer *leaf; 1386 struct btrfs_extent_data_ref *ref1; 1387 struct btrfs_shared_data_ref *ref2; 1388 u32 num_refs = 0; 1389 1390 leaf = path->nodes[0]; 1391 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1392 if (iref) { 1393 if (btrfs_extent_inline_ref_type(leaf, iref) == 1394 BTRFS_EXTENT_DATA_REF_KEY) { 1395 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 1396 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1397 } else { 1398 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 1399 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1400 } 1401 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1402 ref1 = btrfs_item_ptr(leaf, path->slots[0], 1403 struct btrfs_extent_data_ref); 1404 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 1405 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1406 ref2 = btrfs_item_ptr(leaf, path->slots[0], 1407 struct btrfs_shared_data_ref); 1408 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 1409 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1410 } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { 1411 struct btrfs_extent_ref_v0 *ref0; 1412 ref0 = btrfs_item_ptr(leaf, path->slots[0], 1413 struct btrfs_extent_ref_v0); 1414 num_refs = btrfs_ref_count_v0(leaf, ref0); 1415 #endif 1416 } else { 1417 WARN_ON(1); 1418 } 1419 return num_refs; 1420 } 1421 1422 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 1423 struct btrfs_fs_info *fs_info, 1424 struct btrfs_path *path, 1425 u64 bytenr, u64 parent, 1426 u64 root_objectid) 1427 { 1428 struct btrfs_root *root = fs_info->extent_root; 1429 struct btrfs_key key; 1430 int ret; 1431 1432 key.objectid = bytenr; 1433 if (parent) { 1434 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1435 key.offset = parent; 1436 } else { 1437 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1438 key.offset = root_objectid; 1439 } 1440 1441 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1442 if (ret > 0) 1443 ret = -ENOENT; 1444 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1445 if (ret == -ENOENT && parent) { 1446 btrfs_release_path(path); 1447 key.type = BTRFS_EXTENT_REF_V0_KEY; 1448 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1449 if (ret > 0) 1450 ret = -ENOENT; 1451 } 1452 #endif 1453 return ret; 1454 } 1455 1456 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 1457 struct btrfs_fs_info *fs_info, 1458 struct btrfs_path *path, 1459 u64 bytenr, u64 parent, 1460 u64 root_objectid) 1461 { 1462 struct btrfs_key key; 1463 int ret; 1464 1465 key.objectid = bytenr; 1466 if (parent) { 1467 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 1468 key.offset = parent; 1469 } else { 1470 key.type = BTRFS_TREE_BLOCK_REF_KEY; 1471 key.offset = root_objectid; 1472 } 1473 1474 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, 1475 path, &key, 0); 1476 btrfs_release_path(path); 1477 return ret; 1478 } 1479 1480 static inline int extent_ref_type(u64 parent, u64 owner) 1481 { 1482 int type; 1483 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1484 if (parent > 0) 1485 type = BTRFS_SHARED_BLOCK_REF_KEY; 1486 else 1487 type = BTRFS_TREE_BLOCK_REF_KEY; 1488 } else { 1489 if (parent > 0) 1490 type = BTRFS_SHARED_DATA_REF_KEY; 1491 else 1492 type = BTRFS_EXTENT_DATA_REF_KEY; 1493 } 1494 return type; 1495 } 1496 1497 static int find_next_key(struct btrfs_path *path, int level, 1498 struct btrfs_key *key) 1499 1500 { 1501 for (; level < BTRFS_MAX_LEVEL; level++) { 1502 if (!path->nodes[level]) 1503 break; 1504 if (path->slots[level] + 1 >= 1505 btrfs_header_nritems(path->nodes[level])) 1506 continue; 1507 if (level == 0) 1508 btrfs_item_key_to_cpu(path->nodes[level], key, 1509 path->slots[level] + 1); 1510 else 1511 btrfs_node_key_to_cpu(path->nodes[level], key, 1512 path->slots[level] + 1); 1513 return 0; 1514 } 1515 return 1; 1516 } 1517 1518 /* 1519 * look for inline back ref. if back ref is found, *ref_ret is set 1520 * to the address of inline back ref, and 0 is returned. 1521 * 1522 * if back ref isn't found, *ref_ret is set to the address where it 1523 * should be inserted, and -ENOENT is returned. 1524 * 1525 * if insert is true and there are too many inline back refs, the path 1526 * points to the extent item, and -EAGAIN is returned. 1527 * 1528 * NOTE: inline back refs are ordered in the same way that back ref 1529 * items in the tree are ordered. 1530 */ 1531 static noinline_for_stack 1532 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 1533 struct btrfs_fs_info *fs_info, 1534 struct btrfs_path *path, 1535 struct btrfs_extent_inline_ref **ref_ret, 1536 u64 bytenr, u64 num_bytes, 1537 u64 parent, u64 root_objectid, 1538 u64 owner, u64 offset, int insert) 1539 { 1540 struct btrfs_root *root = fs_info->extent_root; 1541 struct btrfs_key key; 1542 struct extent_buffer *leaf; 1543 struct btrfs_extent_item *ei; 1544 struct btrfs_extent_inline_ref *iref; 1545 u64 flags; 1546 u64 item_size; 1547 unsigned long ptr; 1548 unsigned long end; 1549 int extra_size; 1550 int type; 1551 int want; 1552 int ret; 1553 int err = 0; 1554 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 1555 1556 key.objectid = bytenr; 1557 key.type = BTRFS_EXTENT_ITEM_KEY; 1558 key.offset = num_bytes; 1559 1560 want = extent_ref_type(parent, owner); 1561 if (insert) { 1562 extra_size = btrfs_extent_inline_ref_size(want); 1563 path->keep_locks = 1; 1564 } else 1565 extra_size = -1; 1566 1567 /* 1568 * Owner is our parent level, so we can just add one to get the level 1569 * for the block we are interested in. 1570 */ 1571 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 1572 key.type = BTRFS_METADATA_ITEM_KEY; 1573 key.offset = owner; 1574 } 1575 1576 again: 1577 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 1578 if (ret < 0) { 1579 err = ret; 1580 goto out; 1581 } 1582 1583 /* 1584 * We may be a newly converted file system which still has the old fat 1585 * extent entries for metadata, so try and see if we have one of those. 1586 */ 1587 if (ret > 0 && skinny_metadata) { 1588 skinny_metadata = false; 1589 if (path->slots[0]) { 1590 path->slots[0]--; 1591 btrfs_item_key_to_cpu(path->nodes[0], &key, 1592 path->slots[0]); 1593 if (key.objectid == bytenr && 1594 key.type == BTRFS_EXTENT_ITEM_KEY && 1595 key.offset == num_bytes) 1596 ret = 0; 1597 } 1598 if (ret) { 1599 key.objectid = bytenr; 1600 key.type = BTRFS_EXTENT_ITEM_KEY; 1601 key.offset = num_bytes; 1602 btrfs_release_path(path); 1603 goto again; 1604 } 1605 } 1606 1607 if (ret && !insert) { 1608 err = -ENOENT; 1609 goto out; 1610 } else if (WARN_ON(ret)) { 1611 err = -EIO; 1612 goto out; 1613 } 1614 1615 leaf = path->nodes[0]; 1616 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1617 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1618 if (item_size < sizeof(*ei)) { 1619 if (!insert) { 1620 err = -ENOENT; 1621 goto out; 1622 } 1623 ret = convert_extent_item_v0(trans, fs_info, path, owner, 1624 extra_size); 1625 if (ret < 0) { 1626 err = ret; 1627 goto out; 1628 } 1629 leaf = path->nodes[0]; 1630 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1631 } 1632 #endif 1633 BUG_ON(item_size < sizeof(*ei)); 1634 1635 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1636 flags = btrfs_extent_flags(leaf, ei); 1637 1638 ptr = (unsigned long)(ei + 1); 1639 end = (unsigned long)ei + item_size; 1640 1641 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 1642 ptr += sizeof(struct btrfs_tree_block_info); 1643 BUG_ON(ptr > end); 1644 } 1645 1646 err = -ENOENT; 1647 while (1) { 1648 if (ptr >= end) { 1649 WARN_ON(ptr > end); 1650 break; 1651 } 1652 iref = (struct btrfs_extent_inline_ref *)ptr; 1653 type = btrfs_extent_inline_ref_type(leaf, iref); 1654 if (want < type) 1655 break; 1656 if (want > type) { 1657 ptr += btrfs_extent_inline_ref_size(type); 1658 continue; 1659 } 1660 1661 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1662 struct btrfs_extent_data_ref *dref; 1663 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1664 if (match_extent_data_ref(leaf, dref, root_objectid, 1665 owner, offset)) { 1666 err = 0; 1667 break; 1668 } 1669 if (hash_extent_data_ref_item(leaf, dref) < 1670 hash_extent_data_ref(root_objectid, owner, offset)) 1671 break; 1672 } else { 1673 u64 ref_offset; 1674 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 1675 if (parent > 0) { 1676 if (parent == ref_offset) { 1677 err = 0; 1678 break; 1679 } 1680 if (ref_offset < parent) 1681 break; 1682 } else { 1683 if (root_objectid == ref_offset) { 1684 err = 0; 1685 break; 1686 } 1687 if (ref_offset < root_objectid) 1688 break; 1689 } 1690 } 1691 ptr += btrfs_extent_inline_ref_size(type); 1692 } 1693 if (err == -ENOENT && insert) { 1694 if (item_size + extra_size >= 1695 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 1696 err = -EAGAIN; 1697 goto out; 1698 } 1699 /* 1700 * To add new inline back ref, we have to make sure 1701 * there is no corresponding back ref item. 1702 * For simplicity, we just do not add new inline back 1703 * ref if there is any kind of item for this block 1704 */ 1705 if (find_next_key(path, 0, &key) == 0 && 1706 key.objectid == bytenr && 1707 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 1708 err = -EAGAIN; 1709 goto out; 1710 } 1711 } 1712 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 1713 out: 1714 if (insert) { 1715 path->keep_locks = 0; 1716 btrfs_unlock_up_safe(path, 1); 1717 } 1718 return err; 1719 } 1720 1721 /* 1722 * helper to add new inline back ref 1723 */ 1724 static noinline_for_stack 1725 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 1726 struct btrfs_path *path, 1727 struct btrfs_extent_inline_ref *iref, 1728 u64 parent, u64 root_objectid, 1729 u64 owner, u64 offset, int refs_to_add, 1730 struct btrfs_delayed_extent_op *extent_op) 1731 { 1732 struct extent_buffer *leaf; 1733 struct btrfs_extent_item *ei; 1734 unsigned long ptr; 1735 unsigned long end; 1736 unsigned long item_offset; 1737 u64 refs; 1738 int size; 1739 int type; 1740 1741 leaf = path->nodes[0]; 1742 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1743 item_offset = (unsigned long)iref - (unsigned long)ei; 1744 1745 type = extent_ref_type(parent, owner); 1746 size = btrfs_extent_inline_ref_size(type); 1747 1748 btrfs_extend_item(fs_info, path, size); 1749 1750 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1751 refs = btrfs_extent_refs(leaf, ei); 1752 refs += refs_to_add; 1753 btrfs_set_extent_refs(leaf, ei, refs); 1754 if (extent_op) 1755 __run_delayed_extent_op(extent_op, leaf, ei); 1756 1757 ptr = (unsigned long)ei + item_offset; 1758 end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); 1759 if (ptr < end - size) 1760 memmove_extent_buffer(leaf, ptr + size, ptr, 1761 end - size - ptr); 1762 1763 iref = (struct btrfs_extent_inline_ref *)ptr; 1764 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1765 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1766 struct btrfs_extent_data_ref *dref; 1767 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1768 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1769 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1770 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1771 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1772 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1773 struct btrfs_shared_data_ref *sref; 1774 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1775 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1776 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1777 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1778 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1779 } else { 1780 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1781 } 1782 btrfs_mark_buffer_dirty(leaf); 1783 } 1784 1785 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1786 struct btrfs_fs_info *fs_info, 1787 struct btrfs_path *path, 1788 struct btrfs_extent_inline_ref **ref_ret, 1789 u64 bytenr, u64 num_bytes, u64 parent, 1790 u64 root_objectid, u64 owner, u64 offset) 1791 { 1792 int ret; 1793 1794 ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret, 1795 bytenr, num_bytes, parent, 1796 root_objectid, owner, offset, 0); 1797 if (ret != -ENOENT) 1798 return ret; 1799 1800 btrfs_release_path(path); 1801 *ref_ret = NULL; 1802 1803 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1804 ret = lookup_tree_block_ref(trans, fs_info, path, bytenr, 1805 parent, root_objectid); 1806 } else { 1807 ret = lookup_extent_data_ref(trans, fs_info, path, bytenr, 1808 parent, root_objectid, owner, 1809 offset); 1810 } 1811 return ret; 1812 } 1813 1814 /* 1815 * helper to update/remove inline back ref 1816 */ 1817 static noinline_for_stack 1818 void update_inline_extent_backref(struct btrfs_fs_info *fs_info, 1819 struct btrfs_path *path, 1820 struct btrfs_extent_inline_ref *iref, 1821 int refs_to_mod, 1822 struct btrfs_delayed_extent_op *extent_op, 1823 int *last_ref) 1824 { 1825 struct extent_buffer *leaf; 1826 struct btrfs_extent_item *ei; 1827 struct btrfs_extent_data_ref *dref = NULL; 1828 struct btrfs_shared_data_ref *sref = NULL; 1829 unsigned long ptr; 1830 unsigned long end; 1831 u32 item_size; 1832 int size; 1833 int type; 1834 u64 refs; 1835 1836 leaf = path->nodes[0]; 1837 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1838 refs = btrfs_extent_refs(leaf, ei); 1839 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1840 refs += refs_to_mod; 1841 btrfs_set_extent_refs(leaf, ei, refs); 1842 if (extent_op) 1843 __run_delayed_extent_op(extent_op, leaf, ei); 1844 1845 type = btrfs_extent_inline_ref_type(leaf, iref); 1846 1847 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1848 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1849 refs = btrfs_extent_data_ref_count(leaf, dref); 1850 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1851 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1852 refs = btrfs_shared_data_ref_count(leaf, sref); 1853 } else { 1854 refs = 1; 1855 BUG_ON(refs_to_mod != -1); 1856 } 1857 1858 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1859 refs += refs_to_mod; 1860 1861 if (refs > 0) { 1862 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1863 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1864 else 1865 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1866 } else { 1867 *last_ref = 1; 1868 size = btrfs_extent_inline_ref_size(type); 1869 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1870 ptr = (unsigned long)iref; 1871 end = (unsigned long)ei + item_size; 1872 if (ptr + size < end) 1873 memmove_extent_buffer(leaf, ptr, ptr + size, 1874 end - ptr - size); 1875 item_size -= size; 1876 btrfs_truncate_item(fs_info, path, item_size, 1); 1877 } 1878 btrfs_mark_buffer_dirty(leaf); 1879 } 1880 1881 static noinline_for_stack 1882 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1883 struct btrfs_fs_info *fs_info, 1884 struct btrfs_path *path, 1885 u64 bytenr, u64 num_bytes, u64 parent, 1886 u64 root_objectid, u64 owner, 1887 u64 offset, int refs_to_add, 1888 struct btrfs_delayed_extent_op *extent_op) 1889 { 1890 struct btrfs_extent_inline_ref *iref; 1891 int ret; 1892 1893 ret = lookup_inline_extent_backref(trans, fs_info, path, &iref, 1894 bytenr, num_bytes, parent, 1895 root_objectid, owner, offset, 1); 1896 if (ret == 0) { 1897 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); 1898 update_inline_extent_backref(fs_info, path, iref, 1899 refs_to_add, extent_op, NULL); 1900 } else if (ret == -ENOENT) { 1901 setup_inline_extent_backref(fs_info, path, iref, parent, 1902 root_objectid, owner, offset, 1903 refs_to_add, extent_op); 1904 ret = 0; 1905 } 1906 return ret; 1907 } 1908 1909 static int insert_extent_backref(struct btrfs_trans_handle *trans, 1910 struct btrfs_fs_info *fs_info, 1911 struct btrfs_path *path, 1912 u64 bytenr, u64 parent, u64 root_objectid, 1913 u64 owner, u64 offset, int refs_to_add) 1914 { 1915 int ret; 1916 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1917 BUG_ON(refs_to_add != 1); 1918 ret = insert_tree_block_ref(trans, fs_info, path, bytenr, 1919 parent, root_objectid); 1920 } else { 1921 ret = insert_extent_data_ref(trans, fs_info, path, bytenr, 1922 parent, root_objectid, 1923 owner, offset, refs_to_add); 1924 } 1925 return ret; 1926 } 1927 1928 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1929 struct btrfs_fs_info *fs_info, 1930 struct btrfs_path *path, 1931 struct btrfs_extent_inline_ref *iref, 1932 int refs_to_drop, int is_data, int *last_ref) 1933 { 1934 int ret = 0; 1935 1936 BUG_ON(!is_data && refs_to_drop != 1); 1937 if (iref) { 1938 update_inline_extent_backref(fs_info, path, iref, 1939 -refs_to_drop, NULL, last_ref); 1940 } else if (is_data) { 1941 ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop, 1942 last_ref); 1943 } else { 1944 *last_ref = 1; 1945 ret = btrfs_del_item(trans, fs_info->extent_root, path); 1946 } 1947 return ret; 1948 } 1949 1950 #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) 1951 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1952 u64 *discarded_bytes) 1953 { 1954 int j, ret = 0; 1955 u64 bytes_left, end; 1956 u64 aligned_start = ALIGN(start, 1 << 9); 1957 1958 if (WARN_ON(start != aligned_start)) { 1959 len -= aligned_start - start; 1960 len = round_down(len, 1 << 9); 1961 start = aligned_start; 1962 } 1963 1964 *discarded_bytes = 0; 1965 1966 if (!len) 1967 return 0; 1968 1969 end = start + len; 1970 bytes_left = len; 1971 1972 /* Skip any superblocks on this device. */ 1973 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1974 u64 sb_start = btrfs_sb_offset(j); 1975 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1976 u64 size = sb_start - start; 1977 1978 if (!in_range(sb_start, start, bytes_left) && 1979 !in_range(sb_end, start, bytes_left) && 1980 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1981 continue; 1982 1983 /* 1984 * Superblock spans beginning of range. Adjust start and 1985 * try again. 1986 */ 1987 if (sb_start <= start) { 1988 start += sb_end - start; 1989 if (start > end) { 1990 bytes_left = 0; 1991 break; 1992 } 1993 bytes_left = end - start; 1994 continue; 1995 } 1996 1997 if (size) { 1998 ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, 1999 GFP_NOFS, 0); 2000 if (!ret) 2001 *discarded_bytes += size; 2002 else if (ret != -EOPNOTSUPP) 2003 return ret; 2004 } 2005 2006 start = sb_end; 2007 if (start > end) { 2008 bytes_left = 0; 2009 break; 2010 } 2011 bytes_left = end - start; 2012 } 2013 2014 if (bytes_left) { 2015 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, 2016 GFP_NOFS, 0); 2017 if (!ret) 2018 *discarded_bytes += bytes_left; 2019 } 2020 return ret; 2021 } 2022 2023 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 2024 u64 num_bytes, u64 *actual_bytes) 2025 { 2026 int ret; 2027 u64 discarded_bytes = 0; 2028 struct btrfs_bio *bbio = NULL; 2029 2030 2031 /* 2032 * Avoid races with device replace and make sure our bbio has devices 2033 * associated to its stripes that don't go away while we are discarding. 2034 */ 2035 btrfs_bio_counter_inc_blocked(fs_info); 2036 /* Tell the block device(s) that the sectors can be discarded */ 2037 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, 2038 &bbio, 0); 2039 /* Error condition is -ENOMEM */ 2040 if (!ret) { 2041 struct btrfs_bio_stripe *stripe = bbio->stripes; 2042 int i; 2043 2044 2045 for (i = 0; i < bbio->num_stripes; i++, stripe++) { 2046 u64 bytes; 2047 if (!stripe->dev->can_discard) 2048 continue; 2049 2050 ret = btrfs_issue_discard(stripe->dev->bdev, 2051 stripe->physical, 2052 stripe->length, 2053 &bytes); 2054 if (!ret) 2055 discarded_bytes += bytes; 2056 else if (ret != -EOPNOTSUPP) 2057 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */ 2058 2059 /* 2060 * Just in case we get back EOPNOTSUPP for some reason, 2061 * just ignore the return value so we don't screw up 2062 * people calling discard_extent. 2063 */ 2064 ret = 0; 2065 } 2066 btrfs_put_bbio(bbio); 2067 } 2068 btrfs_bio_counter_dec(fs_info); 2069 2070 if (actual_bytes) 2071 *actual_bytes = discarded_bytes; 2072 2073 2074 if (ret == -EOPNOTSUPP) 2075 ret = 0; 2076 return ret; 2077 } 2078 2079 /* Can return -ENOMEM */ 2080 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2081 struct btrfs_fs_info *fs_info, 2082 u64 bytenr, u64 num_bytes, u64 parent, 2083 u64 root_objectid, u64 owner, u64 offset) 2084 { 2085 int ret; 2086 2087 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 2088 root_objectid == BTRFS_TREE_LOG_OBJECTID); 2089 2090 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 2091 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2092 num_bytes, 2093 parent, root_objectid, (int)owner, 2094 BTRFS_ADD_DELAYED_REF, NULL); 2095 } else { 2096 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2097 num_bytes, parent, root_objectid, 2098 owner, offset, 0, 2099 BTRFS_ADD_DELAYED_REF); 2100 } 2101 return ret; 2102 } 2103 2104 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2105 struct btrfs_fs_info *fs_info, 2106 struct btrfs_delayed_ref_node *node, 2107 u64 parent, u64 root_objectid, 2108 u64 owner, u64 offset, int refs_to_add, 2109 struct btrfs_delayed_extent_op *extent_op) 2110 { 2111 struct btrfs_path *path; 2112 struct extent_buffer *leaf; 2113 struct btrfs_extent_item *item; 2114 struct btrfs_key key; 2115 u64 bytenr = node->bytenr; 2116 u64 num_bytes = node->num_bytes; 2117 u64 refs; 2118 int ret; 2119 2120 path = btrfs_alloc_path(); 2121 if (!path) 2122 return -ENOMEM; 2123 2124 path->reada = READA_FORWARD; 2125 path->leave_spinning = 1; 2126 /* this will setup the path even if it fails to insert the back ref */ 2127 ret = insert_inline_extent_backref(trans, fs_info, path, bytenr, 2128 num_bytes, parent, root_objectid, 2129 owner, offset, 2130 refs_to_add, extent_op); 2131 if ((ret < 0 && ret != -EAGAIN) || !ret) 2132 goto out; 2133 2134 /* 2135 * Ok we had -EAGAIN which means we didn't have space to insert and 2136 * inline extent ref, so just update the reference count and add a 2137 * normal backref. 2138 */ 2139 leaf = path->nodes[0]; 2140 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2141 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2142 refs = btrfs_extent_refs(leaf, item); 2143 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 2144 if (extent_op) 2145 __run_delayed_extent_op(extent_op, leaf, item); 2146 2147 btrfs_mark_buffer_dirty(leaf); 2148 btrfs_release_path(path); 2149 2150 path->reada = READA_FORWARD; 2151 path->leave_spinning = 1; 2152 /* now insert the actual backref */ 2153 ret = insert_extent_backref(trans, fs_info, path, bytenr, parent, 2154 root_objectid, owner, offset, refs_to_add); 2155 if (ret) 2156 btrfs_abort_transaction(trans, ret); 2157 out: 2158 btrfs_free_path(path); 2159 return ret; 2160 } 2161 2162 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 2163 struct btrfs_fs_info *fs_info, 2164 struct btrfs_delayed_ref_node *node, 2165 struct btrfs_delayed_extent_op *extent_op, 2166 int insert_reserved) 2167 { 2168 int ret = 0; 2169 struct btrfs_delayed_data_ref *ref; 2170 struct btrfs_key ins; 2171 u64 parent = 0; 2172 u64 ref_root = 0; 2173 u64 flags = 0; 2174 2175 ins.objectid = node->bytenr; 2176 ins.offset = node->num_bytes; 2177 ins.type = BTRFS_EXTENT_ITEM_KEY; 2178 2179 ref = btrfs_delayed_node_to_data_ref(node); 2180 trace_run_delayed_data_ref(fs_info, node, ref, node->action); 2181 2182 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 2183 parent = ref->parent; 2184 ref_root = ref->root; 2185 2186 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2187 if (extent_op) 2188 flags |= extent_op->flags_to_set; 2189 ret = alloc_reserved_file_extent(trans, fs_info, 2190 parent, ref_root, flags, 2191 ref->objectid, ref->offset, 2192 &ins, node->ref_mod); 2193 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2194 ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, 2195 ref_root, ref->objectid, 2196 ref->offset, node->ref_mod, 2197 extent_op); 2198 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2199 ret = __btrfs_free_extent(trans, fs_info, node, parent, 2200 ref_root, ref->objectid, 2201 ref->offset, node->ref_mod, 2202 extent_op); 2203 } else { 2204 BUG(); 2205 } 2206 return ret; 2207 } 2208 2209 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 2210 struct extent_buffer *leaf, 2211 struct btrfs_extent_item *ei) 2212 { 2213 u64 flags = btrfs_extent_flags(leaf, ei); 2214 if (extent_op->update_flags) { 2215 flags |= extent_op->flags_to_set; 2216 btrfs_set_extent_flags(leaf, ei, flags); 2217 } 2218 2219 if (extent_op->update_key) { 2220 struct btrfs_tree_block_info *bi; 2221 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 2222 bi = (struct btrfs_tree_block_info *)(ei + 1); 2223 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 2224 } 2225 } 2226 2227 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 2228 struct btrfs_fs_info *fs_info, 2229 struct btrfs_delayed_ref_node *node, 2230 struct btrfs_delayed_extent_op *extent_op) 2231 { 2232 struct btrfs_key key; 2233 struct btrfs_path *path; 2234 struct btrfs_extent_item *ei; 2235 struct extent_buffer *leaf; 2236 u32 item_size; 2237 int ret; 2238 int err = 0; 2239 int metadata = !extent_op->is_data; 2240 2241 if (trans->aborted) 2242 return 0; 2243 2244 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2245 metadata = 0; 2246 2247 path = btrfs_alloc_path(); 2248 if (!path) 2249 return -ENOMEM; 2250 2251 key.objectid = node->bytenr; 2252 2253 if (metadata) { 2254 key.type = BTRFS_METADATA_ITEM_KEY; 2255 key.offset = extent_op->level; 2256 } else { 2257 key.type = BTRFS_EXTENT_ITEM_KEY; 2258 key.offset = node->num_bytes; 2259 } 2260 2261 again: 2262 path->reada = READA_FORWARD; 2263 path->leave_spinning = 1; 2264 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); 2265 if (ret < 0) { 2266 err = ret; 2267 goto out; 2268 } 2269 if (ret > 0) { 2270 if (metadata) { 2271 if (path->slots[0] > 0) { 2272 path->slots[0]--; 2273 btrfs_item_key_to_cpu(path->nodes[0], &key, 2274 path->slots[0]); 2275 if (key.objectid == node->bytenr && 2276 key.type == BTRFS_EXTENT_ITEM_KEY && 2277 key.offset == node->num_bytes) 2278 ret = 0; 2279 } 2280 if (ret > 0) { 2281 btrfs_release_path(path); 2282 metadata = 0; 2283 2284 key.objectid = node->bytenr; 2285 key.offset = node->num_bytes; 2286 key.type = BTRFS_EXTENT_ITEM_KEY; 2287 goto again; 2288 } 2289 } else { 2290 err = -EIO; 2291 goto out; 2292 } 2293 } 2294 2295 leaf = path->nodes[0]; 2296 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2297 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 2298 if (item_size < sizeof(*ei)) { 2299 ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0); 2300 if (ret < 0) { 2301 err = ret; 2302 goto out; 2303 } 2304 leaf = path->nodes[0]; 2305 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 2306 } 2307 #endif 2308 BUG_ON(item_size < sizeof(*ei)); 2309 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2310 __run_delayed_extent_op(extent_op, leaf, ei); 2311 2312 btrfs_mark_buffer_dirty(leaf); 2313 out: 2314 btrfs_free_path(path); 2315 return err; 2316 } 2317 2318 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 2319 struct btrfs_fs_info *fs_info, 2320 struct btrfs_delayed_ref_node *node, 2321 struct btrfs_delayed_extent_op *extent_op, 2322 int insert_reserved) 2323 { 2324 int ret = 0; 2325 struct btrfs_delayed_tree_ref *ref; 2326 struct btrfs_key ins; 2327 u64 parent = 0; 2328 u64 ref_root = 0; 2329 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 2330 2331 ref = btrfs_delayed_node_to_tree_ref(node); 2332 trace_run_delayed_tree_ref(fs_info, node, ref, node->action); 2333 2334 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2335 parent = ref->parent; 2336 ref_root = ref->root; 2337 2338 ins.objectid = node->bytenr; 2339 if (skinny_metadata) { 2340 ins.offset = ref->level; 2341 ins.type = BTRFS_METADATA_ITEM_KEY; 2342 } else { 2343 ins.offset = node->num_bytes; 2344 ins.type = BTRFS_EXTENT_ITEM_KEY; 2345 } 2346 2347 if (node->ref_mod != 1) { 2348 btrfs_err(fs_info, 2349 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 2350 node->bytenr, node->ref_mod, node->action, ref_root, 2351 parent); 2352 return -EIO; 2353 } 2354 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 2355 BUG_ON(!extent_op || !extent_op->update_flags); 2356 ret = alloc_reserved_tree_block(trans, fs_info, 2357 parent, ref_root, 2358 extent_op->flags_to_set, 2359 &extent_op->key, 2360 ref->level, &ins); 2361 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2362 ret = __btrfs_inc_extent_ref(trans, fs_info, node, 2363 parent, ref_root, 2364 ref->level, 0, 1, 2365 extent_op); 2366 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 2367 ret = __btrfs_free_extent(trans, fs_info, node, 2368 parent, ref_root, 2369 ref->level, 0, 1, extent_op); 2370 } else { 2371 BUG(); 2372 } 2373 return ret; 2374 } 2375 2376 /* helper function to actually process a single delayed ref entry */ 2377 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2378 struct btrfs_fs_info *fs_info, 2379 struct btrfs_delayed_ref_node *node, 2380 struct btrfs_delayed_extent_op *extent_op, 2381 int insert_reserved) 2382 { 2383 int ret = 0; 2384 2385 if (trans->aborted) { 2386 if (insert_reserved) 2387 btrfs_pin_extent(fs_info, node->bytenr, 2388 node->num_bytes, 1); 2389 return 0; 2390 } 2391 2392 if (btrfs_delayed_ref_is_head(node)) { 2393 struct btrfs_delayed_ref_head *head; 2394 /* 2395 * we've hit the end of the chain and we were supposed 2396 * to insert this extent into the tree. But, it got 2397 * deleted before we ever needed to insert it, so all 2398 * we have to do is clean up the accounting 2399 */ 2400 BUG_ON(extent_op); 2401 head = btrfs_delayed_node_to_head(node); 2402 trace_run_delayed_ref_head(fs_info, node, head, node->action); 2403 2404 if (insert_reserved) { 2405 btrfs_pin_extent(fs_info, node->bytenr, 2406 node->num_bytes, 1); 2407 if (head->is_data) { 2408 ret = btrfs_del_csums(trans, fs_info, 2409 node->bytenr, 2410 node->num_bytes); 2411 } 2412 } 2413 2414 /* Also free its reserved qgroup space */ 2415 btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, 2416 head->qgroup_reserved); 2417 return ret; 2418 } 2419 2420 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 2421 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 2422 ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, 2423 insert_reserved); 2424 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 2425 node->type == BTRFS_SHARED_DATA_REF_KEY) 2426 ret = run_delayed_data_ref(trans, fs_info, node, extent_op, 2427 insert_reserved); 2428 else 2429 BUG(); 2430 return ret; 2431 } 2432 2433 static inline struct btrfs_delayed_ref_node * 2434 select_delayed_ref(struct btrfs_delayed_ref_head *head) 2435 { 2436 struct btrfs_delayed_ref_node *ref; 2437 2438 if (list_empty(&head->ref_list)) 2439 return NULL; 2440 2441 /* 2442 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 2443 * This is to prevent a ref count from going down to zero, which deletes 2444 * the extent item from the extent tree, when there still are references 2445 * to add, which would fail because they would not find the extent item. 2446 */ 2447 if (!list_empty(&head->ref_add_list)) 2448 return list_first_entry(&head->ref_add_list, 2449 struct btrfs_delayed_ref_node, add_list); 2450 2451 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, 2452 list); 2453 ASSERT(list_empty(&ref->add_list)); 2454 return ref; 2455 } 2456 2457 /* 2458 * Returns 0 on success or if called with an already aborted transaction. 2459 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2460 */ 2461 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2462 struct btrfs_fs_info *fs_info, 2463 unsigned long nr) 2464 { 2465 struct btrfs_delayed_ref_root *delayed_refs; 2466 struct btrfs_delayed_ref_node *ref; 2467 struct btrfs_delayed_ref_head *locked_ref = NULL; 2468 struct btrfs_delayed_extent_op *extent_op; 2469 ktime_t start = ktime_get(); 2470 int ret; 2471 unsigned long count = 0; 2472 unsigned long actual_count = 0; 2473 int must_insert_reserved = 0; 2474 2475 delayed_refs = &trans->transaction->delayed_refs; 2476 while (1) { 2477 if (!locked_ref) { 2478 if (count >= nr) 2479 break; 2480 2481 spin_lock(&delayed_refs->lock); 2482 locked_ref = btrfs_select_ref_head(trans); 2483 if (!locked_ref) { 2484 spin_unlock(&delayed_refs->lock); 2485 break; 2486 } 2487 2488 /* grab the lock that says we are going to process 2489 * all the refs for this head */ 2490 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2491 spin_unlock(&delayed_refs->lock); 2492 /* 2493 * we may have dropped the spin lock to get the head 2494 * mutex lock, and that might have given someone else 2495 * time to free the head. If that's true, it has been 2496 * removed from our list and we can move on. 2497 */ 2498 if (ret == -EAGAIN) { 2499 locked_ref = NULL; 2500 count++; 2501 continue; 2502 } 2503 } 2504 2505 /* 2506 * We need to try and merge add/drops of the same ref since we 2507 * can run into issues with relocate dropping the implicit ref 2508 * and then it being added back again before the drop can 2509 * finish. If we merged anything we need to re-loop so we can 2510 * get a good ref. 2511 * Or we can get node references of the same type that weren't 2512 * merged when created due to bumps in the tree mod seq, and 2513 * we need to merge them to prevent adding an inline extent 2514 * backref before dropping it (triggering a BUG_ON at 2515 * insert_inline_extent_backref()). 2516 */ 2517 spin_lock(&locked_ref->lock); 2518 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2519 locked_ref); 2520 2521 /* 2522 * locked_ref is the head node, so we have to go one 2523 * node back for any delayed ref updates 2524 */ 2525 ref = select_delayed_ref(locked_ref); 2526 2527 if (ref && ref->seq && 2528 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2529 spin_unlock(&locked_ref->lock); 2530 spin_lock(&delayed_refs->lock); 2531 locked_ref->processing = 0; 2532 delayed_refs->num_heads_ready++; 2533 spin_unlock(&delayed_refs->lock); 2534 btrfs_delayed_ref_unlock(locked_ref); 2535 locked_ref = NULL; 2536 cond_resched(); 2537 count++; 2538 continue; 2539 } 2540 2541 /* 2542 * record the must insert reserved flag before we 2543 * drop the spin lock. 2544 */ 2545 must_insert_reserved = locked_ref->must_insert_reserved; 2546 locked_ref->must_insert_reserved = 0; 2547 2548 extent_op = locked_ref->extent_op; 2549 locked_ref->extent_op = NULL; 2550 2551 if (!ref) { 2552 2553 2554 /* All delayed refs have been processed, Go ahead 2555 * and send the head node to run_one_delayed_ref, 2556 * so that any accounting fixes can happen 2557 */ 2558 ref = &locked_ref->node; 2559 2560 if (extent_op && must_insert_reserved) { 2561 btrfs_free_delayed_extent_op(extent_op); 2562 extent_op = NULL; 2563 } 2564 2565 if (extent_op) { 2566 spin_unlock(&locked_ref->lock); 2567 ret = run_delayed_extent_op(trans, fs_info, 2568 ref, extent_op); 2569 btrfs_free_delayed_extent_op(extent_op); 2570 2571 if (ret) { 2572 /* 2573 * Need to reset must_insert_reserved if 2574 * there was an error so the abort stuff 2575 * can cleanup the reserved space 2576 * properly. 2577 */ 2578 if (must_insert_reserved) 2579 locked_ref->must_insert_reserved = 1; 2580 spin_lock(&delayed_refs->lock); 2581 locked_ref->processing = 0; 2582 delayed_refs->num_heads_ready++; 2583 spin_unlock(&delayed_refs->lock); 2584 btrfs_debug(fs_info, 2585 "run_delayed_extent_op returned %d", 2586 ret); 2587 btrfs_delayed_ref_unlock(locked_ref); 2588 return ret; 2589 } 2590 continue; 2591 } 2592 2593 /* 2594 * Need to drop our head ref lock and re-acquire the 2595 * delayed ref lock and then re-check to make sure 2596 * nobody got added. 2597 */ 2598 spin_unlock(&locked_ref->lock); 2599 spin_lock(&delayed_refs->lock); 2600 spin_lock(&locked_ref->lock); 2601 if (!list_empty(&locked_ref->ref_list) || 2602 locked_ref->extent_op) { 2603 spin_unlock(&locked_ref->lock); 2604 spin_unlock(&delayed_refs->lock); 2605 continue; 2606 } 2607 ref->in_tree = 0; 2608 delayed_refs->num_heads--; 2609 rb_erase(&locked_ref->href_node, 2610 &delayed_refs->href_root); 2611 spin_unlock(&delayed_refs->lock); 2612 } else { 2613 actual_count++; 2614 ref->in_tree = 0; 2615 list_del(&ref->list); 2616 if (!list_empty(&ref->add_list)) 2617 list_del(&ref->add_list); 2618 } 2619 atomic_dec(&delayed_refs->num_entries); 2620 2621 if (!btrfs_delayed_ref_is_head(ref)) { 2622 /* 2623 * when we play the delayed ref, also correct the 2624 * ref_mod on head 2625 */ 2626 switch (ref->action) { 2627 case BTRFS_ADD_DELAYED_REF: 2628 case BTRFS_ADD_DELAYED_EXTENT: 2629 locked_ref->node.ref_mod -= ref->ref_mod; 2630 break; 2631 case BTRFS_DROP_DELAYED_REF: 2632 locked_ref->node.ref_mod += ref->ref_mod; 2633 break; 2634 default: 2635 WARN_ON(1); 2636 } 2637 } 2638 spin_unlock(&locked_ref->lock); 2639 2640 ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, 2641 must_insert_reserved); 2642 2643 btrfs_free_delayed_extent_op(extent_op); 2644 if (ret) { 2645 spin_lock(&delayed_refs->lock); 2646 locked_ref->processing = 0; 2647 delayed_refs->num_heads_ready++; 2648 spin_unlock(&delayed_refs->lock); 2649 btrfs_delayed_ref_unlock(locked_ref); 2650 btrfs_put_delayed_ref(ref); 2651 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", 2652 ret); 2653 return ret; 2654 } 2655 2656 /* 2657 * If this node is a head, that means all the refs in this head 2658 * have been dealt with, and we will pick the next head to deal 2659 * with, so we must unlock the head and drop it from the cluster 2660 * list before we release it. 2661 */ 2662 if (btrfs_delayed_ref_is_head(ref)) { 2663 if (locked_ref->is_data && 2664 locked_ref->total_ref_mod < 0) { 2665 spin_lock(&delayed_refs->lock); 2666 delayed_refs->pending_csums -= ref->num_bytes; 2667 spin_unlock(&delayed_refs->lock); 2668 } 2669 btrfs_delayed_ref_unlock(locked_ref); 2670 locked_ref = NULL; 2671 } 2672 btrfs_put_delayed_ref(ref); 2673 count++; 2674 cond_resched(); 2675 } 2676 2677 /* 2678 * We don't want to include ref heads since we can have empty ref heads 2679 * and those will drastically skew our runtime down since we just do 2680 * accounting, no actual extent tree updates. 2681 */ 2682 if (actual_count > 0) { 2683 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); 2684 u64 avg; 2685 2686 /* 2687 * We weigh the current average higher than our current runtime 2688 * to avoid large swings in the average. 2689 */ 2690 spin_lock(&delayed_refs->lock); 2691 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2692 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ 2693 spin_unlock(&delayed_refs->lock); 2694 } 2695 return 0; 2696 } 2697 2698 #ifdef SCRAMBLE_DELAYED_REFS 2699 /* 2700 * Normally delayed refs get processed in ascending bytenr order. This 2701 * correlates in most cases to the order added. To expose dependencies on this 2702 * order, we start to process the tree in the middle instead of the beginning 2703 */ 2704 static u64 find_middle(struct rb_root *root) 2705 { 2706 struct rb_node *n = root->rb_node; 2707 struct btrfs_delayed_ref_node *entry; 2708 int alt = 1; 2709 u64 middle; 2710 u64 first = 0, last = 0; 2711 2712 n = rb_first(root); 2713 if (n) { 2714 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2715 first = entry->bytenr; 2716 } 2717 n = rb_last(root); 2718 if (n) { 2719 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2720 last = entry->bytenr; 2721 } 2722 n = root->rb_node; 2723 2724 while (n) { 2725 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2726 WARN_ON(!entry->in_tree); 2727 2728 middle = entry->bytenr; 2729 2730 if (alt) 2731 n = n->rb_left; 2732 else 2733 n = n->rb_right; 2734 2735 alt = 1 - alt; 2736 } 2737 return middle; 2738 } 2739 #endif 2740 2741 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) 2742 { 2743 u64 num_bytes; 2744 2745 num_bytes = heads * (sizeof(struct btrfs_extent_item) + 2746 sizeof(struct btrfs_extent_inline_ref)); 2747 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2748 num_bytes += heads * sizeof(struct btrfs_tree_block_info); 2749 2750 /* 2751 * We don't ever fill up leaves all the way so multiply by 2 just to be 2752 * closer to what we're really going to want to use. 2753 */ 2754 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); 2755 } 2756 2757 /* 2758 * Takes the number of bytes to be csumm'ed and figures out how many leaves it 2759 * would require to store the csums for that many bytes. 2760 */ 2761 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) 2762 { 2763 u64 csum_size; 2764 u64 num_csums_per_leaf; 2765 u64 num_csums; 2766 2767 csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); 2768 num_csums_per_leaf = div64_u64(csum_size, 2769 (u64)btrfs_super_csum_size(fs_info->super_copy)); 2770 num_csums = div64_u64(csum_bytes, fs_info->sectorsize); 2771 num_csums += num_csums_per_leaf - 1; 2772 num_csums = div64_u64(num_csums, num_csums_per_leaf); 2773 return num_csums; 2774 } 2775 2776 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2777 struct btrfs_fs_info *fs_info) 2778 { 2779 struct btrfs_block_rsv *global_rsv; 2780 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2781 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; 2782 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; 2783 u64 num_bytes, num_dirty_bgs_bytes; 2784 int ret = 0; 2785 2786 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 2787 num_heads = heads_to_leaves(fs_info, num_heads); 2788 if (num_heads > 1) 2789 num_bytes += (num_heads - 1) * fs_info->nodesize; 2790 num_bytes <<= 1; 2791 num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * 2792 fs_info->nodesize; 2793 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, 2794 num_dirty_bgs); 2795 global_rsv = &fs_info->global_block_rsv; 2796 2797 /* 2798 * If we can't allocate any more chunks lets make sure we have _lots_ of 2799 * wiggle room since running delayed refs can create more delayed refs. 2800 */ 2801 if (global_rsv->space_info->full) { 2802 num_dirty_bgs_bytes <<= 1; 2803 num_bytes <<= 1; 2804 } 2805 2806 spin_lock(&global_rsv->lock); 2807 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) 2808 ret = 1; 2809 spin_unlock(&global_rsv->lock); 2810 return ret; 2811 } 2812 2813 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2814 struct btrfs_fs_info *fs_info) 2815 { 2816 u64 num_entries = 2817 atomic_read(&trans->transaction->delayed_refs.num_entries); 2818 u64 avg_runtime; 2819 u64 val; 2820 2821 smp_mb(); 2822 avg_runtime = fs_info->avg_delayed_ref_runtime; 2823 val = num_entries * avg_runtime; 2824 if (val >= NSEC_PER_SEC) 2825 return 1; 2826 if (val >= NSEC_PER_SEC / 2) 2827 return 2; 2828 2829 return btrfs_check_space_for_delayed_refs(trans, fs_info); 2830 } 2831 2832 struct async_delayed_refs { 2833 struct btrfs_root *root; 2834 u64 transid; 2835 int count; 2836 int error; 2837 int sync; 2838 struct completion wait; 2839 struct btrfs_work work; 2840 }; 2841 2842 static inline struct async_delayed_refs * 2843 to_async_delayed_refs(struct btrfs_work *work) 2844 { 2845 return container_of(work, struct async_delayed_refs, work); 2846 } 2847 2848 static void delayed_ref_async_start(struct btrfs_work *work) 2849 { 2850 struct async_delayed_refs *async = to_async_delayed_refs(work); 2851 struct btrfs_trans_handle *trans; 2852 struct btrfs_fs_info *fs_info = async->root->fs_info; 2853 int ret; 2854 2855 /* if the commit is already started, we don't need to wait here */ 2856 if (btrfs_transaction_blocked(fs_info)) 2857 goto done; 2858 2859 trans = btrfs_join_transaction(async->root); 2860 if (IS_ERR(trans)) { 2861 async->error = PTR_ERR(trans); 2862 goto done; 2863 } 2864 2865 /* 2866 * trans->sync means that when we call end_transaction, we won't 2867 * wait on delayed refs 2868 */ 2869 trans->sync = true; 2870 2871 /* Don't bother flushing if we got into a different transaction */ 2872 if (trans->transid > async->transid) 2873 goto end; 2874 2875 ret = btrfs_run_delayed_refs(trans, fs_info, async->count); 2876 if (ret) 2877 async->error = ret; 2878 end: 2879 ret = btrfs_end_transaction(trans); 2880 if (ret && !async->error) 2881 async->error = ret; 2882 done: 2883 if (async->sync) 2884 complete(&async->wait); 2885 else 2886 kfree(async); 2887 } 2888 2889 int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, 2890 unsigned long count, u64 transid, int wait) 2891 { 2892 struct async_delayed_refs *async; 2893 int ret; 2894 2895 async = kmalloc(sizeof(*async), GFP_NOFS); 2896 if (!async) 2897 return -ENOMEM; 2898 2899 async->root = fs_info->tree_root; 2900 async->count = count; 2901 async->error = 0; 2902 async->transid = transid; 2903 if (wait) 2904 async->sync = 1; 2905 else 2906 async->sync = 0; 2907 init_completion(&async->wait); 2908 2909 btrfs_init_work(&async->work, btrfs_extent_refs_helper, 2910 delayed_ref_async_start, NULL, NULL); 2911 2912 btrfs_queue_work(fs_info->extent_workers, &async->work); 2913 2914 if (wait) { 2915 wait_for_completion(&async->wait); 2916 ret = async->error; 2917 kfree(async); 2918 return ret; 2919 } 2920 return 0; 2921 } 2922 2923 /* 2924 * this starts processing the delayed reference count updates and 2925 * extent insertions we have queued up so far. count can be 2926 * 0, which means to process everything in the tree at the start 2927 * of the run (but not newly added entries), or it can be some target 2928 * number you'd like to process. 2929 * 2930 * Returns 0 on success or if called with an aborted transaction 2931 * Returns <0 on error and aborts the transaction 2932 */ 2933 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2934 struct btrfs_fs_info *fs_info, unsigned long count) 2935 { 2936 struct rb_node *node; 2937 struct btrfs_delayed_ref_root *delayed_refs; 2938 struct btrfs_delayed_ref_head *head; 2939 int ret; 2940 int run_all = count == (unsigned long)-1; 2941 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 2942 2943 /* We'll clean this up in btrfs_cleanup_transaction */ 2944 if (trans->aborted) 2945 return 0; 2946 2947 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2948 return 0; 2949 2950 delayed_refs = &trans->transaction->delayed_refs; 2951 if (count == 0) 2952 count = atomic_read(&delayed_refs->num_entries) * 2; 2953 2954 again: 2955 #ifdef SCRAMBLE_DELAYED_REFS 2956 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2957 #endif 2958 trans->can_flush_pending_bgs = false; 2959 ret = __btrfs_run_delayed_refs(trans, fs_info, count); 2960 if (ret < 0) { 2961 btrfs_abort_transaction(trans, ret); 2962 return ret; 2963 } 2964 2965 if (run_all) { 2966 if (!list_empty(&trans->new_bgs)) 2967 btrfs_create_pending_block_groups(trans, fs_info); 2968 2969 spin_lock(&delayed_refs->lock); 2970 node = rb_first(&delayed_refs->href_root); 2971 if (!node) { 2972 spin_unlock(&delayed_refs->lock); 2973 goto out; 2974 } 2975 2976 while (node) { 2977 head = rb_entry(node, struct btrfs_delayed_ref_head, 2978 href_node); 2979 if (btrfs_delayed_ref_is_head(&head->node)) { 2980 struct btrfs_delayed_ref_node *ref; 2981 2982 ref = &head->node; 2983 atomic_inc(&ref->refs); 2984 2985 spin_unlock(&delayed_refs->lock); 2986 /* 2987 * Mutex was contended, block until it's 2988 * released and try again 2989 */ 2990 mutex_lock(&head->mutex); 2991 mutex_unlock(&head->mutex); 2992 2993 btrfs_put_delayed_ref(ref); 2994 cond_resched(); 2995 goto again; 2996 } else { 2997 WARN_ON(1); 2998 } 2999 node = rb_next(node); 3000 } 3001 spin_unlock(&delayed_refs->lock); 3002 cond_resched(); 3003 goto again; 3004 } 3005 out: 3006 assert_qgroups_uptodate(trans); 3007 trans->can_flush_pending_bgs = can_flush_pending_bgs; 3008 return 0; 3009 } 3010 3011 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3012 struct btrfs_fs_info *fs_info, 3013 u64 bytenr, u64 num_bytes, u64 flags, 3014 int level, int is_data) 3015 { 3016 struct btrfs_delayed_extent_op *extent_op; 3017 int ret; 3018 3019 extent_op = btrfs_alloc_delayed_extent_op(); 3020 if (!extent_op) 3021 return -ENOMEM; 3022 3023 extent_op->flags_to_set = flags; 3024 extent_op->update_flags = true; 3025 extent_op->update_key = false; 3026 extent_op->is_data = is_data ? true : false; 3027 extent_op->level = level; 3028 3029 ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, 3030 num_bytes, extent_op); 3031 if (ret) 3032 btrfs_free_delayed_extent_op(extent_op); 3033 return ret; 3034 } 3035 3036 static noinline int check_delayed_ref(struct btrfs_root *root, 3037 struct btrfs_path *path, 3038 u64 objectid, u64 offset, u64 bytenr) 3039 { 3040 struct btrfs_delayed_ref_head *head; 3041 struct btrfs_delayed_ref_node *ref; 3042 struct btrfs_delayed_data_ref *data_ref; 3043 struct btrfs_delayed_ref_root *delayed_refs; 3044 struct btrfs_transaction *cur_trans; 3045 int ret = 0; 3046 3047 cur_trans = root->fs_info->running_transaction; 3048 if (!cur_trans) 3049 return 0; 3050 3051 delayed_refs = &cur_trans->delayed_refs; 3052 spin_lock(&delayed_refs->lock); 3053 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3054 if (!head) { 3055 spin_unlock(&delayed_refs->lock); 3056 return 0; 3057 } 3058 3059 if (!mutex_trylock(&head->mutex)) { 3060 atomic_inc(&head->node.refs); 3061 spin_unlock(&delayed_refs->lock); 3062 3063 btrfs_release_path(path); 3064 3065 /* 3066 * Mutex was contended, block until it's released and let 3067 * caller try again 3068 */ 3069 mutex_lock(&head->mutex); 3070 mutex_unlock(&head->mutex); 3071 btrfs_put_delayed_ref(&head->node); 3072 return -EAGAIN; 3073 } 3074 spin_unlock(&delayed_refs->lock); 3075 3076 spin_lock(&head->lock); 3077 list_for_each_entry(ref, &head->ref_list, list) { 3078 /* If it's a shared ref we know a cross reference exists */ 3079 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 3080 ret = 1; 3081 break; 3082 } 3083 3084 data_ref = btrfs_delayed_node_to_data_ref(ref); 3085 3086 /* 3087 * If our ref doesn't match the one we're currently looking at 3088 * then we have a cross reference. 3089 */ 3090 if (data_ref->root != root->root_key.objectid || 3091 data_ref->objectid != objectid || 3092 data_ref->offset != offset) { 3093 ret = 1; 3094 break; 3095 } 3096 } 3097 spin_unlock(&head->lock); 3098 mutex_unlock(&head->mutex); 3099 return ret; 3100 } 3101 3102 static noinline int check_committed_ref(struct btrfs_root *root, 3103 struct btrfs_path *path, 3104 u64 objectid, u64 offset, u64 bytenr) 3105 { 3106 struct btrfs_fs_info *fs_info = root->fs_info; 3107 struct btrfs_root *extent_root = fs_info->extent_root; 3108 struct extent_buffer *leaf; 3109 struct btrfs_extent_data_ref *ref; 3110 struct btrfs_extent_inline_ref *iref; 3111 struct btrfs_extent_item *ei; 3112 struct btrfs_key key; 3113 u32 item_size; 3114 int ret; 3115 3116 key.objectid = bytenr; 3117 key.offset = (u64)-1; 3118 key.type = BTRFS_EXTENT_ITEM_KEY; 3119 3120 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 3121 if (ret < 0) 3122 goto out; 3123 BUG_ON(ret == 0); /* Corruption */ 3124 3125 ret = -ENOENT; 3126 if (path->slots[0] == 0) 3127 goto out; 3128 3129 path->slots[0]--; 3130 leaf = path->nodes[0]; 3131 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3132 3133 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 3134 goto out; 3135 3136 ret = 1; 3137 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 3138 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 3139 if (item_size < sizeof(*ei)) { 3140 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3141 goto out; 3142 } 3143 #endif 3144 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 3145 3146 if (item_size != sizeof(*ei) + 3147 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 3148 goto out; 3149 3150 if (btrfs_extent_generation(leaf, ei) <= 3151 btrfs_root_last_snapshot(&root->root_item)) 3152 goto out; 3153 3154 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 3155 if (btrfs_extent_inline_ref_type(leaf, iref) != 3156 BTRFS_EXTENT_DATA_REF_KEY) 3157 goto out; 3158 3159 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 3160 if (btrfs_extent_refs(leaf, ei) != 3161 btrfs_extent_data_ref_count(leaf, ref) || 3162 btrfs_extent_data_ref_root(leaf, ref) != 3163 root->root_key.objectid || 3164 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 3165 btrfs_extent_data_ref_offset(leaf, ref) != offset) 3166 goto out; 3167 3168 ret = 0; 3169 out: 3170 return ret; 3171 } 3172 3173 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 3174 u64 bytenr) 3175 { 3176 struct btrfs_path *path; 3177 int ret; 3178 int ret2; 3179 3180 path = btrfs_alloc_path(); 3181 if (!path) 3182 return -ENOENT; 3183 3184 do { 3185 ret = check_committed_ref(root, path, objectid, 3186 offset, bytenr); 3187 if (ret && ret != -ENOENT) 3188 goto out; 3189 3190 ret2 = check_delayed_ref(root, path, objectid, 3191 offset, bytenr); 3192 } while (ret2 == -EAGAIN); 3193 3194 if (ret2 && ret2 != -ENOENT) { 3195 ret = ret2; 3196 goto out; 3197 } 3198 3199 if (ret != -ENOENT || ret2 != -ENOENT) 3200 ret = 0; 3201 out: 3202 btrfs_free_path(path); 3203 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 3204 WARN_ON(ret > 0); 3205 return ret; 3206 } 3207 3208 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3209 struct btrfs_root *root, 3210 struct extent_buffer *buf, 3211 int full_backref, int inc) 3212 { 3213 struct btrfs_fs_info *fs_info = root->fs_info; 3214 u64 bytenr; 3215 u64 num_bytes; 3216 u64 parent; 3217 u64 ref_root; 3218 u32 nritems; 3219 struct btrfs_key key; 3220 struct btrfs_file_extent_item *fi; 3221 int i; 3222 int level; 3223 int ret = 0; 3224 int (*process_func)(struct btrfs_trans_handle *, 3225 struct btrfs_fs_info *, 3226 u64, u64, u64, u64, u64, u64); 3227 3228 3229 if (btrfs_is_testing(fs_info)) 3230 return 0; 3231 3232 ref_root = btrfs_header_owner(buf); 3233 nritems = btrfs_header_nritems(buf); 3234 level = btrfs_header_level(buf); 3235 3236 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) 3237 return 0; 3238 3239 if (inc) 3240 process_func = btrfs_inc_extent_ref; 3241 else 3242 process_func = btrfs_free_extent; 3243 3244 if (full_backref) 3245 parent = buf->start; 3246 else 3247 parent = 0; 3248 3249 for (i = 0; i < nritems; i++) { 3250 if (level == 0) { 3251 btrfs_item_key_to_cpu(buf, &key, i); 3252 if (key.type != BTRFS_EXTENT_DATA_KEY) 3253 continue; 3254 fi = btrfs_item_ptr(buf, i, 3255 struct btrfs_file_extent_item); 3256 if (btrfs_file_extent_type(buf, fi) == 3257 BTRFS_FILE_EXTENT_INLINE) 3258 continue; 3259 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 3260 if (bytenr == 0) 3261 continue; 3262 3263 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 3264 key.offset -= btrfs_file_extent_offset(buf, fi); 3265 ret = process_func(trans, fs_info, bytenr, num_bytes, 3266 parent, ref_root, key.objectid, 3267 key.offset); 3268 if (ret) 3269 goto fail; 3270 } else { 3271 bytenr = btrfs_node_blockptr(buf, i); 3272 num_bytes = fs_info->nodesize; 3273 ret = process_func(trans, fs_info, bytenr, num_bytes, 3274 parent, ref_root, level - 1, 0); 3275 if (ret) 3276 goto fail; 3277 } 3278 } 3279 return 0; 3280 fail: 3281 return ret; 3282 } 3283 3284 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3285 struct extent_buffer *buf, int full_backref) 3286 { 3287 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 3288 } 3289 3290 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3291 struct extent_buffer *buf, int full_backref) 3292 { 3293 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 3294 } 3295 3296 static int write_one_cache_group(struct btrfs_trans_handle *trans, 3297 struct btrfs_fs_info *fs_info, 3298 struct btrfs_path *path, 3299 struct btrfs_block_group_cache *cache) 3300 { 3301 int ret; 3302 struct btrfs_root *extent_root = fs_info->extent_root; 3303 unsigned long bi; 3304 struct extent_buffer *leaf; 3305 3306 ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); 3307 if (ret) { 3308 if (ret > 0) 3309 ret = -ENOENT; 3310 goto fail; 3311 } 3312 3313 leaf = path->nodes[0]; 3314 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3315 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 3316 btrfs_mark_buffer_dirty(leaf); 3317 fail: 3318 btrfs_release_path(path); 3319 return ret; 3320 3321 } 3322 3323 static struct btrfs_block_group_cache * 3324 next_block_group(struct btrfs_fs_info *fs_info, 3325 struct btrfs_block_group_cache *cache) 3326 { 3327 struct rb_node *node; 3328 3329 spin_lock(&fs_info->block_group_cache_lock); 3330 3331 /* If our block group was removed, we need a full search. */ 3332 if (RB_EMPTY_NODE(&cache->cache_node)) { 3333 const u64 next_bytenr = cache->key.objectid + cache->key.offset; 3334 3335 spin_unlock(&fs_info->block_group_cache_lock); 3336 btrfs_put_block_group(cache); 3337 cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; 3338 } 3339 node = rb_next(&cache->cache_node); 3340 btrfs_put_block_group(cache); 3341 if (node) { 3342 cache = rb_entry(node, struct btrfs_block_group_cache, 3343 cache_node); 3344 btrfs_get_block_group(cache); 3345 } else 3346 cache = NULL; 3347 spin_unlock(&fs_info->block_group_cache_lock); 3348 return cache; 3349 } 3350 3351 static int cache_save_setup(struct btrfs_block_group_cache *block_group, 3352 struct btrfs_trans_handle *trans, 3353 struct btrfs_path *path) 3354 { 3355 struct btrfs_fs_info *fs_info = block_group->fs_info; 3356 struct btrfs_root *root = fs_info->tree_root; 3357 struct inode *inode = NULL; 3358 u64 alloc_hint = 0; 3359 int dcs = BTRFS_DC_ERROR; 3360 u64 num_pages = 0; 3361 int retries = 0; 3362 int ret = 0; 3363 3364 /* 3365 * If this block group is smaller than 100 megs don't bother caching the 3366 * block group. 3367 */ 3368 if (block_group->key.offset < (100 * SZ_1M)) { 3369 spin_lock(&block_group->lock); 3370 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3371 spin_unlock(&block_group->lock); 3372 return 0; 3373 } 3374 3375 if (trans->aborted) 3376 return 0; 3377 again: 3378 inode = lookup_free_space_inode(fs_info, block_group, path); 3379 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 3380 ret = PTR_ERR(inode); 3381 btrfs_release_path(path); 3382 goto out; 3383 } 3384 3385 if (IS_ERR(inode)) { 3386 BUG_ON(retries); 3387 retries++; 3388 3389 if (block_group->ro) 3390 goto out_free; 3391 3392 ret = create_free_space_inode(fs_info, trans, block_group, 3393 path); 3394 if (ret) 3395 goto out_free; 3396 goto again; 3397 } 3398 3399 /* We've already setup this transaction, go ahead and exit */ 3400 if (block_group->cache_generation == trans->transid && 3401 i_size_read(inode)) { 3402 dcs = BTRFS_DC_SETUP; 3403 goto out_put; 3404 } 3405 3406 /* 3407 * We want to set the generation to 0, that way if anything goes wrong 3408 * from here on out we know not to trust this cache when we load up next 3409 * time. 3410 */ 3411 BTRFS_I(inode)->generation = 0; 3412 ret = btrfs_update_inode(trans, root, inode); 3413 if (ret) { 3414 /* 3415 * So theoretically we could recover from this, simply set the 3416 * super cache generation to 0 so we know to invalidate the 3417 * cache, but then we'd have to keep track of the block groups 3418 * that fail this way so we know we _have_ to reset this cache 3419 * before the next commit or risk reading stale cache. So to 3420 * limit our exposure to horrible edge cases lets just abort the 3421 * transaction, this only happens in really bad situations 3422 * anyway. 3423 */ 3424 btrfs_abort_transaction(trans, ret); 3425 goto out_put; 3426 } 3427 WARN_ON(ret); 3428 3429 if (i_size_read(inode) > 0) { 3430 ret = btrfs_check_trunc_cache_free_space(fs_info, 3431 &fs_info->global_block_rsv); 3432 if (ret) 3433 goto out_put; 3434 3435 ret = btrfs_truncate_free_space_cache(trans, NULL, inode); 3436 if (ret) 3437 goto out_put; 3438 } 3439 3440 spin_lock(&block_group->lock); 3441 if (block_group->cached != BTRFS_CACHE_FINISHED || 3442 !btrfs_test_opt(fs_info, SPACE_CACHE)) { 3443 /* 3444 * don't bother trying to write stuff out _if_ 3445 * a) we're not cached, 3446 * b) we're with nospace_cache mount option. 3447 */ 3448 dcs = BTRFS_DC_WRITTEN; 3449 spin_unlock(&block_group->lock); 3450 goto out_put; 3451 } 3452 spin_unlock(&block_group->lock); 3453 3454 /* 3455 * We hit an ENOSPC when setting up the cache in this transaction, just 3456 * skip doing the setup, we've already cleared the cache so we're safe. 3457 */ 3458 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3459 ret = -ENOSPC; 3460 goto out_put; 3461 } 3462 3463 /* 3464 * Try to preallocate enough space based on how big the block group is. 3465 * Keep in mind this has to include any pinned space which could end up 3466 * taking up quite a bit since it's not folded into the other space 3467 * cache. 3468 */ 3469 num_pages = div_u64(block_group->key.offset, SZ_256M); 3470 if (!num_pages) 3471 num_pages = 1; 3472 3473 num_pages *= 16; 3474 num_pages *= PAGE_SIZE; 3475 3476 ret = btrfs_check_data_free_space(inode, 0, num_pages); 3477 if (ret) 3478 goto out_put; 3479 3480 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3481 num_pages, num_pages, 3482 &alloc_hint); 3483 /* 3484 * Our cache requires contiguous chunks so that we don't modify a bunch 3485 * of metadata or split extents when writing the cache out, which means 3486 * we can enospc if we are heavily fragmented in addition to just normal 3487 * out of space conditions. So if we hit this just skip setting up any 3488 * other block groups for this transaction, maybe we'll unpin enough 3489 * space the next time around. 3490 */ 3491 if (!ret) 3492 dcs = BTRFS_DC_SETUP; 3493 else if (ret == -ENOSPC) 3494 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3495 3496 out_put: 3497 iput(inode); 3498 out_free: 3499 btrfs_release_path(path); 3500 out: 3501 spin_lock(&block_group->lock); 3502 if (!ret && dcs == BTRFS_DC_SETUP) 3503 block_group->cache_generation = trans->transid; 3504 block_group->disk_cache_state = dcs; 3505 spin_unlock(&block_group->lock); 3506 3507 return ret; 3508 } 3509 3510 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3511 struct btrfs_fs_info *fs_info) 3512 { 3513 struct btrfs_block_group_cache *cache, *tmp; 3514 struct btrfs_transaction *cur_trans = trans->transaction; 3515 struct btrfs_path *path; 3516 3517 if (list_empty(&cur_trans->dirty_bgs) || 3518 !btrfs_test_opt(fs_info, SPACE_CACHE)) 3519 return 0; 3520 3521 path = btrfs_alloc_path(); 3522 if (!path) 3523 return -ENOMEM; 3524 3525 /* Could add new block groups, use _safe just in case */ 3526 list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, 3527 dirty_list) { 3528 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3529 cache_save_setup(cache, trans, path); 3530 } 3531 3532 btrfs_free_path(path); 3533 return 0; 3534 } 3535 3536 /* 3537 * transaction commit does final block group cache writeback during a 3538 * critical section where nothing is allowed to change the FS. This is 3539 * required in order for the cache to actually match the block group, 3540 * but can introduce a lot of latency into the commit. 3541 * 3542 * So, btrfs_start_dirty_block_groups is here to kick off block group 3543 * cache IO. There's a chance we'll have to redo some of it if the 3544 * block group changes again during the commit, but it greatly reduces 3545 * the commit latency by getting rid of the easy block groups while 3546 * we're still allowing others to join the commit. 3547 */ 3548 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3549 struct btrfs_fs_info *fs_info) 3550 { 3551 struct btrfs_block_group_cache *cache; 3552 struct btrfs_transaction *cur_trans = trans->transaction; 3553 int ret = 0; 3554 int should_put; 3555 struct btrfs_path *path = NULL; 3556 LIST_HEAD(dirty); 3557 struct list_head *io = &cur_trans->io_bgs; 3558 int num_started = 0; 3559 int loops = 0; 3560 3561 spin_lock(&cur_trans->dirty_bgs_lock); 3562 if (list_empty(&cur_trans->dirty_bgs)) { 3563 spin_unlock(&cur_trans->dirty_bgs_lock); 3564 return 0; 3565 } 3566 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3567 spin_unlock(&cur_trans->dirty_bgs_lock); 3568 3569 again: 3570 /* 3571 * make sure all the block groups on our dirty list actually 3572 * exist 3573 */ 3574 btrfs_create_pending_block_groups(trans, fs_info); 3575 3576 if (!path) { 3577 path = btrfs_alloc_path(); 3578 if (!path) 3579 return -ENOMEM; 3580 } 3581 3582 /* 3583 * cache_write_mutex is here only to save us from balance or automatic 3584 * removal of empty block groups deleting this block group while we are 3585 * writing out the cache 3586 */ 3587 mutex_lock(&trans->transaction->cache_write_mutex); 3588 while (!list_empty(&dirty)) { 3589 cache = list_first_entry(&dirty, 3590 struct btrfs_block_group_cache, 3591 dirty_list); 3592 /* 3593 * this can happen if something re-dirties a block 3594 * group that is already under IO. Just wait for it to 3595 * finish and then do it all again 3596 */ 3597 if (!list_empty(&cache->io_list)) { 3598 list_del_init(&cache->io_list); 3599 btrfs_wait_cache_io(trans, cache, path); 3600 btrfs_put_block_group(cache); 3601 } 3602 3603 3604 /* 3605 * btrfs_wait_cache_io uses the cache->dirty_list to decide 3606 * if it should update the cache_state. Don't delete 3607 * until after we wait. 3608 * 3609 * Since we're not running in the commit critical section 3610 * we need the dirty_bgs_lock to protect from update_block_group 3611 */ 3612 spin_lock(&cur_trans->dirty_bgs_lock); 3613 list_del_init(&cache->dirty_list); 3614 spin_unlock(&cur_trans->dirty_bgs_lock); 3615 3616 should_put = 1; 3617 3618 cache_save_setup(cache, trans, path); 3619 3620 if (cache->disk_cache_state == BTRFS_DC_SETUP) { 3621 cache->io_ctl.inode = NULL; 3622 ret = btrfs_write_out_cache(fs_info, trans, 3623 cache, path); 3624 if (ret == 0 && cache->io_ctl.inode) { 3625 num_started++; 3626 should_put = 0; 3627 3628 /* 3629 * the cache_write_mutex is protecting 3630 * the io_list 3631 */ 3632 list_add_tail(&cache->io_list, io); 3633 } else { 3634 /* 3635 * if we failed to write the cache, the 3636 * generation will be bad and life goes on 3637 */ 3638 ret = 0; 3639 } 3640 } 3641 if (!ret) { 3642 ret = write_one_cache_group(trans, fs_info, 3643 path, cache); 3644 /* 3645 * Our block group might still be attached to the list 3646 * of new block groups in the transaction handle of some 3647 * other task (struct btrfs_trans_handle->new_bgs). This 3648 * means its block group item isn't yet in the extent 3649 * tree. If this happens ignore the error, as we will 3650 * try again later in the critical section of the 3651 * transaction commit. 3652 */ 3653 if (ret == -ENOENT) { 3654 ret = 0; 3655 spin_lock(&cur_trans->dirty_bgs_lock); 3656 if (list_empty(&cache->dirty_list)) { 3657 list_add_tail(&cache->dirty_list, 3658 &cur_trans->dirty_bgs); 3659 btrfs_get_block_group(cache); 3660 } 3661 spin_unlock(&cur_trans->dirty_bgs_lock); 3662 } else if (ret) { 3663 btrfs_abort_transaction(trans, ret); 3664 } 3665 } 3666 3667 /* if its not on the io list, we need to put the block group */ 3668 if (should_put) 3669 btrfs_put_block_group(cache); 3670 3671 if (ret) 3672 break; 3673 3674 /* 3675 * Avoid blocking other tasks for too long. It might even save 3676 * us from writing caches for block groups that are going to be 3677 * removed. 3678 */ 3679 mutex_unlock(&trans->transaction->cache_write_mutex); 3680 mutex_lock(&trans->transaction->cache_write_mutex); 3681 } 3682 mutex_unlock(&trans->transaction->cache_write_mutex); 3683 3684 /* 3685 * go through delayed refs for all the stuff we've just kicked off 3686 * and then loop back (just once) 3687 */ 3688 ret = btrfs_run_delayed_refs(trans, fs_info, 0); 3689 if (!ret && loops == 0) { 3690 loops++; 3691 spin_lock(&cur_trans->dirty_bgs_lock); 3692 list_splice_init(&cur_trans->dirty_bgs, &dirty); 3693 /* 3694 * dirty_bgs_lock protects us from concurrent block group 3695 * deletes too (not just cache_write_mutex). 3696 */ 3697 if (!list_empty(&dirty)) { 3698 spin_unlock(&cur_trans->dirty_bgs_lock); 3699 goto again; 3700 } 3701 spin_unlock(&cur_trans->dirty_bgs_lock); 3702 } else if (ret < 0) { 3703 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 3704 } 3705 3706 btrfs_free_path(path); 3707 return ret; 3708 } 3709 3710 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3711 struct btrfs_fs_info *fs_info) 3712 { 3713 struct btrfs_block_group_cache *cache; 3714 struct btrfs_transaction *cur_trans = trans->transaction; 3715 int ret = 0; 3716 int should_put; 3717 struct btrfs_path *path; 3718 struct list_head *io = &cur_trans->io_bgs; 3719 int num_started = 0; 3720 3721 path = btrfs_alloc_path(); 3722 if (!path) 3723 return -ENOMEM; 3724 3725 /* 3726 * Even though we are in the critical section of the transaction commit, 3727 * we can still have concurrent tasks adding elements to this 3728 * transaction's list of dirty block groups. These tasks correspond to 3729 * endio free space workers started when writeback finishes for a 3730 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can 3731 * allocate new block groups as a result of COWing nodes of the root 3732 * tree when updating the free space inode. The writeback for the space 3733 * caches is triggered by an earlier call to 3734 * btrfs_start_dirty_block_groups() and iterations of the following 3735 * loop. 3736 * Also we want to do the cache_save_setup first and then run the 3737 * delayed refs to make sure we have the best chance at doing this all 3738 * in one shot. 3739 */ 3740 spin_lock(&cur_trans->dirty_bgs_lock); 3741 while (!list_empty(&cur_trans->dirty_bgs)) { 3742 cache = list_first_entry(&cur_trans->dirty_bgs, 3743 struct btrfs_block_group_cache, 3744 dirty_list); 3745 3746 /* 3747 * this can happen if cache_save_setup re-dirties a block 3748 * group that is already under IO. Just wait for it to 3749 * finish and then do it all again 3750 */ 3751 if (!list_empty(&cache->io_list)) { 3752 spin_unlock(&cur_trans->dirty_bgs_lock); 3753 list_del_init(&cache->io_list); 3754 btrfs_wait_cache_io(trans, cache, path); 3755 btrfs_put_block_group(cache); 3756 spin_lock(&cur_trans->dirty_bgs_lock); 3757 } 3758 3759 /* 3760 * don't remove from the dirty list until after we've waited 3761 * on any pending IO 3762 */ 3763 list_del_init(&cache->dirty_list); 3764 spin_unlock(&cur_trans->dirty_bgs_lock); 3765 should_put = 1; 3766 3767 cache_save_setup(cache, trans, path); 3768 3769 if (!ret) 3770 ret = btrfs_run_delayed_refs(trans, fs_info, 3771 (unsigned long) -1); 3772 3773 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { 3774 cache->io_ctl.inode = NULL; 3775 ret = btrfs_write_out_cache(fs_info, trans, 3776 cache, path); 3777 if (ret == 0 && cache->io_ctl.inode) { 3778 num_started++; 3779 should_put = 0; 3780 list_add_tail(&cache->io_list, io); 3781 } else { 3782 /* 3783 * if we failed to write the cache, the 3784 * generation will be bad and life goes on 3785 */ 3786 ret = 0; 3787 } 3788 } 3789 if (!ret) { 3790 ret = write_one_cache_group(trans, fs_info, 3791 path, cache); 3792 /* 3793 * One of the free space endio workers might have 3794 * created a new block group while updating a free space 3795 * cache's inode (at inode.c:btrfs_finish_ordered_io()) 3796 * and hasn't released its transaction handle yet, in 3797 * which case the new block group is still attached to 3798 * its transaction handle and its creation has not 3799 * finished yet (no block group item in the extent tree 3800 * yet, etc). If this is the case, wait for all free 3801 * space endio workers to finish and retry. This is a 3802 * a very rare case so no need for a more efficient and 3803 * complex approach. 3804 */ 3805 if (ret == -ENOENT) { 3806 wait_event(cur_trans->writer_wait, 3807 atomic_read(&cur_trans->num_writers) == 1); 3808 ret = write_one_cache_group(trans, fs_info, 3809 path, cache); 3810 } 3811 if (ret) 3812 btrfs_abort_transaction(trans, ret); 3813 } 3814 3815 /* if its not on the io list, we need to put the block group */ 3816 if (should_put) 3817 btrfs_put_block_group(cache); 3818 spin_lock(&cur_trans->dirty_bgs_lock); 3819 } 3820 spin_unlock(&cur_trans->dirty_bgs_lock); 3821 3822 while (!list_empty(io)) { 3823 cache = list_first_entry(io, struct btrfs_block_group_cache, 3824 io_list); 3825 list_del_init(&cache->io_list); 3826 btrfs_wait_cache_io(trans, cache, path); 3827 btrfs_put_block_group(cache); 3828 } 3829 3830 btrfs_free_path(path); 3831 return ret; 3832 } 3833 3834 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) 3835 { 3836 struct btrfs_block_group_cache *block_group; 3837 int readonly = 0; 3838 3839 block_group = btrfs_lookup_block_group(fs_info, bytenr); 3840 if (!block_group || block_group->ro) 3841 readonly = 1; 3842 if (block_group) 3843 btrfs_put_block_group(block_group); 3844 return readonly; 3845 } 3846 3847 bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3848 { 3849 struct btrfs_block_group_cache *bg; 3850 bool ret = true; 3851 3852 bg = btrfs_lookup_block_group(fs_info, bytenr); 3853 if (!bg) 3854 return false; 3855 3856 spin_lock(&bg->lock); 3857 if (bg->ro) 3858 ret = false; 3859 else 3860 atomic_inc(&bg->nocow_writers); 3861 spin_unlock(&bg->lock); 3862 3863 /* no put on block group, done by btrfs_dec_nocow_writers */ 3864 if (!ret) 3865 btrfs_put_block_group(bg); 3866 3867 return ret; 3868 3869 } 3870 3871 void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) 3872 { 3873 struct btrfs_block_group_cache *bg; 3874 3875 bg = btrfs_lookup_block_group(fs_info, bytenr); 3876 ASSERT(bg); 3877 if (atomic_dec_and_test(&bg->nocow_writers)) 3878 wake_up_atomic_t(&bg->nocow_writers); 3879 /* 3880 * Once for our lookup and once for the lookup done by a previous call 3881 * to btrfs_inc_nocow_writers() 3882 */ 3883 btrfs_put_block_group(bg); 3884 btrfs_put_block_group(bg); 3885 } 3886 3887 static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) 3888 { 3889 schedule(); 3890 return 0; 3891 } 3892 3893 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) 3894 { 3895 wait_on_atomic_t(&bg->nocow_writers, 3896 btrfs_wait_nocow_writers_atomic_t, 3897 TASK_UNINTERRUPTIBLE); 3898 } 3899 3900 static const char *alloc_name(u64 flags) 3901 { 3902 switch (flags) { 3903 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: 3904 return "mixed"; 3905 case BTRFS_BLOCK_GROUP_METADATA: 3906 return "metadata"; 3907 case BTRFS_BLOCK_GROUP_DATA: 3908 return "data"; 3909 case BTRFS_BLOCK_GROUP_SYSTEM: 3910 return "system"; 3911 default: 3912 WARN_ON(1); 3913 return "invalid-combination"; 3914 }; 3915 } 3916 3917 static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3918 u64 total_bytes, u64 bytes_used, 3919 u64 bytes_readonly, 3920 struct btrfs_space_info **space_info) 3921 { 3922 struct btrfs_space_info *found; 3923 int i; 3924 int factor; 3925 int ret; 3926 3927 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3928 BTRFS_BLOCK_GROUP_RAID10)) 3929 factor = 2; 3930 else 3931 factor = 1; 3932 3933 found = __find_space_info(info, flags); 3934 if (found) { 3935 spin_lock(&found->lock); 3936 found->total_bytes += total_bytes; 3937 found->disk_total += total_bytes * factor; 3938 found->bytes_used += bytes_used; 3939 found->disk_used += bytes_used * factor; 3940 found->bytes_readonly += bytes_readonly; 3941 if (total_bytes > 0) 3942 found->full = 0; 3943 space_info_add_new_bytes(info, found, total_bytes - 3944 bytes_used - bytes_readonly); 3945 spin_unlock(&found->lock); 3946 *space_info = found; 3947 return 0; 3948 } 3949 found = kzalloc(sizeof(*found), GFP_NOFS); 3950 if (!found) 3951 return -ENOMEM; 3952 3953 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); 3954 if (ret) { 3955 kfree(found); 3956 return ret; 3957 } 3958 3959 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3960 INIT_LIST_HEAD(&found->block_groups[i]); 3961 init_rwsem(&found->groups_sem); 3962 spin_lock_init(&found->lock); 3963 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3964 found->total_bytes = total_bytes; 3965 found->disk_total = total_bytes * factor; 3966 found->bytes_used = bytes_used; 3967 found->disk_used = bytes_used * factor; 3968 found->bytes_pinned = 0; 3969 found->bytes_reserved = 0; 3970 found->bytes_readonly = bytes_readonly; 3971 found->bytes_may_use = 0; 3972 found->full = 0; 3973 found->max_extent_size = 0; 3974 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3975 found->chunk_alloc = 0; 3976 found->flush = 0; 3977 init_waitqueue_head(&found->wait); 3978 INIT_LIST_HEAD(&found->ro_bgs); 3979 INIT_LIST_HEAD(&found->tickets); 3980 INIT_LIST_HEAD(&found->priority_tickets); 3981 3982 ret = kobject_init_and_add(&found->kobj, &space_info_ktype, 3983 info->space_info_kobj, "%s", 3984 alloc_name(found->flags)); 3985 if (ret) { 3986 kfree(found); 3987 return ret; 3988 } 3989 3990 *space_info = found; 3991 list_add_rcu(&found->list, &info->space_info); 3992 if (flags & BTRFS_BLOCK_GROUP_DATA) 3993 info->data_sinfo = found; 3994 3995 return ret; 3996 } 3997 3998 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3999 { 4000 u64 extra_flags = chunk_to_extended(flags) & 4001 BTRFS_EXTENDED_PROFILE_MASK; 4002 4003 write_seqlock(&fs_info->profiles_lock); 4004 if (flags & BTRFS_BLOCK_GROUP_DATA) 4005 fs_info->avail_data_alloc_bits |= extra_flags; 4006 if (flags & BTRFS_BLOCK_GROUP_METADATA) 4007 fs_info->avail_metadata_alloc_bits |= extra_flags; 4008 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4009 fs_info->avail_system_alloc_bits |= extra_flags; 4010 write_sequnlock(&fs_info->profiles_lock); 4011 } 4012 4013 /* 4014 * returns target flags in extended format or 0 if restripe for this 4015 * chunk_type is not in progress 4016 * 4017 * should be called with either volume_mutex or balance_lock held 4018 */ 4019 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) 4020 { 4021 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4022 u64 target = 0; 4023 4024 if (!bctl) 4025 return 0; 4026 4027 if (flags & BTRFS_BLOCK_GROUP_DATA && 4028 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4029 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target; 4030 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM && 4031 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4032 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target; 4033 } else if (flags & BTRFS_BLOCK_GROUP_METADATA && 4034 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) { 4035 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target; 4036 } 4037 4038 return target; 4039 } 4040 4041 /* 4042 * @flags: available profiles in extended format (see ctree.h) 4043 * 4044 * Returns reduced profile in chunk format. If profile changing is in 4045 * progress (either running or paused) picks the target profile (if it's 4046 * already available), otherwise falls back to plain reducing. 4047 */ 4048 static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) 4049 { 4050 u64 num_devices = fs_info->fs_devices->rw_devices; 4051 u64 target; 4052 u64 raid_type; 4053 u64 allowed = 0; 4054 4055 /* 4056 * see if restripe for this chunk_type is in progress, if so 4057 * try to reduce to the target profile 4058 */ 4059 spin_lock(&fs_info->balance_lock); 4060 target = get_restripe_target(fs_info, flags); 4061 if (target) { 4062 /* pick target profile only if it's already available */ 4063 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { 4064 spin_unlock(&fs_info->balance_lock); 4065 return extended_to_chunk(target); 4066 } 4067 } 4068 spin_unlock(&fs_info->balance_lock); 4069 4070 /* First, mask out the RAID levels which aren't possible */ 4071 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4072 if (num_devices >= btrfs_raid_array[raid_type].devs_min) 4073 allowed |= btrfs_raid_group[raid_type]; 4074 } 4075 allowed &= flags; 4076 4077 if (allowed & BTRFS_BLOCK_GROUP_RAID6) 4078 allowed = BTRFS_BLOCK_GROUP_RAID6; 4079 else if (allowed & BTRFS_BLOCK_GROUP_RAID5) 4080 allowed = BTRFS_BLOCK_GROUP_RAID5; 4081 else if (allowed & BTRFS_BLOCK_GROUP_RAID10) 4082 allowed = BTRFS_BLOCK_GROUP_RAID10; 4083 else if (allowed & BTRFS_BLOCK_GROUP_RAID1) 4084 allowed = BTRFS_BLOCK_GROUP_RAID1; 4085 else if (allowed & BTRFS_BLOCK_GROUP_RAID0) 4086 allowed = BTRFS_BLOCK_GROUP_RAID0; 4087 4088 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; 4089 4090 return extended_to_chunk(flags | allowed); 4091 } 4092 4093 static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) 4094 { 4095 unsigned seq; 4096 u64 flags; 4097 4098 do { 4099 flags = orig_flags; 4100 seq = read_seqbegin(&fs_info->profiles_lock); 4101 4102 if (flags & BTRFS_BLOCK_GROUP_DATA) 4103 flags |= fs_info->avail_data_alloc_bits; 4104 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 4105 flags |= fs_info->avail_system_alloc_bits; 4106 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 4107 flags |= fs_info->avail_metadata_alloc_bits; 4108 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4109 4110 return btrfs_reduce_alloc_profile(fs_info, flags); 4111 } 4112 4113 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 4114 { 4115 struct btrfs_fs_info *fs_info = root->fs_info; 4116 u64 flags; 4117 u64 ret; 4118 4119 if (data) 4120 flags = BTRFS_BLOCK_GROUP_DATA; 4121 else if (root == fs_info->chunk_root) 4122 flags = BTRFS_BLOCK_GROUP_SYSTEM; 4123 else 4124 flags = BTRFS_BLOCK_GROUP_METADATA; 4125 4126 ret = get_alloc_profile(fs_info, flags); 4127 return ret; 4128 } 4129 4130 static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, 4131 bool may_use_included) 4132 { 4133 ASSERT(s_info); 4134 return s_info->bytes_used + s_info->bytes_reserved + 4135 s_info->bytes_pinned + s_info->bytes_readonly + 4136 (may_use_included ? s_info->bytes_may_use : 0); 4137 } 4138 4139 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) 4140 { 4141 struct btrfs_space_info *data_sinfo; 4142 struct btrfs_root *root = inode->root; 4143 struct btrfs_fs_info *fs_info = root->fs_info; 4144 u64 used; 4145 int ret = 0; 4146 int need_commit = 2; 4147 int have_pinned_space; 4148 4149 /* make sure bytes are sectorsize aligned */ 4150 bytes = ALIGN(bytes, fs_info->sectorsize); 4151 4152 if (btrfs_is_free_space_inode(inode)) { 4153 need_commit = 0; 4154 ASSERT(current->journal_info); 4155 } 4156 4157 data_sinfo = fs_info->data_sinfo; 4158 if (!data_sinfo) 4159 goto alloc; 4160 4161 again: 4162 /* make sure we have enough space to handle the data first */ 4163 spin_lock(&data_sinfo->lock); 4164 used = btrfs_space_info_used(data_sinfo, true); 4165 4166 if (used + bytes > data_sinfo->total_bytes) { 4167 struct btrfs_trans_handle *trans; 4168 4169 /* 4170 * if we don't have enough free bytes in this space then we need 4171 * to alloc a new chunk. 4172 */ 4173 if (!data_sinfo->full) { 4174 u64 alloc_target; 4175 4176 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 4177 spin_unlock(&data_sinfo->lock); 4178 alloc: 4179 alloc_target = btrfs_get_alloc_profile(root, 1); 4180 /* 4181 * It is ugly that we don't call nolock join 4182 * transaction for the free space inode case here. 4183 * But it is safe because we only do the data space 4184 * reservation for the free space cache in the 4185 * transaction context, the common join transaction 4186 * just increase the counter of the current transaction 4187 * handler, doesn't try to acquire the trans_lock of 4188 * the fs. 4189 */ 4190 trans = btrfs_join_transaction(root); 4191 if (IS_ERR(trans)) 4192 return PTR_ERR(trans); 4193 4194 ret = do_chunk_alloc(trans, fs_info, alloc_target, 4195 CHUNK_ALLOC_NO_FORCE); 4196 btrfs_end_transaction(trans); 4197 if (ret < 0) { 4198 if (ret != -ENOSPC) 4199 return ret; 4200 else { 4201 have_pinned_space = 1; 4202 goto commit_trans; 4203 } 4204 } 4205 4206 if (!data_sinfo) 4207 data_sinfo = fs_info->data_sinfo; 4208 4209 goto again; 4210 } 4211 4212 /* 4213 * If we don't have enough pinned space to deal with this 4214 * allocation, and no removed chunk in current transaction, 4215 * don't bother committing the transaction. 4216 */ 4217 have_pinned_space = percpu_counter_compare( 4218 &data_sinfo->total_bytes_pinned, 4219 used + bytes - data_sinfo->total_bytes); 4220 spin_unlock(&data_sinfo->lock); 4221 4222 /* commit the current transaction and try again */ 4223 commit_trans: 4224 if (need_commit && 4225 !atomic_read(&fs_info->open_ioctl_trans)) { 4226 need_commit--; 4227 4228 if (need_commit > 0) { 4229 btrfs_start_delalloc_roots(fs_info, 0, -1); 4230 btrfs_wait_ordered_roots(fs_info, -1, 0, 4231 (u64)-1); 4232 } 4233 4234 trans = btrfs_join_transaction(root); 4235 if (IS_ERR(trans)) 4236 return PTR_ERR(trans); 4237 if (have_pinned_space >= 0 || 4238 test_bit(BTRFS_TRANS_HAVE_FREE_BGS, 4239 &trans->transaction->flags) || 4240 need_commit > 0) { 4241 ret = btrfs_commit_transaction(trans); 4242 if (ret) 4243 return ret; 4244 /* 4245 * The cleaner kthread might still be doing iput 4246 * operations. Wait for it to finish so that 4247 * more space is released. 4248 */ 4249 mutex_lock(&fs_info->cleaner_delayed_iput_mutex); 4250 mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); 4251 goto again; 4252 } else { 4253 btrfs_end_transaction(trans); 4254 } 4255 } 4256 4257 trace_btrfs_space_reservation(fs_info, 4258 "space_info:enospc", 4259 data_sinfo->flags, bytes, 1); 4260 return -ENOSPC; 4261 } 4262 data_sinfo->bytes_may_use += bytes; 4263 trace_btrfs_space_reservation(fs_info, "space_info", 4264 data_sinfo->flags, bytes, 1); 4265 spin_unlock(&data_sinfo->lock); 4266 4267 return ret; 4268 } 4269 4270 /* 4271 * New check_data_free_space() with ability for precious data reservation 4272 * Will replace old btrfs_check_data_free_space(), but for patch split, 4273 * add a new function first and then replace it. 4274 */ 4275 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) 4276 { 4277 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4278 int ret; 4279 4280 /* align the range */ 4281 len = round_up(start + len, fs_info->sectorsize) - 4282 round_down(start, fs_info->sectorsize); 4283 start = round_down(start, fs_info->sectorsize); 4284 4285 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); 4286 if (ret < 0) 4287 return ret; 4288 4289 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ 4290 ret = btrfs_qgroup_reserve_data(inode, start, len); 4291 if (ret) 4292 btrfs_free_reserved_data_space_noquota(inode, start, len); 4293 return ret; 4294 } 4295 4296 /* 4297 * Called if we need to clear a data reservation for this inode 4298 * Normally in a error case. 4299 * 4300 * This one will *NOT* use accurate qgroup reserved space API, just for case 4301 * which we can't sleep and is sure it won't affect qgroup reserved space. 4302 * Like clear_bit_hook(). 4303 */ 4304 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, 4305 u64 len) 4306 { 4307 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4308 struct btrfs_space_info *data_sinfo; 4309 4310 /* Make sure the range is aligned to sectorsize */ 4311 len = round_up(start + len, fs_info->sectorsize) - 4312 round_down(start, fs_info->sectorsize); 4313 start = round_down(start, fs_info->sectorsize); 4314 4315 data_sinfo = fs_info->data_sinfo; 4316 spin_lock(&data_sinfo->lock); 4317 if (WARN_ON(data_sinfo->bytes_may_use < len)) 4318 data_sinfo->bytes_may_use = 0; 4319 else 4320 data_sinfo->bytes_may_use -= len; 4321 trace_btrfs_space_reservation(fs_info, "space_info", 4322 data_sinfo->flags, len, 0); 4323 spin_unlock(&data_sinfo->lock); 4324 } 4325 4326 /* 4327 * Called if we need to clear a data reservation for this inode 4328 * Normally in a error case. 4329 * 4330 * This one will handle the per-inode data rsv map for accurate reserved 4331 * space framework. 4332 */ 4333 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) 4334 { 4335 struct btrfs_root *root = BTRFS_I(inode)->root; 4336 4337 /* Make sure the range is aligned to sectorsize */ 4338 len = round_up(start + len, root->fs_info->sectorsize) - 4339 round_down(start, root->fs_info->sectorsize); 4340 start = round_down(start, root->fs_info->sectorsize); 4341 4342 btrfs_free_reserved_data_space_noquota(inode, start, len); 4343 btrfs_qgroup_free_data(inode, start, len); 4344 } 4345 4346 static void force_metadata_allocation(struct btrfs_fs_info *info) 4347 { 4348 struct list_head *head = &info->space_info; 4349 struct btrfs_space_info *found; 4350 4351 rcu_read_lock(); 4352 list_for_each_entry_rcu(found, head, list) { 4353 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 4354 found->force_alloc = CHUNK_ALLOC_FORCE; 4355 } 4356 rcu_read_unlock(); 4357 } 4358 4359 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) 4360 { 4361 return (global->size << 1); 4362 } 4363 4364 static int should_alloc_chunk(struct btrfs_fs_info *fs_info, 4365 struct btrfs_space_info *sinfo, int force) 4366 { 4367 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4368 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 4369 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 4370 u64 thresh; 4371 4372 if (force == CHUNK_ALLOC_FORCE) 4373 return 1; 4374 4375 /* 4376 * We need to take into account the global rsv because for all intents 4377 * and purposes it's used space. Don't worry about locking the 4378 * global_rsv, it doesn't change except when the transaction commits. 4379 */ 4380 if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) 4381 num_allocated += calc_global_rsv_need_space(global_rsv); 4382 4383 /* 4384 * in limited mode, we want to have some free space up to 4385 * about 1% of the FS size. 4386 */ 4387 if (force == CHUNK_ALLOC_LIMITED) { 4388 thresh = btrfs_super_total_bytes(fs_info->super_copy); 4389 thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); 4390 4391 if (num_bytes - num_allocated < thresh) 4392 return 1; 4393 } 4394 4395 if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) 4396 return 0; 4397 return 1; 4398 } 4399 4400 static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) 4401 { 4402 u64 num_dev; 4403 4404 if (type & (BTRFS_BLOCK_GROUP_RAID10 | 4405 BTRFS_BLOCK_GROUP_RAID0 | 4406 BTRFS_BLOCK_GROUP_RAID5 | 4407 BTRFS_BLOCK_GROUP_RAID6)) 4408 num_dev = fs_info->fs_devices->rw_devices; 4409 else if (type & BTRFS_BLOCK_GROUP_RAID1) 4410 num_dev = 2; 4411 else 4412 num_dev = 1; /* DUP or single */ 4413 4414 return num_dev; 4415 } 4416 4417 /* 4418 * If @is_allocation is true, reserve space in the system space info necessary 4419 * for allocating a chunk, otherwise if it's false, reserve space necessary for 4420 * removing a chunk. 4421 */ 4422 void check_system_chunk(struct btrfs_trans_handle *trans, 4423 struct btrfs_fs_info *fs_info, u64 type) 4424 { 4425 struct btrfs_space_info *info; 4426 u64 left; 4427 u64 thresh; 4428 int ret = 0; 4429 u64 num_devs; 4430 4431 /* 4432 * Needed because we can end up allocating a system chunk and for an 4433 * atomic and race free space reservation in the chunk block reserve. 4434 */ 4435 ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); 4436 4437 info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4438 spin_lock(&info->lock); 4439 left = info->total_bytes - btrfs_space_info_used(info, true); 4440 spin_unlock(&info->lock); 4441 4442 num_devs = get_profile_num_devs(fs_info, type); 4443 4444 /* num_devs device items to update and 1 chunk item to add or remove */ 4445 thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + 4446 btrfs_calc_trans_metadata_size(fs_info, 1); 4447 4448 if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4449 btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", 4450 left, thresh, type); 4451 dump_space_info(fs_info, info, 0, 0); 4452 } 4453 4454 if (left < thresh) { 4455 u64 flags; 4456 4457 flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4458 /* 4459 * Ignore failure to create system chunk. We might end up not 4460 * needing it, as we might not need to COW all nodes/leafs from 4461 * the paths we visit in the chunk tree (they were already COWed 4462 * or created in the current transaction for example). 4463 */ 4464 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4465 } 4466 4467 if (!ret) { 4468 ret = btrfs_block_rsv_add(fs_info->chunk_root, 4469 &fs_info->chunk_block_rsv, 4470 thresh, BTRFS_RESERVE_NO_FLUSH); 4471 if (!ret) 4472 trans->chunk_bytes_reserved += thresh; 4473 } 4474 } 4475 4476 /* 4477 * If force is CHUNK_ALLOC_FORCE: 4478 * - return 1 if it successfully allocates a chunk, 4479 * - return errors including -ENOSPC otherwise. 4480 * If force is NOT CHUNK_ALLOC_FORCE: 4481 * - return 0 if it doesn't need to allocate a new chunk, 4482 * - return 1 if it successfully allocates a chunk, 4483 * - return errors including -ENOSPC otherwise. 4484 */ 4485 static int do_chunk_alloc(struct btrfs_trans_handle *trans, 4486 struct btrfs_fs_info *fs_info, u64 flags, int force) 4487 { 4488 struct btrfs_space_info *space_info; 4489 int wait_for_alloc = 0; 4490 int ret = 0; 4491 4492 /* Don't re-enter if we're already allocating a chunk */ 4493 if (trans->allocating_chunk) 4494 return -ENOSPC; 4495 4496 space_info = __find_space_info(fs_info, flags); 4497 if (!space_info) { 4498 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 4499 BUG_ON(ret); /* -ENOMEM */ 4500 } 4501 BUG_ON(!space_info); /* Logic error */ 4502 4503 again: 4504 spin_lock(&space_info->lock); 4505 if (force < space_info->force_alloc) 4506 force = space_info->force_alloc; 4507 if (space_info->full) { 4508 if (should_alloc_chunk(fs_info, space_info, force)) 4509 ret = -ENOSPC; 4510 else 4511 ret = 0; 4512 spin_unlock(&space_info->lock); 4513 return ret; 4514 } 4515 4516 if (!should_alloc_chunk(fs_info, space_info, force)) { 4517 spin_unlock(&space_info->lock); 4518 return 0; 4519 } else if (space_info->chunk_alloc) { 4520 wait_for_alloc = 1; 4521 } else { 4522 space_info->chunk_alloc = 1; 4523 } 4524 4525 spin_unlock(&space_info->lock); 4526 4527 mutex_lock(&fs_info->chunk_mutex); 4528 4529 /* 4530 * The chunk_mutex is held throughout the entirety of a chunk 4531 * allocation, so once we've acquired the chunk_mutex we know that the 4532 * other guy is done and we need to recheck and see if we should 4533 * allocate. 4534 */ 4535 if (wait_for_alloc) { 4536 mutex_unlock(&fs_info->chunk_mutex); 4537 wait_for_alloc = 0; 4538 goto again; 4539 } 4540 4541 trans->allocating_chunk = true; 4542 4543 /* 4544 * If we have mixed data/metadata chunks we want to make sure we keep 4545 * allocating mixed chunks instead of individual chunks. 4546 */ 4547 if (btrfs_mixed_space_info(space_info)) 4548 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); 4549 4550 /* 4551 * if we're doing a data chunk, go ahead and make sure that 4552 * we keep a reasonable number of metadata chunks allocated in the 4553 * FS as well. 4554 */ 4555 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4556 fs_info->data_chunk_allocations++; 4557 if (!(fs_info->data_chunk_allocations % 4558 fs_info->metadata_ratio)) 4559 force_metadata_allocation(fs_info); 4560 } 4561 4562 /* 4563 * Check if we have enough space in SYSTEM chunk because we may need 4564 * to update devices. 4565 */ 4566 check_system_chunk(trans, fs_info, flags); 4567 4568 ret = btrfs_alloc_chunk(trans, fs_info, flags); 4569 trans->allocating_chunk = false; 4570 4571 spin_lock(&space_info->lock); 4572 if (ret < 0 && ret != -ENOSPC) 4573 goto out; 4574 if (ret) 4575 space_info->full = 1; 4576 else 4577 ret = 1; 4578 4579 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 4580 out: 4581 space_info->chunk_alloc = 0; 4582 spin_unlock(&space_info->lock); 4583 mutex_unlock(&fs_info->chunk_mutex); 4584 /* 4585 * When we allocate a new chunk we reserve space in the chunk block 4586 * reserve to make sure we can COW nodes/leafs in the chunk tree or 4587 * add new nodes/leafs to it if we end up needing to do it when 4588 * inserting the chunk item and updating device items as part of the 4589 * second phase of chunk allocation, performed by 4590 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a 4591 * large number of new block groups to create in our transaction 4592 * handle's new_bgs list to avoid exhausting the chunk block reserve 4593 * in extreme cases - like having a single transaction create many new 4594 * block groups when starting to write out the free space caches of all 4595 * the block groups that were made dirty during the lifetime of the 4596 * transaction. 4597 */ 4598 if (trans->can_flush_pending_bgs && 4599 trans->chunk_bytes_reserved >= (u64)SZ_2M) { 4600 btrfs_create_pending_block_groups(trans, fs_info); 4601 btrfs_trans_release_chunk_metadata(trans); 4602 } 4603 return ret; 4604 } 4605 4606 static int can_overcommit(struct btrfs_root *root, 4607 struct btrfs_space_info *space_info, u64 bytes, 4608 enum btrfs_reserve_flush_enum flush) 4609 { 4610 struct btrfs_fs_info *fs_info = root->fs_info; 4611 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 4612 u64 profile; 4613 u64 space_size; 4614 u64 avail; 4615 u64 used; 4616 4617 /* Don't overcommit when in mixed mode. */ 4618 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) 4619 return 0; 4620 4621 profile = btrfs_get_alloc_profile(root, 0); 4622 used = btrfs_space_info_used(space_info, false); 4623 4624 /* 4625 * We only want to allow over committing if we have lots of actual space 4626 * free, but if we don't have enough space to handle the global reserve 4627 * space then we could end up having a real enospc problem when trying 4628 * to allocate a chunk or some other such important allocation. 4629 */ 4630 spin_lock(&global_rsv->lock); 4631 space_size = calc_global_rsv_need_space(global_rsv); 4632 spin_unlock(&global_rsv->lock); 4633 if (used + space_size >= space_info->total_bytes) 4634 return 0; 4635 4636 used += space_info->bytes_may_use; 4637 4638 spin_lock(&fs_info->free_chunk_lock); 4639 avail = fs_info->free_chunk_space; 4640 spin_unlock(&fs_info->free_chunk_lock); 4641 4642 /* 4643 * If we have dup, raid1 or raid10 then only half of the free 4644 * space is actually useable. For raid56, the space info used 4645 * doesn't include the parity drive, so we don't have to 4646 * change the math 4647 */ 4648 if (profile & (BTRFS_BLOCK_GROUP_DUP | 4649 BTRFS_BLOCK_GROUP_RAID1 | 4650 BTRFS_BLOCK_GROUP_RAID10)) 4651 avail >>= 1; 4652 4653 /* 4654 * If we aren't flushing all things, let us overcommit up to 4655 * 1/2th of the space. If we can flush, don't let us overcommit 4656 * too much, let it overcommit up to 1/8 of the space. 4657 */ 4658 if (flush == BTRFS_RESERVE_FLUSH_ALL) 4659 avail >>= 3; 4660 else 4661 avail >>= 1; 4662 4663 if (used + bytes < space_info->total_bytes + avail) 4664 return 1; 4665 return 0; 4666 } 4667 4668 static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, 4669 unsigned long nr_pages, int nr_items) 4670 { 4671 struct super_block *sb = fs_info->sb; 4672 4673 if (down_read_trylock(&sb->s_umount)) { 4674 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); 4675 up_read(&sb->s_umount); 4676 } else { 4677 /* 4678 * We needn't worry the filesystem going from r/w to r/o though 4679 * we don't acquire ->s_umount mutex, because the filesystem 4680 * should guarantee the delalloc inodes list be empty after 4681 * the filesystem is readonly(all dirty pages are written to 4682 * the disk). 4683 */ 4684 btrfs_start_delalloc_roots(fs_info, 0, nr_items); 4685 if (!current->journal_info) 4686 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); 4687 } 4688 } 4689 4690 static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, 4691 u64 to_reclaim) 4692 { 4693 u64 bytes; 4694 int nr; 4695 4696 bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 4697 nr = (int)div64_u64(to_reclaim, bytes); 4698 if (!nr) 4699 nr = 1; 4700 return nr; 4701 } 4702 4703 #define EXTENT_SIZE_PER_ITEM SZ_256K 4704 4705 /* 4706 * shrink metadata reservation for delalloc 4707 */ 4708 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, 4709 bool wait_ordered) 4710 { 4711 struct btrfs_fs_info *fs_info = root->fs_info; 4712 struct btrfs_block_rsv *block_rsv; 4713 struct btrfs_space_info *space_info; 4714 struct btrfs_trans_handle *trans; 4715 u64 delalloc_bytes; 4716 u64 max_reclaim; 4717 long time_left; 4718 unsigned long nr_pages; 4719 int loops; 4720 int items; 4721 enum btrfs_reserve_flush_enum flush; 4722 4723 /* Calc the number of the pages we need flush for space reservation */ 4724 items = calc_reclaim_items_nr(fs_info, to_reclaim); 4725 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; 4726 4727 trans = (struct btrfs_trans_handle *)current->journal_info; 4728 block_rsv = &fs_info->delalloc_block_rsv; 4729 space_info = block_rsv->space_info; 4730 4731 delalloc_bytes = percpu_counter_sum_positive( 4732 &fs_info->delalloc_bytes); 4733 if (delalloc_bytes == 0) { 4734 if (trans) 4735 return; 4736 if (wait_ordered) 4737 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4738 return; 4739 } 4740 4741 loops = 0; 4742 while (delalloc_bytes && loops < 3) { 4743 max_reclaim = min(delalloc_bytes, to_reclaim); 4744 nr_pages = max_reclaim >> PAGE_SHIFT; 4745 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); 4746 /* 4747 * We need to wait for the async pages to actually start before 4748 * we do anything. 4749 */ 4750 max_reclaim = atomic_read(&fs_info->async_delalloc_pages); 4751 if (!max_reclaim) 4752 goto skip_async; 4753 4754 if (max_reclaim <= nr_pages) 4755 max_reclaim = 0; 4756 else 4757 max_reclaim -= nr_pages; 4758 4759 wait_event(fs_info->async_submit_wait, 4760 atomic_read(&fs_info->async_delalloc_pages) <= 4761 (int)max_reclaim); 4762 skip_async: 4763 if (!trans) 4764 flush = BTRFS_RESERVE_FLUSH_ALL; 4765 else 4766 flush = BTRFS_RESERVE_NO_FLUSH; 4767 spin_lock(&space_info->lock); 4768 if (can_overcommit(root, space_info, orig, flush)) { 4769 spin_unlock(&space_info->lock); 4770 break; 4771 } 4772 if (list_empty(&space_info->tickets) && 4773 list_empty(&space_info->priority_tickets)) { 4774 spin_unlock(&space_info->lock); 4775 break; 4776 } 4777 spin_unlock(&space_info->lock); 4778 4779 loops++; 4780 if (wait_ordered && !trans) { 4781 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); 4782 } else { 4783 time_left = schedule_timeout_killable(1); 4784 if (time_left) 4785 break; 4786 } 4787 delalloc_bytes = percpu_counter_sum_positive( 4788 &fs_info->delalloc_bytes); 4789 } 4790 } 4791 4792 /** 4793 * maybe_commit_transaction - possibly commit the transaction if its ok to 4794 * @root - the root we're allocating for 4795 * @bytes - the number of bytes we want to reserve 4796 * @force - force the commit 4797 * 4798 * This will check to make sure that committing the transaction will actually 4799 * get us somewhere and then commit the transaction if it does. Otherwise it 4800 * will return -ENOSPC. 4801 */ 4802 static int may_commit_transaction(struct btrfs_fs_info *fs_info, 4803 struct btrfs_space_info *space_info, 4804 u64 bytes, int force) 4805 { 4806 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 4807 struct btrfs_trans_handle *trans; 4808 4809 trans = (struct btrfs_trans_handle *)current->journal_info; 4810 if (trans) 4811 return -EAGAIN; 4812 4813 if (force) 4814 goto commit; 4815 4816 /* See if there is enough pinned space to make this reservation */ 4817 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4818 bytes) >= 0) 4819 goto commit; 4820 4821 /* 4822 * See if there is some space in the delayed insertion reservation for 4823 * this reservation. 4824 */ 4825 if (space_info != delayed_rsv->space_info) 4826 return -ENOSPC; 4827 4828 spin_lock(&delayed_rsv->lock); 4829 if (percpu_counter_compare(&space_info->total_bytes_pinned, 4830 bytes - delayed_rsv->size) >= 0) { 4831 spin_unlock(&delayed_rsv->lock); 4832 return -ENOSPC; 4833 } 4834 spin_unlock(&delayed_rsv->lock); 4835 4836 commit: 4837 trans = btrfs_join_transaction(fs_info->fs_root); 4838 if (IS_ERR(trans)) 4839 return -ENOSPC; 4840 4841 return btrfs_commit_transaction(trans); 4842 } 4843 4844 struct reserve_ticket { 4845 u64 bytes; 4846 int error; 4847 struct list_head list; 4848 wait_queue_head_t wait; 4849 }; 4850 4851 static int flush_space(struct btrfs_fs_info *fs_info, 4852 struct btrfs_space_info *space_info, u64 num_bytes, 4853 u64 orig_bytes, int state) 4854 { 4855 struct btrfs_root *root = fs_info->fs_root; 4856 struct btrfs_trans_handle *trans; 4857 int nr; 4858 int ret = 0; 4859 4860 switch (state) { 4861 case FLUSH_DELAYED_ITEMS_NR: 4862 case FLUSH_DELAYED_ITEMS: 4863 if (state == FLUSH_DELAYED_ITEMS_NR) 4864 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; 4865 else 4866 nr = -1; 4867 4868 trans = btrfs_join_transaction(root); 4869 if (IS_ERR(trans)) { 4870 ret = PTR_ERR(trans); 4871 break; 4872 } 4873 ret = btrfs_run_delayed_items_nr(trans, fs_info, nr); 4874 btrfs_end_transaction(trans); 4875 break; 4876 case FLUSH_DELALLOC: 4877 case FLUSH_DELALLOC_WAIT: 4878 shrink_delalloc(root, num_bytes * 2, orig_bytes, 4879 state == FLUSH_DELALLOC_WAIT); 4880 break; 4881 case ALLOC_CHUNK: 4882 trans = btrfs_join_transaction(root); 4883 if (IS_ERR(trans)) { 4884 ret = PTR_ERR(trans); 4885 break; 4886 } 4887 ret = do_chunk_alloc(trans, fs_info, 4888 btrfs_get_alloc_profile(root, 0), 4889 CHUNK_ALLOC_NO_FORCE); 4890 btrfs_end_transaction(trans); 4891 if (ret > 0 || ret == -ENOSPC) 4892 ret = 0; 4893 break; 4894 case COMMIT_TRANS: 4895 ret = may_commit_transaction(fs_info, space_info, 4896 orig_bytes, 0); 4897 break; 4898 default: 4899 ret = -ENOSPC; 4900 break; 4901 } 4902 4903 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, 4904 orig_bytes, state, ret); 4905 return ret; 4906 } 4907 4908 static inline u64 4909 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, 4910 struct btrfs_space_info *space_info) 4911 { 4912 struct reserve_ticket *ticket; 4913 u64 used; 4914 u64 expected; 4915 u64 to_reclaim = 0; 4916 4917 list_for_each_entry(ticket, &space_info->tickets, list) 4918 to_reclaim += ticket->bytes; 4919 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4920 to_reclaim += ticket->bytes; 4921 if (to_reclaim) 4922 return to_reclaim; 4923 4924 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); 4925 if (can_overcommit(root, space_info, to_reclaim, 4926 BTRFS_RESERVE_FLUSH_ALL)) 4927 return 0; 4928 4929 used = space_info->bytes_used + space_info->bytes_reserved + 4930 space_info->bytes_pinned + space_info->bytes_readonly + 4931 space_info->bytes_may_use; 4932 if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) 4933 expected = div_factor_fine(space_info->total_bytes, 95); 4934 else 4935 expected = div_factor_fine(space_info->total_bytes, 90); 4936 4937 if (used > expected) 4938 to_reclaim = used - expected; 4939 else 4940 to_reclaim = 0; 4941 to_reclaim = min(to_reclaim, space_info->bytes_may_use + 4942 space_info->bytes_reserved); 4943 return to_reclaim; 4944 } 4945 4946 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4947 struct btrfs_root *root, u64 used) 4948 { 4949 struct btrfs_fs_info *fs_info = root->fs_info; 4950 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 4951 4952 /* If we're just plain full then async reclaim just slows us down. */ 4953 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 4954 return 0; 4955 4956 if (!btrfs_calc_reclaim_metadata_size(root, space_info)) 4957 return 0; 4958 4959 return (used >= thresh && !btrfs_fs_closing(fs_info) && 4960 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4961 } 4962 4963 static void wake_all_tickets(struct list_head *head) 4964 { 4965 struct reserve_ticket *ticket; 4966 4967 while (!list_empty(head)) { 4968 ticket = list_first_entry(head, struct reserve_ticket, list); 4969 list_del_init(&ticket->list); 4970 ticket->error = -ENOSPC; 4971 wake_up(&ticket->wait); 4972 } 4973 } 4974 4975 /* 4976 * This is for normal flushers, we can wait all goddamned day if we want to. We 4977 * will loop and continuously try to flush as long as we are making progress. 4978 * We count progress as clearing off tickets each time we have to loop. 4979 */ 4980 static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4981 { 4982 struct btrfs_fs_info *fs_info; 4983 struct btrfs_space_info *space_info; 4984 u64 to_reclaim; 4985 int flush_state; 4986 int commit_cycles = 0; 4987 u64 last_tickets_id; 4988 4989 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4990 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4991 4992 spin_lock(&space_info->lock); 4993 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 4994 space_info); 4995 if (!to_reclaim) { 4996 space_info->flush = 0; 4997 spin_unlock(&space_info->lock); 4998 return; 4999 } 5000 last_tickets_id = space_info->tickets_id; 5001 spin_unlock(&space_info->lock); 5002 5003 flush_state = FLUSH_DELAYED_ITEMS_NR; 5004 do { 5005 struct reserve_ticket *ticket; 5006 int ret; 5007 5008 ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5009 flush_state); 5010 spin_lock(&space_info->lock); 5011 if (list_empty(&space_info->tickets)) { 5012 space_info->flush = 0; 5013 spin_unlock(&space_info->lock); 5014 return; 5015 } 5016 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5017 space_info); 5018 ticket = list_first_entry(&space_info->tickets, 5019 struct reserve_ticket, list); 5020 if (last_tickets_id == space_info->tickets_id) { 5021 flush_state++; 5022 } else { 5023 last_tickets_id = space_info->tickets_id; 5024 flush_state = FLUSH_DELAYED_ITEMS_NR; 5025 if (commit_cycles) 5026 commit_cycles--; 5027 } 5028 5029 if (flush_state > COMMIT_TRANS) { 5030 commit_cycles++; 5031 if (commit_cycles > 2) { 5032 wake_all_tickets(&space_info->tickets); 5033 space_info->flush = 0; 5034 } else { 5035 flush_state = FLUSH_DELAYED_ITEMS_NR; 5036 } 5037 } 5038 spin_unlock(&space_info->lock); 5039 } while (flush_state <= COMMIT_TRANS); 5040 } 5041 5042 void btrfs_init_async_reclaim_work(struct work_struct *work) 5043 { 5044 INIT_WORK(work, btrfs_async_reclaim_metadata_space); 5045 } 5046 5047 static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, 5048 struct btrfs_space_info *space_info, 5049 struct reserve_ticket *ticket) 5050 { 5051 u64 to_reclaim; 5052 int flush_state = FLUSH_DELAYED_ITEMS_NR; 5053 5054 spin_lock(&space_info->lock); 5055 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, 5056 space_info); 5057 if (!to_reclaim) { 5058 spin_unlock(&space_info->lock); 5059 return; 5060 } 5061 spin_unlock(&space_info->lock); 5062 5063 do { 5064 flush_space(fs_info, space_info, to_reclaim, to_reclaim, 5065 flush_state); 5066 flush_state++; 5067 spin_lock(&space_info->lock); 5068 if (ticket->bytes == 0) { 5069 spin_unlock(&space_info->lock); 5070 return; 5071 } 5072 spin_unlock(&space_info->lock); 5073 5074 /* 5075 * Priority flushers can't wait on delalloc without 5076 * deadlocking. 5077 */ 5078 if (flush_state == FLUSH_DELALLOC || 5079 flush_state == FLUSH_DELALLOC_WAIT) 5080 flush_state = ALLOC_CHUNK; 5081 } while (flush_state < COMMIT_TRANS); 5082 } 5083 5084 static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, 5085 struct btrfs_space_info *space_info, 5086 struct reserve_ticket *ticket, u64 orig_bytes) 5087 5088 { 5089 DEFINE_WAIT(wait); 5090 int ret = 0; 5091 5092 spin_lock(&space_info->lock); 5093 while (ticket->bytes > 0 && ticket->error == 0) { 5094 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); 5095 if (ret) { 5096 ret = -EINTR; 5097 break; 5098 } 5099 spin_unlock(&space_info->lock); 5100 5101 schedule(); 5102 5103 finish_wait(&ticket->wait, &wait); 5104 spin_lock(&space_info->lock); 5105 } 5106 if (!ret) 5107 ret = ticket->error; 5108 if (!list_empty(&ticket->list)) 5109 list_del_init(&ticket->list); 5110 if (ticket->bytes && ticket->bytes < orig_bytes) { 5111 u64 num_bytes = orig_bytes - ticket->bytes; 5112 space_info->bytes_may_use -= num_bytes; 5113 trace_btrfs_space_reservation(fs_info, "space_info", 5114 space_info->flags, num_bytes, 0); 5115 } 5116 spin_unlock(&space_info->lock); 5117 5118 return ret; 5119 } 5120 5121 /** 5122 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5123 * @root - the root we're allocating for 5124 * @space_info - the space info we want to allocate from 5125 * @orig_bytes - the number of bytes we want 5126 * @flush - whether or not we can flush to make our reservation 5127 * 5128 * This will reserve orig_bytes number of bytes from the space info associated 5129 * with the block_rsv. If there is not enough space it will make an attempt to 5130 * flush out space to make room. It will do this by flushing delalloc if 5131 * possible or committing the transaction. If flush is 0 then no attempts to 5132 * regain reservations will be made and this will fail if there is not enough 5133 * space already. 5134 */ 5135 static int __reserve_metadata_bytes(struct btrfs_root *root, 5136 struct btrfs_space_info *space_info, 5137 u64 orig_bytes, 5138 enum btrfs_reserve_flush_enum flush) 5139 { 5140 struct btrfs_fs_info *fs_info = root->fs_info; 5141 struct reserve_ticket ticket; 5142 u64 used; 5143 int ret = 0; 5144 5145 ASSERT(orig_bytes); 5146 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); 5147 5148 spin_lock(&space_info->lock); 5149 ret = -ENOSPC; 5150 used = btrfs_space_info_used(space_info, true); 5151 5152 /* 5153 * If we have enough space then hooray, make our reservation and carry 5154 * on. If not see if we can overcommit, and if we can, hooray carry on. 5155 * If not things get more complicated. 5156 */ 5157 if (used + orig_bytes <= space_info->total_bytes) { 5158 space_info->bytes_may_use += orig_bytes; 5159 trace_btrfs_space_reservation(fs_info, "space_info", 5160 space_info->flags, orig_bytes, 1); 5161 ret = 0; 5162 } else if (can_overcommit(root, space_info, orig_bytes, flush)) { 5163 space_info->bytes_may_use += orig_bytes; 5164 trace_btrfs_space_reservation(fs_info, "space_info", 5165 space_info->flags, orig_bytes, 1); 5166 ret = 0; 5167 } 5168 5169 /* 5170 * If we couldn't make a reservation then setup our reservation ticket 5171 * and kick the async worker if it's not already running. 5172 * 5173 * If we are a priority flusher then we just need to add our ticket to 5174 * the list and we will do our own flushing further down. 5175 */ 5176 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { 5177 ticket.bytes = orig_bytes; 5178 ticket.error = 0; 5179 init_waitqueue_head(&ticket.wait); 5180 if (flush == BTRFS_RESERVE_FLUSH_ALL) { 5181 list_add_tail(&ticket.list, &space_info->tickets); 5182 if (!space_info->flush) { 5183 space_info->flush = 1; 5184 trace_btrfs_trigger_flush(fs_info, 5185 space_info->flags, 5186 orig_bytes, flush, 5187 "enospc"); 5188 queue_work(system_unbound_wq, 5189 &root->fs_info->async_reclaim_work); 5190 } 5191 } else { 5192 list_add_tail(&ticket.list, 5193 &space_info->priority_tickets); 5194 } 5195 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 5196 used += orig_bytes; 5197 /* 5198 * We will do the space reservation dance during log replay, 5199 * which means we won't have fs_info->fs_root set, so don't do 5200 * the async reclaim as we will panic. 5201 */ 5202 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 5203 need_do_async_reclaim(space_info, root, used) && 5204 !work_busy(&fs_info->async_reclaim_work)) { 5205 trace_btrfs_trigger_flush(fs_info, space_info->flags, 5206 orig_bytes, flush, "preempt"); 5207 queue_work(system_unbound_wq, 5208 &fs_info->async_reclaim_work); 5209 } 5210 } 5211 spin_unlock(&space_info->lock); 5212 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) 5213 return ret; 5214 5215 if (flush == BTRFS_RESERVE_FLUSH_ALL) 5216 return wait_reserve_ticket(fs_info, space_info, &ticket, 5217 orig_bytes); 5218 5219 ret = 0; 5220 priority_reclaim_metadata_space(fs_info, space_info, &ticket); 5221 spin_lock(&space_info->lock); 5222 if (ticket.bytes) { 5223 if (ticket.bytes < orig_bytes) { 5224 u64 num_bytes = orig_bytes - ticket.bytes; 5225 space_info->bytes_may_use -= num_bytes; 5226 trace_btrfs_space_reservation(fs_info, "space_info", 5227 space_info->flags, 5228 num_bytes, 0); 5229 5230 } 5231 list_del_init(&ticket.list); 5232 ret = -ENOSPC; 5233 } 5234 spin_unlock(&space_info->lock); 5235 ASSERT(list_empty(&ticket.list)); 5236 return ret; 5237 } 5238 5239 /** 5240 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space 5241 * @root - the root we're allocating for 5242 * @block_rsv - the block_rsv we're allocating for 5243 * @orig_bytes - the number of bytes we want 5244 * @flush - whether or not we can flush to make our reservation 5245 * 5246 * This will reserve orgi_bytes number of bytes from the space info associated 5247 * with the block_rsv. If there is not enough space it will make an attempt to 5248 * flush out space to make room. It will do this by flushing delalloc if 5249 * possible or committing the transaction. If flush is 0 then no attempts to 5250 * regain reservations will be made and this will fail if there is not enough 5251 * space already. 5252 */ 5253 static int reserve_metadata_bytes(struct btrfs_root *root, 5254 struct btrfs_block_rsv *block_rsv, 5255 u64 orig_bytes, 5256 enum btrfs_reserve_flush_enum flush) 5257 { 5258 struct btrfs_fs_info *fs_info = root->fs_info; 5259 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5260 int ret; 5261 5262 ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, 5263 flush); 5264 if (ret == -ENOSPC && 5265 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { 5266 if (block_rsv != global_rsv && 5267 !block_rsv_use_bytes(global_rsv, orig_bytes)) 5268 ret = 0; 5269 } 5270 if (ret == -ENOSPC) 5271 trace_btrfs_space_reservation(fs_info, "space_info:enospc", 5272 block_rsv->space_info->flags, 5273 orig_bytes, 1); 5274 return ret; 5275 } 5276 5277 static struct btrfs_block_rsv *get_block_rsv( 5278 const struct btrfs_trans_handle *trans, 5279 const struct btrfs_root *root) 5280 { 5281 struct btrfs_fs_info *fs_info = root->fs_info; 5282 struct btrfs_block_rsv *block_rsv = NULL; 5283 5284 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 5285 (root == fs_info->csum_root && trans->adding_csums) || 5286 (root == fs_info->uuid_root)) 5287 block_rsv = trans->block_rsv; 5288 5289 if (!block_rsv) 5290 block_rsv = root->block_rsv; 5291 5292 if (!block_rsv) 5293 block_rsv = &fs_info->empty_block_rsv; 5294 5295 return block_rsv; 5296 } 5297 5298 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 5299 u64 num_bytes) 5300 { 5301 int ret = -ENOSPC; 5302 spin_lock(&block_rsv->lock); 5303 if (block_rsv->reserved >= num_bytes) { 5304 block_rsv->reserved -= num_bytes; 5305 if (block_rsv->reserved < block_rsv->size) 5306 block_rsv->full = 0; 5307 ret = 0; 5308 } 5309 spin_unlock(&block_rsv->lock); 5310 return ret; 5311 } 5312 5313 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, 5314 u64 num_bytes, int update_size) 5315 { 5316 spin_lock(&block_rsv->lock); 5317 block_rsv->reserved += num_bytes; 5318 if (update_size) 5319 block_rsv->size += num_bytes; 5320 else if (block_rsv->reserved >= block_rsv->size) 5321 block_rsv->full = 1; 5322 spin_unlock(&block_rsv->lock); 5323 } 5324 5325 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, 5326 struct btrfs_block_rsv *dest, u64 num_bytes, 5327 int min_factor) 5328 { 5329 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5330 u64 min_bytes; 5331 5332 if (global_rsv->space_info != dest->space_info) 5333 return -ENOSPC; 5334 5335 spin_lock(&global_rsv->lock); 5336 min_bytes = div_factor(global_rsv->size, min_factor); 5337 if (global_rsv->reserved < min_bytes + num_bytes) { 5338 spin_unlock(&global_rsv->lock); 5339 return -ENOSPC; 5340 } 5341 global_rsv->reserved -= num_bytes; 5342 if (global_rsv->reserved < global_rsv->size) 5343 global_rsv->full = 0; 5344 spin_unlock(&global_rsv->lock); 5345 5346 block_rsv_add_bytes(dest, num_bytes, 1); 5347 return 0; 5348 } 5349 5350 /* 5351 * This is for space we already have accounted in space_info->bytes_may_use, so 5352 * basically when we're returning space from block_rsv's. 5353 */ 5354 static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, 5355 struct btrfs_space_info *space_info, 5356 u64 num_bytes) 5357 { 5358 struct reserve_ticket *ticket; 5359 struct list_head *head; 5360 u64 used; 5361 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; 5362 bool check_overcommit = false; 5363 5364 spin_lock(&space_info->lock); 5365 head = &space_info->priority_tickets; 5366 5367 /* 5368 * If we are over our limit then we need to check and see if we can 5369 * overcommit, and if we can't then we just need to free up our space 5370 * and not satisfy any requests. 5371 */ 5372 used = space_info->bytes_used + space_info->bytes_reserved + 5373 space_info->bytes_pinned + space_info->bytes_readonly + 5374 space_info->bytes_may_use; 5375 if (used - num_bytes >= space_info->total_bytes) 5376 check_overcommit = true; 5377 again: 5378 while (!list_empty(head) && num_bytes) { 5379 ticket = list_first_entry(head, struct reserve_ticket, 5380 list); 5381 /* 5382 * We use 0 bytes because this space is already reserved, so 5383 * adding the ticket space would be a double count. 5384 */ 5385 if (check_overcommit && 5386 !can_overcommit(fs_info->extent_root, space_info, 0, 5387 flush)) 5388 break; 5389 if (num_bytes >= ticket->bytes) { 5390 list_del_init(&ticket->list); 5391 num_bytes -= ticket->bytes; 5392 ticket->bytes = 0; 5393 space_info->tickets_id++; 5394 wake_up(&ticket->wait); 5395 } else { 5396 ticket->bytes -= num_bytes; 5397 num_bytes = 0; 5398 } 5399 } 5400 5401 if (num_bytes && head == &space_info->priority_tickets) { 5402 head = &space_info->tickets; 5403 flush = BTRFS_RESERVE_FLUSH_ALL; 5404 goto again; 5405 } 5406 space_info->bytes_may_use -= num_bytes; 5407 trace_btrfs_space_reservation(fs_info, "space_info", 5408 space_info->flags, num_bytes, 0); 5409 spin_unlock(&space_info->lock); 5410 } 5411 5412 /* 5413 * This is for newly allocated space that isn't accounted in 5414 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent 5415 * we use this helper. 5416 */ 5417 static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, 5418 struct btrfs_space_info *space_info, 5419 u64 num_bytes) 5420 { 5421 struct reserve_ticket *ticket; 5422 struct list_head *head = &space_info->priority_tickets; 5423 5424 again: 5425 while (!list_empty(head) && num_bytes) { 5426 ticket = list_first_entry(head, struct reserve_ticket, 5427 list); 5428 if (num_bytes >= ticket->bytes) { 5429 trace_btrfs_space_reservation(fs_info, "space_info", 5430 space_info->flags, 5431 ticket->bytes, 1); 5432 list_del_init(&ticket->list); 5433 num_bytes -= ticket->bytes; 5434 space_info->bytes_may_use += ticket->bytes; 5435 ticket->bytes = 0; 5436 space_info->tickets_id++; 5437 wake_up(&ticket->wait); 5438 } else { 5439 trace_btrfs_space_reservation(fs_info, "space_info", 5440 space_info->flags, 5441 num_bytes, 1); 5442 space_info->bytes_may_use += num_bytes; 5443 ticket->bytes -= num_bytes; 5444 num_bytes = 0; 5445 } 5446 } 5447 5448 if (num_bytes && head == &space_info->priority_tickets) { 5449 head = &space_info->tickets; 5450 goto again; 5451 } 5452 } 5453 5454 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 5455 struct btrfs_block_rsv *block_rsv, 5456 struct btrfs_block_rsv *dest, u64 num_bytes) 5457 { 5458 struct btrfs_space_info *space_info = block_rsv->space_info; 5459 5460 spin_lock(&block_rsv->lock); 5461 if (num_bytes == (u64)-1) 5462 num_bytes = block_rsv->size; 5463 block_rsv->size -= num_bytes; 5464 if (block_rsv->reserved >= block_rsv->size) { 5465 num_bytes = block_rsv->reserved - block_rsv->size; 5466 block_rsv->reserved = block_rsv->size; 5467 block_rsv->full = 1; 5468 } else { 5469 num_bytes = 0; 5470 } 5471 spin_unlock(&block_rsv->lock); 5472 5473 if (num_bytes > 0) { 5474 if (dest) { 5475 spin_lock(&dest->lock); 5476 if (!dest->full) { 5477 u64 bytes_to_add; 5478 5479 bytes_to_add = dest->size - dest->reserved; 5480 bytes_to_add = min(num_bytes, bytes_to_add); 5481 dest->reserved += bytes_to_add; 5482 if (dest->reserved >= dest->size) 5483 dest->full = 1; 5484 num_bytes -= bytes_to_add; 5485 } 5486 spin_unlock(&dest->lock); 5487 } 5488 if (num_bytes) 5489 space_info_add_old_bytes(fs_info, space_info, 5490 num_bytes); 5491 } 5492 } 5493 5494 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, 5495 struct btrfs_block_rsv *dst, u64 num_bytes, 5496 int update_size) 5497 { 5498 int ret; 5499 5500 ret = block_rsv_use_bytes(src, num_bytes); 5501 if (ret) 5502 return ret; 5503 5504 block_rsv_add_bytes(dst, num_bytes, update_size); 5505 return 0; 5506 } 5507 5508 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) 5509 { 5510 memset(rsv, 0, sizeof(*rsv)); 5511 spin_lock_init(&rsv->lock); 5512 rsv->type = type; 5513 } 5514 5515 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, 5516 unsigned short type) 5517 { 5518 struct btrfs_block_rsv *block_rsv; 5519 5520 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 5521 if (!block_rsv) 5522 return NULL; 5523 5524 btrfs_init_block_rsv(block_rsv, type); 5525 block_rsv->space_info = __find_space_info(fs_info, 5526 BTRFS_BLOCK_GROUP_METADATA); 5527 return block_rsv; 5528 } 5529 5530 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, 5531 struct btrfs_block_rsv *rsv) 5532 { 5533 if (!rsv) 5534 return; 5535 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5536 kfree(rsv); 5537 } 5538 5539 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) 5540 { 5541 kfree(rsv); 5542 } 5543 5544 int btrfs_block_rsv_add(struct btrfs_root *root, 5545 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 5546 enum btrfs_reserve_flush_enum flush) 5547 { 5548 int ret; 5549 5550 if (num_bytes == 0) 5551 return 0; 5552 5553 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5554 if (!ret) { 5555 block_rsv_add_bytes(block_rsv, num_bytes, 1); 5556 return 0; 5557 } 5558 5559 return ret; 5560 } 5561 5562 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) 5563 { 5564 u64 num_bytes = 0; 5565 int ret = -ENOSPC; 5566 5567 if (!block_rsv) 5568 return 0; 5569 5570 spin_lock(&block_rsv->lock); 5571 num_bytes = div_factor(block_rsv->size, min_factor); 5572 if (block_rsv->reserved >= num_bytes) 5573 ret = 0; 5574 spin_unlock(&block_rsv->lock); 5575 5576 return ret; 5577 } 5578 5579 int btrfs_block_rsv_refill(struct btrfs_root *root, 5580 struct btrfs_block_rsv *block_rsv, u64 min_reserved, 5581 enum btrfs_reserve_flush_enum flush) 5582 { 5583 u64 num_bytes = 0; 5584 int ret = -ENOSPC; 5585 5586 if (!block_rsv) 5587 return 0; 5588 5589 spin_lock(&block_rsv->lock); 5590 num_bytes = min_reserved; 5591 if (block_rsv->reserved >= num_bytes) 5592 ret = 0; 5593 else 5594 num_bytes -= block_rsv->reserved; 5595 spin_unlock(&block_rsv->lock); 5596 5597 if (!ret) 5598 return 0; 5599 5600 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); 5601 if (!ret) { 5602 block_rsv_add_bytes(block_rsv, num_bytes, 0); 5603 return 0; 5604 } 5605 5606 return ret; 5607 } 5608 5609 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, 5610 struct btrfs_block_rsv *block_rsv, 5611 u64 num_bytes) 5612 { 5613 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5614 5615 if (global_rsv == block_rsv || 5616 block_rsv->space_info != global_rsv->space_info) 5617 global_rsv = NULL; 5618 block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); 5619 } 5620 5621 static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 5622 { 5623 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 5624 struct btrfs_space_info *sinfo = block_rsv->space_info; 5625 u64 num_bytes; 5626 5627 /* 5628 * The global block rsv is based on the size of the extent tree, the 5629 * checksum tree and the root tree. If the fs is empty we want to set 5630 * it to a minimal amount for safety. 5631 */ 5632 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + 5633 btrfs_root_used(&fs_info->csum_root->root_item) + 5634 btrfs_root_used(&fs_info->tree_root->root_item); 5635 num_bytes = max_t(u64, num_bytes, SZ_16M); 5636 5637 spin_lock(&sinfo->lock); 5638 spin_lock(&block_rsv->lock); 5639 5640 block_rsv->size = min_t(u64, num_bytes, SZ_512M); 5641 5642 if (block_rsv->reserved < block_rsv->size) { 5643 num_bytes = btrfs_space_info_used(sinfo, true); 5644 if (sinfo->total_bytes > num_bytes) { 5645 num_bytes = sinfo->total_bytes - num_bytes; 5646 num_bytes = min(num_bytes, 5647 block_rsv->size - block_rsv->reserved); 5648 block_rsv->reserved += num_bytes; 5649 sinfo->bytes_may_use += num_bytes; 5650 trace_btrfs_space_reservation(fs_info, "space_info", 5651 sinfo->flags, num_bytes, 5652 1); 5653 } 5654 } else if (block_rsv->reserved > block_rsv->size) { 5655 num_bytes = block_rsv->reserved - block_rsv->size; 5656 sinfo->bytes_may_use -= num_bytes; 5657 trace_btrfs_space_reservation(fs_info, "space_info", 5658 sinfo->flags, num_bytes, 0); 5659 block_rsv->reserved = block_rsv->size; 5660 } 5661 5662 if (block_rsv->reserved == block_rsv->size) 5663 block_rsv->full = 1; 5664 else 5665 block_rsv->full = 0; 5666 5667 spin_unlock(&block_rsv->lock); 5668 spin_unlock(&sinfo->lock); 5669 } 5670 5671 static void init_global_block_rsv(struct btrfs_fs_info *fs_info) 5672 { 5673 struct btrfs_space_info *space_info; 5674 5675 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 5676 fs_info->chunk_block_rsv.space_info = space_info; 5677 5678 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 5679 fs_info->global_block_rsv.space_info = space_info; 5680 fs_info->delalloc_block_rsv.space_info = space_info; 5681 fs_info->trans_block_rsv.space_info = space_info; 5682 fs_info->empty_block_rsv.space_info = space_info; 5683 fs_info->delayed_block_rsv.space_info = space_info; 5684 5685 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 5686 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 5687 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; 5688 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 5689 if (fs_info->quota_root) 5690 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; 5691 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 5692 5693 update_global_block_rsv(fs_info); 5694 } 5695 5696 static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 5697 { 5698 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, 5699 (u64)-1); 5700 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 5701 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 5702 WARN_ON(fs_info->trans_block_rsv.size > 0); 5703 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 5704 WARN_ON(fs_info->chunk_block_rsv.size > 0); 5705 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 5706 WARN_ON(fs_info->delayed_block_rsv.size > 0); 5707 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 5708 } 5709 5710 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 5711 struct btrfs_fs_info *fs_info) 5712 { 5713 if (!trans->block_rsv) 5714 return; 5715 5716 if (!trans->bytes_reserved) 5717 return; 5718 5719 trace_btrfs_space_reservation(fs_info, "transaction", 5720 trans->transid, trans->bytes_reserved, 0); 5721 btrfs_block_rsv_release(fs_info, trans->block_rsv, 5722 trans->bytes_reserved); 5723 trans->bytes_reserved = 0; 5724 } 5725 5726 /* 5727 * To be called after all the new block groups attached to the transaction 5728 * handle have been created (btrfs_create_pending_block_groups()). 5729 */ 5730 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) 5731 { 5732 struct btrfs_fs_info *fs_info = trans->fs_info; 5733 5734 if (!trans->chunk_bytes_reserved) 5735 return; 5736 5737 WARN_ON_ONCE(!list_empty(&trans->new_bgs)); 5738 5739 block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, 5740 trans->chunk_bytes_reserved); 5741 trans->chunk_bytes_reserved = 0; 5742 } 5743 5744 /* Can only return 0 or -ENOSPC */ 5745 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 5746 struct btrfs_inode *inode) 5747 { 5748 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5749 struct btrfs_root *root = inode->root; 5750 /* 5751 * We always use trans->block_rsv here as we will have reserved space 5752 * for our orphan when starting the transaction, using get_block_rsv() 5753 * here will sometimes make us choose the wrong block rsv as we could be 5754 * doing a reloc inode for a non refcounted root. 5755 */ 5756 struct btrfs_block_rsv *src_rsv = trans->block_rsv; 5757 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 5758 5759 /* 5760 * We need to hold space in order to delete our orphan item once we've 5761 * added it, so this takes the reservation so we can release it later 5762 * when we are truly done with the orphan item. 5763 */ 5764 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5765 5766 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 5767 num_bytes, 1); 5768 return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); 5769 } 5770 5771 void btrfs_orphan_release_metadata(struct btrfs_inode *inode) 5772 { 5773 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5774 struct btrfs_root *root = inode->root; 5775 u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); 5776 5777 trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 5778 num_bytes, 0); 5779 btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); 5780 } 5781 5782 /* 5783 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation 5784 * root: the root of the parent directory 5785 * rsv: block reservation 5786 * items: the number of items that we need do reservation 5787 * qgroup_reserved: used to return the reserved size in qgroup 5788 * 5789 * This function is used to reserve the space for snapshot/subvolume 5790 * creation and deletion. Those operations are different with the 5791 * common file/directory operations, they change two fs/file trees 5792 * and root tree, the number of items that the qgroup reserves is 5793 * different with the free space reservation. So we can not use 5794 * the space reservation mechanism in start_transaction(). 5795 */ 5796 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, 5797 struct btrfs_block_rsv *rsv, 5798 int items, 5799 u64 *qgroup_reserved, 5800 bool use_global_rsv) 5801 { 5802 u64 num_bytes; 5803 int ret; 5804 struct btrfs_fs_info *fs_info = root->fs_info; 5805 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 5806 5807 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5808 /* One for parent inode, two for dir entries */ 5809 num_bytes = 3 * fs_info->nodesize; 5810 ret = btrfs_qgroup_reserve_meta(root, num_bytes, true); 5811 if (ret) 5812 return ret; 5813 } else { 5814 num_bytes = 0; 5815 } 5816 5817 *qgroup_reserved = num_bytes; 5818 5819 num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); 5820 rsv->space_info = __find_space_info(fs_info, 5821 BTRFS_BLOCK_GROUP_METADATA); 5822 ret = btrfs_block_rsv_add(root, rsv, num_bytes, 5823 BTRFS_RESERVE_FLUSH_ALL); 5824 5825 if (ret == -ENOSPC && use_global_rsv) 5826 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); 5827 5828 if (ret && *qgroup_reserved) 5829 btrfs_qgroup_free_meta(root, *qgroup_reserved); 5830 5831 return ret; 5832 } 5833 5834 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, 5835 struct btrfs_block_rsv *rsv) 5836 { 5837 btrfs_block_rsv_release(fs_info, rsv, (u64)-1); 5838 } 5839 5840 /** 5841 * drop_outstanding_extent - drop an outstanding extent 5842 * @inode: the inode we're dropping the extent for 5843 * @num_bytes: the number of bytes we're releasing. 5844 * 5845 * This is called when we are freeing up an outstanding extent, either called 5846 * after an error or after an extent is written. This will return the number of 5847 * reserved extents that need to be freed. This must be called with 5848 * BTRFS_I(inode)->lock held. 5849 */ 5850 static unsigned drop_outstanding_extent(struct btrfs_inode *inode, 5851 u64 num_bytes) 5852 { 5853 unsigned drop_inode_space = 0; 5854 unsigned dropped_extents = 0; 5855 unsigned num_extents; 5856 5857 num_extents = count_max_extents(num_bytes); 5858 ASSERT(num_extents); 5859 ASSERT(inode->outstanding_extents >= num_extents); 5860 inode->outstanding_extents -= num_extents; 5861 5862 if (inode->outstanding_extents == 0 && 5863 test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5864 &inode->runtime_flags)) 5865 drop_inode_space = 1; 5866 5867 /* 5868 * If we have more or the same amount of outstanding extents than we have 5869 * reserved then we need to leave the reserved extents count alone. 5870 */ 5871 if (inode->outstanding_extents >= inode->reserved_extents) 5872 return drop_inode_space; 5873 5874 dropped_extents = inode->reserved_extents - inode->outstanding_extents; 5875 inode->reserved_extents -= dropped_extents; 5876 return dropped_extents + drop_inode_space; 5877 } 5878 5879 /** 5880 * calc_csum_metadata_size - return the amount of metadata space that must be 5881 * reserved/freed for the given bytes. 5882 * @inode: the inode we're manipulating 5883 * @num_bytes: the number of bytes in question 5884 * @reserve: 1 if we are reserving space, 0 if we are freeing space 5885 * 5886 * This adjusts the number of csum_bytes in the inode and then returns the 5887 * correct amount of metadata that must either be reserved or freed. We 5888 * calculate how many checksums we can fit into one leaf and then divide the 5889 * number of bytes that will need to be checksumed by this value to figure out 5890 * how many checksums will be required. If we are adding bytes then the number 5891 * may go up and we will return the number of additional bytes that must be 5892 * reserved. If it is going down we will return the number of bytes that must 5893 * be freed. 5894 * 5895 * This must be called with BTRFS_I(inode)->lock held. 5896 */ 5897 static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, 5898 int reserve) 5899 { 5900 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5901 u64 old_csums, num_csums; 5902 5903 if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) 5904 return 0; 5905 5906 old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 5907 if (reserve) 5908 inode->csum_bytes += num_bytes; 5909 else 5910 inode->csum_bytes -= num_bytes; 5911 num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); 5912 5913 /* No change, no need to reserve more */ 5914 if (old_csums == num_csums) 5915 return 0; 5916 5917 if (reserve) 5918 return btrfs_calc_trans_metadata_size(fs_info, 5919 num_csums - old_csums); 5920 5921 return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); 5922 } 5923 5924 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) 5925 { 5926 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 5927 struct btrfs_root *root = inode->root; 5928 struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; 5929 u64 to_reserve = 0; 5930 u64 csum_bytes; 5931 unsigned nr_extents; 5932 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 5933 int ret = 0; 5934 bool delalloc_lock = true; 5935 u64 to_free = 0; 5936 unsigned dropped; 5937 bool release_extra = false; 5938 5939 /* If we are a free space inode we need to not flush since we will be in 5940 * the middle of a transaction commit. We also don't need the delalloc 5941 * mutex since we won't race with anybody. We need this mostly to make 5942 * lockdep shut its filthy mouth. 5943 * 5944 * If we have a transaction open (can happen if we call truncate_block 5945 * from truncate), then we need FLUSH_LIMIT so we don't deadlock. 5946 */ 5947 if (btrfs_is_free_space_inode(inode)) { 5948 flush = BTRFS_RESERVE_NO_FLUSH; 5949 delalloc_lock = false; 5950 } else if (current->journal_info) { 5951 flush = BTRFS_RESERVE_FLUSH_LIMIT; 5952 } 5953 5954 if (flush != BTRFS_RESERVE_NO_FLUSH && 5955 btrfs_transaction_in_commit(fs_info)) 5956 schedule_timeout(1); 5957 5958 if (delalloc_lock) 5959 mutex_lock(&inode->delalloc_mutex); 5960 5961 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 5962 5963 spin_lock(&inode->lock); 5964 nr_extents = count_max_extents(num_bytes); 5965 inode->outstanding_extents += nr_extents; 5966 5967 nr_extents = 0; 5968 if (inode->outstanding_extents > inode->reserved_extents) 5969 nr_extents += inode->outstanding_extents - 5970 inode->reserved_extents; 5971 5972 /* We always want to reserve a slot for updating the inode. */ 5973 to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); 5974 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); 5975 csum_bytes = inode->csum_bytes; 5976 spin_unlock(&inode->lock); 5977 5978 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 5979 ret = btrfs_qgroup_reserve_meta(root, 5980 nr_extents * fs_info->nodesize, true); 5981 if (ret) 5982 goto out_fail; 5983 } 5984 5985 ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); 5986 if (unlikely(ret)) { 5987 btrfs_qgroup_free_meta(root, 5988 nr_extents * fs_info->nodesize); 5989 goto out_fail; 5990 } 5991 5992 spin_lock(&inode->lock); 5993 if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, 5994 &inode->runtime_flags)) { 5995 to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); 5996 release_extra = true; 5997 } 5998 inode->reserved_extents += nr_extents; 5999 spin_unlock(&inode->lock); 6000 6001 if (delalloc_lock) 6002 mutex_unlock(&inode->delalloc_mutex); 6003 6004 if (to_reserve) 6005 trace_btrfs_space_reservation(fs_info, "delalloc", 6006 btrfs_ino(inode), to_reserve, 1); 6007 if (release_extra) 6008 btrfs_block_rsv_release(fs_info, block_rsv, 6009 btrfs_calc_trans_metadata_size(fs_info, 1)); 6010 return 0; 6011 6012 out_fail: 6013 spin_lock(&inode->lock); 6014 dropped = drop_outstanding_extent(inode, num_bytes); 6015 /* 6016 * If the inodes csum_bytes is the same as the original 6017 * csum_bytes then we know we haven't raced with any free()ers 6018 * so we can just reduce our inodes csum bytes and carry on. 6019 */ 6020 if (inode->csum_bytes == csum_bytes) { 6021 calc_csum_metadata_size(inode, num_bytes, 0); 6022 } else { 6023 u64 orig_csum_bytes = inode->csum_bytes; 6024 u64 bytes; 6025 6026 /* 6027 * This is tricky, but first we need to figure out how much we 6028 * freed from any free-ers that occurred during this 6029 * reservation, so we reset ->csum_bytes to the csum_bytes 6030 * before we dropped our lock, and then call the free for the 6031 * number of bytes that were freed while we were trying our 6032 * reservation. 6033 */ 6034 bytes = csum_bytes - inode->csum_bytes; 6035 inode->csum_bytes = csum_bytes; 6036 to_free = calc_csum_metadata_size(inode, bytes, 0); 6037 6038 6039 /* 6040 * Now we need to see how much we would have freed had we not 6041 * been making this reservation and our ->csum_bytes were not 6042 * artificially inflated. 6043 */ 6044 inode->csum_bytes = csum_bytes - num_bytes; 6045 bytes = csum_bytes - orig_csum_bytes; 6046 bytes = calc_csum_metadata_size(inode, bytes, 0); 6047 6048 /* 6049 * Now reset ->csum_bytes to what it should be. If bytes is 6050 * more than to_free then we would have freed more space had we 6051 * not had an artificially high ->csum_bytes, so we need to free 6052 * the remainder. If bytes is the same or less then we don't 6053 * need to do anything, the other free-ers did the correct 6054 * thing. 6055 */ 6056 inode->csum_bytes = orig_csum_bytes - num_bytes; 6057 if (bytes > to_free) 6058 to_free = bytes - to_free; 6059 else 6060 to_free = 0; 6061 } 6062 spin_unlock(&inode->lock); 6063 if (dropped) 6064 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6065 6066 if (to_free) { 6067 btrfs_block_rsv_release(fs_info, block_rsv, to_free); 6068 trace_btrfs_space_reservation(fs_info, "delalloc", 6069 btrfs_ino(inode), to_free, 0); 6070 } 6071 if (delalloc_lock) 6072 mutex_unlock(&inode->delalloc_mutex); 6073 return ret; 6074 } 6075 6076 /** 6077 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode 6078 * @inode: the inode to release the reservation for 6079 * @num_bytes: the number of bytes we're releasing 6080 * 6081 * This will release the metadata reservation for an inode. This can be called 6082 * once we complete IO for a given set of bytes to release their metadata 6083 * reservations. 6084 */ 6085 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) 6086 { 6087 struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); 6088 u64 to_free = 0; 6089 unsigned dropped; 6090 6091 num_bytes = ALIGN(num_bytes, fs_info->sectorsize); 6092 spin_lock(&inode->lock); 6093 dropped = drop_outstanding_extent(inode, num_bytes); 6094 6095 if (num_bytes) 6096 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 6097 spin_unlock(&inode->lock); 6098 if (dropped > 0) 6099 to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); 6100 6101 if (btrfs_is_testing(fs_info)) 6102 return; 6103 6104 trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), 6105 to_free, 0); 6106 6107 btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); 6108 } 6109 6110 /** 6111 * btrfs_delalloc_reserve_space - reserve data and metadata space for 6112 * delalloc 6113 * @inode: inode we're writing to 6114 * @start: start range we are writing to 6115 * @len: how long the range we are writing to 6116 * 6117 * This will do the following things 6118 * 6119 * o reserve space in data space info for num bytes 6120 * and reserve precious corresponding qgroup space 6121 * (Done in check_data_free_space) 6122 * 6123 * o reserve space for metadata space, based on the number of outstanding 6124 * extents and how much csums will be needed 6125 * also reserve metadata space in a per root over-reserve method. 6126 * o add to the inodes->delalloc_bytes 6127 * o add it to the fs_info's delalloc inodes list. 6128 * (Above 3 all done in delalloc_reserve_metadata) 6129 * 6130 * Return 0 for success 6131 * Return <0 for error(-ENOSPC or -EQUOT) 6132 */ 6133 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) 6134 { 6135 int ret; 6136 6137 ret = btrfs_check_data_free_space(inode, start, len); 6138 if (ret < 0) 6139 return ret; 6140 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); 6141 if (ret < 0) 6142 btrfs_free_reserved_data_space(inode, start, len); 6143 return ret; 6144 } 6145 6146 /** 6147 * btrfs_delalloc_release_space - release data and metadata space for delalloc 6148 * @inode: inode we're releasing space for 6149 * @start: start position of the space already reserved 6150 * @len: the len of the space already reserved 6151 * 6152 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 6153 * called in the case that we don't need the metadata AND data reservations 6154 * anymore. So if there is an error or we insert an inline extent. 6155 * 6156 * This function will release the metadata space that was not used and will 6157 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 6158 * list if there are no delalloc bytes left. 6159 * Also it will handle the qgroup reserved space. 6160 */ 6161 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) 6162 { 6163 btrfs_delalloc_release_metadata(BTRFS_I(inode), len); 6164 btrfs_free_reserved_data_space(inode, start, len); 6165 } 6166 6167 static int update_block_group(struct btrfs_trans_handle *trans, 6168 struct btrfs_fs_info *info, u64 bytenr, 6169 u64 num_bytes, int alloc) 6170 { 6171 struct btrfs_block_group_cache *cache = NULL; 6172 u64 total = num_bytes; 6173 u64 old_val; 6174 u64 byte_in_group; 6175 int factor; 6176 6177 /* block accounting for super block */ 6178 spin_lock(&info->delalloc_root_lock); 6179 old_val = btrfs_super_bytes_used(info->super_copy); 6180 if (alloc) 6181 old_val += num_bytes; 6182 else 6183 old_val -= num_bytes; 6184 btrfs_set_super_bytes_used(info->super_copy, old_val); 6185 spin_unlock(&info->delalloc_root_lock); 6186 6187 while (total) { 6188 cache = btrfs_lookup_block_group(info, bytenr); 6189 if (!cache) 6190 return -ENOENT; 6191 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | 6192 BTRFS_BLOCK_GROUP_RAID1 | 6193 BTRFS_BLOCK_GROUP_RAID10)) 6194 factor = 2; 6195 else 6196 factor = 1; 6197 /* 6198 * If this block group has free space cache written out, we 6199 * need to make sure to load it if we are removing space. This 6200 * is because we need the unpinning stage to actually add the 6201 * space back to the block group, otherwise we will leak space. 6202 */ 6203 if (!alloc && cache->cached == BTRFS_CACHE_NO) 6204 cache_block_group(cache, 1); 6205 6206 byte_in_group = bytenr - cache->key.objectid; 6207 WARN_ON(byte_in_group > cache->key.offset); 6208 6209 spin_lock(&cache->space_info->lock); 6210 spin_lock(&cache->lock); 6211 6212 if (btrfs_test_opt(info, SPACE_CACHE) && 6213 cache->disk_cache_state < BTRFS_DC_CLEAR) 6214 cache->disk_cache_state = BTRFS_DC_CLEAR; 6215 6216 old_val = btrfs_block_group_used(&cache->item); 6217 num_bytes = min(total, cache->key.offset - byte_in_group); 6218 if (alloc) { 6219 old_val += num_bytes; 6220 btrfs_set_block_group_used(&cache->item, old_val); 6221 cache->reserved -= num_bytes; 6222 cache->space_info->bytes_reserved -= num_bytes; 6223 cache->space_info->bytes_used += num_bytes; 6224 cache->space_info->disk_used += num_bytes * factor; 6225 spin_unlock(&cache->lock); 6226 spin_unlock(&cache->space_info->lock); 6227 } else { 6228 old_val -= num_bytes; 6229 btrfs_set_block_group_used(&cache->item, old_val); 6230 cache->pinned += num_bytes; 6231 cache->space_info->bytes_pinned += num_bytes; 6232 cache->space_info->bytes_used -= num_bytes; 6233 cache->space_info->disk_used -= num_bytes * factor; 6234 spin_unlock(&cache->lock); 6235 spin_unlock(&cache->space_info->lock); 6236 6237 trace_btrfs_space_reservation(info, "pinned", 6238 cache->space_info->flags, 6239 num_bytes, 1); 6240 set_extent_dirty(info->pinned_extents, 6241 bytenr, bytenr + num_bytes - 1, 6242 GFP_NOFS | __GFP_NOFAIL); 6243 } 6244 6245 spin_lock(&trans->transaction->dirty_bgs_lock); 6246 if (list_empty(&cache->dirty_list)) { 6247 list_add_tail(&cache->dirty_list, 6248 &trans->transaction->dirty_bgs); 6249 trans->transaction->num_dirty_bgs++; 6250 btrfs_get_block_group(cache); 6251 } 6252 spin_unlock(&trans->transaction->dirty_bgs_lock); 6253 6254 /* 6255 * No longer have used bytes in this block group, queue it for 6256 * deletion. We do this after adding the block group to the 6257 * dirty list to avoid races between cleaner kthread and space 6258 * cache writeout. 6259 */ 6260 if (!alloc && old_val == 0) { 6261 spin_lock(&info->unused_bgs_lock); 6262 if (list_empty(&cache->bg_list)) { 6263 btrfs_get_block_group(cache); 6264 list_add_tail(&cache->bg_list, 6265 &info->unused_bgs); 6266 } 6267 spin_unlock(&info->unused_bgs_lock); 6268 } 6269 6270 btrfs_put_block_group(cache); 6271 total -= num_bytes; 6272 bytenr += num_bytes; 6273 } 6274 return 0; 6275 } 6276 6277 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) 6278 { 6279 struct btrfs_block_group_cache *cache; 6280 u64 bytenr; 6281 6282 spin_lock(&fs_info->block_group_cache_lock); 6283 bytenr = fs_info->first_logical_byte; 6284 spin_unlock(&fs_info->block_group_cache_lock); 6285 6286 if (bytenr < (u64)-1) 6287 return bytenr; 6288 6289 cache = btrfs_lookup_first_block_group(fs_info, search_start); 6290 if (!cache) 6291 return 0; 6292 6293 bytenr = cache->key.objectid; 6294 btrfs_put_block_group(cache); 6295 6296 return bytenr; 6297 } 6298 6299 static int pin_down_extent(struct btrfs_fs_info *fs_info, 6300 struct btrfs_block_group_cache *cache, 6301 u64 bytenr, u64 num_bytes, int reserved) 6302 { 6303 spin_lock(&cache->space_info->lock); 6304 spin_lock(&cache->lock); 6305 cache->pinned += num_bytes; 6306 cache->space_info->bytes_pinned += num_bytes; 6307 if (reserved) { 6308 cache->reserved -= num_bytes; 6309 cache->space_info->bytes_reserved -= num_bytes; 6310 } 6311 spin_unlock(&cache->lock); 6312 spin_unlock(&cache->space_info->lock); 6313 6314 trace_btrfs_space_reservation(fs_info, "pinned", 6315 cache->space_info->flags, num_bytes, 1); 6316 set_extent_dirty(fs_info->pinned_extents, bytenr, 6317 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 6318 return 0; 6319 } 6320 6321 /* 6322 * this function must be called within transaction 6323 */ 6324 int btrfs_pin_extent(struct btrfs_fs_info *fs_info, 6325 u64 bytenr, u64 num_bytes, int reserved) 6326 { 6327 struct btrfs_block_group_cache *cache; 6328 6329 cache = btrfs_lookup_block_group(fs_info, bytenr); 6330 BUG_ON(!cache); /* Logic error */ 6331 6332 pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); 6333 6334 btrfs_put_block_group(cache); 6335 return 0; 6336 } 6337 6338 /* 6339 * this function must be called within transaction 6340 */ 6341 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, 6342 u64 bytenr, u64 num_bytes) 6343 { 6344 struct btrfs_block_group_cache *cache; 6345 int ret; 6346 6347 cache = btrfs_lookup_block_group(fs_info, bytenr); 6348 if (!cache) 6349 return -EINVAL; 6350 6351 /* 6352 * pull in the free space cache (if any) so that our pin 6353 * removes the free space from the cache. We have load_only set 6354 * to one because the slow code to read in the free extents does check 6355 * the pinned extents. 6356 */ 6357 cache_block_group(cache, 1); 6358 6359 pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); 6360 6361 /* remove us from the free space cache (if we're there at all) */ 6362 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 6363 btrfs_put_block_group(cache); 6364 return ret; 6365 } 6366 6367 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 6368 u64 start, u64 num_bytes) 6369 { 6370 int ret; 6371 struct btrfs_block_group_cache *block_group; 6372 struct btrfs_caching_control *caching_ctl; 6373 6374 block_group = btrfs_lookup_block_group(fs_info, start); 6375 if (!block_group) 6376 return -EINVAL; 6377 6378 cache_block_group(block_group, 0); 6379 caching_ctl = get_caching_control(block_group); 6380 6381 if (!caching_ctl) { 6382 /* Logic error */ 6383 BUG_ON(!block_group_cache_done(block_group)); 6384 ret = btrfs_remove_free_space(block_group, start, num_bytes); 6385 } else { 6386 mutex_lock(&caching_ctl->mutex); 6387 6388 if (start >= caching_ctl->progress) { 6389 ret = add_excluded_extent(fs_info, start, num_bytes); 6390 } else if (start + num_bytes <= caching_ctl->progress) { 6391 ret = btrfs_remove_free_space(block_group, 6392 start, num_bytes); 6393 } else { 6394 num_bytes = caching_ctl->progress - start; 6395 ret = btrfs_remove_free_space(block_group, 6396 start, num_bytes); 6397 if (ret) 6398 goto out_lock; 6399 6400 num_bytes = (start + num_bytes) - 6401 caching_ctl->progress; 6402 start = caching_ctl->progress; 6403 ret = add_excluded_extent(fs_info, start, num_bytes); 6404 } 6405 out_lock: 6406 mutex_unlock(&caching_ctl->mutex); 6407 put_caching_control(caching_ctl); 6408 } 6409 btrfs_put_block_group(block_group); 6410 return ret; 6411 } 6412 6413 int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, 6414 struct extent_buffer *eb) 6415 { 6416 struct btrfs_file_extent_item *item; 6417 struct btrfs_key key; 6418 int found_type; 6419 int i; 6420 6421 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 6422 return 0; 6423 6424 for (i = 0; i < btrfs_header_nritems(eb); i++) { 6425 btrfs_item_key_to_cpu(eb, &key, i); 6426 if (key.type != BTRFS_EXTENT_DATA_KEY) 6427 continue; 6428 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 6429 found_type = btrfs_file_extent_type(eb, item); 6430 if (found_type == BTRFS_FILE_EXTENT_INLINE) 6431 continue; 6432 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 6433 continue; 6434 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 6435 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 6436 __exclude_logged_extent(fs_info, key.objectid, key.offset); 6437 } 6438 6439 return 0; 6440 } 6441 6442 static void 6443 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) 6444 { 6445 atomic_inc(&bg->reservations); 6446 } 6447 6448 void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, 6449 const u64 start) 6450 { 6451 struct btrfs_block_group_cache *bg; 6452 6453 bg = btrfs_lookup_block_group(fs_info, start); 6454 ASSERT(bg); 6455 if (atomic_dec_and_test(&bg->reservations)) 6456 wake_up_atomic_t(&bg->reservations); 6457 btrfs_put_block_group(bg); 6458 } 6459 6460 static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) 6461 { 6462 schedule(); 6463 return 0; 6464 } 6465 6466 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) 6467 { 6468 struct btrfs_space_info *space_info = bg->space_info; 6469 6470 ASSERT(bg->ro); 6471 6472 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) 6473 return; 6474 6475 /* 6476 * Our block group is read only but before we set it to read only, 6477 * some task might have had allocated an extent from it already, but it 6478 * has not yet created a respective ordered extent (and added it to a 6479 * root's list of ordered extents). 6480 * Therefore wait for any task currently allocating extents, since the 6481 * block group's reservations counter is incremented while a read lock 6482 * on the groups' semaphore is held and decremented after releasing 6483 * the read access on that semaphore and creating the ordered extent. 6484 */ 6485 down_write(&space_info->groups_sem); 6486 up_write(&space_info->groups_sem); 6487 6488 wait_on_atomic_t(&bg->reservations, 6489 btrfs_wait_bg_reservations_atomic_t, 6490 TASK_UNINTERRUPTIBLE); 6491 } 6492 6493 /** 6494 * btrfs_add_reserved_bytes - update the block_group and space info counters 6495 * @cache: The cache we are manipulating 6496 * @ram_bytes: The number of bytes of file content, and will be same to 6497 * @num_bytes except for the compress path. 6498 * @num_bytes: The number of bytes in question 6499 * @delalloc: The blocks are allocated for the delalloc write 6500 * 6501 * This is called by the allocator when it reserves space. If this is a 6502 * reservation and the block group has become read only we cannot make the 6503 * reservation and return -EAGAIN, otherwise this function always succeeds. 6504 */ 6505 static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, 6506 u64 ram_bytes, u64 num_bytes, int delalloc) 6507 { 6508 struct btrfs_space_info *space_info = cache->space_info; 6509 int ret = 0; 6510 6511 spin_lock(&space_info->lock); 6512 spin_lock(&cache->lock); 6513 if (cache->ro) { 6514 ret = -EAGAIN; 6515 } else { 6516 cache->reserved += num_bytes; 6517 space_info->bytes_reserved += num_bytes; 6518 6519 trace_btrfs_space_reservation(cache->fs_info, 6520 "space_info", space_info->flags, 6521 ram_bytes, 0); 6522 space_info->bytes_may_use -= ram_bytes; 6523 if (delalloc) 6524 cache->delalloc_bytes += num_bytes; 6525 } 6526 spin_unlock(&cache->lock); 6527 spin_unlock(&space_info->lock); 6528 return ret; 6529 } 6530 6531 /** 6532 * btrfs_free_reserved_bytes - update the block_group and space info counters 6533 * @cache: The cache we are manipulating 6534 * @num_bytes: The number of bytes in question 6535 * @delalloc: The blocks are allocated for the delalloc write 6536 * 6537 * This is called by somebody who is freeing space that was never actually used 6538 * on disk. For example if you reserve some space for a new leaf in transaction 6539 * A and before transaction A commits you free that leaf, you call this with 6540 * reserve set to 0 in order to clear the reservation. 6541 */ 6542 6543 static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, 6544 u64 num_bytes, int delalloc) 6545 { 6546 struct btrfs_space_info *space_info = cache->space_info; 6547 int ret = 0; 6548 6549 spin_lock(&space_info->lock); 6550 spin_lock(&cache->lock); 6551 if (cache->ro) 6552 space_info->bytes_readonly += num_bytes; 6553 cache->reserved -= num_bytes; 6554 space_info->bytes_reserved -= num_bytes; 6555 6556 if (delalloc) 6557 cache->delalloc_bytes -= num_bytes; 6558 spin_unlock(&cache->lock); 6559 spin_unlock(&space_info->lock); 6560 return ret; 6561 } 6562 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) 6563 { 6564 struct btrfs_caching_control *next; 6565 struct btrfs_caching_control *caching_ctl; 6566 struct btrfs_block_group_cache *cache; 6567 6568 down_write(&fs_info->commit_root_sem); 6569 6570 list_for_each_entry_safe(caching_ctl, next, 6571 &fs_info->caching_block_groups, list) { 6572 cache = caching_ctl->block_group; 6573 if (block_group_cache_done(cache)) { 6574 cache->last_byte_to_unpin = (u64)-1; 6575 list_del_init(&caching_ctl->list); 6576 put_caching_control(caching_ctl); 6577 } else { 6578 cache->last_byte_to_unpin = caching_ctl->progress; 6579 } 6580 } 6581 6582 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6583 fs_info->pinned_extents = &fs_info->freed_extents[1]; 6584 else 6585 fs_info->pinned_extents = &fs_info->freed_extents[0]; 6586 6587 up_write(&fs_info->commit_root_sem); 6588 6589 update_global_block_rsv(fs_info); 6590 } 6591 6592 /* 6593 * Returns the free cluster for the given space info and sets empty_cluster to 6594 * what it should be based on the mount options. 6595 */ 6596 static struct btrfs_free_cluster * 6597 fetch_cluster_info(struct btrfs_fs_info *fs_info, 6598 struct btrfs_space_info *space_info, u64 *empty_cluster) 6599 { 6600 struct btrfs_free_cluster *ret = NULL; 6601 bool ssd = btrfs_test_opt(fs_info, SSD); 6602 6603 *empty_cluster = 0; 6604 if (btrfs_mixed_space_info(space_info)) 6605 return ret; 6606 6607 if (ssd) 6608 *empty_cluster = SZ_2M; 6609 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 6610 ret = &fs_info->meta_alloc_cluster; 6611 if (!ssd) 6612 *empty_cluster = SZ_64K; 6613 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { 6614 ret = &fs_info->data_alloc_cluster; 6615 } 6616 6617 return ret; 6618 } 6619 6620 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 6621 u64 start, u64 end, 6622 const bool return_free_space) 6623 { 6624 struct btrfs_block_group_cache *cache = NULL; 6625 struct btrfs_space_info *space_info; 6626 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6627 struct btrfs_free_cluster *cluster = NULL; 6628 u64 len; 6629 u64 total_unpinned = 0; 6630 u64 empty_cluster = 0; 6631 bool readonly; 6632 6633 while (start <= end) { 6634 readonly = false; 6635 if (!cache || 6636 start >= cache->key.objectid + cache->key.offset) { 6637 if (cache) 6638 btrfs_put_block_group(cache); 6639 total_unpinned = 0; 6640 cache = btrfs_lookup_block_group(fs_info, start); 6641 BUG_ON(!cache); /* Logic error */ 6642 6643 cluster = fetch_cluster_info(fs_info, 6644 cache->space_info, 6645 &empty_cluster); 6646 empty_cluster <<= 1; 6647 } 6648 6649 len = cache->key.objectid + cache->key.offset - start; 6650 len = min(len, end + 1 - start); 6651 6652 if (start < cache->last_byte_to_unpin) { 6653 len = min(len, cache->last_byte_to_unpin - start); 6654 if (return_free_space) 6655 btrfs_add_free_space(cache, start, len); 6656 } 6657 6658 start += len; 6659 total_unpinned += len; 6660 space_info = cache->space_info; 6661 6662 /* 6663 * If this space cluster has been marked as fragmented and we've 6664 * unpinned enough in this block group to potentially allow a 6665 * cluster to be created inside of it go ahead and clear the 6666 * fragmented check. 6667 */ 6668 if (cluster && cluster->fragmented && 6669 total_unpinned > empty_cluster) { 6670 spin_lock(&cluster->lock); 6671 cluster->fragmented = 0; 6672 spin_unlock(&cluster->lock); 6673 } 6674 6675 spin_lock(&space_info->lock); 6676 spin_lock(&cache->lock); 6677 cache->pinned -= len; 6678 space_info->bytes_pinned -= len; 6679 6680 trace_btrfs_space_reservation(fs_info, "pinned", 6681 space_info->flags, len, 0); 6682 space_info->max_extent_size = 0; 6683 percpu_counter_add(&space_info->total_bytes_pinned, -len); 6684 if (cache->ro) { 6685 space_info->bytes_readonly += len; 6686 readonly = true; 6687 } 6688 spin_unlock(&cache->lock); 6689 if (!readonly && return_free_space && 6690 global_rsv->space_info == space_info) { 6691 u64 to_add = len; 6692 WARN_ON(!return_free_space); 6693 spin_lock(&global_rsv->lock); 6694 if (!global_rsv->full) { 6695 to_add = min(len, global_rsv->size - 6696 global_rsv->reserved); 6697 global_rsv->reserved += to_add; 6698 space_info->bytes_may_use += to_add; 6699 if (global_rsv->reserved >= global_rsv->size) 6700 global_rsv->full = 1; 6701 trace_btrfs_space_reservation(fs_info, 6702 "space_info", 6703 space_info->flags, 6704 to_add, 1); 6705 len -= to_add; 6706 } 6707 spin_unlock(&global_rsv->lock); 6708 /* Add to any tickets we may have */ 6709 if (len) 6710 space_info_add_new_bytes(fs_info, space_info, 6711 len); 6712 } 6713 spin_unlock(&space_info->lock); 6714 } 6715 6716 if (cache) 6717 btrfs_put_block_group(cache); 6718 return 0; 6719 } 6720 6721 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 6722 struct btrfs_fs_info *fs_info) 6723 { 6724 struct btrfs_block_group_cache *block_group, *tmp; 6725 struct list_head *deleted_bgs; 6726 struct extent_io_tree *unpin; 6727 u64 start; 6728 u64 end; 6729 int ret; 6730 6731 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 6732 unpin = &fs_info->freed_extents[1]; 6733 else 6734 unpin = &fs_info->freed_extents[0]; 6735 6736 while (!trans->aborted) { 6737 mutex_lock(&fs_info->unused_bg_unpin_mutex); 6738 ret = find_first_extent_bit(unpin, 0, &start, &end, 6739 EXTENT_DIRTY, NULL); 6740 if (ret) { 6741 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6742 break; 6743 } 6744 6745 if (btrfs_test_opt(fs_info, DISCARD)) 6746 ret = btrfs_discard_extent(fs_info, start, 6747 end + 1 - start, NULL); 6748 6749 clear_extent_dirty(unpin, start, end); 6750 unpin_extent_range(fs_info, start, end, true); 6751 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 6752 cond_resched(); 6753 } 6754 6755 /* 6756 * Transaction is finished. We don't need the lock anymore. We 6757 * do need to clean up the block groups in case of a transaction 6758 * abort. 6759 */ 6760 deleted_bgs = &trans->transaction->deleted_bgs; 6761 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 6762 u64 trimmed = 0; 6763 6764 ret = -EROFS; 6765 if (!trans->aborted) 6766 ret = btrfs_discard_extent(fs_info, 6767 block_group->key.objectid, 6768 block_group->key.offset, 6769 &trimmed); 6770 6771 list_del_init(&block_group->bg_list); 6772 btrfs_put_block_group_trimming(block_group); 6773 btrfs_put_block_group(block_group); 6774 6775 if (ret) { 6776 const char *errstr = btrfs_decode_error(ret); 6777 btrfs_warn(fs_info, 6778 "Discard failed while removing blockgroup: errno=%d %s\n", 6779 ret, errstr); 6780 } 6781 } 6782 6783 return 0; 6784 } 6785 6786 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, 6787 u64 owner, u64 root_objectid) 6788 { 6789 struct btrfs_space_info *space_info; 6790 u64 flags; 6791 6792 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 6793 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID) 6794 flags = BTRFS_BLOCK_GROUP_SYSTEM; 6795 else 6796 flags = BTRFS_BLOCK_GROUP_METADATA; 6797 } else { 6798 flags = BTRFS_BLOCK_GROUP_DATA; 6799 } 6800 6801 space_info = __find_space_info(fs_info, flags); 6802 BUG_ON(!space_info); /* Logic bug */ 6803 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes); 6804 } 6805 6806 6807 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 6808 struct btrfs_fs_info *info, 6809 struct btrfs_delayed_ref_node *node, u64 parent, 6810 u64 root_objectid, u64 owner_objectid, 6811 u64 owner_offset, int refs_to_drop, 6812 struct btrfs_delayed_extent_op *extent_op) 6813 { 6814 struct btrfs_key key; 6815 struct btrfs_path *path; 6816 struct btrfs_root *extent_root = info->extent_root; 6817 struct extent_buffer *leaf; 6818 struct btrfs_extent_item *ei; 6819 struct btrfs_extent_inline_ref *iref; 6820 int ret; 6821 int is_data; 6822 int extent_slot = 0; 6823 int found_extent = 0; 6824 int num_to_del = 1; 6825 u32 item_size; 6826 u64 refs; 6827 u64 bytenr = node->bytenr; 6828 u64 num_bytes = node->num_bytes; 6829 int last_ref = 0; 6830 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 6831 6832 path = btrfs_alloc_path(); 6833 if (!path) 6834 return -ENOMEM; 6835 6836 path->reada = READA_FORWARD; 6837 path->leave_spinning = 1; 6838 6839 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 6840 BUG_ON(!is_data && refs_to_drop != 1); 6841 6842 if (is_data) 6843 skinny_metadata = 0; 6844 6845 ret = lookup_extent_backref(trans, info, path, &iref, 6846 bytenr, num_bytes, parent, 6847 root_objectid, owner_objectid, 6848 owner_offset); 6849 if (ret == 0) { 6850 extent_slot = path->slots[0]; 6851 while (extent_slot >= 0) { 6852 btrfs_item_key_to_cpu(path->nodes[0], &key, 6853 extent_slot); 6854 if (key.objectid != bytenr) 6855 break; 6856 if (key.type == BTRFS_EXTENT_ITEM_KEY && 6857 key.offset == num_bytes) { 6858 found_extent = 1; 6859 break; 6860 } 6861 if (key.type == BTRFS_METADATA_ITEM_KEY && 6862 key.offset == owner_objectid) { 6863 found_extent = 1; 6864 break; 6865 } 6866 if (path->slots[0] - extent_slot > 5) 6867 break; 6868 extent_slot--; 6869 } 6870 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6871 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); 6872 if (found_extent && item_size < sizeof(*ei)) 6873 found_extent = 0; 6874 #endif 6875 if (!found_extent) { 6876 BUG_ON(iref); 6877 ret = remove_extent_backref(trans, info, path, NULL, 6878 refs_to_drop, 6879 is_data, &last_ref); 6880 if (ret) { 6881 btrfs_abort_transaction(trans, ret); 6882 goto out; 6883 } 6884 btrfs_release_path(path); 6885 path->leave_spinning = 1; 6886 6887 key.objectid = bytenr; 6888 key.type = BTRFS_EXTENT_ITEM_KEY; 6889 key.offset = num_bytes; 6890 6891 if (!is_data && skinny_metadata) { 6892 key.type = BTRFS_METADATA_ITEM_KEY; 6893 key.offset = owner_objectid; 6894 } 6895 6896 ret = btrfs_search_slot(trans, extent_root, 6897 &key, path, -1, 1); 6898 if (ret > 0 && skinny_metadata && path->slots[0]) { 6899 /* 6900 * Couldn't find our skinny metadata item, 6901 * see if we have ye olde extent item. 6902 */ 6903 path->slots[0]--; 6904 btrfs_item_key_to_cpu(path->nodes[0], &key, 6905 path->slots[0]); 6906 if (key.objectid == bytenr && 6907 key.type == BTRFS_EXTENT_ITEM_KEY && 6908 key.offset == num_bytes) 6909 ret = 0; 6910 } 6911 6912 if (ret > 0 && skinny_metadata) { 6913 skinny_metadata = false; 6914 key.objectid = bytenr; 6915 key.type = BTRFS_EXTENT_ITEM_KEY; 6916 key.offset = num_bytes; 6917 btrfs_release_path(path); 6918 ret = btrfs_search_slot(trans, extent_root, 6919 &key, path, -1, 1); 6920 } 6921 6922 if (ret) { 6923 btrfs_err(info, 6924 "umm, got %d back from search, was looking for %llu", 6925 ret, bytenr); 6926 if (ret > 0) 6927 btrfs_print_leaf(info, path->nodes[0]); 6928 } 6929 if (ret < 0) { 6930 btrfs_abort_transaction(trans, ret); 6931 goto out; 6932 } 6933 extent_slot = path->slots[0]; 6934 } 6935 } else if (WARN_ON(ret == -ENOENT)) { 6936 btrfs_print_leaf(info, path->nodes[0]); 6937 btrfs_err(info, 6938 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", 6939 bytenr, parent, root_objectid, owner_objectid, 6940 owner_offset); 6941 btrfs_abort_transaction(trans, ret); 6942 goto out; 6943 } else { 6944 btrfs_abort_transaction(trans, ret); 6945 goto out; 6946 } 6947 6948 leaf = path->nodes[0]; 6949 item_size = btrfs_item_size_nr(leaf, extent_slot); 6950 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 6951 if (item_size < sizeof(*ei)) { 6952 BUG_ON(found_extent || extent_slot != path->slots[0]); 6953 ret = convert_extent_item_v0(trans, info, path, owner_objectid, 6954 0); 6955 if (ret < 0) { 6956 btrfs_abort_transaction(trans, ret); 6957 goto out; 6958 } 6959 6960 btrfs_release_path(path); 6961 path->leave_spinning = 1; 6962 6963 key.objectid = bytenr; 6964 key.type = BTRFS_EXTENT_ITEM_KEY; 6965 key.offset = num_bytes; 6966 6967 ret = btrfs_search_slot(trans, extent_root, &key, path, 6968 -1, 1); 6969 if (ret) { 6970 btrfs_err(info, 6971 "umm, got %d back from search, was looking for %llu", 6972 ret, bytenr); 6973 btrfs_print_leaf(info, path->nodes[0]); 6974 } 6975 if (ret < 0) { 6976 btrfs_abort_transaction(trans, ret); 6977 goto out; 6978 } 6979 6980 extent_slot = path->slots[0]; 6981 leaf = path->nodes[0]; 6982 item_size = btrfs_item_size_nr(leaf, extent_slot); 6983 } 6984 #endif 6985 BUG_ON(item_size < sizeof(*ei)); 6986 ei = btrfs_item_ptr(leaf, extent_slot, 6987 struct btrfs_extent_item); 6988 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 6989 key.type == BTRFS_EXTENT_ITEM_KEY) { 6990 struct btrfs_tree_block_info *bi; 6991 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); 6992 bi = (struct btrfs_tree_block_info *)(ei + 1); 6993 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 6994 } 6995 6996 refs = btrfs_extent_refs(leaf, ei); 6997 if (refs < refs_to_drop) { 6998 btrfs_err(info, 6999 "trying to drop %d refs but we only have %Lu for bytenr %Lu", 7000 refs_to_drop, refs, bytenr); 7001 ret = -EINVAL; 7002 btrfs_abort_transaction(trans, ret); 7003 goto out; 7004 } 7005 refs -= refs_to_drop; 7006 7007 if (refs > 0) { 7008 if (extent_op) 7009 __run_delayed_extent_op(extent_op, leaf, ei); 7010 /* 7011 * In the case of inline back ref, reference count will 7012 * be updated by remove_extent_backref 7013 */ 7014 if (iref) { 7015 BUG_ON(!found_extent); 7016 } else { 7017 btrfs_set_extent_refs(leaf, ei, refs); 7018 btrfs_mark_buffer_dirty(leaf); 7019 } 7020 if (found_extent) { 7021 ret = remove_extent_backref(trans, info, path, 7022 iref, refs_to_drop, 7023 is_data, &last_ref); 7024 if (ret) { 7025 btrfs_abort_transaction(trans, ret); 7026 goto out; 7027 } 7028 } 7029 add_pinned_bytes(info, -num_bytes, owner_objectid, 7030 root_objectid); 7031 } else { 7032 if (found_extent) { 7033 BUG_ON(is_data && refs_to_drop != 7034 extent_data_ref_count(path, iref)); 7035 if (iref) { 7036 BUG_ON(path->slots[0] != extent_slot); 7037 } else { 7038 BUG_ON(path->slots[0] != extent_slot + 1); 7039 path->slots[0] = extent_slot; 7040 num_to_del = 2; 7041 } 7042 } 7043 7044 last_ref = 1; 7045 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 7046 num_to_del); 7047 if (ret) { 7048 btrfs_abort_transaction(trans, ret); 7049 goto out; 7050 } 7051 btrfs_release_path(path); 7052 7053 if (is_data) { 7054 ret = btrfs_del_csums(trans, info, bytenr, num_bytes); 7055 if (ret) { 7056 btrfs_abort_transaction(trans, ret); 7057 goto out; 7058 } 7059 } 7060 7061 ret = add_to_free_space_tree(trans, info, bytenr, num_bytes); 7062 if (ret) { 7063 btrfs_abort_transaction(trans, ret); 7064 goto out; 7065 } 7066 7067 ret = update_block_group(trans, info, bytenr, num_bytes, 0); 7068 if (ret) { 7069 btrfs_abort_transaction(trans, ret); 7070 goto out; 7071 } 7072 } 7073 btrfs_release_path(path); 7074 7075 out: 7076 btrfs_free_path(path); 7077 return ret; 7078 } 7079 7080 /* 7081 * when we free an block, it is possible (and likely) that we free the last 7082 * delayed ref for that extent as well. This searches the delayed ref tree for 7083 * a given extent, and if there are no other delayed refs to be processed, it 7084 * removes it from the tree. 7085 */ 7086 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 7087 u64 bytenr) 7088 { 7089 struct btrfs_delayed_ref_head *head; 7090 struct btrfs_delayed_ref_root *delayed_refs; 7091 int ret = 0; 7092 7093 delayed_refs = &trans->transaction->delayed_refs; 7094 spin_lock(&delayed_refs->lock); 7095 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 7096 if (!head) 7097 goto out_delayed_unlock; 7098 7099 spin_lock(&head->lock); 7100 if (!list_empty(&head->ref_list)) 7101 goto out; 7102 7103 if (head->extent_op) { 7104 if (!head->must_insert_reserved) 7105 goto out; 7106 btrfs_free_delayed_extent_op(head->extent_op); 7107 head->extent_op = NULL; 7108 } 7109 7110 /* 7111 * waiting for the lock here would deadlock. If someone else has it 7112 * locked they are already in the process of dropping it anyway 7113 */ 7114 if (!mutex_trylock(&head->mutex)) 7115 goto out; 7116 7117 /* 7118 * at this point we have a head with no other entries. Go 7119 * ahead and process it. 7120 */ 7121 head->node.in_tree = 0; 7122 rb_erase(&head->href_node, &delayed_refs->href_root); 7123 7124 atomic_dec(&delayed_refs->num_entries); 7125 7126 /* 7127 * we don't take a ref on the node because we're removing it from the 7128 * tree, so we just steal the ref the tree was holding. 7129 */ 7130 delayed_refs->num_heads--; 7131 if (head->processing == 0) 7132 delayed_refs->num_heads_ready--; 7133 head->processing = 0; 7134 spin_unlock(&head->lock); 7135 spin_unlock(&delayed_refs->lock); 7136 7137 BUG_ON(head->extent_op); 7138 if (head->must_insert_reserved) 7139 ret = 1; 7140 7141 mutex_unlock(&head->mutex); 7142 btrfs_put_delayed_ref(&head->node); 7143 return ret; 7144 out: 7145 spin_unlock(&head->lock); 7146 7147 out_delayed_unlock: 7148 spin_unlock(&delayed_refs->lock); 7149 return 0; 7150 } 7151 7152 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 7153 struct btrfs_root *root, 7154 struct extent_buffer *buf, 7155 u64 parent, int last_ref) 7156 { 7157 struct btrfs_fs_info *fs_info = root->fs_info; 7158 int pin = 1; 7159 int ret; 7160 7161 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7162 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 7163 buf->start, buf->len, 7164 parent, 7165 root->root_key.objectid, 7166 btrfs_header_level(buf), 7167 BTRFS_DROP_DELAYED_REF, NULL); 7168 BUG_ON(ret); /* -ENOMEM */ 7169 } 7170 7171 if (!last_ref) 7172 return; 7173 7174 if (btrfs_header_generation(buf) == trans->transid) { 7175 struct btrfs_block_group_cache *cache; 7176 7177 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 7178 ret = check_ref_cleanup(trans, buf->start); 7179 if (!ret) 7180 goto out; 7181 } 7182 7183 cache = btrfs_lookup_block_group(fs_info, buf->start); 7184 7185 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 7186 pin_down_extent(fs_info, cache, buf->start, 7187 buf->len, 1); 7188 btrfs_put_block_group(cache); 7189 goto out; 7190 } 7191 7192 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7193 7194 btrfs_add_free_space(cache, buf->start, buf->len); 7195 btrfs_free_reserved_bytes(cache, buf->len, 0); 7196 btrfs_put_block_group(cache); 7197 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 7198 pin = 0; 7199 } 7200 out: 7201 if (pin) 7202 add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), 7203 root->root_key.objectid); 7204 7205 /* 7206 * Deleting the buffer, clear the corrupt flag since it doesn't matter 7207 * anymore. 7208 */ 7209 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 7210 } 7211 7212 /* Can return -ENOMEM */ 7213 int btrfs_free_extent(struct btrfs_trans_handle *trans, 7214 struct btrfs_fs_info *fs_info, 7215 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 7216 u64 owner, u64 offset) 7217 { 7218 int ret; 7219 7220 if (btrfs_is_testing(fs_info)) 7221 return 0; 7222 7223 add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); 7224 7225 /* 7226 * tree log blocks never actually go into the extent allocation 7227 * tree, just update pinning info and exit early. 7228 */ 7229 if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { 7230 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); 7231 /* unlocks the pinned mutex */ 7232 btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); 7233 ret = 0; 7234 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 7235 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 7236 num_bytes, 7237 parent, root_objectid, (int)owner, 7238 BTRFS_DROP_DELAYED_REF, NULL); 7239 } else { 7240 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 7241 num_bytes, 7242 parent, root_objectid, owner, 7243 offset, 0, 7244 BTRFS_DROP_DELAYED_REF); 7245 } 7246 return ret; 7247 } 7248 7249 /* 7250 * when we wait for progress in the block group caching, its because 7251 * our allocation attempt failed at least once. So, we must sleep 7252 * and let some progress happen before we try again. 7253 * 7254 * This function will sleep at least once waiting for new free space to 7255 * show up, and then it will check the block group free space numbers 7256 * for our min num_bytes. Another option is to have it go ahead 7257 * and look in the rbtree for a free extent of a given size, but this 7258 * is a good start. 7259 * 7260 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using 7261 * any of the information in this block group. 7262 */ 7263 static noinline void 7264 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, 7265 u64 num_bytes) 7266 { 7267 struct btrfs_caching_control *caching_ctl; 7268 7269 caching_ctl = get_caching_control(cache); 7270 if (!caching_ctl) 7271 return; 7272 7273 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 7274 (cache->free_space_ctl->free_space >= num_bytes)); 7275 7276 put_caching_control(caching_ctl); 7277 } 7278 7279 static noinline int 7280 wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 7281 { 7282 struct btrfs_caching_control *caching_ctl; 7283 int ret = 0; 7284 7285 caching_ctl = get_caching_control(cache); 7286 if (!caching_ctl) 7287 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; 7288 7289 wait_event(caching_ctl->wait, block_group_cache_done(cache)); 7290 if (cache->cached == BTRFS_CACHE_ERROR) 7291 ret = -EIO; 7292 put_caching_control(caching_ctl); 7293 return ret; 7294 } 7295 7296 int __get_raid_index(u64 flags) 7297 { 7298 if (flags & BTRFS_BLOCK_GROUP_RAID10) 7299 return BTRFS_RAID_RAID10; 7300 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 7301 return BTRFS_RAID_RAID1; 7302 else if (flags & BTRFS_BLOCK_GROUP_DUP) 7303 return BTRFS_RAID_DUP; 7304 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 7305 return BTRFS_RAID_RAID0; 7306 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 7307 return BTRFS_RAID_RAID5; 7308 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 7309 return BTRFS_RAID_RAID6; 7310 7311 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 7312 } 7313 7314 int get_block_group_index(struct btrfs_block_group_cache *cache) 7315 { 7316 return __get_raid_index(cache->flags); 7317 } 7318 7319 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { 7320 [BTRFS_RAID_RAID10] = "raid10", 7321 [BTRFS_RAID_RAID1] = "raid1", 7322 [BTRFS_RAID_DUP] = "dup", 7323 [BTRFS_RAID_RAID0] = "raid0", 7324 [BTRFS_RAID_SINGLE] = "single", 7325 [BTRFS_RAID_RAID5] = "raid5", 7326 [BTRFS_RAID_RAID6] = "raid6", 7327 }; 7328 7329 static const char *get_raid_name(enum btrfs_raid_types type) 7330 { 7331 if (type >= BTRFS_NR_RAID_TYPES) 7332 return NULL; 7333 7334 return btrfs_raid_type_names[type]; 7335 } 7336 7337 enum btrfs_loop_type { 7338 LOOP_CACHING_NOWAIT = 0, 7339 LOOP_CACHING_WAIT = 1, 7340 LOOP_ALLOC_CHUNK = 2, 7341 LOOP_NO_EMPTY_SIZE = 3, 7342 }; 7343 7344 static inline void 7345 btrfs_lock_block_group(struct btrfs_block_group_cache *cache, 7346 int delalloc) 7347 { 7348 if (delalloc) 7349 down_read(&cache->data_rwsem); 7350 } 7351 7352 static inline void 7353 btrfs_grab_block_group(struct btrfs_block_group_cache *cache, 7354 int delalloc) 7355 { 7356 btrfs_get_block_group(cache); 7357 if (delalloc) 7358 down_read(&cache->data_rwsem); 7359 } 7360 7361 static struct btrfs_block_group_cache * 7362 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, 7363 struct btrfs_free_cluster *cluster, 7364 int delalloc) 7365 { 7366 struct btrfs_block_group_cache *used_bg = NULL; 7367 7368 spin_lock(&cluster->refill_lock); 7369 while (1) { 7370 used_bg = cluster->block_group; 7371 if (!used_bg) 7372 return NULL; 7373 7374 if (used_bg == block_group) 7375 return used_bg; 7376 7377 btrfs_get_block_group(used_bg); 7378 7379 if (!delalloc) 7380 return used_bg; 7381 7382 if (down_read_trylock(&used_bg->data_rwsem)) 7383 return used_bg; 7384 7385 spin_unlock(&cluster->refill_lock); 7386 7387 /* We should only have one-level nested. */ 7388 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 7389 7390 spin_lock(&cluster->refill_lock); 7391 if (used_bg == cluster->block_group) 7392 return used_bg; 7393 7394 up_read(&used_bg->data_rwsem); 7395 btrfs_put_block_group(used_bg); 7396 } 7397 } 7398 7399 static inline void 7400 btrfs_release_block_group(struct btrfs_block_group_cache *cache, 7401 int delalloc) 7402 { 7403 if (delalloc) 7404 up_read(&cache->data_rwsem); 7405 btrfs_put_block_group(cache); 7406 } 7407 7408 /* 7409 * walks the btree of allocated extents and find a hole of a given size. 7410 * The key ins is changed to record the hole: 7411 * ins->objectid == start position 7412 * ins->flags = BTRFS_EXTENT_ITEM_KEY 7413 * ins->offset == the size of the hole. 7414 * Any available blocks before search_start are skipped. 7415 * 7416 * If there is no suitable free space, we will record the max size of 7417 * the free space extent currently. 7418 */ 7419 static noinline int find_free_extent(struct btrfs_fs_info *fs_info, 7420 u64 ram_bytes, u64 num_bytes, u64 empty_size, 7421 u64 hint_byte, struct btrfs_key *ins, 7422 u64 flags, int delalloc) 7423 { 7424 int ret = 0; 7425 struct btrfs_root *root = fs_info->extent_root; 7426 struct btrfs_free_cluster *last_ptr = NULL; 7427 struct btrfs_block_group_cache *block_group = NULL; 7428 u64 search_start = 0; 7429 u64 max_extent_size = 0; 7430 u64 empty_cluster = 0; 7431 struct btrfs_space_info *space_info; 7432 int loop = 0; 7433 int index = __get_raid_index(flags); 7434 bool failed_cluster_refill = false; 7435 bool failed_alloc = false; 7436 bool use_cluster = true; 7437 bool have_caching_bg = false; 7438 bool orig_have_caching_bg = false; 7439 bool full_search = false; 7440 7441 WARN_ON(num_bytes < fs_info->sectorsize); 7442 ins->type = BTRFS_EXTENT_ITEM_KEY; 7443 ins->objectid = 0; 7444 ins->offset = 0; 7445 7446 trace_find_free_extent(fs_info, num_bytes, empty_size, flags); 7447 7448 space_info = __find_space_info(fs_info, flags); 7449 if (!space_info) { 7450 btrfs_err(fs_info, "No space info for %llu", flags); 7451 return -ENOSPC; 7452 } 7453 7454 /* 7455 * If our free space is heavily fragmented we may not be able to make 7456 * big contiguous allocations, so instead of doing the expensive search 7457 * for free space, simply return ENOSPC with our max_extent_size so we 7458 * can go ahead and search for a more manageable chunk. 7459 * 7460 * If our max_extent_size is large enough for our allocation simply 7461 * disable clustering since we will likely not be able to find enough 7462 * space to create a cluster and induce latency trying. 7463 */ 7464 if (unlikely(space_info->max_extent_size)) { 7465 spin_lock(&space_info->lock); 7466 if (space_info->max_extent_size && 7467 num_bytes > space_info->max_extent_size) { 7468 ins->offset = space_info->max_extent_size; 7469 spin_unlock(&space_info->lock); 7470 return -ENOSPC; 7471 } else if (space_info->max_extent_size) { 7472 use_cluster = false; 7473 } 7474 spin_unlock(&space_info->lock); 7475 } 7476 7477 last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); 7478 if (last_ptr) { 7479 spin_lock(&last_ptr->lock); 7480 if (last_ptr->block_group) 7481 hint_byte = last_ptr->window_start; 7482 if (last_ptr->fragmented) { 7483 /* 7484 * We still set window_start so we can keep track of the 7485 * last place we found an allocation to try and save 7486 * some time. 7487 */ 7488 hint_byte = last_ptr->window_start; 7489 use_cluster = false; 7490 } 7491 spin_unlock(&last_ptr->lock); 7492 } 7493 7494 search_start = max(search_start, first_logical_byte(fs_info, 0)); 7495 search_start = max(search_start, hint_byte); 7496 if (search_start == hint_byte) { 7497 block_group = btrfs_lookup_block_group(fs_info, search_start); 7498 /* 7499 * we don't want to use the block group if it doesn't match our 7500 * allocation bits, or if its not cached. 7501 * 7502 * However if we are re-searching with an ideal block group 7503 * picked out then we don't care that the block group is cached. 7504 */ 7505 if (block_group && block_group_bits(block_group, flags) && 7506 block_group->cached != BTRFS_CACHE_NO) { 7507 down_read(&space_info->groups_sem); 7508 if (list_empty(&block_group->list) || 7509 block_group->ro) { 7510 /* 7511 * someone is removing this block group, 7512 * we can't jump into the have_block_group 7513 * target because our list pointers are not 7514 * valid 7515 */ 7516 btrfs_put_block_group(block_group); 7517 up_read(&space_info->groups_sem); 7518 } else { 7519 index = get_block_group_index(block_group); 7520 btrfs_lock_block_group(block_group, delalloc); 7521 goto have_block_group; 7522 } 7523 } else if (block_group) { 7524 btrfs_put_block_group(block_group); 7525 } 7526 } 7527 search: 7528 have_caching_bg = false; 7529 if (index == 0 || index == __get_raid_index(flags)) 7530 full_search = true; 7531 down_read(&space_info->groups_sem); 7532 list_for_each_entry(block_group, &space_info->block_groups[index], 7533 list) { 7534 u64 offset; 7535 int cached; 7536 7537 btrfs_grab_block_group(block_group, delalloc); 7538 search_start = block_group->key.objectid; 7539 7540 /* 7541 * this can happen if we end up cycling through all the 7542 * raid types, but we want to make sure we only allocate 7543 * for the proper type. 7544 */ 7545 if (!block_group_bits(block_group, flags)) { 7546 u64 extra = BTRFS_BLOCK_GROUP_DUP | 7547 BTRFS_BLOCK_GROUP_RAID1 | 7548 BTRFS_BLOCK_GROUP_RAID5 | 7549 BTRFS_BLOCK_GROUP_RAID6 | 7550 BTRFS_BLOCK_GROUP_RAID10; 7551 7552 /* 7553 * if they asked for extra copies and this block group 7554 * doesn't provide them, bail. This does allow us to 7555 * fill raid0 from raid1. 7556 */ 7557 if ((flags & extra) && !(block_group->flags & extra)) 7558 goto loop; 7559 } 7560 7561 have_block_group: 7562 cached = block_group_cache_done(block_group); 7563 if (unlikely(!cached)) { 7564 have_caching_bg = true; 7565 ret = cache_block_group(block_group, 0); 7566 BUG_ON(ret < 0); 7567 ret = 0; 7568 } 7569 7570 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) 7571 goto loop; 7572 if (unlikely(block_group->ro)) 7573 goto loop; 7574 7575 /* 7576 * Ok we want to try and use the cluster allocator, so 7577 * lets look there 7578 */ 7579 if (last_ptr && use_cluster) { 7580 struct btrfs_block_group_cache *used_block_group; 7581 unsigned long aligned_cluster; 7582 /* 7583 * the refill lock keeps out other 7584 * people trying to start a new cluster 7585 */ 7586 used_block_group = btrfs_lock_cluster(block_group, 7587 last_ptr, 7588 delalloc); 7589 if (!used_block_group) 7590 goto refill_cluster; 7591 7592 if (used_block_group != block_group && 7593 (used_block_group->ro || 7594 !block_group_bits(used_block_group, flags))) 7595 goto release_cluster; 7596 7597 offset = btrfs_alloc_from_cluster(used_block_group, 7598 last_ptr, 7599 num_bytes, 7600 used_block_group->key.objectid, 7601 &max_extent_size); 7602 if (offset) { 7603 /* we have a block, we're done */ 7604 spin_unlock(&last_ptr->refill_lock); 7605 trace_btrfs_reserve_extent_cluster(fs_info, 7606 used_block_group, 7607 search_start, num_bytes); 7608 if (used_block_group != block_group) { 7609 btrfs_release_block_group(block_group, 7610 delalloc); 7611 block_group = used_block_group; 7612 } 7613 goto checks; 7614 } 7615 7616 WARN_ON(last_ptr->block_group != used_block_group); 7617 release_cluster: 7618 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 7619 * set up a new clusters, so lets just skip it 7620 * and let the allocator find whatever block 7621 * it can find. If we reach this point, we 7622 * will have tried the cluster allocator 7623 * plenty of times and not have found 7624 * anything, so we are likely way too 7625 * fragmented for the clustering stuff to find 7626 * anything. 7627 * 7628 * However, if the cluster is taken from the 7629 * current block group, release the cluster 7630 * first, so that we stand a better chance of 7631 * succeeding in the unclustered 7632 * allocation. */ 7633 if (loop >= LOOP_NO_EMPTY_SIZE && 7634 used_block_group != block_group) { 7635 spin_unlock(&last_ptr->refill_lock); 7636 btrfs_release_block_group(used_block_group, 7637 delalloc); 7638 goto unclustered_alloc; 7639 } 7640 7641 /* 7642 * this cluster didn't work out, free it and 7643 * start over 7644 */ 7645 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7646 7647 if (used_block_group != block_group) 7648 btrfs_release_block_group(used_block_group, 7649 delalloc); 7650 refill_cluster: 7651 if (loop >= LOOP_NO_EMPTY_SIZE) { 7652 spin_unlock(&last_ptr->refill_lock); 7653 goto unclustered_alloc; 7654 } 7655 7656 aligned_cluster = max_t(unsigned long, 7657 empty_cluster + empty_size, 7658 block_group->full_stripe_len); 7659 7660 /* allocate a cluster in this block group */ 7661 ret = btrfs_find_space_cluster(fs_info, block_group, 7662 last_ptr, search_start, 7663 num_bytes, 7664 aligned_cluster); 7665 if (ret == 0) { 7666 /* 7667 * now pull our allocation out of this 7668 * cluster 7669 */ 7670 offset = btrfs_alloc_from_cluster(block_group, 7671 last_ptr, 7672 num_bytes, 7673 search_start, 7674 &max_extent_size); 7675 if (offset) { 7676 /* we found one, proceed */ 7677 spin_unlock(&last_ptr->refill_lock); 7678 trace_btrfs_reserve_extent_cluster(fs_info, 7679 block_group, search_start, 7680 num_bytes); 7681 goto checks; 7682 } 7683 } else if (!cached && loop > LOOP_CACHING_NOWAIT 7684 && !failed_cluster_refill) { 7685 spin_unlock(&last_ptr->refill_lock); 7686 7687 failed_cluster_refill = true; 7688 wait_block_group_cache_progress(block_group, 7689 num_bytes + empty_cluster + empty_size); 7690 goto have_block_group; 7691 } 7692 7693 /* 7694 * at this point we either didn't find a cluster 7695 * or we weren't able to allocate a block from our 7696 * cluster. Free the cluster we've been trying 7697 * to use, and go to the next block group 7698 */ 7699 btrfs_return_cluster_to_free_space(NULL, last_ptr); 7700 spin_unlock(&last_ptr->refill_lock); 7701 goto loop; 7702 } 7703 7704 unclustered_alloc: 7705 /* 7706 * We are doing an unclustered alloc, set the fragmented flag so 7707 * we don't bother trying to setup a cluster again until we get 7708 * more space. 7709 */ 7710 if (unlikely(last_ptr)) { 7711 spin_lock(&last_ptr->lock); 7712 last_ptr->fragmented = 1; 7713 spin_unlock(&last_ptr->lock); 7714 } 7715 if (cached) { 7716 struct btrfs_free_space_ctl *ctl = 7717 block_group->free_space_ctl; 7718 7719 spin_lock(&ctl->tree_lock); 7720 if (ctl->free_space < 7721 num_bytes + empty_cluster + empty_size) { 7722 if (ctl->free_space > max_extent_size) 7723 max_extent_size = ctl->free_space; 7724 spin_unlock(&ctl->tree_lock); 7725 goto loop; 7726 } 7727 spin_unlock(&ctl->tree_lock); 7728 } 7729 7730 offset = btrfs_find_space_for_alloc(block_group, search_start, 7731 num_bytes, empty_size, 7732 &max_extent_size); 7733 /* 7734 * If we didn't find a chunk, and we haven't failed on this 7735 * block group before, and this block group is in the middle of 7736 * caching and we are ok with waiting, then go ahead and wait 7737 * for progress to be made, and set failed_alloc to true. 7738 * 7739 * If failed_alloc is true then we've already waited on this 7740 * block group once and should move on to the next block group. 7741 */ 7742 if (!offset && !failed_alloc && !cached && 7743 loop > LOOP_CACHING_NOWAIT) { 7744 wait_block_group_cache_progress(block_group, 7745 num_bytes + empty_size); 7746 failed_alloc = true; 7747 goto have_block_group; 7748 } else if (!offset) { 7749 goto loop; 7750 } 7751 checks: 7752 search_start = ALIGN(offset, fs_info->stripesize); 7753 7754 /* move on to the next group */ 7755 if (search_start + num_bytes > 7756 block_group->key.objectid + block_group->key.offset) { 7757 btrfs_add_free_space(block_group, offset, num_bytes); 7758 goto loop; 7759 } 7760 7761 if (offset < search_start) 7762 btrfs_add_free_space(block_group, offset, 7763 search_start - offset); 7764 BUG_ON(offset > search_start); 7765 7766 ret = btrfs_add_reserved_bytes(block_group, ram_bytes, 7767 num_bytes, delalloc); 7768 if (ret == -EAGAIN) { 7769 btrfs_add_free_space(block_group, offset, num_bytes); 7770 goto loop; 7771 } 7772 btrfs_inc_block_group_reservations(block_group); 7773 7774 /* we are all good, lets return */ 7775 ins->objectid = search_start; 7776 ins->offset = num_bytes; 7777 7778 trace_btrfs_reserve_extent(fs_info, block_group, 7779 search_start, num_bytes); 7780 btrfs_release_block_group(block_group, delalloc); 7781 break; 7782 loop: 7783 failed_cluster_refill = false; 7784 failed_alloc = false; 7785 BUG_ON(index != get_block_group_index(block_group)); 7786 btrfs_release_block_group(block_group, delalloc); 7787 } 7788 up_read(&space_info->groups_sem); 7789 7790 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg 7791 && !orig_have_caching_bg) 7792 orig_have_caching_bg = true; 7793 7794 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 7795 goto search; 7796 7797 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 7798 goto search; 7799 7800 /* 7801 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 7802 * caching kthreads as we move along 7803 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 7804 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 7805 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 7806 * again 7807 */ 7808 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 7809 index = 0; 7810 if (loop == LOOP_CACHING_NOWAIT) { 7811 /* 7812 * We want to skip the LOOP_CACHING_WAIT step if we 7813 * don't have any uncached bgs and we've already done a 7814 * full search through. 7815 */ 7816 if (orig_have_caching_bg || !full_search) 7817 loop = LOOP_CACHING_WAIT; 7818 else 7819 loop = LOOP_ALLOC_CHUNK; 7820 } else { 7821 loop++; 7822 } 7823 7824 if (loop == LOOP_ALLOC_CHUNK) { 7825 struct btrfs_trans_handle *trans; 7826 int exist = 0; 7827 7828 trans = current->journal_info; 7829 if (trans) 7830 exist = 1; 7831 else 7832 trans = btrfs_join_transaction(root); 7833 7834 if (IS_ERR(trans)) { 7835 ret = PTR_ERR(trans); 7836 goto out; 7837 } 7838 7839 ret = do_chunk_alloc(trans, fs_info, flags, 7840 CHUNK_ALLOC_FORCE); 7841 7842 /* 7843 * If we can't allocate a new chunk we've already looped 7844 * through at least once, move on to the NO_EMPTY_SIZE 7845 * case. 7846 */ 7847 if (ret == -ENOSPC) 7848 loop = LOOP_NO_EMPTY_SIZE; 7849 7850 /* 7851 * Do not bail out on ENOSPC since we 7852 * can do more things. 7853 */ 7854 if (ret < 0 && ret != -ENOSPC) 7855 btrfs_abort_transaction(trans, ret); 7856 else 7857 ret = 0; 7858 if (!exist) 7859 btrfs_end_transaction(trans); 7860 if (ret) 7861 goto out; 7862 } 7863 7864 if (loop == LOOP_NO_EMPTY_SIZE) { 7865 /* 7866 * Don't loop again if we already have no empty_size and 7867 * no empty_cluster. 7868 */ 7869 if (empty_size == 0 && 7870 empty_cluster == 0) { 7871 ret = -ENOSPC; 7872 goto out; 7873 } 7874 empty_size = 0; 7875 empty_cluster = 0; 7876 } 7877 7878 goto search; 7879 } else if (!ins->objectid) { 7880 ret = -ENOSPC; 7881 } else if (ins->objectid) { 7882 if (!use_cluster && last_ptr) { 7883 spin_lock(&last_ptr->lock); 7884 last_ptr->window_start = ins->objectid; 7885 spin_unlock(&last_ptr->lock); 7886 } 7887 ret = 0; 7888 } 7889 out: 7890 if (ret == -ENOSPC) { 7891 spin_lock(&space_info->lock); 7892 space_info->max_extent_size = max_extent_size; 7893 spin_unlock(&space_info->lock); 7894 ins->offset = max_extent_size; 7895 } 7896 return ret; 7897 } 7898 7899 static void dump_space_info(struct btrfs_fs_info *fs_info, 7900 struct btrfs_space_info *info, u64 bytes, 7901 int dump_block_groups) 7902 { 7903 struct btrfs_block_group_cache *cache; 7904 int index = 0; 7905 7906 spin_lock(&info->lock); 7907 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", 7908 info->flags, 7909 info->total_bytes - btrfs_space_info_used(info, true), 7910 info->full ? "" : "not "); 7911 btrfs_info(fs_info, 7912 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", 7913 info->total_bytes, info->bytes_used, info->bytes_pinned, 7914 info->bytes_reserved, info->bytes_may_use, 7915 info->bytes_readonly); 7916 spin_unlock(&info->lock); 7917 7918 if (!dump_block_groups) 7919 return; 7920 7921 down_read(&info->groups_sem); 7922 again: 7923 list_for_each_entry(cache, &info->block_groups[index], list) { 7924 spin_lock(&cache->lock); 7925 btrfs_info(fs_info, 7926 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", 7927 cache->key.objectid, cache->key.offset, 7928 btrfs_block_group_used(&cache->item), cache->pinned, 7929 cache->reserved, cache->ro ? "[readonly]" : ""); 7930 btrfs_dump_free_space(cache, bytes); 7931 spin_unlock(&cache->lock); 7932 } 7933 if (++index < BTRFS_NR_RAID_TYPES) 7934 goto again; 7935 up_read(&info->groups_sem); 7936 } 7937 7938 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 7939 u64 num_bytes, u64 min_alloc_size, 7940 u64 empty_size, u64 hint_byte, 7941 struct btrfs_key *ins, int is_data, int delalloc) 7942 { 7943 struct btrfs_fs_info *fs_info = root->fs_info; 7944 bool final_tried = num_bytes == min_alloc_size; 7945 u64 flags; 7946 int ret; 7947 7948 flags = btrfs_get_alloc_profile(root, is_data); 7949 again: 7950 WARN_ON(num_bytes < fs_info->sectorsize); 7951 ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, 7952 hint_byte, ins, flags, delalloc); 7953 if (!ret && !is_data) { 7954 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 7955 } else if (ret == -ENOSPC) { 7956 if (!final_tried && ins->offset) { 7957 num_bytes = min(num_bytes >> 1, ins->offset); 7958 num_bytes = round_down(num_bytes, 7959 fs_info->sectorsize); 7960 num_bytes = max(num_bytes, min_alloc_size); 7961 ram_bytes = num_bytes; 7962 if (num_bytes == min_alloc_size) 7963 final_tried = true; 7964 goto again; 7965 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 7966 struct btrfs_space_info *sinfo; 7967 7968 sinfo = __find_space_info(fs_info, flags); 7969 btrfs_err(fs_info, 7970 "allocation failed flags %llu, wanted %llu", 7971 flags, num_bytes); 7972 if (sinfo) 7973 dump_space_info(fs_info, sinfo, num_bytes, 1); 7974 } 7975 } 7976 7977 return ret; 7978 } 7979 7980 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 7981 u64 start, u64 len, 7982 int pin, int delalloc) 7983 { 7984 struct btrfs_block_group_cache *cache; 7985 int ret = 0; 7986 7987 cache = btrfs_lookup_block_group(fs_info, start); 7988 if (!cache) { 7989 btrfs_err(fs_info, "Unable to find block group for %llu", 7990 start); 7991 return -ENOSPC; 7992 } 7993 7994 if (pin) 7995 pin_down_extent(fs_info, cache, start, len, 1); 7996 else { 7997 if (btrfs_test_opt(fs_info, DISCARD)) 7998 ret = btrfs_discard_extent(fs_info, start, len, NULL); 7999 btrfs_add_free_space(cache, start, len); 8000 btrfs_free_reserved_bytes(cache, len, delalloc); 8001 trace_btrfs_reserved_extent_free(fs_info, start, len); 8002 } 8003 8004 btrfs_put_block_group(cache); 8005 return ret; 8006 } 8007 8008 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 8009 u64 start, u64 len, int delalloc) 8010 { 8011 return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); 8012 } 8013 8014 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, 8015 u64 start, u64 len) 8016 { 8017 return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); 8018 } 8019 8020 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8021 struct btrfs_fs_info *fs_info, 8022 u64 parent, u64 root_objectid, 8023 u64 flags, u64 owner, u64 offset, 8024 struct btrfs_key *ins, int ref_mod) 8025 { 8026 int ret; 8027 struct btrfs_extent_item *extent_item; 8028 struct btrfs_extent_inline_ref *iref; 8029 struct btrfs_path *path; 8030 struct extent_buffer *leaf; 8031 int type; 8032 u32 size; 8033 8034 if (parent > 0) 8035 type = BTRFS_SHARED_DATA_REF_KEY; 8036 else 8037 type = BTRFS_EXTENT_DATA_REF_KEY; 8038 8039 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 8040 8041 path = btrfs_alloc_path(); 8042 if (!path) 8043 return -ENOMEM; 8044 8045 path->leave_spinning = 1; 8046 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8047 ins, size); 8048 if (ret) { 8049 btrfs_free_path(path); 8050 return ret; 8051 } 8052 8053 leaf = path->nodes[0]; 8054 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8055 struct btrfs_extent_item); 8056 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 8057 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8058 btrfs_set_extent_flags(leaf, extent_item, 8059 flags | BTRFS_EXTENT_FLAG_DATA); 8060 8061 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8062 btrfs_set_extent_inline_ref_type(leaf, iref, type); 8063 if (parent > 0) { 8064 struct btrfs_shared_data_ref *ref; 8065 ref = (struct btrfs_shared_data_ref *)(iref + 1); 8066 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8067 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 8068 } else { 8069 struct btrfs_extent_data_ref *ref; 8070 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 8071 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 8072 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 8073 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 8074 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 8075 } 8076 8077 btrfs_mark_buffer_dirty(path->nodes[0]); 8078 btrfs_free_path(path); 8079 8080 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8081 ins->offset); 8082 if (ret) 8083 return ret; 8084 8085 ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); 8086 if (ret) { /* -ENOENT, logic error */ 8087 btrfs_err(fs_info, "update block group failed for %llu %llu", 8088 ins->objectid, ins->offset); 8089 BUG(); 8090 } 8091 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); 8092 return ret; 8093 } 8094 8095 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 8096 struct btrfs_fs_info *fs_info, 8097 u64 parent, u64 root_objectid, 8098 u64 flags, struct btrfs_disk_key *key, 8099 int level, struct btrfs_key *ins) 8100 { 8101 int ret; 8102 struct btrfs_extent_item *extent_item; 8103 struct btrfs_tree_block_info *block_info; 8104 struct btrfs_extent_inline_ref *iref; 8105 struct btrfs_path *path; 8106 struct extent_buffer *leaf; 8107 u32 size = sizeof(*extent_item) + sizeof(*iref); 8108 u64 num_bytes = ins->offset; 8109 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8110 8111 if (!skinny_metadata) 8112 size += sizeof(*block_info); 8113 8114 path = btrfs_alloc_path(); 8115 if (!path) { 8116 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8117 fs_info->nodesize); 8118 return -ENOMEM; 8119 } 8120 8121 path->leave_spinning = 1; 8122 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 8123 ins, size); 8124 if (ret) { 8125 btrfs_free_path(path); 8126 btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, 8127 fs_info->nodesize); 8128 return ret; 8129 } 8130 8131 leaf = path->nodes[0]; 8132 extent_item = btrfs_item_ptr(leaf, path->slots[0], 8133 struct btrfs_extent_item); 8134 btrfs_set_extent_refs(leaf, extent_item, 1); 8135 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 8136 btrfs_set_extent_flags(leaf, extent_item, 8137 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 8138 8139 if (skinny_metadata) { 8140 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 8141 num_bytes = fs_info->nodesize; 8142 } else { 8143 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 8144 btrfs_set_tree_block_key(leaf, block_info, key); 8145 btrfs_set_tree_block_level(leaf, block_info, level); 8146 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 8147 } 8148 8149 if (parent > 0) { 8150 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); 8151 btrfs_set_extent_inline_ref_type(leaf, iref, 8152 BTRFS_SHARED_BLOCK_REF_KEY); 8153 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 8154 } else { 8155 btrfs_set_extent_inline_ref_type(leaf, iref, 8156 BTRFS_TREE_BLOCK_REF_KEY); 8157 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 8158 } 8159 8160 btrfs_mark_buffer_dirty(leaf); 8161 btrfs_free_path(path); 8162 8163 ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, 8164 num_bytes); 8165 if (ret) 8166 return ret; 8167 8168 ret = update_block_group(trans, fs_info, ins->objectid, 8169 fs_info->nodesize, 1); 8170 if (ret) { /* -ENOENT, logic error */ 8171 btrfs_err(fs_info, "update block group failed for %llu %llu", 8172 ins->objectid, ins->offset); 8173 BUG(); 8174 } 8175 8176 trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, 8177 fs_info->nodesize); 8178 return ret; 8179 } 8180 8181 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 8182 u64 root_objectid, u64 owner, 8183 u64 offset, u64 ram_bytes, 8184 struct btrfs_key *ins) 8185 { 8186 struct btrfs_fs_info *fs_info = trans->fs_info; 8187 int ret; 8188 8189 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 8190 8191 ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, 8192 ins->offset, 0, 8193 root_objectid, owner, offset, 8194 ram_bytes, BTRFS_ADD_DELAYED_EXTENT); 8195 return ret; 8196 } 8197 8198 /* 8199 * this is used by the tree logging recovery code. It records that 8200 * an extent has been allocated and makes sure to clear the free 8201 * space cache bits as well 8202 */ 8203 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 8204 struct btrfs_fs_info *fs_info, 8205 u64 root_objectid, u64 owner, u64 offset, 8206 struct btrfs_key *ins) 8207 { 8208 int ret; 8209 struct btrfs_block_group_cache *block_group; 8210 struct btrfs_space_info *space_info; 8211 8212 /* 8213 * Mixed block groups will exclude before processing the log so we only 8214 * need to do the exclude dance if this fs isn't mixed. 8215 */ 8216 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 8217 ret = __exclude_logged_extent(fs_info, ins->objectid, 8218 ins->offset); 8219 if (ret) 8220 return ret; 8221 } 8222 8223 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 8224 if (!block_group) 8225 return -EINVAL; 8226 8227 space_info = block_group->space_info; 8228 spin_lock(&space_info->lock); 8229 spin_lock(&block_group->lock); 8230 space_info->bytes_reserved += ins->offset; 8231 block_group->reserved += ins->offset; 8232 spin_unlock(&block_group->lock); 8233 spin_unlock(&space_info->lock); 8234 8235 ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid, 8236 0, owner, offset, ins, 1); 8237 btrfs_put_block_group(block_group); 8238 return ret; 8239 } 8240 8241 static struct extent_buffer * 8242 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 8243 u64 bytenr, int level) 8244 { 8245 struct btrfs_fs_info *fs_info = root->fs_info; 8246 struct extent_buffer *buf; 8247 8248 buf = btrfs_find_create_tree_block(fs_info, bytenr); 8249 if (IS_ERR(buf)) 8250 return buf; 8251 8252 btrfs_set_header_generation(buf, trans->transid); 8253 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 8254 btrfs_tree_lock(buf); 8255 clean_tree_block(fs_info, buf); 8256 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 8257 8258 btrfs_set_lock_blocking(buf); 8259 set_extent_buffer_uptodate(buf); 8260 8261 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 8262 buf->log_index = root->log_transid % 2; 8263 /* 8264 * we allow two log transactions at a time, use different 8265 * EXENT bit to differentiate dirty pages. 8266 */ 8267 if (buf->log_index == 0) 8268 set_extent_dirty(&root->dirty_log_pages, buf->start, 8269 buf->start + buf->len - 1, GFP_NOFS); 8270 else 8271 set_extent_new(&root->dirty_log_pages, buf->start, 8272 buf->start + buf->len - 1); 8273 } else { 8274 buf->log_index = -1; 8275 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 8276 buf->start + buf->len - 1, GFP_NOFS); 8277 } 8278 trans->dirty = true; 8279 /* this returns a buffer locked for blocking */ 8280 return buf; 8281 } 8282 8283 static struct btrfs_block_rsv * 8284 use_block_rsv(struct btrfs_trans_handle *trans, 8285 struct btrfs_root *root, u32 blocksize) 8286 { 8287 struct btrfs_fs_info *fs_info = root->fs_info; 8288 struct btrfs_block_rsv *block_rsv; 8289 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 8290 int ret; 8291 bool global_updated = false; 8292 8293 block_rsv = get_block_rsv(trans, root); 8294 8295 if (unlikely(block_rsv->size == 0)) 8296 goto try_reserve; 8297 again: 8298 ret = block_rsv_use_bytes(block_rsv, blocksize); 8299 if (!ret) 8300 return block_rsv; 8301 8302 if (block_rsv->failfast) 8303 return ERR_PTR(ret); 8304 8305 if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { 8306 global_updated = true; 8307 update_global_block_rsv(fs_info); 8308 goto again; 8309 } 8310 8311 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 8312 static DEFINE_RATELIMIT_STATE(_rs, 8313 DEFAULT_RATELIMIT_INTERVAL * 10, 8314 /*DEFAULT_RATELIMIT_BURST*/ 1); 8315 if (__ratelimit(&_rs)) 8316 WARN(1, KERN_DEBUG 8317 "BTRFS: block rsv returned %d\n", ret); 8318 } 8319 try_reserve: 8320 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 8321 BTRFS_RESERVE_NO_FLUSH); 8322 if (!ret) 8323 return block_rsv; 8324 /* 8325 * If we couldn't reserve metadata bytes try and use some from 8326 * the global reserve if its space type is the same as the global 8327 * reservation. 8328 */ 8329 if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && 8330 block_rsv->space_info == global_rsv->space_info) { 8331 ret = block_rsv_use_bytes(global_rsv, blocksize); 8332 if (!ret) 8333 return global_rsv; 8334 } 8335 return ERR_PTR(ret); 8336 } 8337 8338 static void unuse_block_rsv(struct btrfs_fs_info *fs_info, 8339 struct btrfs_block_rsv *block_rsv, u32 blocksize) 8340 { 8341 block_rsv_add_bytes(block_rsv, blocksize, 0); 8342 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0); 8343 } 8344 8345 /* 8346 * finds a free extent and does all the dirty work required for allocation 8347 * returns the tree buffer or an ERR_PTR on error. 8348 */ 8349 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 8350 struct btrfs_root *root, 8351 u64 parent, u64 root_objectid, 8352 const struct btrfs_disk_key *key, 8353 int level, u64 hint, 8354 u64 empty_size) 8355 { 8356 struct btrfs_fs_info *fs_info = root->fs_info; 8357 struct btrfs_key ins; 8358 struct btrfs_block_rsv *block_rsv; 8359 struct extent_buffer *buf; 8360 struct btrfs_delayed_extent_op *extent_op; 8361 u64 flags = 0; 8362 int ret; 8363 u32 blocksize = fs_info->nodesize; 8364 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 8365 8366 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 8367 if (btrfs_is_testing(fs_info)) { 8368 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 8369 level); 8370 if (!IS_ERR(buf)) 8371 root->alloc_bytenr += blocksize; 8372 return buf; 8373 } 8374 #endif 8375 8376 block_rsv = use_block_rsv(trans, root, blocksize); 8377 if (IS_ERR(block_rsv)) 8378 return ERR_CAST(block_rsv); 8379 8380 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 8381 empty_size, hint, &ins, 0, 0); 8382 if (ret) 8383 goto out_unuse; 8384 8385 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); 8386 if (IS_ERR(buf)) { 8387 ret = PTR_ERR(buf); 8388 goto out_free_reserved; 8389 } 8390 8391 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 8392 if (parent == 0) 8393 parent = ins.objectid; 8394 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 8395 } else 8396 BUG_ON(parent > 0); 8397 8398 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 8399 extent_op = btrfs_alloc_delayed_extent_op(); 8400 if (!extent_op) { 8401 ret = -ENOMEM; 8402 goto out_free_buf; 8403 } 8404 if (key) 8405 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 8406 else 8407 memset(&extent_op->key, 0, sizeof(extent_op->key)); 8408 extent_op->flags_to_set = flags; 8409 extent_op->update_key = skinny_metadata ? false : true; 8410 extent_op->update_flags = true; 8411 extent_op->is_data = false; 8412 extent_op->level = level; 8413 8414 ret = btrfs_add_delayed_tree_ref(fs_info, trans, 8415 ins.objectid, ins.offset, 8416 parent, root_objectid, level, 8417 BTRFS_ADD_DELAYED_EXTENT, 8418 extent_op); 8419 if (ret) 8420 goto out_free_delayed; 8421 } 8422 return buf; 8423 8424 out_free_delayed: 8425 btrfs_free_delayed_extent_op(extent_op); 8426 out_free_buf: 8427 free_extent_buffer(buf); 8428 out_free_reserved: 8429 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 8430 out_unuse: 8431 unuse_block_rsv(fs_info, block_rsv, blocksize); 8432 return ERR_PTR(ret); 8433 } 8434 8435 struct walk_control { 8436 u64 refs[BTRFS_MAX_LEVEL]; 8437 u64 flags[BTRFS_MAX_LEVEL]; 8438 struct btrfs_key update_progress; 8439 int stage; 8440 int level; 8441 int shared_level; 8442 int update_ref; 8443 int keep_locks; 8444 int reada_slot; 8445 int reada_count; 8446 int for_reloc; 8447 }; 8448 8449 #define DROP_REFERENCE 1 8450 #define UPDATE_BACKREF 2 8451 8452 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 8453 struct btrfs_root *root, 8454 struct walk_control *wc, 8455 struct btrfs_path *path) 8456 { 8457 struct btrfs_fs_info *fs_info = root->fs_info; 8458 u64 bytenr; 8459 u64 generation; 8460 u64 refs; 8461 u64 flags; 8462 u32 nritems; 8463 struct btrfs_key key; 8464 struct extent_buffer *eb; 8465 int ret; 8466 int slot; 8467 int nread = 0; 8468 8469 if (path->slots[wc->level] < wc->reada_slot) { 8470 wc->reada_count = wc->reada_count * 2 / 3; 8471 wc->reada_count = max(wc->reada_count, 2); 8472 } else { 8473 wc->reada_count = wc->reada_count * 3 / 2; 8474 wc->reada_count = min_t(int, wc->reada_count, 8475 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 8476 } 8477 8478 eb = path->nodes[wc->level]; 8479 nritems = btrfs_header_nritems(eb); 8480 8481 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 8482 if (nread >= wc->reada_count) 8483 break; 8484 8485 cond_resched(); 8486 bytenr = btrfs_node_blockptr(eb, slot); 8487 generation = btrfs_node_ptr_generation(eb, slot); 8488 8489 if (slot == path->slots[wc->level]) 8490 goto reada; 8491 8492 if (wc->stage == UPDATE_BACKREF && 8493 generation <= root->root_key.offset) 8494 continue; 8495 8496 /* We don't lock the tree block, it's OK to be racy here */ 8497 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 8498 wc->level - 1, 1, &refs, 8499 &flags); 8500 /* We don't care about errors in readahead. */ 8501 if (ret < 0) 8502 continue; 8503 BUG_ON(refs == 0); 8504 8505 if (wc->stage == DROP_REFERENCE) { 8506 if (refs == 1) 8507 goto reada; 8508 8509 if (wc->level == 1 && 8510 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8511 continue; 8512 if (!wc->update_ref || 8513 generation <= root->root_key.offset) 8514 continue; 8515 btrfs_node_key_to_cpu(eb, &key, slot); 8516 ret = btrfs_comp_cpu_keys(&key, 8517 &wc->update_progress); 8518 if (ret < 0) 8519 continue; 8520 } else { 8521 if (wc->level == 1 && 8522 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8523 continue; 8524 } 8525 reada: 8526 readahead_tree_block(fs_info, bytenr); 8527 nread++; 8528 } 8529 wc->reada_slot = slot; 8530 } 8531 8532 /* 8533 * helper to process tree block while walking down the tree. 8534 * 8535 * when wc->stage == UPDATE_BACKREF, this function updates 8536 * back refs for pointers in the block. 8537 * 8538 * NOTE: return value 1 means we should stop walking down. 8539 */ 8540 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 8541 struct btrfs_root *root, 8542 struct btrfs_path *path, 8543 struct walk_control *wc, int lookup_info) 8544 { 8545 struct btrfs_fs_info *fs_info = root->fs_info; 8546 int level = wc->level; 8547 struct extent_buffer *eb = path->nodes[level]; 8548 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 8549 int ret; 8550 8551 if (wc->stage == UPDATE_BACKREF && 8552 btrfs_header_owner(eb) != root->root_key.objectid) 8553 return 1; 8554 8555 /* 8556 * when reference count of tree block is 1, it won't increase 8557 * again. once full backref flag is set, we never clear it. 8558 */ 8559 if (lookup_info && 8560 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 8561 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 8562 BUG_ON(!path->locks[level]); 8563 ret = btrfs_lookup_extent_info(trans, fs_info, 8564 eb->start, level, 1, 8565 &wc->refs[level], 8566 &wc->flags[level]); 8567 BUG_ON(ret == -ENOMEM); 8568 if (ret) 8569 return ret; 8570 BUG_ON(wc->refs[level] == 0); 8571 } 8572 8573 if (wc->stage == DROP_REFERENCE) { 8574 if (wc->refs[level] > 1) 8575 return 1; 8576 8577 if (path->locks[level] && !wc->keep_locks) { 8578 btrfs_tree_unlock_rw(eb, path->locks[level]); 8579 path->locks[level] = 0; 8580 } 8581 return 0; 8582 } 8583 8584 /* wc->stage == UPDATE_BACKREF */ 8585 if (!(wc->flags[level] & flag)) { 8586 BUG_ON(!path->locks[level]); 8587 ret = btrfs_inc_ref(trans, root, eb, 1); 8588 BUG_ON(ret); /* -ENOMEM */ 8589 ret = btrfs_dec_ref(trans, root, eb, 0); 8590 BUG_ON(ret); /* -ENOMEM */ 8591 ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, 8592 eb->len, flag, 8593 btrfs_header_level(eb), 0); 8594 BUG_ON(ret); /* -ENOMEM */ 8595 wc->flags[level] |= flag; 8596 } 8597 8598 /* 8599 * the block is shared by multiple trees, so it's not good to 8600 * keep the tree lock 8601 */ 8602 if (path->locks[level] && level > 0) { 8603 btrfs_tree_unlock_rw(eb, path->locks[level]); 8604 path->locks[level] = 0; 8605 } 8606 return 0; 8607 } 8608 8609 /* 8610 * helper to process tree block pointer. 8611 * 8612 * when wc->stage == DROP_REFERENCE, this function checks 8613 * reference count of the block pointed to. if the block 8614 * is shared and we need update back refs for the subtree 8615 * rooted at the block, this function changes wc->stage to 8616 * UPDATE_BACKREF. if the block is shared and there is no 8617 * need to update back, this function drops the reference 8618 * to the block. 8619 * 8620 * NOTE: return value 1 means we should stop walking down. 8621 */ 8622 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 8623 struct btrfs_root *root, 8624 struct btrfs_path *path, 8625 struct walk_control *wc, int *lookup_info) 8626 { 8627 struct btrfs_fs_info *fs_info = root->fs_info; 8628 u64 bytenr; 8629 u64 generation; 8630 u64 parent; 8631 u32 blocksize; 8632 struct btrfs_key key; 8633 struct extent_buffer *next; 8634 int level = wc->level; 8635 int reada = 0; 8636 int ret = 0; 8637 bool need_account = false; 8638 8639 generation = btrfs_node_ptr_generation(path->nodes[level], 8640 path->slots[level]); 8641 /* 8642 * if the lower level block was created before the snapshot 8643 * was created, we know there is no need to update back refs 8644 * for the subtree 8645 */ 8646 if (wc->stage == UPDATE_BACKREF && 8647 generation <= root->root_key.offset) { 8648 *lookup_info = 1; 8649 return 1; 8650 } 8651 8652 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8653 blocksize = fs_info->nodesize; 8654 8655 next = find_extent_buffer(fs_info, bytenr); 8656 if (!next) { 8657 next = btrfs_find_create_tree_block(fs_info, bytenr); 8658 if (IS_ERR(next)) 8659 return PTR_ERR(next); 8660 8661 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, 8662 level - 1); 8663 reada = 1; 8664 } 8665 btrfs_tree_lock(next); 8666 btrfs_set_lock_blocking(next); 8667 8668 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 8669 &wc->refs[level - 1], 8670 &wc->flags[level - 1]); 8671 if (ret < 0) 8672 goto out_unlock; 8673 8674 if (unlikely(wc->refs[level - 1] == 0)) { 8675 btrfs_err(fs_info, "Missing references."); 8676 ret = -EIO; 8677 goto out_unlock; 8678 } 8679 *lookup_info = 0; 8680 8681 if (wc->stage == DROP_REFERENCE) { 8682 if (wc->refs[level - 1] > 1) { 8683 need_account = true; 8684 if (level == 1 && 8685 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8686 goto skip; 8687 8688 if (!wc->update_ref || 8689 generation <= root->root_key.offset) 8690 goto skip; 8691 8692 btrfs_node_key_to_cpu(path->nodes[level], &key, 8693 path->slots[level]); 8694 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 8695 if (ret < 0) 8696 goto skip; 8697 8698 wc->stage = UPDATE_BACKREF; 8699 wc->shared_level = level - 1; 8700 } 8701 } else { 8702 if (level == 1 && 8703 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 8704 goto skip; 8705 } 8706 8707 if (!btrfs_buffer_uptodate(next, generation, 0)) { 8708 btrfs_tree_unlock(next); 8709 free_extent_buffer(next); 8710 next = NULL; 8711 *lookup_info = 1; 8712 } 8713 8714 if (!next) { 8715 if (reada && level == 1) 8716 reada_walk_down(trans, root, wc, path); 8717 next = read_tree_block(fs_info, bytenr, generation); 8718 if (IS_ERR(next)) { 8719 return PTR_ERR(next); 8720 } else if (!extent_buffer_uptodate(next)) { 8721 free_extent_buffer(next); 8722 return -EIO; 8723 } 8724 btrfs_tree_lock(next); 8725 btrfs_set_lock_blocking(next); 8726 } 8727 8728 level--; 8729 ASSERT(level == btrfs_header_level(next)); 8730 if (level != btrfs_header_level(next)) { 8731 btrfs_err(root->fs_info, "mismatched level"); 8732 ret = -EIO; 8733 goto out_unlock; 8734 } 8735 path->nodes[level] = next; 8736 path->slots[level] = 0; 8737 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8738 wc->level = level; 8739 if (wc->level == 1) 8740 wc->reada_slot = 0; 8741 return 0; 8742 skip: 8743 wc->refs[level - 1] = 0; 8744 wc->flags[level - 1] = 0; 8745 if (wc->stage == DROP_REFERENCE) { 8746 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 8747 parent = path->nodes[level]->start; 8748 } else { 8749 ASSERT(root->root_key.objectid == 8750 btrfs_header_owner(path->nodes[level])); 8751 if (root->root_key.objectid != 8752 btrfs_header_owner(path->nodes[level])) { 8753 btrfs_err(root->fs_info, 8754 "mismatched block owner"); 8755 ret = -EIO; 8756 goto out_unlock; 8757 } 8758 parent = 0; 8759 } 8760 8761 if (need_account) { 8762 ret = btrfs_qgroup_trace_subtree(trans, root, next, 8763 generation, level - 1); 8764 if (ret) { 8765 btrfs_err_rl(fs_info, 8766 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 8767 ret); 8768 } 8769 } 8770 ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, 8771 parent, root->root_key.objectid, 8772 level - 1, 0); 8773 if (ret) 8774 goto out_unlock; 8775 } 8776 8777 *lookup_info = 1; 8778 ret = 1; 8779 8780 out_unlock: 8781 btrfs_tree_unlock(next); 8782 free_extent_buffer(next); 8783 8784 return ret; 8785 } 8786 8787 /* 8788 * helper to process tree block while walking up the tree. 8789 * 8790 * when wc->stage == DROP_REFERENCE, this function drops 8791 * reference count on the block. 8792 * 8793 * when wc->stage == UPDATE_BACKREF, this function changes 8794 * wc->stage back to DROP_REFERENCE if we changed wc->stage 8795 * to UPDATE_BACKREF previously while processing the block. 8796 * 8797 * NOTE: return value 1 means we should stop walking up. 8798 */ 8799 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 8800 struct btrfs_root *root, 8801 struct btrfs_path *path, 8802 struct walk_control *wc) 8803 { 8804 struct btrfs_fs_info *fs_info = root->fs_info; 8805 int ret; 8806 int level = wc->level; 8807 struct extent_buffer *eb = path->nodes[level]; 8808 u64 parent = 0; 8809 8810 if (wc->stage == UPDATE_BACKREF) { 8811 BUG_ON(wc->shared_level < level); 8812 if (level < wc->shared_level) 8813 goto out; 8814 8815 ret = find_next_key(path, level + 1, &wc->update_progress); 8816 if (ret > 0) 8817 wc->update_ref = 0; 8818 8819 wc->stage = DROP_REFERENCE; 8820 wc->shared_level = -1; 8821 path->slots[level] = 0; 8822 8823 /* 8824 * check reference count again if the block isn't locked. 8825 * we should start walking down the tree again if reference 8826 * count is one. 8827 */ 8828 if (!path->locks[level]) { 8829 BUG_ON(level == 0); 8830 btrfs_tree_lock(eb); 8831 btrfs_set_lock_blocking(eb); 8832 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8833 8834 ret = btrfs_lookup_extent_info(trans, fs_info, 8835 eb->start, level, 1, 8836 &wc->refs[level], 8837 &wc->flags[level]); 8838 if (ret < 0) { 8839 btrfs_tree_unlock_rw(eb, path->locks[level]); 8840 path->locks[level] = 0; 8841 return ret; 8842 } 8843 BUG_ON(wc->refs[level] == 0); 8844 if (wc->refs[level] == 1) { 8845 btrfs_tree_unlock_rw(eb, path->locks[level]); 8846 path->locks[level] = 0; 8847 return 1; 8848 } 8849 } 8850 } 8851 8852 /* wc->stage == DROP_REFERENCE */ 8853 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 8854 8855 if (wc->refs[level] == 1) { 8856 if (level == 0) { 8857 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8858 ret = btrfs_dec_ref(trans, root, eb, 1); 8859 else 8860 ret = btrfs_dec_ref(trans, root, eb, 0); 8861 BUG_ON(ret); /* -ENOMEM */ 8862 ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb); 8863 if (ret) { 8864 btrfs_err_rl(fs_info, 8865 "error %d accounting leaf items. Quota is out of sync, rescan required.", 8866 ret); 8867 } 8868 } 8869 /* make block locked assertion in clean_tree_block happy */ 8870 if (!path->locks[level] && 8871 btrfs_header_generation(eb) == trans->transid) { 8872 btrfs_tree_lock(eb); 8873 btrfs_set_lock_blocking(eb); 8874 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8875 } 8876 clean_tree_block(fs_info, eb); 8877 } 8878 8879 if (eb == root->node) { 8880 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8881 parent = eb->start; 8882 else 8883 BUG_ON(root->root_key.objectid != 8884 btrfs_header_owner(eb)); 8885 } else { 8886 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8887 parent = path->nodes[level + 1]->start; 8888 else 8889 BUG_ON(root->root_key.objectid != 8890 btrfs_header_owner(path->nodes[level + 1])); 8891 } 8892 8893 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 8894 out: 8895 wc->refs[level] = 0; 8896 wc->flags[level] = 0; 8897 return 0; 8898 } 8899 8900 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 8901 struct btrfs_root *root, 8902 struct btrfs_path *path, 8903 struct walk_control *wc) 8904 { 8905 int level = wc->level; 8906 int lookup_info = 1; 8907 int ret; 8908 8909 while (level >= 0) { 8910 ret = walk_down_proc(trans, root, path, wc, lookup_info); 8911 if (ret > 0) 8912 break; 8913 8914 if (level == 0) 8915 break; 8916 8917 if (path->slots[level] >= 8918 btrfs_header_nritems(path->nodes[level])) 8919 break; 8920 8921 ret = do_walk_down(trans, root, path, wc, &lookup_info); 8922 if (ret > 0) { 8923 path->slots[level]++; 8924 continue; 8925 } else if (ret < 0) 8926 return ret; 8927 level = wc->level; 8928 } 8929 return 0; 8930 } 8931 8932 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 8933 struct btrfs_root *root, 8934 struct btrfs_path *path, 8935 struct walk_control *wc, int max_level) 8936 { 8937 int level = wc->level; 8938 int ret; 8939 8940 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 8941 while (level < max_level && path->nodes[level]) { 8942 wc->level = level; 8943 if (path->slots[level] + 1 < 8944 btrfs_header_nritems(path->nodes[level])) { 8945 path->slots[level]++; 8946 return 0; 8947 } else { 8948 ret = walk_up_proc(trans, root, path, wc); 8949 if (ret > 0) 8950 return 0; 8951 8952 if (path->locks[level]) { 8953 btrfs_tree_unlock_rw(path->nodes[level], 8954 path->locks[level]); 8955 path->locks[level] = 0; 8956 } 8957 free_extent_buffer(path->nodes[level]); 8958 path->nodes[level] = NULL; 8959 level++; 8960 } 8961 } 8962 return 1; 8963 } 8964 8965 /* 8966 * drop a subvolume tree. 8967 * 8968 * this function traverses the tree freeing any blocks that only 8969 * referenced by the tree. 8970 * 8971 * when a shared tree block is found. this function decreases its 8972 * reference count by one. if update_ref is true, this function 8973 * also make sure backrefs for the shared block and all lower level 8974 * blocks are properly updated. 8975 * 8976 * If called with for_reloc == 0, may exit early with -EAGAIN 8977 */ 8978 int btrfs_drop_snapshot(struct btrfs_root *root, 8979 struct btrfs_block_rsv *block_rsv, int update_ref, 8980 int for_reloc) 8981 { 8982 struct btrfs_fs_info *fs_info = root->fs_info; 8983 struct btrfs_path *path; 8984 struct btrfs_trans_handle *trans; 8985 struct btrfs_root *tree_root = fs_info->tree_root; 8986 struct btrfs_root_item *root_item = &root->root_item; 8987 struct walk_control *wc; 8988 struct btrfs_key key; 8989 int err = 0; 8990 int ret; 8991 int level; 8992 bool root_dropped = false; 8993 8994 btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid); 8995 8996 path = btrfs_alloc_path(); 8997 if (!path) { 8998 err = -ENOMEM; 8999 goto out; 9000 } 9001 9002 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9003 if (!wc) { 9004 btrfs_free_path(path); 9005 err = -ENOMEM; 9006 goto out; 9007 } 9008 9009 trans = btrfs_start_transaction(tree_root, 0); 9010 if (IS_ERR(trans)) { 9011 err = PTR_ERR(trans); 9012 goto out_free; 9013 } 9014 9015 if (block_rsv) 9016 trans->block_rsv = block_rsv; 9017 9018 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 9019 level = btrfs_header_level(root->node); 9020 path->nodes[level] = btrfs_lock_root_node(root); 9021 btrfs_set_lock_blocking(path->nodes[level]); 9022 path->slots[level] = 0; 9023 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9024 memset(&wc->update_progress, 0, 9025 sizeof(wc->update_progress)); 9026 } else { 9027 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 9028 memcpy(&wc->update_progress, &key, 9029 sizeof(wc->update_progress)); 9030 9031 level = root_item->drop_level; 9032 BUG_ON(level == 0); 9033 path->lowest_level = level; 9034 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9035 path->lowest_level = 0; 9036 if (ret < 0) { 9037 err = ret; 9038 goto out_end_trans; 9039 } 9040 WARN_ON(ret > 0); 9041 9042 /* 9043 * unlock our path, this is safe because only this 9044 * function is allowed to delete this snapshot 9045 */ 9046 btrfs_unlock_up_safe(path, 0); 9047 9048 level = btrfs_header_level(root->node); 9049 while (1) { 9050 btrfs_tree_lock(path->nodes[level]); 9051 btrfs_set_lock_blocking(path->nodes[level]); 9052 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9053 9054 ret = btrfs_lookup_extent_info(trans, fs_info, 9055 path->nodes[level]->start, 9056 level, 1, &wc->refs[level], 9057 &wc->flags[level]); 9058 if (ret < 0) { 9059 err = ret; 9060 goto out_end_trans; 9061 } 9062 BUG_ON(wc->refs[level] == 0); 9063 9064 if (level == root_item->drop_level) 9065 break; 9066 9067 btrfs_tree_unlock(path->nodes[level]); 9068 path->locks[level] = 0; 9069 WARN_ON(wc->refs[level] != 1); 9070 level--; 9071 } 9072 } 9073 9074 wc->level = level; 9075 wc->shared_level = -1; 9076 wc->stage = DROP_REFERENCE; 9077 wc->update_ref = update_ref; 9078 wc->keep_locks = 0; 9079 wc->for_reloc = for_reloc; 9080 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9081 9082 while (1) { 9083 9084 ret = walk_down_tree(trans, root, path, wc); 9085 if (ret < 0) { 9086 err = ret; 9087 break; 9088 } 9089 9090 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 9091 if (ret < 0) { 9092 err = ret; 9093 break; 9094 } 9095 9096 if (ret > 0) { 9097 BUG_ON(wc->stage != DROP_REFERENCE); 9098 break; 9099 } 9100 9101 if (wc->stage == DROP_REFERENCE) { 9102 level = wc->level; 9103 btrfs_node_key(path->nodes[level], 9104 &root_item->drop_progress, 9105 path->slots[level]); 9106 root_item->drop_level = level; 9107 } 9108 9109 BUG_ON(wc->level == 0); 9110 if (btrfs_should_end_transaction(trans) || 9111 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 9112 ret = btrfs_update_root(trans, tree_root, 9113 &root->root_key, 9114 root_item); 9115 if (ret) { 9116 btrfs_abort_transaction(trans, ret); 9117 err = ret; 9118 goto out_end_trans; 9119 } 9120 9121 btrfs_end_transaction_throttle(trans); 9122 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 9123 btrfs_debug(fs_info, 9124 "drop snapshot early exit"); 9125 err = -EAGAIN; 9126 goto out_free; 9127 } 9128 9129 trans = btrfs_start_transaction(tree_root, 0); 9130 if (IS_ERR(trans)) { 9131 err = PTR_ERR(trans); 9132 goto out_free; 9133 } 9134 if (block_rsv) 9135 trans->block_rsv = block_rsv; 9136 } 9137 } 9138 btrfs_release_path(path); 9139 if (err) 9140 goto out_end_trans; 9141 9142 ret = btrfs_del_root(trans, tree_root, &root->root_key); 9143 if (ret) { 9144 btrfs_abort_transaction(trans, ret); 9145 goto out_end_trans; 9146 } 9147 9148 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 9149 ret = btrfs_find_root(tree_root, &root->root_key, path, 9150 NULL, NULL); 9151 if (ret < 0) { 9152 btrfs_abort_transaction(trans, ret); 9153 err = ret; 9154 goto out_end_trans; 9155 } else if (ret > 0) { 9156 /* if we fail to delete the orphan item this time 9157 * around, it'll get picked up the next time. 9158 * 9159 * The most common failure here is just -ENOENT. 9160 */ 9161 btrfs_del_orphan_item(trans, tree_root, 9162 root->root_key.objectid); 9163 } 9164 } 9165 9166 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) { 9167 btrfs_add_dropped_root(trans, root); 9168 } else { 9169 free_extent_buffer(root->node); 9170 free_extent_buffer(root->commit_root); 9171 btrfs_put_fs_root(root); 9172 } 9173 root_dropped = true; 9174 out_end_trans: 9175 btrfs_end_transaction_throttle(trans); 9176 out_free: 9177 kfree(wc); 9178 btrfs_free_path(path); 9179 out: 9180 /* 9181 * So if we need to stop dropping the snapshot for whatever reason we 9182 * need to make sure to add it back to the dead root list so that we 9183 * keep trying to do the work later. This also cleans up roots if we 9184 * don't have it in the radix (like when we recover after a power fail 9185 * or unmount) so we don't leak memory. 9186 */ 9187 if (!for_reloc && root_dropped == false) 9188 btrfs_add_dead_root(root); 9189 if (err && err != -EAGAIN) 9190 btrfs_handle_fs_error(fs_info, err, NULL); 9191 return err; 9192 } 9193 9194 /* 9195 * drop subtree rooted at tree block 'node'. 9196 * 9197 * NOTE: this function will unlock and release tree block 'node' 9198 * only used by relocation code 9199 */ 9200 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 9201 struct btrfs_root *root, 9202 struct extent_buffer *node, 9203 struct extent_buffer *parent) 9204 { 9205 struct btrfs_fs_info *fs_info = root->fs_info; 9206 struct btrfs_path *path; 9207 struct walk_control *wc; 9208 int level; 9209 int parent_level; 9210 int ret = 0; 9211 int wret; 9212 9213 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 9214 9215 path = btrfs_alloc_path(); 9216 if (!path) 9217 return -ENOMEM; 9218 9219 wc = kzalloc(sizeof(*wc), GFP_NOFS); 9220 if (!wc) { 9221 btrfs_free_path(path); 9222 return -ENOMEM; 9223 } 9224 9225 btrfs_assert_tree_locked(parent); 9226 parent_level = btrfs_header_level(parent); 9227 extent_buffer_get(parent); 9228 path->nodes[parent_level] = parent; 9229 path->slots[parent_level] = btrfs_header_nritems(parent); 9230 9231 btrfs_assert_tree_locked(node); 9232 level = btrfs_header_level(node); 9233 path->nodes[level] = node; 9234 path->slots[level] = 0; 9235 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 9236 9237 wc->refs[parent_level] = 1; 9238 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 9239 wc->level = level; 9240 wc->shared_level = -1; 9241 wc->stage = DROP_REFERENCE; 9242 wc->update_ref = 0; 9243 wc->keep_locks = 1; 9244 wc->for_reloc = 1; 9245 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 9246 9247 while (1) { 9248 wret = walk_down_tree(trans, root, path, wc); 9249 if (wret < 0) { 9250 ret = wret; 9251 break; 9252 } 9253 9254 wret = walk_up_tree(trans, root, path, wc, parent_level); 9255 if (wret < 0) 9256 ret = wret; 9257 if (wret != 0) 9258 break; 9259 } 9260 9261 kfree(wc); 9262 btrfs_free_path(path); 9263 return ret; 9264 } 9265 9266 static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) 9267 { 9268 u64 num_devices; 9269 u64 stripped; 9270 9271 /* 9272 * if restripe for this chunk_type is on pick target profile and 9273 * return, otherwise do the usual balance 9274 */ 9275 stripped = get_restripe_target(fs_info, flags); 9276 if (stripped) 9277 return extended_to_chunk(stripped); 9278 9279 num_devices = fs_info->fs_devices->rw_devices; 9280 9281 stripped = BTRFS_BLOCK_GROUP_RAID0 | 9282 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 9283 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 9284 9285 if (num_devices == 1) { 9286 stripped |= BTRFS_BLOCK_GROUP_DUP; 9287 stripped = flags & ~stripped; 9288 9289 /* turn raid0 into single device chunks */ 9290 if (flags & BTRFS_BLOCK_GROUP_RAID0) 9291 return stripped; 9292 9293 /* turn mirroring into duplication */ 9294 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 9295 BTRFS_BLOCK_GROUP_RAID10)) 9296 return stripped | BTRFS_BLOCK_GROUP_DUP; 9297 } else { 9298 /* they already had raid on here, just return */ 9299 if (flags & stripped) 9300 return flags; 9301 9302 stripped |= BTRFS_BLOCK_GROUP_DUP; 9303 stripped = flags & ~stripped; 9304 9305 /* switch duplicated blocks with raid1 */ 9306 if (flags & BTRFS_BLOCK_GROUP_DUP) 9307 return stripped | BTRFS_BLOCK_GROUP_RAID1; 9308 9309 /* this is drive concat, leave it alone */ 9310 } 9311 9312 return flags; 9313 } 9314 9315 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) 9316 { 9317 struct btrfs_space_info *sinfo = cache->space_info; 9318 u64 num_bytes; 9319 u64 min_allocable_bytes; 9320 int ret = -ENOSPC; 9321 9322 /* 9323 * We need some metadata space and system metadata space for 9324 * allocating chunks in some corner cases until we force to set 9325 * it to be readonly. 9326 */ 9327 if ((sinfo->flags & 9328 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && 9329 !force) 9330 min_allocable_bytes = SZ_1M; 9331 else 9332 min_allocable_bytes = 0; 9333 9334 spin_lock(&sinfo->lock); 9335 spin_lock(&cache->lock); 9336 9337 if (cache->ro) { 9338 cache->ro++; 9339 ret = 0; 9340 goto out; 9341 } 9342 9343 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 9344 cache->bytes_super - btrfs_block_group_used(&cache->item); 9345 9346 if (btrfs_space_info_used(sinfo, true) + num_bytes + 9347 min_allocable_bytes <= sinfo->total_bytes) { 9348 sinfo->bytes_readonly += num_bytes; 9349 cache->ro++; 9350 list_add_tail(&cache->ro_list, &sinfo->ro_bgs); 9351 ret = 0; 9352 } 9353 out: 9354 spin_unlock(&cache->lock); 9355 spin_unlock(&sinfo->lock); 9356 return ret; 9357 } 9358 9359 int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info, 9360 struct btrfs_block_group_cache *cache) 9361 9362 { 9363 struct btrfs_trans_handle *trans; 9364 u64 alloc_flags; 9365 int ret; 9366 9367 again: 9368 trans = btrfs_join_transaction(fs_info->extent_root); 9369 if (IS_ERR(trans)) 9370 return PTR_ERR(trans); 9371 9372 /* 9373 * we're not allowed to set block groups readonly after the dirty 9374 * block groups cache has started writing. If it already started, 9375 * back off and let this transaction commit 9376 */ 9377 mutex_lock(&fs_info->ro_block_group_mutex); 9378 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { 9379 u64 transid = trans->transid; 9380 9381 mutex_unlock(&fs_info->ro_block_group_mutex); 9382 btrfs_end_transaction(trans); 9383 9384 ret = btrfs_wait_for_commit(fs_info, transid); 9385 if (ret) 9386 return ret; 9387 goto again; 9388 } 9389 9390 /* 9391 * if we are changing raid levels, try to allocate a corresponding 9392 * block group with the new raid level. 9393 */ 9394 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9395 if (alloc_flags != cache->flags) { 9396 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9397 CHUNK_ALLOC_FORCE); 9398 /* 9399 * ENOSPC is allowed here, we may have enough space 9400 * already allocated at the new raid level to 9401 * carry on 9402 */ 9403 if (ret == -ENOSPC) 9404 ret = 0; 9405 if (ret < 0) 9406 goto out; 9407 } 9408 9409 ret = inc_block_group_ro(cache, 0); 9410 if (!ret) 9411 goto out; 9412 alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); 9413 ret = do_chunk_alloc(trans, fs_info, alloc_flags, 9414 CHUNK_ALLOC_FORCE); 9415 if (ret < 0) 9416 goto out; 9417 ret = inc_block_group_ro(cache, 0); 9418 out: 9419 if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { 9420 alloc_flags = update_block_group_flags(fs_info, cache->flags); 9421 mutex_lock(&fs_info->chunk_mutex); 9422 check_system_chunk(trans, fs_info, alloc_flags); 9423 mutex_unlock(&fs_info->chunk_mutex); 9424 } 9425 mutex_unlock(&fs_info->ro_block_group_mutex); 9426 9427 btrfs_end_transaction(trans); 9428 return ret; 9429 } 9430 9431 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, 9432 struct btrfs_fs_info *fs_info, u64 type) 9433 { 9434 u64 alloc_flags = get_alloc_profile(fs_info, type); 9435 9436 return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); 9437 } 9438 9439 /* 9440 * helper to account the unused space of all the readonly block group in the 9441 * space_info. takes mirrors into account. 9442 */ 9443 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) 9444 { 9445 struct btrfs_block_group_cache *block_group; 9446 u64 free_bytes = 0; 9447 int factor; 9448 9449 /* It's df, we don't care if it's racy */ 9450 if (list_empty(&sinfo->ro_bgs)) 9451 return 0; 9452 9453 spin_lock(&sinfo->lock); 9454 list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { 9455 spin_lock(&block_group->lock); 9456 9457 if (!block_group->ro) { 9458 spin_unlock(&block_group->lock); 9459 continue; 9460 } 9461 9462 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | 9463 BTRFS_BLOCK_GROUP_RAID10 | 9464 BTRFS_BLOCK_GROUP_DUP)) 9465 factor = 2; 9466 else 9467 factor = 1; 9468 9469 free_bytes += (block_group->key.offset - 9470 btrfs_block_group_used(&block_group->item)) * 9471 factor; 9472 9473 spin_unlock(&block_group->lock); 9474 } 9475 spin_unlock(&sinfo->lock); 9476 9477 return free_bytes; 9478 } 9479 9480 void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) 9481 { 9482 struct btrfs_space_info *sinfo = cache->space_info; 9483 u64 num_bytes; 9484 9485 BUG_ON(!cache->ro); 9486 9487 spin_lock(&sinfo->lock); 9488 spin_lock(&cache->lock); 9489 if (!--cache->ro) { 9490 num_bytes = cache->key.offset - cache->reserved - 9491 cache->pinned - cache->bytes_super - 9492 btrfs_block_group_used(&cache->item); 9493 sinfo->bytes_readonly -= num_bytes; 9494 list_del_init(&cache->ro_list); 9495 } 9496 spin_unlock(&cache->lock); 9497 spin_unlock(&sinfo->lock); 9498 } 9499 9500 /* 9501 * checks to see if its even possible to relocate this block group. 9502 * 9503 * @return - -1 if it's not a good idea to relocate this block group, 0 if its 9504 * ok to go ahead and try. 9505 */ 9506 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) 9507 { 9508 struct btrfs_root *root = fs_info->extent_root; 9509 struct btrfs_block_group_cache *block_group; 9510 struct btrfs_space_info *space_info; 9511 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 9512 struct btrfs_device *device; 9513 struct btrfs_trans_handle *trans; 9514 u64 min_free; 9515 u64 dev_min = 1; 9516 u64 dev_nr = 0; 9517 u64 target; 9518 int debug; 9519 int index; 9520 int full = 0; 9521 int ret = 0; 9522 9523 debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); 9524 9525 block_group = btrfs_lookup_block_group(fs_info, bytenr); 9526 9527 /* odd, couldn't find the block group, leave it alone */ 9528 if (!block_group) { 9529 if (debug) 9530 btrfs_warn(fs_info, 9531 "can't find block group for bytenr %llu", 9532 bytenr); 9533 return -1; 9534 } 9535 9536 min_free = btrfs_block_group_used(&block_group->item); 9537 9538 /* no bytes used, we're good */ 9539 if (!min_free) 9540 goto out; 9541 9542 space_info = block_group->space_info; 9543 spin_lock(&space_info->lock); 9544 9545 full = space_info->full; 9546 9547 /* 9548 * if this is the last block group we have in this space, we can't 9549 * relocate it unless we're able to allocate a new chunk below. 9550 * 9551 * Otherwise, we need to make sure we have room in the space to handle 9552 * all of the extents from this block group. If we can, we're good 9553 */ 9554 if ((space_info->total_bytes != block_group->key.offset) && 9555 (btrfs_space_info_used(space_info, false) + min_free < 9556 space_info->total_bytes)) { 9557 spin_unlock(&space_info->lock); 9558 goto out; 9559 } 9560 spin_unlock(&space_info->lock); 9561 9562 /* 9563 * ok we don't have enough space, but maybe we have free space on our 9564 * devices to allocate new chunks for relocation, so loop through our 9565 * alloc devices and guess if we have enough space. if this block 9566 * group is going to be restriped, run checks against the target 9567 * profile instead of the current one. 9568 */ 9569 ret = -1; 9570 9571 /* 9572 * index: 9573 * 0: raid10 9574 * 1: raid1 9575 * 2: dup 9576 * 3: raid0 9577 * 4: single 9578 */ 9579 target = get_restripe_target(fs_info, block_group->flags); 9580 if (target) { 9581 index = __get_raid_index(extended_to_chunk(target)); 9582 } else { 9583 /* 9584 * this is just a balance, so if we were marked as full 9585 * we know there is no space for a new chunk 9586 */ 9587 if (full) { 9588 if (debug) 9589 btrfs_warn(fs_info, 9590 "no space to alloc new chunk for block group %llu", 9591 block_group->key.objectid); 9592 goto out; 9593 } 9594 9595 index = get_block_group_index(block_group); 9596 } 9597 9598 if (index == BTRFS_RAID_RAID10) { 9599 dev_min = 4; 9600 /* Divide by 2 */ 9601 min_free >>= 1; 9602 } else if (index == BTRFS_RAID_RAID1) { 9603 dev_min = 2; 9604 } else if (index == BTRFS_RAID_DUP) { 9605 /* Multiply by 2 */ 9606 min_free <<= 1; 9607 } else if (index == BTRFS_RAID_RAID0) { 9608 dev_min = fs_devices->rw_devices; 9609 min_free = div64_u64(min_free, dev_min); 9610 } 9611 9612 /* We need to do this so that we can look at pending chunks */ 9613 trans = btrfs_join_transaction(root); 9614 if (IS_ERR(trans)) { 9615 ret = PTR_ERR(trans); 9616 goto out; 9617 } 9618 9619 mutex_lock(&fs_info->chunk_mutex); 9620 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 9621 u64 dev_offset; 9622 9623 /* 9624 * check to make sure we can actually find a chunk with enough 9625 * space to fit our block group in. 9626 */ 9627 if (device->total_bytes > device->bytes_used + min_free && 9628 !device->is_tgtdev_for_dev_replace) { 9629 ret = find_free_dev_extent(trans, device, min_free, 9630 &dev_offset, NULL); 9631 if (!ret) 9632 dev_nr++; 9633 9634 if (dev_nr >= dev_min) 9635 break; 9636 9637 ret = -1; 9638 } 9639 } 9640 if (debug && ret == -1) 9641 btrfs_warn(fs_info, 9642 "no space to allocate a new chunk for block group %llu", 9643 block_group->key.objectid); 9644 mutex_unlock(&fs_info->chunk_mutex); 9645 btrfs_end_transaction(trans); 9646 out: 9647 btrfs_put_block_group(block_group); 9648 return ret; 9649 } 9650 9651 static int find_first_block_group(struct btrfs_fs_info *fs_info, 9652 struct btrfs_path *path, 9653 struct btrfs_key *key) 9654 { 9655 struct btrfs_root *root = fs_info->extent_root; 9656 int ret = 0; 9657 struct btrfs_key found_key; 9658 struct extent_buffer *leaf; 9659 int slot; 9660 9661 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 9662 if (ret < 0) 9663 goto out; 9664 9665 while (1) { 9666 slot = path->slots[0]; 9667 leaf = path->nodes[0]; 9668 if (slot >= btrfs_header_nritems(leaf)) { 9669 ret = btrfs_next_leaf(root, path); 9670 if (ret == 0) 9671 continue; 9672 if (ret < 0) 9673 goto out; 9674 break; 9675 } 9676 btrfs_item_key_to_cpu(leaf, &found_key, slot); 9677 9678 if (found_key.objectid >= key->objectid && 9679 found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { 9680 struct extent_map_tree *em_tree; 9681 struct extent_map *em; 9682 9683 em_tree = &root->fs_info->mapping_tree.map_tree; 9684 read_lock(&em_tree->lock); 9685 em = lookup_extent_mapping(em_tree, found_key.objectid, 9686 found_key.offset); 9687 read_unlock(&em_tree->lock); 9688 if (!em) { 9689 btrfs_err(fs_info, 9690 "logical %llu len %llu found bg but no related chunk", 9691 found_key.objectid, found_key.offset); 9692 ret = -ENOENT; 9693 } else { 9694 ret = 0; 9695 } 9696 free_extent_map(em); 9697 goto out; 9698 } 9699 path->slots[0]++; 9700 } 9701 out: 9702 return ret; 9703 } 9704 9705 void btrfs_put_block_group_cache(struct btrfs_fs_info *info) 9706 { 9707 struct btrfs_block_group_cache *block_group; 9708 u64 last = 0; 9709 9710 while (1) { 9711 struct inode *inode; 9712 9713 block_group = btrfs_lookup_first_block_group(info, last); 9714 while (block_group) { 9715 spin_lock(&block_group->lock); 9716 if (block_group->iref) 9717 break; 9718 spin_unlock(&block_group->lock); 9719 block_group = next_block_group(info, block_group); 9720 } 9721 if (!block_group) { 9722 if (last == 0) 9723 break; 9724 last = 0; 9725 continue; 9726 } 9727 9728 inode = block_group->inode; 9729 block_group->iref = 0; 9730 block_group->inode = NULL; 9731 spin_unlock(&block_group->lock); 9732 ASSERT(block_group->io_ctl.inode == NULL); 9733 iput(inode); 9734 last = block_group->key.objectid + block_group->key.offset; 9735 btrfs_put_block_group(block_group); 9736 } 9737 } 9738 9739 /* 9740 * Must be called only after stopping all workers, since we could have block 9741 * group caching kthreads running, and therefore they could race with us if we 9742 * freed the block groups before stopping them. 9743 */ 9744 int btrfs_free_block_groups(struct btrfs_fs_info *info) 9745 { 9746 struct btrfs_block_group_cache *block_group; 9747 struct btrfs_space_info *space_info; 9748 struct btrfs_caching_control *caching_ctl; 9749 struct rb_node *n; 9750 9751 down_write(&info->commit_root_sem); 9752 while (!list_empty(&info->caching_block_groups)) { 9753 caching_ctl = list_entry(info->caching_block_groups.next, 9754 struct btrfs_caching_control, list); 9755 list_del(&caching_ctl->list); 9756 put_caching_control(caching_ctl); 9757 } 9758 up_write(&info->commit_root_sem); 9759 9760 spin_lock(&info->unused_bgs_lock); 9761 while (!list_empty(&info->unused_bgs)) { 9762 block_group = list_first_entry(&info->unused_bgs, 9763 struct btrfs_block_group_cache, 9764 bg_list); 9765 list_del_init(&block_group->bg_list); 9766 btrfs_put_block_group(block_group); 9767 } 9768 spin_unlock(&info->unused_bgs_lock); 9769 9770 spin_lock(&info->block_group_cache_lock); 9771 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 9772 block_group = rb_entry(n, struct btrfs_block_group_cache, 9773 cache_node); 9774 rb_erase(&block_group->cache_node, 9775 &info->block_group_cache_tree); 9776 RB_CLEAR_NODE(&block_group->cache_node); 9777 spin_unlock(&info->block_group_cache_lock); 9778 9779 down_write(&block_group->space_info->groups_sem); 9780 list_del(&block_group->list); 9781 up_write(&block_group->space_info->groups_sem); 9782 9783 /* 9784 * We haven't cached this block group, which means we could 9785 * possibly have excluded extents on this block group. 9786 */ 9787 if (block_group->cached == BTRFS_CACHE_NO || 9788 block_group->cached == BTRFS_CACHE_ERROR) 9789 free_excluded_extents(info, block_group); 9790 9791 btrfs_remove_free_space_cache(block_group); 9792 ASSERT(block_group->cached != BTRFS_CACHE_STARTED); 9793 ASSERT(list_empty(&block_group->dirty_list)); 9794 ASSERT(list_empty(&block_group->io_list)); 9795 ASSERT(list_empty(&block_group->bg_list)); 9796 ASSERT(atomic_read(&block_group->count) == 1); 9797 btrfs_put_block_group(block_group); 9798 9799 spin_lock(&info->block_group_cache_lock); 9800 } 9801 spin_unlock(&info->block_group_cache_lock); 9802 9803 /* now that all the block groups are freed, go through and 9804 * free all the space_info structs. This is only called during 9805 * the final stages of unmount, and so we know nobody is 9806 * using them. We call synchronize_rcu() once before we start, 9807 * just to be on the safe side. 9808 */ 9809 synchronize_rcu(); 9810 9811 release_global_block_rsv(info); 9812 9813 while (!list_empty(&info->space_info)) { 9814 int i; 9815 9816 space_info = list_entry(info->space_info.next, 9817 struct btrfs_space_info, 9818 list); 9819 9820 /* 9821 * Do not hide this behind enospc_debug, this is actually 9822 * important and indicates a real bug if this happens. 9823 */ 9824 if (WARN_ON(space_info->bytes_pinned > 0 || 9825 space_info->bytes_reserved > 0 || 9826 space_info->bytes_may_use > 0)) 9827 dump_space_info(info, space_info, 0, 0); 9828 list_del(&space_info->list); 9829 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 9830 struct kobject *kobj; 9831 kobj = space_info->block_group_kobjs[i]; 9832 space_info->block_group_kobjs[i] = NULL; 9833 if (kobj) { 9834 kobject_del(kobj); 9835 kobject_put(kobj); 9836 } 9837 } 9838 kobject_del(&space_info->kobj); 9839 kobject_put(&space_info->kobj); 9840 } 9841 return 0; 9842 } 9843 9844 static void __link_block_group(struct btrfs_space_info *space_info, 9845 struct btrfs_block_group_cache *cache) 9846 { 9847 int index = get_block_group_index(cache); 9848 bool first = false; 9849 9850 down_write(&space_info->groups_sem); 9851 if (list_empty(&space_info->block_groups[index])) 9852 first = true; 9853 list_add_tail(&cache->list, &space_info->block_groups[index]); 9854 up_write(&space_info->groups_sem); 9855 9856 if (first) { 9857 struct raid_kobject *rkobj; 9858 int ret; 9859 9860 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); 9861 if (!rkobj) 9862 goto out_err; 9863 rkobj->raid_type = index; 9864 kobject_init(&rkobj->kobj, &btrfs_raid_ktype); 9865 ret = kobject_add(&rkobj->kobj, &space_info->kobj, 9866 "%s", get_raid_name(index)); 9867 if (ret) { 9868 kobject_put(&rkobj->kobj); 9869 goto out_err; 9870 } 9871 space_info->block_group_kobjs[index] = &rkobj->kobj; 9872 } 9873 9874 return; 9875 out_err: 9876 btrfs_warn(cache->fs_info, 9877 "failed to add kobject for block cache, ignoring"); 9878 } 9879 9880 static struct btrfs_block_group_cache * 9881 btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, 9882 u64 start, u64 size) 9883 { 9884 struct btrfs_block_group_cache *cache; 9885 9886 cache = kzalloc(sizeof(*cache), GFP_NOFS); 9887 if (!cache) 9888 return NULL; 9889 9890 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), 9891 GFP_NOFS); 9892 if (!cache->free_space_ctl) { 9893 kfree(cache); 9894 return NULL; 9895 } 9896 9897 cache->key.objectid = start; 9898 cache->key.offset = size; 9899 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9900 9901 cache->sectorsize = fs_info->sectorsize; 9902 cache->fs_info = fs_info; 9903 cache->full_stripe_len = btrfs_full_stripe_len(fs_info, 9904 &fs_info->mapping_tree, 9905 start); 9906 set_free_space_tree_thresholds(cache); 9907 9908 atomic_set(&cache->count, 1); 9909 spin_lock_init(&cache->lock); 9910 init_rwsem(&cache->data_rwsem); 9911 INIT_LIST_HEAD(&cache->list); 9912 INIT_LIST_HEAD(&cache->cluster_list); 9913 INIT_LIST_HEAD(&cache->bg_list); 9914 INIT_LIST_HEAD(&cache->ro_list); 9915 INIT_LIST_HEAD(&cache->dirty_list); 9916 INIT_LIST_HEAD(&cache->io_list); 9917 btrfs_init_free_space_ctl(cache); 9918 atomic_set(&cache->trimming, 0); 9919 mutex_init(&cache->free_space_lock); 9920 9921 return cache; 9922 } 9923 9924 int btrfs_read_block_groups(struct btrfs_fs_info *info) 9925 { 9926 struct btrfs_path *path; 9927 int ret; 9928 struct btrfs_block_group_cache *cache; 9929 struct btrfs_space_info *space_info; 9930 struct btrfs_key key; 9931 struct btrfs_key found_key; 9932 struct extent_buffer *leaf; 9933 int need_clear = 0; 9934 u64 cache_gen; 9935 u64 feature; 9936 int mixed; 9937 9938 feature = btrfs_super_incompat_flags(info->super_copy); 9939 mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); 9940 9941 key.objectid = 0; 9942 key.offset = 0; 9943 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 9944 path = btrfs_alloc_path(); 9945 if (!path) 9946 return -ENOMEM; 9947 path->reada = READA_FORWARD; 9948 9949 cache_gen = btrfs_super_cache_generation(info->super_copy); 9950 if (btrfs_test_opt(info, SPACE_CACHE) && 9951 btrfs_super_generation(info->super_copy) != cache_gen) 9952 need_clear = 1; 9953 if (btrfs_test_opt(info, CLEAR_CACHE)) 9954 need_clear = 1; 9955 9956 while (1) { 9957 ret = find_first_block_group(info, path, &key); 9958 if (ret > 0) 9959 break; 9960 if (ret != 0) 9961 goto error; 9962 9963 leaf = path->nodes[0]; 9964 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 9965 9966 cache = btrfs_create_block_group_cache(info, found_key.objectid, 9967 found_key.offset); 9968 if (!cache) { 9969 ret = -ENOMEM; 9970 goto error; 9971 } 9972 9973 if (need_clear) { 9974 /* 9975 * When we mount with old space cache, we need to 9976 * set BTRFS_DC_CLEAR and set dirty flag. 9977 * 9978 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we 9979 * truncate the old free space cache inode and 9980 * setup a new one. 9981 * b) Setting 'dirty flag' makes sure that we flush 9982 * the new space cache info onto disk. 9983 */ 9984 if (btrfs_test_opt(info, SPACE_CACHE)) 9985 cache->disk_cache_state = BTRFS_DC_CLEAR; 9986 } 9987 9988 read_extent_buffer(leaf, &cache->item, 9989 btrfs_item_ptr_offset(leaf, path->slots[0]), 9990 sizeof(cache->item)); 9991 cache->flags = btrfs_block_group_flags(&cache->item); 9992 if (!mixed && 9993 ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && 9994 (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { 9995 btrfs_err(info, 9996 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", 9997 cache->key.objectid); 9998 ret = -EINVAL; 9999 goto error; 10000 } 10001 10002 key.objectid = found_key.objectid + found_key.offset; 10003 btrfs_release_path(path); 10004 10005 /* 10006 * We need to exclude the super stripes now so that the space 10007 * info has super bytes accounted for, otherwise we'll think 10008 * we have more space than we actually do. 10009 */ 10010 ret = exclude_super_stripes(info, cache); 10011 if (ret) { 10012 /* 10013 * We may have excluded something, so call this just in 10014 * case. 10015 */ 10016 free_excluded_extents(info, cache); 10017 btrfs_put_block_group(cache); 10018 goto error; 10019 } 10020 10021 /* 10022 * check for two cases, either we are full, and therefore 10023 * don't need to bother with the caching work since we won't 10024 * find any space, or we are empty, and we can just add all 10025 * the space in and be done with it. This saves us _alot_ of 10026 * time, particularly in the full case. 10027 */ 10028 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 10029 cache->last_byte_to_unpin = (u64)-1; 10030 cache->cached = BTRFS_CACHE_FINISHED; 10031 free_excluded_extents(info, cache); 10032 } else if (btrfs_block_group_used(&cache->item) == 0) { 10033 cache->last_byte_to_unpin = (u64)-1; 10034 cache->cached = BTRFS_CACHE_FINISHED; 10035 add_new_free_space(cache, info, 10036 found_key.objectid, 10037 found_key.objectid + 10038 found_key.offset); 10039 free_excluded_extents(info, cache); 10040 } 10041 10042 ret = btrfs_add_block_group_cache(info, cache); 10043 if (ret) { 10044 btrfs_remove_free_space_cache(cache); 10045 btrfs_put_block_group(cache); 10046 goto error; 10047 } 10048 10049 trace_btrfs_add_block_group(info, cache, 0); 10050 ret = update_space_info(info, cache->flags, found_key.offset, 10051 btrfs_block_group_used(&cache->item), 10052 cache->bytes_super, &space_info); 10053 if (ret) { 10054 btrfs_remove_free_space_cache(cache); 10055 spin_lock(&info->block_group_cache_lock); 10056 rb_erase(&cache->cache_node, 10057 &info->block_group_cache_tree); 10058 RB_CLEAR_NODE(&cache->cache_node); 10059 spin_unlock(&info->block_group_cache_lock); 10060 btrfs_put_block_group(cache); 10061 goto error; 10062 } 10063 10064 cache->space_info = space_info; 10065 10066 __link_block_group(space_info, cache); 10067 10068 set_avail_alloc_bits(info, cache->flags); 10069 if (btrfs_chunk_readonly(info, cache->key.objectid)) { 10070 inc_block_group_ro(cache, 1); 10071 } else if (btrfs_block_group_used(&cache->item) == 0) { 10072 spin_lock(&info->unused_bgs_lock); 10073 /* Should always be true but just in case. */ 10074 if (list_empty(&cache->bg_list)) { 10075 btrfs_get_block_group(cache); 10076 list_add_tail(&cache->bg_list, 10077 &info->unused_bgs); 10078 } 10079 spin_unlock(&info->unused_bgs_lock); 10080 } 10081 } 10082 10083 list_for_each_entry_rcu(space_info, &info->space_info, list) { 10084 if (!(get_alloc_profile(info, space_info->flags) & 10085 (BTRFS_BLOCK_GROUP_RAID10 | 10086 BTRFS_BLOCK_GROUP_RAID1 | 10087 BTRFS_BLOCK_GROUP_RAID5 | 10088 BTRFS_BLOCK_GROUP_RAID6 | 10089 BTRFS_BLOCK_GROUP_DUP))) 10090 continue; 10091 /* 10092 * avoid allocating from un-mirrored block group if there are 10093 * mirrored block groups. 10094 */ 10095 list_for_each_entry(cache, 10096 &space_info->block_groups[BTRFS_RAID_RAID0], 10097 list) 10098 inc_block_group_ro(cache, 1); 10099 list_for_each_entry(cache, 10100 &space_info->block_groups[BTRFS_RAID_SINGLE], 10101 list) 10102 inc_block_group_ro(cache, 1); 10103 } 10104 10105 init_global_block_rsv(info); 10106 ret = 0; 10107 error: 10108 btrfs_free_path(path); 10109 return ret; 10110 } 10111 10112 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 10113 struct btrfs_fs_info *fs_info) 10114 { 10115 struct btrfs_block_group_cache *block_group, *tmp; 10116 struct btrfs_root *extent_root = fs_info->extent_root; 10117 struct btrfs_block_group_item item; 10118 struct btrfs_key key; 10119 int ret = 0; 10120 bool can_flush_pending_bgs = trans->can_flush_pending_bgs; 10121 10122 trans->can_flush_pending_bgs = false; 10123 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { 10124 if (ret) 10125 goto next; 10126 10127 spin_lock(&block_group->lock); 10128 memcpy(&item, &block_group->item, sizeof(item)); 10129 memcpy(&key, &block_group->key, sizeof(key)); 10130 spin_unlock(&block_group->lock); 10131 10132 ret = btrfs_insert_item(trans, extent_root, &key, &item, 10133 sizeof(item)); 10134 if (ret) 10135 btrfs_abort_transaction(trans, ret); 10136 ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid, 10137 key.offset); 10138 if (ret) 10139 btrfs_abort_transaction(trans, ret); 10140 add_block_group_free_space(trans, fs_info, block_group); 10141 /* already aborted the transaction if it failed. */ 10142 next: 10143 list_del_init(&block_group->bg_list); 10144 } 10145 trans->can_flush_pending_bgs = can_flush_pending_bgs; 10146 } 10147 10148 int btrfs_make_block_group(struct btrfs_trans_handle *trans, 10149 struct btrfs_fs_info *fs_info, u64 bytes_used, 10150 u64 type, u64 chunk_objectid, u64 chunk_offset, 10151 u64 size) 10152 { 10153 struct btrfs_block_group_cache *cache; 10154 int ret; 10155 10156 btrfs_set_log_full_commit(fs_info, trans); 10157 10158 cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); 10159 if (!cache) 10160 return -ENOMEM; 10161 10162 btrfs_set_block_group_used(&cache->item, bytes_used); 10163 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 10164 btrfs_set_block_group_flags(&cache->item, type); 10165 10166 cache->flags = type; 10167 cache->last_byte_to_unpin = (u64)-1; 10168 cache->cached = BTRFS_CACHE_FINISHED; 10169 cache->needs_free_space = 1; 10170 ret = exclude_super_stripes(fs_info, cache); 10171 if (ret) { 10172 /* 10173 * We may have excluded something, so call this just in 10174 * case. 10175 */ 10176 free_excluded_extents(fs_info, cache); 10177 btrfs_put_block_group(cache); 10178 return ret; 10179 } 10180 10181 add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size); 10182 10183 free_excluded_extents(fs_info, cache); 10184 10185 #ifdef CONFIG_BTRFS_DEBUG 10186 if (btrfs_should_fragment_free_space(cache)) { 10187 u64 new_bytes_used = size - bytes_used; 10188 10189 bytes_used += new_bytes_used >> 1; 10190 fragment_free_space(cache); 10191 } 10192 #endif 10193 /* 10194 * Call to ensure the corresponding space_info object is created and 10195 * assigned to our block group, but don't update its counters just yet. 10196 * We want our bg to be added to the rbtree with its ->space_info set. 10197 */ 10198 ret = update_space_info(fs_info, cache->flags, 0, 0, 0, 10199 &cache->space_info); 10200 if (ret) { 10201 btrfs_remove_free_space_cache(cache); 10202 btrfs_put_block_group(cache); 10203 return ret; 10204 } 10205 10206 ret = btrfs_add_block_group_cache(fs_info, cache); 10207 if (ret) { 10208 btrfs_remove_free_space_cache(cache); 10209 btrfs_put_block_group(cache); 10210 return ret; 10211 } 10212 10213 /* 10214 * Now that our block group has its ->space_info set and is inserted in 10215 * the rbtree, update the space info's counters. 10216 */ 10217 trace_btrfs_add_block_group(fs_info, cache, 1); 10218 ret = update_space_info(fs_info, cache->flags, size, bytes_used, 10219 cache->bytes_super, &cache->space_info); 10220 if (ret) { 10221 btrfs_remove_free_space_cache(cache); 10222 spin_lock(&fs_info->block_group_cache_lock); 10223 rb_erase(&cache->cache_node, 10224 &fs_info->block_group_cache_tree); 10225 RB_CLEAR_NODE(&cache->cache_node); 10226 spin_unlock(&fs_info->block_group_cache_lock); 10227 btrfs_put_block_group(cache); 10228 return ret; 10229 } 10230 update_global_block_rsv(fs_info); 10231 10232 __link_block_group(cache->space_info, cache); 10233 10234 list_add_tail(&cache->bg_list, &trans->new_bgs); 10235 10236 set_avail_alloc_bits(fs_info, type); 10237 return 0; 10238 } 10239 10240 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 10241 { 10242 u64 extra_flags = chunk_to_extended(flags) & 10243 BTRFS_EXTENDED_PROFILE_MASK; 10244 10245 write_seqlock(&fs_info->profiles_lock); 10246 if (flags & BTRFS_BLOCK_GROUP_DATA) 10247 fs_info->avail_data_alloc_bits &= ~extra_flags; 10248 if (flags & BTRFS_BLOCK_GROUP_METADATA) 10249 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 10250 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 10251 fs_info->avail_system_alloc_bits &= ~extra_flags; 10252 write_sequnlock(&fs_info->profiles_lock); 10253 } 10254 10255 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 10256 struct btrfs_fs_info *fs_info, u64 group_start, 10257 struct extent_map *em) 10258 { 10259 struct btrfs_root *root = fs_info->extent_root; 10260 struct btrfs_path *path; 10261 struct btrfs_block_group_cache *block_group; 10262 struct btrfs_free_cluster *cluster; 10263 struct btrfs_root *tree_root = fs_info->tree_root; 10264 struct btrfs_key key; 10265 struct inode *inode; 10266 struct kobject *kobj = NULL; 10267 int ret; 10268 int index; 10269 int factor; 10270 struct btrfs_caching_control *caching_ctl = NULL; 10271 bool remove_em; 10272 10273 block_group = btrfs_lookup_block_group(fs_info, group_start); 10274 BUG_ON(!block_group); 10275 BUG_ON(!block_group->ro); 10276 10277 /* 10278 * Free the reserved super bytes from this block group before 10279 * remove it. 10280 */ 10281 free_excluded_extents(fs_info, block_group); 10282 10283 memcpy(&key, &block_group->key, sizeof(key)); 10284 index = get_block_group_index(block_group); 10285 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 10286 BTRFS_BLOCK_GROUP_RAID1 | 10287 BTRFS_BLOCK_GROUP_RAID10)) 10288 factor = 2; 10289 else 10290 factor = 1; 10291 10292 /* make sure this block group isn't part of an allocation cluster */ 10293 cluster = &fs_info->data_alloc_cluster; 10294 spin_lock(&cluster->refill_lock); 10295 btrfs_return_cluster_to_free_space(block_group, cluster); 10296 spin_unlock(&cluster->refill_lock); 10297 10298 /* 10299 * make sure this block group isn't part of a metadata 10300 * allocation cluster 10301 */ 10302 cluster = &fs_info->meta_alloc_cluster; 10303 spin_lock(&cluster->refill_lock); 10304 btrfs_return_cluster_to_free_space(block_group, cluster); 10305 spin_unlock(&cluster->refill_lock); 10306 10307 path = btrfs_alloc_path(); 10308 if (!path) { 10309 ret = -ENOMEM; 10310 goto out; 10311 } 10312 10313 /* 10314 * get the inode first so any iput calls done for the io_list 10315 * aren't the final iput (no unlinks allowed now) 10316 */ 10317 inode = lookup_free_space_inode(fs_info, block_group, path); 10318 10319 mutex_lock(&trans->transaction->cache_write_mutex); 10320 /* 10321 * make sure our free spache cache IO is done before remove the 10322 * free space inode 10323 */ 10324 spin_lock(&trans->transaction->dirty_bgs_lock); 10325 if (!list_empty(&block_group->io_list)) { 10326 list_del_init(&block_group->io_list); 10327 10328 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); 10329 10330 spin_unlock(&trans->transaction->dirty_bgs_lock); 10331 btrfs_wait_cache_io(trans, block_group, path); 10332 btrfs_put_block_group(block_group); 10333 spin_lock(&trans->transaction->dirty_bgs_lock); 10334 } 10335 10336 if (!list_empty(&block_group->dirty_list)) { 10337 list_del_init(&block_group->dirty_list); 10338 btrfs_put_block_group(block_group); 10339 } 10340 spin_unlock(&trans->transaction->dirty_bgs_lock); 10341 mutex_unlock(&trans->transaction->cache_write_mutex); 10342 10343 if (!IS_ERR(inode)) { 10344 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 10345 if (ret) { 10346 btrfs_add_delayed_iput(inode); 10347 goto out; 10348 } 10349 clear_nlink(inode); 10350 /* One for the block groups ref */ 10351 spin_lock(&block_group->lock); 10352 if (block_group->iref) { 10353 block_group->iref = 0; 10354 block_group->inode = NULL; 10355 spin_unlock(&block_group->lock); 10356 iput(inode); 10357 } else { 10358 spin_unlock(&block_group->lock); 10359 } 10360 /* One for our lookup ref */ 10361 btrfs_add_delayed_iput(inode); 10362 } 10363 10364 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 10365 key.offset = block_group->key.objectid; 10366 key.type = 0; 10367 10368 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); 10369 if (ret < 0) 10370 goto out; 10371 if (ret > 0) 10372 btrfs_release_path(path); 10373 if (ret == 0) { 10374 ret = btrfs_del_item(trans, tree_root, path); 10375 if (ret) 10376 goto out; 10377 btrfs_release_path(path); 10378 } 10379 10380 spin_lock(&fs_info->block_group_cache_lock); 10381 rb_erase(&block_group->cache_node, 10382 &fs_info->block_group_cache_tree); 10383 RB_CLEAR_NODE(&block_group->cache_node); 10384 10385 if (fs_info->first_logical_byte == block_group->key.objectid) 10386 fs_info->first_logical_byte = (u64)-1; 10387 spin_unlock(&fs_info->block_group_cache_lock); 10388 10389 down_write(&block_group->space_info->groups_sem); 10390 /* 10391 * we must use list_del_init so people can check to see if they 10392 * are still on the list after taking the semaphore 10393 */ 10394 list_del_init(&block_group->list); 10395 if (list_empty(&block_group->space_info->block_groups[index])) { 10396 kobj = block_group->space_info->block_group_kobjs[index]; 10397 block_group->space_info->block_group_kobjs[index] = NULL; 10398 clear_avail_alloc_bits(fs_info, block_group->flags); 10399 } 10400 up_write(&block_group->space_info->groups_sem); 10401 if (kobj) { 10402 kobject_del(kobj); 10403 kobject_put(kobj); 10404 } 10405 10406 if (block_group->has_caching_ctl) 10407 caching_ctl = get_caching_control(block_group); 10408 if (block_group->cached == BTRFS_CACHE_STARTED) 10409 wait_block_group_cache_done(block_group); 10410 if (block_group->has_caching_ctl) { 10411 down_write(&fs_info->commit_root_sem); 10412 if (!caching_ctl) { 10413 struct btrfs_caching_control *ctl; 10414 10415 list_for_each_entry(ctl, 10416 &fs_info->caching_block_groups, list) 10417 if (ctl->block_group == block_group) { 10418 caching_ctl = ctl; 10419 atomic_inc(&caching_ctl->count); 10420 break; 10421 } 10422 } 10423 if (caching_ctl) 10424 list_del_init(&caching_ctl->list); 10425 up_write(&fs_info->commit_root_sem); 10426 if (caching_ctl) { 10427 /* Once for the caching bgs list and once for us. */ 10428 put_caching_control(caching_ctl); 10429 put_caching_control(caching_ctl); 10430 } 10431 } 10432 10433 spin_lock(&trans->transaction->dirty_bgs_lock); 10434 if (!list_empty(&block_group->dirty_list)) { 10435 WARN_ON(1); 10436 } 10437 if (!list_empty(&block_group->io_list)) { 10438 WARN_ON(1); 10439 } 10440 spin_unlock(&trans->transaction->dirty_bgs_lock); 10441 btrfs_remove_free_space_cache(block_group); 10442 10443 spin_lock(&block_group->space_info->lock); 10444 list_del_init(&block_group->ro_list); 10445 10446 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 10447 WARN_ON(block_group->space_info->total_bytes 10448 < block_group->key.offset); 10449 WARN_ON(block_group->space_info->bytes_readonly 10450 < block_group->key.offset); 10451 WARN_ON(block_group->space_info->disk_total 10452 < block_group->key.offset * factor); 10453 } 10454 block_group->space_info->total_bytes -= block_group->key.offset; 10455 block_group->space_info->bytes_readonly -= block_group->key.offset; 10456 block_group->space_info->disk_total -= block_group->key.offset * factor; 10457 10458 spin_unlock(&block_group->space_info->lock); 10459 10460 memcpy(&key, &block_group->key, sizeof(key)); 10461 10462 mutex_lock(&fs_info->chunk_mutex); 10463 if (!list_empty(&em->list)) { 10464 /* We're in the transaction->pending_chunks list. */ 10465 free_extent_map(em); 10466 } 10467 spin_lock(&block_group->lock); 10468 block_group->removed = 1; 10469 /* 10470 * At this point trimming can't start on this block group, because we 10471 * removed the block group from the tree fs_info->block_group_cache_tree 10472 * so no one can't find it anymore and even if someone already got this 10473 * block group before we removed it from the rbtree, they have already 10474 * incremented block_group->trimming - if they didn't, they won't find 10475 * any free space entries because we already removed them all when we 10476 * called btrfs_remove_free_space_cache(). 10477 * 10478 * And we must not remove the extent map from the fs_info->mapping_tree 10479 * to prevent the same logical address range and physical device space 10480 * ranges from being reused for a new block group. This is because our 10481 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is 10482 * completely transactionless, so while it is trimming a range the 10483 * currently running transaction might finish and a new one start, 10484 * allowing for new block groups to be created that can reuse the same 10485 * physical device locations unless we take this special care. 10486 * 10487 * There may also be an implicit trim operation if the file system 10488 * is mounted with -odiscard. The same protections must remain 10489 * in place until the extents have been discarded completely when 10490 * the transaction commit has completed. 10491 */ 10492 remove_em = (atomic_read(&block_group->trimming) == 0); 10493 /* 10494 * Make sure a trimmer task always sees the em in the pinned_chunks list 10495 * if it sees block_group->removed == 1 (needs to lock block_group->lock 10496 * before checking block_group->removed). 10497 */ 10498 if (!remove_em) { 10499 /* 10500 * Our em might be in trans->transaction->pending_chunks which 10501 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), 10502 * and so is the fs_info->pinned_chunks list. 10503 * 10504 * So at this point we must be holding the chunk_mutex to avoid 10505 * any races with chunk allocation (more specifically at 10506 * volumes.c:contains_pending_extent()), to ensure it always 10507 * sees the em, either in the pending_chunks list or in the 10508 * pinned_chunks list. 10509 */ 10510 list_move_tail(&em->list, &fs_info->pinned_chunks); 10511 } 10512 spin_unlock(&block_group->lock); 10513 10514 if (remove_em) { 10515 struct extent_map_tree *em_tree; 10516 10517 em_tree = &fs_info->mapping_tree.map_tree; 10518 write_lock(&em_tree->lock); 10519 /* 10520 * The em might be in the pending_chunks list, so make sure the 10521 * chunk mutex is locked, since remove_extent_mapping() will 10522 * delete us from that list. 10523 */ 10524 remove_extent_mapping(em_tree, em); 10525 write_unlock(&em_tree->lock); 10526 /* once for the tree */ 10527 free_extent_map(em); 10528 } 10529 10530 mutex_unlock(&fs_info->chunk_mutex); 10531 10532 ret = remove_block_group_free_space(trans, fs_info, block_group); 10533 if (ret) 10534 goto out; 10535 10536 btrfs_put_block_group(block_group); 10537 btrfs_put_block_group(block_group); 10538 10539 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 10540 if (ret > 0) 10541 ret = -EIO; 10542 if (ret < 0) 10543 goto out; 10544 10545 ret = btrfs_del_item(trans, root, path); 10546 out: 10547 btrfs_free_path(path); 10548 return ret; 10549 } 10550 10551 struct btrfs_trans_handle * 10552 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, 10553 const u64 chunk_offset) 10554 { 10555 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; 10556 struct extent_map *em; 10557 struct map_lookup *map; 10558 unsigned int num_items; 10559 10560 read_lock(&em_tree->lock); 10561 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 10562 read_unlock(&em_tree->lock); 10563 ASSERT(em && em->start == chunk_offset); 10564 10565 /* 10566 * We need to reserve 3 + N units from the metadata space info in order 10567 * to remove a block group (done at btrfs_remove_chunk() and at 10568 * btrfs_remove_block_group()), which are used for: 10569 * 10570 * 1 unit for adding the free space inode's orphan (located in the tree 10571 * of tree roots). 10572 * 1 unit for deleting the block group item (located in the extent 10573 * tree). 10574 * 1 unit for deleting the free space item (located in tree of tree 10575 * roots). 10576 * N units for deleting N device extent items corresponding to each 10577 * stripe (located in the device tree). 10578 * 10579 * In order to remove a block group we also need to reserve units in the 10580 * system space info in order to update the chunk tree (update one or 10581 * more device items and remove one chunk item), but this is done at 10582 * btrfs_remove_chunk() through a call to check_system_chunk(). 10583 */ 10584 map = em->map_lookup; 10585 num_items = 3 + map->num_stripes; 10586 free_extent_map(em); 10587 10588 return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, 10589 num_items, 1); 10590 } 10591 10592 /* 10593 * Process the unused_bgs list and remove any that don't have any allocated 10594 * space inside of them. 10595 */ 10596 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) 10597 { 10598 struct btrfs_block_group_cache *block_group; 10599 struct btrfs_space_info *space_info; 10600 struct btrfs_trans_handle *trans; 10601 int ret = 0; 10602 10603 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 10604 return; 10605 10606 spin_lock(&fs_info->unused_bgs_lock); 10607 while (!list_empty(&fs_info->unused_bgs)) { 10608 u64 start, end; 10609 int trimming; 10610 10611 block_group = list_first_entry(&fs_info->unused_bgs, 10612 struct btrfs_block_group_cache, 10613 bg_list); 10614 list_del_init(&block_group->bg_list); 10615 10616 space_info = block_group->space_info; 10617 10618 if (ret || btrfs_mixed_space_info(space_info)) { 10619 btrfs_put_block_group(block_group); 10620 continue; 10621 } 10622 spin_unlock(&fs_info->unused_bgs_lock); 10623 10624 mutex_lock(&fs_info->delete_unused_bgs_mutex); 10625 10626 /* Don't want to race with allocators so take the groups_sem */ 10627 down_write(&space_info->groups_sem); 10628 spin_lock(&block_group->lock); 10629 if (block_group->reserved || 10630 btrfs_block_group_used(&block_group->item) || 10631 block_group->ro || 10632 list_is_singular(&block_group->list)) { 10633 /* 10634 * We want to bail if we made new allocations or have 10635 * outstanding allocations in this block group. We do 10636 * the ro check in case balance is currently acting on 10637 * this block group. 10638 */ 10639 spin_unlock(&block_group->lock); 10640 up_write(&space_info->groups_sem); 10641 goto next; 10642 } 10643 spin_unlock(&block_group->lock); 10644 10645 /* We don't want to force the issue, only flip if it's ok. */ 10646 ret = inc_block_group_ro(block_group, 0); 10647 up_write(&space_info->groups_sem); 10648 if (ret < 0) { 10649 ret = 0; 10650 goto next; 10651 } 10652 10653 /* 10654 * Want to do this before we do anything else so we can recover 10655 * properly if we fail to join the transaction. 10656 */ 10657 trans = btrfs_start_trans_remove_block_group(fs_info, 10658 block_group->key.objectid); 10659 if (IS_ERR(trans)) { 10660 btrfs_dec_block_group_ro(block_group); 10661 ret = PTR_ERR(trans); 10662 goto next; 10663 } 10664 10665 /* 10666 * We could have pending pinned extents for this block group, 10667 * just delete them, we don't care about them anymore. 10668 */ 10669 start = block_group->key.objectid; 10670 end = start + block_group->key.offset - 1; 10671 /* 10672 * Hold the unused_bg_unpin_mutex lock to avoid racing with 10673 * btrfs_finish_extent_commit(). If we are at transaction N, 10674 * another task might be running finish_extent_commit() for the 10675 * previous transaction N - 1, and have seen a range belonging 10676 * to the block group in freed_extents[] before we were able to 10677 * clear the whole block group range from freed_extents[]. This 10678 * means that task can lookup for the block group after we 10679 * unpinned it from freed_extents[] and removed it, leading to 10680 * a BUG_ON() at btrfs_unpin_extent_range(). 10681 */ 10682 mutex_lock(&fs_info->unused_bg_unpin_mutex); 10683 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, 10684 EXTENT_DIRTY); 10685 if (ret) { 10686 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10687 btrfs_dec_block_group_ro(block_group); 10688 goto end_trans; 10689 } 10690 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, 10691 EXTENT_DIRTY); 10692 if (ret) { 10693 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10694 btrfs_dec_block_group_ro(block_group); 10695 goto end_trans; 10696 } 10697 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 10698 10699 /* Reset pinned so btrfs_put_block_group doesn't complain */ 10700 spin_lock(&space_info->lock); 10701 spin_lock(&block_group->lock); 10702 10703 space_info->bytes_pinned -= block_group->pinned; 10704 space_info->bytes_readonly += block_group->pinned; 10705 percpu_counter_add(&space_info->total_bytes_pinned, 10706 -block_group->pinned); 10707 block_group->pinned = 0; 10708 10709 spin_unlock(&block_group->lock); 10710 spin_unlock(&space_info->lock); 10711 10712 /* DISCARD can flip during remount */ 10713 trimming = btrfs_test_opt(fs_info, DISCARD); 10714 10715 /* Implicit trim during transaction commit. */ 10716 if (trimming) 10717 btrfs_get_block_group_trimming(block_group); 10718 10719 /* 10720 * Btrfs_remove_chunk will abort the transaction if things go 10721 * horribly wrong. 10722 */ 10723 ret = btrfs_remove_chunk(trans, fs_info, 10724 block_group->key.objectid); 10725 10726 if (ret) { 10727 if (trimming) 10728 btrfs_put_block_group_trimming(block_group); 10729 goto end_trans; 10730 } 10731 10732 /* 10733 * If we're not mounted with -odiscard, we can just forget 10734 * about this block group. Otherwise we'll need to wait 10735 * until transaction commit to do the actual discard. 10736 */ 10737 if (trimming) { 10738 spin_lock(&fs_info->unused_bgs_lock); 10739 /* 10740 * A concurrent scrub might have added us to the list 10741 * fs_info->unused_bgs, so use a list_move operation 10742 * to add the block group to the deleted_bgs list. 10743 */ 10744 list_move(&block_group->bg_list, 10745 &trans->transaction->deleted_bgs); 10746 spin_unlock(&fs_info->unused_bgs_lock); 10747 btrfs_get_block_group(block_group); 10748 } 10749 end_trans: 10750 btrfs_end_transaction(trans); 10751 next: 10752 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 10753 btrfs_put_block_group(block_group); 10754 spin_lock(&fs_info->unused_bgs_lock); 10755 } 10756 spin_unlock(&fs_info->unused_bgs_lock); 10757 } 10758 10759 int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 10760 { 10761 struct btrfs_space_info *space_info; 10762 struct btrfs_super_block *disk_super; 10763 u64 features; 10764 u64 flags; 10765 int mixed = 0; 10766 int ret; 10767 10768 disk_super = fs_info->super_copy; 10769 if (!btrfs_super_root(disk_super)) 10770 return -EINVAL; 10771 10772 features = btrfs_super_incompat_flags(disk_super); 10773 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 10774 mixed = 1; 10775 10776 flags = BTRFS_BLOCK_GROUP_SYSTEM; 10777 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10778 if (ret) 10779 goto out; 10780 10781 if (mixed) { 10782 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 10783 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10784 } else { 10785 flags = BTRFS_BLOCK_GROUP_METADATA; 10786 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10787 if (ret) 10788 goto out; 10789 10790 flags = BTRFS_BLOCK_GROUP_DATA; 10791 ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); 10792 } 10793 out: 10794 return ret; 10795 } 10796 10797 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 10798 u64 start, u64 end) 10799 { 10800 return unpin_extent_range(fs_info, start, end, false); 10801 } 10802 10803 /* 10804 * It used to be that old block groups would be left around forever. 10805 * Iterating over them would be enough to trim unused space. Since we 10806 * now automatically remove them, we also need to iterate over unallocated 10807 * space. 10808 * 10809 * We don't want a transaction for this since the discard may take a 10810 * substantial amount of time. We don't require that a transaction be 10811 * running, but we do need to take a running transaction into account 10812 * to ensure that we're not discarding chunks that were released in 10813 * the current transaction. 10814 * 10815 * Holding the chunks lock will prevent other threads from allocating 10816 * or releasing chunks, but it won't prevent a running transaction 10817 * from committing and releasing the memory that the pending chunks 10818 * list head uses. For that, we need to take a reference to the 10819 * transaction. 10820 */ 10821 static int btrfs_trim_free_extents(struct btrfs_device *device, 10822 u64 minlen, u64 *trimmed) 10823 { 10824 u64 start = 0, len = 0; 10825 int ret; 10826 10827 *trimmed = 0; 10828 10829 /* Not writeable = nothing to do. */ 10830 if (!device->writeable) 10831 return 0; 10832 10833 /* No free space = nothing to do. */ 10834 if (device->total_bytes <= device->bytes_used) 10835 return 0; 10836 10837 ret = 0; 10838 10839 while (1) { 10840 struct btrfs_fs_info *fs_info = device->fs_info; 10841 struct btrfs_transaction *trans; 10842 u64 bytes; 10843 10844 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 10845 if (ret) 10846 return ret; 10847 10848 down_read(&fs_info->commit_root_sem); 10849 10850 spin_lock(&fs_info->trans_lock); 10851 trans = fs_info->running_transaction; 10852 if (trans) 10853 atomic_inc(&trans->use_count); 10854 spin_unlock(&fs_info->trans_lock); 10855 10856 ret = find_free_dev_extent_start(trans, device, minlen, start, 10857 &start, &len); 10858 if (trans) 10859 btrfs_put_transaction(trans); 10860 10861 if (ret) { 10862 up_read(&fs_info->commit_root_sem); 10863 mutex_unlock(&fs_info->chunk_mutex); 10864 if (ret == -ENOSPC) 10865 ret = 0; 10866 break; 10867 } 10868 10869 ret = btrfs_issue_discard(device->bdev, start, len, &bytes); 10870 up_read(&fs_info->commit_root_sem); 10871 mutex_unlock(&fs_info->chunk_mutex); 10872 10873 if (ret) 10874 break; 10875 10876 start += len; 10877 *trimmed += bytes; 10878 10879 if (fatal_signal_pending(current)) { 10880 ret = -ERESTARTSYS; 10881 break; 10882 } 10883 10884 cond_resched(); 10885 } 10886 10887 return ret; 10888 } 10889 10890 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 10891 { 10892 struct btrfs_block_group_cache *cache = NULL; 10893 struct btrfs_device *device; 10894 struct list_head *devices; 10895 u64 group_trimmed; 10896 u64 start; 10897 u64 end; 10898 u64 trimmed = 0; 10899 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 10900 int ret = 0; 10901 10902 /* 10903 * try to trim all FS space, our block group may start from non-zero. 10904 */ 10905 if (range->len == total_bytes) 10906 cache = btrfs_lookup_first_block_group(fs_info, range->start); 10907 else 10908 cache = btrfs_lookup_block_group(fs_info, range->start); 10909 10910 while (cache) { 10911 if (cache->key.objectid >= (range->start + range->len)) { 10912 btrfs_put_block_group(cache); 10913 break; 10914 } 10915 10916 start = max(range->start, cache->key.objectid); 10917 end = min(range->start + range->len, 10918 cache->key.objectid + cache->key.offset); 10919 10920 if (end - start >= range->minlen) { 10921 if (!block_group_cache_done(cache)) { 10922 ret = cache_block_group(cache, 0); 10923 if (ret) { 10924 btrfs_put_block_group(cache); 10925 break; 10926 } 10927 ret = wait_block_group_cache_done(cache); 10928 if (ret) { 10929 btrfs_put_block_group(cache); 10930 break; 10931 } 10932 } 10933 ret = btrfs_trim_block_group(cache, 10934 &group_trimmed, 10935 start, 10936 end, 10937 range->minlen); 10938 10939 trimmed += group_trimmed; 10940 if (ret) { 10941 btrfs_put_block_group(cache); 10942 break; 10943 } 10944 } 10945 10946 cache = next_block_group(fs_info, cache); 10947 } 10948 10949 mutex_lock(&fs_info->fs_devices->device_list_mutex); 10950 devices = &fs_info->fs_devices->alloc_list; 10951 list_for_each_entry(device, devices, dev_alloc_list) { 10952 ret = btrfs_trim_free_extents(device, range->minlen, 10953 &group_trimmed); 10954 if (ret) 10955 break; 10956 10957 trimmed += group_trimmed; 10958 } 10959 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 10960 10961 range->len = trimmed; 10962 return ret; 10963 } 10964 10965 /* 10966 * btrfs_{start,end}_write_no_snapshoting() are similar to 10967 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing 10968 * data into the page cache through nocow before the subvolume is snapshoted, 10969 * but flush the data into disk after the snapshot creation, or to prevent 10970 * operations while snapshoting is ongoing and that cause the snapshot to be 10971 * inconsistent (writes followed by expanding truncates for example). 10972 */ 10973 void btrfs_end_write_no_snapshoting(struct btrfs_root *root) 10974 { 10975 percpu_counter_dec(&root->subv_writers->counter); 10976 /* 10977 * Make sure counter is updated before we wake up waiters. 10978 */ 10979 smp_mb(); 10980 if (waitqueue_active(&root->subv_writers->wait)) 10981 wake_up(&root->subv_writers->wait); 10982 } 10983 10984 int btrfs_start_write_no_snapshoting(struct btrfs_root *root) 10985 { 10986 if (atomic_read(&root->will_be_snapshoted)) 10987 return 0; 10988 10989 percpu_counter_inc(&root->subv_writers->counter); 10990 /* 10991 * Make sure counter is updated before we check for snapshot creation. 10992 */ 10993 smp_mb(); 10994 if (atomic_read(&root->will_be_snapshoted)) { 10995 btrfs_end_write_no_snapshoting(root); 10996 return 0; 10997 } 10998 return 1; 10999 } 11000 11001 static int wait_snapshoting_atomic_t(atomic_t *a) 11002 { 11003 schedule(); 11004 return 0; 11005 } 11006 11007 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) 11008 { 11009 while (true) { 11010 int ret; 11011 11012 ret = btrfs_start_write_no_snapshoting(root); 11013 if (ret) 11014 break; 11015 wait_on_atomic_t(&root->will_be_snapshoted, 11016 wait_snapshoting_atomic_t, 11017 TASK_UNINTERRUPTIBLE); 11018 } 11019 } 11020