1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/signal.h> 8 #include <linux/pagemap.h> 9 #include <linux/writeback.h> 10 #include <linux/blkdev.h> 11 #include <linux/sort.h> 12 #include <linux/rcupdate.h> 13 #include <linux/kthread.h> 14 #include <linux/slab.h> 15 #include <linux/ratelimit.h> 16 #include <linux/percpu_counter.h> 17 #include <linux/lockdep.h> 18 #include <linux/crc32c.h> 19 #include "ctree.h" 20 #include "extent-tree.h" 21 #include "tree-log.h" 22 #include "disk-io.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "locking.h" 27 #include "free-space-cache.h" 28 #include "free-space-tree.h" 29 #include "sysfs.h" 30 #include "qgroup.h" 31 #include "ref-verify.h" 32 #include "space-info.h" 33 #include "block-rsv.h" 34 #include "delalloc-space.h" 35 #include "discard.h" 36 #include "rcu-string.h" 37 #include "zoned.h" 38 #include "dev-replace.h" 39 #include "fs.h" 40 #include "accessors.h" 41 #include "root-tree.h" 42 #include "file-item.h" 43 #include "orphan.h" 44 #include "tree-checker.h" 45 46 #undef SCRAMBLE_DELAYED_REFS 47 48 49 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 50 struct btrfs_delayed_ref_node *node, u64 parent, 51 u64 root_objectid, u64 owner_objectid, 52 u64 owner_offset, int refs_to_drop, 53 struct btrfs_delayed_extent_op *extra_op); 54 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 55 struct extent_buffer *leaf, 56 struct btrfs_extent_item *ei); 57 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 58 u64 parent, u64 root_objectid, 59 u64 flags, u64 owner, u64 offset, 60 struct btrfs_key *ins, int ref_mod); 61 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 62 struct btrfs_delayed_ref_node *node, 63 struct btrfs_delayed_extent_op *extent_op); 64 static int find_next_key(struct btrfs_path *path, int level, 65 struct btrfs_key *key); 66 67 static int block_group_bits(struct btrfs_block_group *cache, u64 bits) 68 { 69 return (cache->flags & bits) == bits; 70 } 71 72 int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, 73 u64 start, u64 num_bytes) 74 { 75 u64 end = start + num_bytes - 1; 76 set_extent_bit(&fs_info->excluded_extents, start, end, 77 EXTENT_UPTODATE, NULL); 78 return 0; 79 } 80 81 void btrfs_free_excluded_extents(struct btrfs_block_group *cache) 82 { 83 struct btrfs_fs_info *fs_info = cache->fs_info; 84 u64 start, end; 85 86 start = cache->start; 87 end = start + cache->length - 1; 88 89 clear_extent_bits(&fs_info->excluded_extents, start, end, 90 EXTENT_UPTODATE); 91 } 92 93 /* simple helper to search for an existing data extent at a given offset */ 94 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 95 { 96 struct btrfs_root *root = btrfs_extent_root(fs_info, start); 97 int ret; 98 struct btrfs_key key; 99 struct btrfs_path *path; 100 101 path = btrfs_alloc_path(); 102 if (!path) 103 return -ENOMEM; 104 105 key.objectid = start; 106 key.offset = len; 107 key.type = BTRFS_EXTENT_ITEM_KEY; 108 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 109 btrfs_free_path(path); 110 return ret; 111 } 112 113 /* 114 * helper function to lookup reference count and flags of a tree block. 115 * 116 * the head node for delayed ref is used to store the sum of all the 117 * reference count modifications queued up in the rbtree. the head 118 * node may also store the extent flags to set. This way you can check 119 * to see what the reference count and extent flags would be if all of 120 * the delayed refs are not processed. 121 */ 122 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 123 struct btrfs_fs_info *fs_info, u64 bytenr, 124 u64 offset, int metadata, u64 *refs, u64 *flags) 125 { 126 struct btrfs_root *extent_root; 127 struct btrfs_delayed_ref_head *head; 128 struct btrfs_delayed_ref_root *delayed_refs; 129 struct btrfs_path *path; 130 struct btrfs_extent_item *ei; 131 struct extent_buffer *leaf; 132 struct btrfs_key key; 133 u32 item_size; 134 u64 num_refs; 135 u64 extent_flags; 136 int ret; 137 138 /* 139 * If we don't have skinny metadata, don't bother doing anything 140 * different 141 */ 142 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 143 offset = fs_info->nodesize; 144 metadata = 0; 145 } 146 147 path = btrfs_alloc_path(); 148 if (!path) 149 return -ENOMEM; 150 151 if (!trans) { 152 path->skip_locking = 1; 153 path->search_commit_root = 1; 154 } 155 156 search_again: 157 key.objectid = bytenr; 158 key.offset = offset; 159 if (metadata) 160 key.type = BTRFS_METADATA_ITEM_KEY; 161 else 162 key.type = BTRFS_EXTENT_ITEM_KEY; 163 164 extent_root = btrfs_extent_root(fs_info, bytenr); 165 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 166 if (ret < 0) 167 goto out_free; 168 169 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 170 if (path->slots[0]) { 171 path->slots[0]--; 172 btrfs_item_key_to_cpu(path->nodes[0], &key, 173 path->slots[0]); 174 if (key.objectid == bytenr && 175 key.type == BTRFS_EXTENT_ITEM_KEY && 176 key.offset == fs_info->nodesize) 177 ret = 0; 178 } 179 } 180 181 if (ret == 0) { 182 leaf = path->nodes[0]; 183 item_size = btrfs_item_size(leaf, path->slots[0]); 184 if (item_size >= sizeof(*ei)) { 185 ei = btrfs_item_ptr(leaf, path->slots[0], 186 struct btrfs_extent_item); 187 num_refs = btrfs_extent_refs(leaf, ei); 188 extent_flags = btrfs_extent_flags(leaf, ei); 189 } else { 190 ret = -EINVAL; 191 btrfs_print_v0_err(fs_info); 192 if (trans) 193 btrfs_abort_transaction(trans, ret); 194 else 195 btrfs_handle_fs_error(fs_info, ret, NULL); 196 197 goto out_free; 198 } 199 200 BUG_ON(num_refs == 0); 201 } else { 202 num_refs = 0; 203 extent_flags = 0; 204 ret = 0; 205 } 206 207 if (!trans) 208 goto out; 209 210 delayed_refs = &trans->transaction->delayed_refs; 211 spin_lock(&delayed_refs->lock); 212 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 213 if (head) { 214 if (!mutex_trylock(&head->mutex)) { 215 refcount_inc(&head->refs); 216 spin_unlock(&delayed_refs->lock); 217 218 btrfs_release_path(path); 219 220 /* 221 * Mutex was contended, block until it's released and try 222 * again 223 */ 224 mutex_lock(&head->mutex); 225 mutex_unlock(&head->mutex); 226 btrfs_put_delayed_ref_head(head); 227 goto search_again; 228 } 229 spin_lock(&head->lock); 230 if (head->extent_op && head->extent_op->update_flags) 231 extent_flags |= head->extent_op->flags_to_set; 232 else 233 BUG_ON(num_refs == 0); 234 235 num_refs += head->ref_mod; 236 spin_unlock(&head->lock); 237 mutex_unlock(&head->mutex); 238 } 239 spin_unlock(&delayed_refs->lock); 240 out: 241 WARN_ON(num_refs == 0); 242 if (refs) 243 *refs = num_refs; 244 if (flags) 245 *flags = extent_flags; 246 out_free: 247 btrfs_free_path(path); 248 return ret; 249 } 250 251 /* 252 * Back reference rules. Back refs have three main goals: 253 * 254 * 1) differentiate between all holders of references to an extent so that 255 * when a reference is dropped we can make sure it was a valid reference 256 * before freeing the extent. 257 * 258 * 2) Provide enough information to quickly find the holders of an extent 259 * if we notice a given block is corrupted or bad. 260 * 261 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 262 * maintenance. This is actually the same as #2, but with a slightly 263 * different use case. 264 * 265 * There are two kinds of back refs. The implicit back refs is optimized 266 * for pointers in non-shared tree blocks. For a given pointer in a block, 267 * back refs of this kind provide information about the block's owner tree 268 * and the pointer's key. These information allow us to find the block by 269 * b-tree searching. The full back refs is for pointers in tree blocks not 270 * referenced by their owner trees. The location of tree block is recorded 271 * in the back refs. Actually the full back refs is generic, and can be 272 * used in all cases the implicit back refs is used. The major shortcoming 273 * of the full back refs is its overhead. Every time a tree block gets 274 * COWed, we have to update back refs entry for all pointers in it. 275 * 276 * For a newly allocated tree block, we use implicit back refs for 277 * pointers in it. This means most tree related operations only involve 278 * implicit back refs. For a tree block created in old transaction, the 279 * only way to drop a reference to it is COW it. So we can detect the 280 * event that tree block loses its owner tree's reference and do the 281 * back refs conversion. 282 * 283 * When a tree block is COWed through a tree, there are four cases: 284 * 285 * The reference count of the block is one and the tree is the block's 286 * owner tree. Nothing to do in this case. 287 * 288 * The reference count of the block is one and the tree is not the 289 * block's owner tree. In this case, full back refs is used for pointers 290 * in the block. Remove these full back refs, add implicit back refs for 291 * every pointers in the new block. 292 * 293 * The reference count of the block is greater than one and the tree is 294 * the block's owner tree. In this case, implicit back refs is used for 295 * pointers in the block. Add full back refs for every pointers in the 296 * block, increase lower level extents' reference counts. The original 297 * implicit back refs are entailed to the new block. 298 * 299 * The reference count of the block is greater than one and the tree is 300 * not the block's owner tree. Add implicit back refs for every pointer in 301 * the new block, increase lower level extents' reference count. 302 * 303 * Back Reference Key composing: 304 * 305 * The key objectid corresponds to the first byte in the extent, 306 * The key type is used to differentiate between types of back refs. 307 * There are different meanings of the key offset for different types 308 * of back refs. 309 * 310 * File extents can be referenced by: 311 * 312 * - multiple snapshots, subvolumes, or different generations in one subvol 313 * - different files inside a single subvolume 314 * - different offsets inside a file (bookend extents in file.c) 315 * 316 * The extent ref structure for the implicit back refs has fields for: 317 * 318 * - Objectid of the subvolume root 319 * - objectid of the file holding the reference 320 * - original offset in the file 321 * - how many bookend extents 322 * 323 * The key offset for the implicit back refs is hash of the first 324 * three fields. 325 * 326 * The extent ref structure for the full back refs has field for: 327 * 328 * - number of pointers in the tree leaf 329 * 330 * The key offset for the implicit back refs is the first byte of 331 * the tree leaf 332 * 333 * When a file extent is allocated, The implicit back refs is used. 334 * the fields are filled in: 335 * 336 * (root_key.objectid, inode objectid, offset in file, 1) 337 * 338 * When a file extent is removed file truncation, we find the 339 * corresponding implicit back refs and check the following fields: 340 * 341 * (btrfs_header_owner(leaf), inode objectid, offset in file) 342 * 343 * Btree extents can be referenced by: 344 * 345 * - Different subvolumes 346 * 347 * Both the implicit back refs and the full back refs for tree blocks 348 * only consist of key. The key offset for the implicit back refs is 349 * objectid of block's owner tree. The key offset for the full back refs 350 * is the first byte of parent block. 351 * 352 * When implicit back refs is used, information about the lowest key and 353 * level of the tree block are required. These information are stored in 354 * tree block info structure. 355 */ 356 357 /* 358 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 359 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 360 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 361 */ 362 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 363 struct btrfs_extent_inline_ref *iref, 364 enum btrfs_inline_ref_type is_data) 365 { 366 int type = btrfs_extent_inline_ref_type(eb, iref); 367 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 368 369 if (type == BTRFS_TREE_BLOCK_REF_KEY || 370 type == BTRFS_SHARED_BLOCK_REF_KEY || 371 type == BTRFS_SHARED_DATA_REF_KEY || 372 type == BTRFS_EXTENT_DATA_REF_KEY) { 373 if (is_data == BTRFS_REF_TYPE_BLOCK) { 374 if (type == BTRFS_TREE_BLOCK_REF_KEY) 375 return type; 376 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 377 ASSERT(eb->fs_info); 378 /* 379 * Every shared one has parent tree block, 380 * which must be aligned to sector size. 381 */ 382 if (offset && 383 IS_ALIGNED(offset, eb->fs_info->sectorsize)) 384 return type; 385 } 386 } else if (is_data == BTRFS_REF_TYPE_DATA) { 387 if (type == BTRFS_EXTENT_DATA_REF_KEY) 388 return type; 389 if (type == BTRFS_SHARED_DATA_REF_KEY) { 390 ASSERT(eb->fs_info); 391 /* 392 * Every shared one has parent tree block, 393 * which must be aligned to sector size. 394 */ 395 if (offset && 396 IS_ALIGNED(offset, eb->fs_info->sectorsize)) 397 return type; 398 } 399 } else { 400 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 401 return type; 402 } 403 } 404 405 btrfs_print_leaf(eb); 406 btrfs_err(eb->fs_info, 407 "eb %llu iref 0x%lx invalid extent inline ref type %d", 408 eb->start, (unsigned long)iref, type); 409 WARN_ON(1); 410 411 return BTRFS_REF_TYPE_INVALID; 412 } 413 414 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 415 { 416 u32 high_crc = ~(u32)0; 417 u32 low_crc = ~(u32)0; 418 __le64 lenum; 419 420 lenum = cpu_to_le64(root_objectid); 421 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 422 lenum = cpu_to_le64(owner); 423 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 424 lenum = cpu_to_le64(offset); 425 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 426 427 return ((u64)high_crc << 31) ^ (u64)low_crc; 428 } 429 430 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 431 struct btrfs_extent_data_ref *ref) 432 { 433 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 434 btrfs_extent_data_ref_objectid(leaf, ref), 435 btrfs_extent_data_ref_offset(leaf, ref)); 436 } 437 438 static int match_extent_data_ref(struct extent_buffer *leaf, 439 struct btrfs_extent_data_ref *ref, 440 u64 root_objectid, u64 owner, u64 offset) 441 { 442 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 443 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 444 btrfs_extent_data_ref_offset(leaf, ref) != offset) 445 return 0; 446 return 1; 447 } 448 449 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 450 struct btrfs_path *path, 451 u64 bytenr, u64 parent, 452 u64 root_objectid, 453 u64 owner, u64 offset) 454 { 455 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 456 struct btrfs_key key; 457 struct btrfs_extent_data_ref *ref; 458 struct extent_buffer *leaf; 459 u32 nritems; 460 int ret; 461 int recow; 462 int err = -ENOENT; 463 464 key.objectid = bytenr; 465 if (parent) { 466 key.type = BTRFS_SHARED_DATA_REF_KEY; 467 key.offset = parent; 468 } else { 469 key.type = BTRFS_EXTENT_DATA_REF_KEY; 470 key.offset = hash_extent_data_ref(root_objectid, 471 owner, offset); 472 } 473 again: 474 recow = 0; 475 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 476 if (ret < 0) { 477 err = ret; 478 goto fail; 479 } 480 481 if (parent) { 482 if (!ret) 483 return 0; 484 goto fail; 485 } 486 487 leaf = path->nodes[0]; 488 nritems = btrfs_header_nritems(leaf); 489 while (1) { 490 if (path->slots[0] >= nritems) { 491 ret = btrfs_next_leaf(root, path); 492 if (ret < 0) 493 err = ret; 494 if (ret) 495 goto fail; 496 497 leaf = path->nodes[0]; 498 nritems = btrfs_header_nritems(leaf); 499 recow = 1; 500 } 501 502 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 503 if (key.objectid != bytenr || 504 key.type != BTRFS_EXTENT_DATA_REF_KEY) 505 goto fail; 506 507 ref = btrfs_item_ptr(leaf, path->slots[0], 508 struct btrfs_extent_data_ref); 509 510 if (match_extent_data_ref(leaf, ref, root_objectid, 511 owner, offset)) { 512 if (recow) { 513 btrfs_release_path(path); 514 goto again; 515 } 516 err = 0; 517 break; 518 } 519 path->slots[0]++; 520 } 521 fail: 522 return err; 523 } 524 525 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 526 struct btrfs_path *path, 527 u64 bytenr, u64 parent, 528 u64 root_objectid, u64 owner, 529 u64 offset, int refs_to_add) 530 { 531 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 532 struct btrfs_key key; 533 struct extent_buffer *leaf; 534 u32 size; 535 u32 num_refs; 536 int ret; 537 538 key.objectid = bytenr; 539 if (parent) { 540 key.type = BTRFS_SHARED_DATA_REF_KEY; 541 key.offset = parent; 542 size = sizeof(struct btrfs_shared_data_ref); 543 } else { 544 key.type = BTRFS_EXTENT_DATA_REF_KEY; 545 key.offset = hash_extent_data_ref(root_objectid, 546 owner, offset); 547 size = sizeof(struct btrfs_extent_data_ref); 548 } 549 550 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 551 if (ret && ret != -EEXIST) 552 goto fail; 553 554 leaf = path->nodes[0]; 555 if (parent) { 556 struct btrfs_shared_data_ref *ref; 557 ref = btrfs_item_ptr(leaf, path->slots[0], 558 struct btrfs_shared_data_ref); 559 if (ret == 0) { 560 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 561 } else { 562 num_refs = btrfs_shared_data_ref_count(leaf, ref); 563 num_refs += refs_to_add; 564 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 565 } 566 } else { 567 struct btrfs_extent_data_ref *ref; 568 while (ret == -EEXIST) { 569 ref = btrfs_item_ptr(leaf, path->slots[0], 570 struct btrfs_extent_data_ref); 571 if (match_extent_data_ref(leaf, ref, root_objectid, 572 owner, offset)) 573 break; 574 btrfs_release_path(path); 575 key.offset++; 576 ret = btrfs_insert_empty_item(trans, root, path, &key, 577 size); 578 if (ret && ret != -EEXIST) 579 goto fail; 580 581 leaf = path->nodes[0]; 582 } 583 ref = btrfs_item_ptr(leaf, path->slots[0], 584 struct btrfs_extent_data_ref); 585 if (ret == 0) { 586 btrfs_set_extent_data_ref_root(leaf, ref, 587 root_objectid); 588 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 589 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 590 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 591 } else { 592 num_refs = btrfs_extent_data_ref_count(leaf, ref); 593 num_refs += refs_to_add; 594 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 595 } 596 } 597 btrfs_mark_buffer_dirty(leaf); 598 ret = 0; 599 fail: 600 btrfs_release_path(path); 601 return ret; 602 } 603 604 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 605 struct btrfs_root *root, 606 struct btrfs_path *path, 607 int refs_to_drop) 608 { 609 struct btrfs_key key; 610 struct btrfs_extent_data_ref *ref1 = NULL; 611 struct btrfs_shared_data_ref *ref2 = NULL; 612 struct extent_buffer *leaf; 613 u32 num_refs = 0; 614 int ret = 0; 615 616 leaf = path->nodes[0]; 617 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 618 619 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 620 ref1 = btrfs_item_ptr(leaf, path->slots[0], 621 struct btrfs_extent_data_ref); 622 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 623 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 624 ref2 = btrfs_item_ptr(leaf, path->slots[0], 625 struct btrfs_shared_data_ref); 626 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 627 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 628 btrfs_print_v0_err(trans->fs_info); 629 btrfs_abort_transaction(trans, -EINVAL); 630 return -EINVAL; 631 } else { 632 BUG(); 633 } 634 635 BUG_ON(num_refs < refs_to_drop); 636 num_refs -= refs_to_drop; 637 638 if (num_refs == 0) { 639 ret = btrfs_del_item(trans, root, path); 640 } else { 641 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 642 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 643 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 644 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 645 btrfs_mark_buffer_dirty(leaf); 646 } 647 return ret; 648 } 649 650 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 651 struct btrfs_extent_inline_ref *iref) 652 { 653 struct btrfs_key key; 654 struct extent_buffer *leaf; 655 struct btrfs_extent_data_ref *ref1; 656 struct btrfs_shared_data_ref *ref2; 657 u32 num_refs = 0; 658 int type; 659 660 leaf = path->nodes[0]; 661 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 662 663 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 664 if (iref) { 665 /* 666 * If type is invalid, we should have bailed out earlier than 667 * this call. 668 */ 669 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 670 ASSERT(type != BTRFS_REF_TYPE_INVALID); 671 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 672 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 673 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 674 } else { 675 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 676 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 677 } 678 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 679 ref1 = btrfs_item_ptr(leaf, path->slots[0], 680 struct btrfs_extent_data_ref); 681 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 682 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 683 ref2 = btrfs_item_ptr(leaf, path->slots[0], 684 struct btrfs_shared_data_ref); 685 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 686 } else { 687 WARN_ON(1); 688 } 689 return num_refs; 690 } 691 692 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 693 struct btrfs_path *path, 694 u64 bytenr, u64 parent, 695 u64 root_objectid) 696 { 697 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 698 struct btrfs_key key; 699 int ret; 700 701 key.objectid = bytenr; 702 if (parent) { 703 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 704 key.offset = parent; 705 } else { 706 key.type = BTRFS_TREE_BLOCK_REF_KEY; 707 key.offset = root_objectid; 708 } 709 710 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 711 if (ret > 0) 712 ret = -ENOENT; 713 return ret; 714 } 715 716 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 717 struct btrfs_path *path, 718 u64 bytenr, u64 parent, 719 u64 root_objectid) 720 { 721 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 722 struct btrfs_key key; 723 int ret; 724 725 key.objectid = bytenr; 726 if (parent) { 727 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 728 key.offset = parent; 729 } else { 730 key.type = BTRFS_TREE_BLOCK_REF_KEY; 731 key.offset = root_objectid; 732 } 733 734 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 735 btrfs_release_path(path); 736 return ret; 737 } 738 739 static inline int extent_ref_type(u64 parent, u64 owner) 740 { 741 int type; 742 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 743 if (parent > 0) 744 type = BTRFS_SHARED_BLOCK_REF_KEY; 745 else 746 type = BTRFS_TREE_BLOCK_REF_KEY; 747 } else { 748 if (parent > 0) 749 type = BTRFS_SHARED_DATA_REF_KEY; 750 else 751 type = BTRFS_EXTENT_DATA_REF_KEY; 752 } 753 return type; 754 } 755 756 static int find_next_key(struct btrfs_path *path, int level, 757 struct btrfs_key *key) 758 759 { 760 for (; level < BTRFS_MAX_LEVEL; level++) { 761 if (!path->nodes[level]) 762 break; 763 if (path->slots[level] + 1 >= 764 btrfs_header_nritems(path->nodes[level])) 765 continue; 766 if (level == 0) 767 btrfs_item_key_to_cpu(path->nodes[level], key, 768 path->slots[level] + 1); 769 else 770 btrfs_node_key_to_cpu(path->nodes[level], key, 771 path->slots[level] + 1); 772 return 0; 773 } 774 return 1; 775 } 776 777 /* 778 * look for inline back ref. if back ref is found, *ref_ret is set 779 * to the address of inline back ref, and 0 is returned. 780 * 781 * if back ref isn't found, *ref_ret is set to the address where it 782 * should be inserted, and -ENOENT is returned. 783 * 784 * if insert is true and there are too many inline back refs, the path 785 * points to the extent item, and -EAGAIN is returned. 786 * 787 * NOTE: inline back refs are ordered in the same way that back ref 788 * items in the tree are ordered. 789 */ 790 static noinline_for_stack 791 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 792 struct btrfs_path *path, 793 struct btrfs_extent_inline_ref **ref_ret, 794 u64 bytenr, u64 num_bytes, 795 u64 parent, u64 root_objectid, 796 u64 owner, u64 offset, int insert) 797 { 798 struct btrfs_fs_info *fs_info = trans->fs_info; 799 struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); 800 struct btrfs_key key; 801 struct extent_buffer *leaf; 802 struct btrfs_extent_item *ei; 803 struct btrfs_extent_inline_ref *iref; 804 u64 flags; 805 u64 item_size; 806 unsigned long ptr; 807 unsigned long end; 808 int extra_size; 809 int type; 810 int want; 811 int ret; 812 int err = 0; 813 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 814 int needed; 815 816 key.objectid = bytenr; 817 key.type = BTRFS_EXTENT_ITEM_KEY; 818 key.offset = num_bytes; 819 820 want = extent_ref_type(parent, owner); 821 if (insert) { 822 extra_size = btrfs_extent_inline_ref_size(want); 823 path->search_for_extension = 1; 824 path->keep_locks = 1; 825 } else 826 extra_size = -1; 827 828 /* 829 * Owner is our level, so we can just add one to get the level for the 830 * block we are interested in. 831 */ 832 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 833 key.type = BTRFS_METADATA_ITEM_KEY; 834 key.offset = owner; 835 } 836 837 again: 838 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 839 if (ret < 0) { 840 err = ret; 841 goto out; 842 } 843 844 /* 845 * We may be a newly converted file system which still has the old fat 846 * extent entries for metadata, so try and see if we have one of those. 847 */ 848 if (ret > 0 && skinny_metadata) { 849 skinny_metadata = false; 850 if (path->slots[0]) { 851 path->slots[0]--; 852 btrfs_item_key_to_cpu(path->nodes[0], &key, 853 path->slots[0]); 854 if (key.objectid == bytenr && 855 key.type == BTRFS_EXTENT_ITEM_KEY && 856 key.offset == num_bytes) 857 ret = 0; 858 } 859 if (ret) { 860 key.objectid = bytenr; 861 key.type = BTRFS_EXTENT_ITEM_KEY; 862 key.offset = num_bytes; 863 btrfs_release_path(path); 864 goto again; 865 } 866 } 867 868 if (ret && !insert) { 869 err = -ENOENT; 870 goto out; 871 } else if (WARN_ON(ret)) { 872 err = -EIO; 873 goto out; 874 } 875 876 leaf = path->nodes[0]; 877 item_size = btrfs_item_size(leaf, path->slots[0]); 878 if (unlikely(item_size < sizeof(*ei))) { 879 err = -EINVAL; 880 btrfs_print_v0_err(fs_info); 881 btrfs_abort_transaction(trans, err); 882 goto out; 883 } 884 885 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 886 flags = btrfs_extent_flags(leaf, ei); 887 888 ptr = (unsigned long)(ei + 1); 889 end = (unsigned long)ei + item_size; 890 891 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 892 ptr += sizeof(struct btrfs_tree_block_info); 893 BUG_ON(ptr > end); 894 } 895 896 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 897 needed = BTRFS_REF_TYPE_DATA; 898 else 899 needed = BTRFS_REF_TYPE_BLOCK; 900 901 err = -ENOENT; 902 while (1) { 903 if (ptr >= end) { 904 if (ptr > end) { 905 err = -EUCLEAN; 906 btrfs_print_leaf(path->nodes[0]); 907 btrfs_crit(fs_info, 908 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", 909 path->slots[0], root_objectid, owner, offset, parent); 910 } 911 break; 912 } 913 iref = (struct btrfs_extent_inline_ref *)ptr; 914 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 915 if (type == BTRFS_REF_TYPE_INVALID) { 916 err = -EUCLEAN; 917 goto out; 918 } 919 920 if (want < type) 921 break; 922 if (want > type) { 923 ptr += btrfs_extent_inline_ref_size(type); 924 continue; 925 } 926 927 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 928 struct btrfs_extent_data_ref *dref; 929 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 930 if (match_extent_data_ref(leaf, dref, root_objectid, 931 owner, offset)) { 932 err = 0; 933 break; 934 } 935 if (hash_extent_data_ref_item(leaf, dref) < 936 hash_extent_data_ref(root_objectid, owner, offset)) 937 break; 938 } else { 939 u64 ref_offset; 940 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 941 if (parent > 0) { 942 if (parent == ref_offset) { 943 err = 0; 944 break; 945 } 946 if (ref_offset < parent) 947 break; 948 } else { 949 if (root_objectid == ref_offset) { 950 err = 0; 951 break; 952 } 953 if (ref_offset < root_objectid) 954 break; 955 } 956 } 957 ptr += btrfs_extent_inline_ref_size(type); 958 } 959 if (err == -ENOENT && insert) { 960 if (item_size + extra_size >= 961 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 962 err = -EAGAIN; 963 goto out; 964 } 965 /* 966 * To add new inline back ref, we have to make sure 967 * there is no corresponding back ref item. 968 * For simplicity, we just do not add new inline back 969 * ref if there is any kind of item for this block 970 */ 971 if (find_next_key(path, 0, &key) == 0 && 972 key.objectid == bytenr && 973 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 974 err = -EAGAIN; 975 goto out; 976 } 977 } 978 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 979 out: 980 if (insert) { 981 path->keep_locks = 0; 982 path->search_for_extension = 0; 983 btrfs_unlock_up_safe(path, 1); 984 } 985 return err; 986 } 987 988 /* 989 * helper to add new inline back ref 990 */ 991 static noinline_for_stack 992 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 993 struct btrfs_path *path, 994 struct btrfs_extent_inline_ref *iref, 995 u64 parent, u64 root_objectid, 996 u64 owner, u64 offset, int refs_to_add, 997 struct btrfs_delayed_extent_op *extent_op) 998 { 999 struct extent_buffer *leaf; 1000 struct btrfs_extent_item *ei; 1001 unsigned long ptr; 1002 unsigned long end; 1003 unsigned long item_offset; 1004 u64 refs; 1005 int size; 1006 int type; 1007 1008 leaf = path->nodes[0]; 1009 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1010 item_offset = (unsigned long)iref - (unsigned long)ei; 1011 1012 type = extent_ref_type(parent, owner); 1013 size = btrfs_extent_inline_ref_size(type); 1014 1015 btrfs_extend_item(path, size); 1016 1017 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1018 refs = btrfs_extent_refs(leaf, ei); 1019 refs += refs_to_add; 1020 btrfs_set_extent_refs(leaf, ei, refs); 1021 if (extent_op) 1022 __run_delayed_extent_op(extent_op, leaf, ei); 1023 1024 ptr = (unsigned long)ei + item_offset; 1025 end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); 1026 if (ptr < end - size) 1027 memmove_extent_buffer(leaf, ptr + size, ptr, 1028 end - size - ptr); 1029 1030 iref = (struct btrfs_extent_inline_ref *)ptr; 1031 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1032 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1033 struct btrfs_extent_data_ref *dref; 1034 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1035 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1036 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1037 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1038 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1039 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1040 struct btrfs_shared_data_ref *sref; 1041 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1042 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1043 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1044 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1045 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1046 } else { 1047 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1048 } 1049 btrfs_mark_buffer_dirty(leaf); 1050 } 1051 1052 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1053 struct btrfs_path *path, 1054 struct btrfs_extent_inline_ref **ref_ret, 1055 u64 bytenr, u64 num_bytes, u64 parent, 1056 u64 root_objectid, u64 owner, u64 offset) 1057 { 1058 int ret; 1059 1060 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1061 num_bytes, parent, root_objectid, 1062 owner, offset, 0); 1063 if (ret != -ENOENT) 1064 return ret; 1065 1066 btrfs_release_path(path); 1067 *ref_ret = NULL; 1068 1069 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1070 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1071 root_objectid); 1072 } else { 1073 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1074 root_objectid, owner, offset); 1075 } 1076 return ret; 1077 } 1078 1079 /* 1080 * helper to update/remove inline back ref 1081 */ 1082 static noinline_for_stack 1083 void update_inline_extent_backref(struct btrfs_path *path, 1084 struct btrfs_extent_inline_ref *iref, 1085 int refs_to_mod, 1086 struct btrfs_delayed_extent_op *extent_op) 1087 { 1088 struct extent_buffer *leaf = path->nodes[0]; 1089 struct btrfs_extent_item *ei; 1090 struct btrfs_extent_data_ref *dref = NULL; 1091 struct btrfs_shared_data_ref *sref = NULL; 1092 unsigned long ptr; 1093 unsigned long end; 1094 u32 item_size; 1095 int size; 1096 int type; 1097 u64 refs; 1098 1099 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1100 refs = btrfs_extent_refs(leaf, ei); 1101 WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); 1102 refs += refs_to_mod; 1103 btrfs_set_extent_refs(leaf, ei, refs); 1104 if (extent_op) 1105 __run_delayed_extent_op(extent_op, leaf, ei); 1106 1107 /* 1108 * If type is invalid, we should have bailed out after 1109 * lookup_inline_extent_backref(). 1110 */ 1111 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1112 ASSERT(type != BTRFS_REF_TYPE_INVALID); 1113 1114 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1115 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1116 refs = btrfs_extent_data_ref_count(leaf, dref); 1117 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1118 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1119 refs = btrfs_shared_data_ref_count(leaf, sref); 1120 } else { 1121 refs = 1; 1122 BUG_ON(refs_to_mod != -1); 1123 } 1124 1125 BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); 1126 refs += refs_to_mod; 1127 1128 if (refs > 0) { 1129 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1130 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1131 else 1132 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1133 } else { 1134 size = btrfs_extent_inline_ref_size(type); 1135 item_size = btrfs_item_size(leaf, path->slots[0]); 1136 ptr = (unsigned long)iref; 1137 end = (unsigned long)ei + item_size; 1138 if (ptr + size < end) 1139 memmove_extent_buffer(leaf, ptr, ptr + size, 1140 end - ptr - size); 1141 item_size -= size; 1142 btrfs_truncate_item(path, item_size, 1); 1143 } 1144 btrfs_mark_buffer_dirty(leaf); 1145 } 1146 1147 static noinline_for_stack 1148 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1149 struct btrfs_path *path, 1150 u64 bytenr, u64 num_bytes, u64 parent, 1151 u64 root_objectid, u64 owner, 1152 u64 offset, int refs_to_add, 1153 struct btrfs_delayed_extent_op *extent_op) 1154 { 1155 struct btrfs_extent_inline_ref *iref; 1156 int ret; 1157 1158 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1159 num_bytes, parent, root_objectid, 1160 owner, offset, 1); 1161 if (ret == 0) { 1162 /* 1163 * We're adding refs to a tree block we already own, this 1164 * should not happen at all. 1165 */ 1166 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1167 btrfs_print_leaf(path->nodes[0]); 1168 btrfs_crit(trans->fs_info, 1169 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", 1170 bytenr, num_bytes, root_objectid, path->slots[0]); 1171 return -EUCLEAN; 1172 } 1173 update_inline_extent_backref(path, iref, refs_to_add, extent_op); 1174 } else if (ret == -ENOENT) { 1175 setup_inline_extent_backref(trans->fs_info, path, iref, parent, 1176 root_objectid, owner, offset, 1177 refs_to_add, extent_op); 1178 ret = 0; 1179 } 1180 return ret; 1181 } 1182 1183 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1184 struct btrfs_root *root, 1185 struct btrfs_path *path, 1186 struct btrfs_extent_inline_ref *iref, 1187 int refs_to_drop, int is_data) 1188 { 1189 int ret = 0; 1190 1191 BUG_ON(!is_data && refs_to_drop != 1); 1192 if (iref) 1193 update_inline_extent_backref(path, iref, -refs_to_drop, NULL); 1194 else if (is_data) 1195 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1196 else 1197 ret = btrfs_del_item(trans, root, path); 1198 return ret; 1199 } 1200 1201 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1202 u64 *discarded_bytes) 1203 { 1204 int j, ret = 0; 1205 u64 bytes_left, end; 1206 u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); 1207 1208 if (WARN_ON(start != aligned_start)) { 1209 len -= aligned_start - start; 1210 len = round_down(len, 1 << SECTOR_SHIFT); 1211 start = aligned_start; 1212 } 1213 1214 *discarded_bytes = 0; 1215 1216 if (!len) 1217 return 0; 1218 1219 end = start + len; 1220 bytes_left = len; 1221 1222 /* Skip any superblocks on this device. */ 1223 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1224 u64 sb_start = btrfs_sb_offset(j); 1225 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1226 u64 size = sb_start - start; 1227 1228 if (!in_range(sb_start, start, bytes_left) && 1229 !in_range(sb_end, start, bytes_left) && 1230 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1231 continue; 1232 1233 /* 1234 * Superblock spans beginning of range. Adjust start and 1235 * try again. 1236 */ 1237 if (sb_start <= start) { 1238 start += sb_end - start; 1239 if (start > end) { 1240 bytes_left = 0; 1241 break; 1242 } 1243 bytes_left = end - start; 1244 continue; 1245 } 1246 1247 if (size) { 1248 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1249 size >> SECTOR_SHIFT, 1250 GFP_NOFS); 1251 if (!ret) 1252 *discarded_bytes += size; 1253 else if (ret != -EOPNOTSUPP) 1254 return ret; 1255 } 1256 1257 start = sb_end; 1258 if (start > end) { 1259 bytes_left = 0; 1260 break; 1261 } 1262 bytes_left = end - start; 1263 } 1264 1265 if (bytes_left) { 1266 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1267 bytes_left >> SECTOR_SHIFT, 1268 GFP_NOFS); 1269 if (!ret) 1270 *discarded_bytes += bytes_left; 1271 } 1272 return ret; 1273 } 1274 1275 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) 1276 { 1277 struct btrfs_device *dev = stripe->dev; 1278 struct btrfs_fs_info *fs_info = dev->fs_info; 1279 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1280 u64 phys = stripe->physical; 1281 u64 len = stripe->length; 1282 u64 discarded = 0; 1283 int ret = 0; 1284 1285 /* Zone reset on a zoned filesystem */ 1286 if (btrfs_can_zone_reset(dev, phys, len)) { 1287 u64 src_disc; 1288 1289 ret = btrfs_reset_device_zone(dev, phys, len, &discarded); 1290 if (ret) 1291 goto out; 1292 1293 if (!btrfs_dev_replace_is_ongoing(dev_replace) || 1294 dev != dev_replace->srcdev) 1295 goto out; 1296 1297 src_disc = discarded; 1298 1299 /* Send to replace target as well */ 1300 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, 1301 &discarded); 1302 discarded += src_disc; 1303 } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { 1304 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); 1305 } else { 1306 ret = 0; 1307 *bytes = 0; 1308 } 1309 1310 out: 1311 *bytes = discarded; 1312 return ret; 1313 } 1314 1315 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1316 u64 num_bytes, u64 *actual_bytes) 1317 { 1318 int ret = 0; 1319 u64 discarded_bytes = 0; 1320 u64 end = bytenr + num_bytes; 1321 u64 cur = bytenr; 1322 1323 /* 1324 * Avoid races with device replace and make sure the devices in the 1325 * stripes don't go away while we are discarding. 1326 */ 1327 btrfs_bio_counter_inc_blocked(fs_info); 1328 while (cur < end) { 1329 struct btrfs_discard_stripe *stripes; 1330 unsigned int num_stripes; 1331 int i; 1332 1333 num_bytes = end - cur; 1334 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); 1335 if (IS_ERR(stripes)) { 1336 ret = PTR_ERR(stripes); 1337 if (ret == -EOPNOTSUPP) 1338 ret = 0; 1339 break; 1340 } 1341 1342 for (i = 0; i < num_stripes; i++) { 1343 struct btrfs_discard_stripe *stripe = stripes + i; 1344 u64 bytes; 1345 1346 if (!stripe->dev->bdev) { 1347 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 1348 continue; 1349 } 1350 1351 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 1352 &stripe->dev->dev_state)) 1353 continue; 1354 1355 ret = do_discard_extent(stripe, &bytes); 1356 if (ret) { 1357 /* 1358 * Keep going if discard is not supported by the 1359 * device. 1360 */ 1361 if (ret != -EOPNOTSUPP) 1362 break; 1363 ret = 0; 1364 } else { 1365 discarded_bytes += bytes; 1366 } 1367 } 1368 kfree(stripes); 1369 if (ret) 1370 break; 1371 cur += num_bytes; 1372 } 1373 btrfs_bio_counter_dec(fs_info); 1374 if (actual_bytes) 1375 *actual_bytes = discarded_bytes; 1376 return ret; 1377 } 1378 1379 /* Can return -ENOMEM */ 1380 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1381 struct btrfs_ref *generic_ref) 1382 { 1383 struct btrfs_fs_info *fs_info = trans->fs_info; 1384 int ret; 1385 1386 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && 1387 generic_ref->action); 1388 BUG_ON(generic_ref->type == BTRFS_REF_METADATA && 1389 generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); 1390 1391 if (generic_ref->type == BTRFS_REF_METADATA) 1392 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); 1393 else 1394 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); 1395 1396 btrfs_ref_tree_mod(fs_info, generic_ref); 1397 1398 return ret; 1399 } 1400 1401 /* 1402 * __btrfs_inc_extent_ref - insert backreference for a given extent 1403 * 1404 * The counterpart is in __btrfs_free_extent(), with examples and more details 1405 * how it works. 1406 * 1407 * @trans: Handle of transaction 1408 * 1409 * @node: The delayed ref node used to get the bytenr/length for 1410 * extent whose references are incremented. 1411 * 1412 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ 1413 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical 1414 * bytenr of the parent block. Since new extents are always 1415 * created with indirect references, this will only be the case 1416 * when relocating a shared extent. In that case, root_objectid 1417 * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must 1418 * be 0 1419 * 1420 * @root_objectid: The id of the root where this modification has originated, 1421 * this can be either one of the well-known metadata trees or 1422 * the subvolume id which references this extent. 1423 * 1424 * @owner: For data extents it is the inode number of the owning file. 1425 * For metadata extents this parameter holds the level in the 1426 * tree of the extent. 1427 * 1428 * @offset: For metadata extents the offset is ignored and is currently 1429 * always passed as 0. For data extents it is the fileoffset 1430 * this extent belongs to. 1431 * 1432 * @refs_to_add Number of references to add 1433 * 1434 * @extent_op Pointer to a structure, holding information necessary when 1435 * updating a tree block's flags 1436 * 1437 */ 1438 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1439 struct btrfs_delayed_ref_node *node, 1440 u64 parent, u64 root_objectid, 1441 u64 owner, u64 offset, int refs_to_add, 1442 struct btrfs_delayed_extent_op *extent_op) 1443 { 1444 struct btrfs_path *path; 1445 struct extent_buffer *leaf; 1446 struct btrfs_extent_item *item; 1447 struct btrfs_key key; 1448 u64 bytenr = node->bytenr; 1449 u64 num_bytes = node->num_bytes; 1450 u64 refs; 1451 int ret; 1452 1453 path = btrfs_alloc_path(); 1454 if (!path) 1455 return -ENOMEM; 1456 1457 /* this will setup the path even if it fails to insert the back ref */ 1458 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 1459 parent, root_objectid, owner, 1460 offset, refs_to_add, extent_op); 1461 if ((ret < 0 && ret != -EAGAIN) || !ret) 1462 goto out; 1463 1464 /* 1465 * Ok we had -EAGAIN which means we didn't have space to insert and 1466 * inline extent ref, so just update the reference count and add a 1467 * normal backref. 1468 */ 1469 leaf = path->nodes[0]; 1470 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1471 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1472 refs = btrfs_extent_refs(leaf, item); 1473 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1474 if (extent_op) 1475 __run_delayed_extent_op(extent_op, leaf, item); 1476 1477 btrfs_mark_buffer_dirty(leaf); 1478 btrfs_release_path(path); 1479 1480 /* now insert the actual backref */ 1481 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1482 BUG_ON(refs_to_add != 1); 1483 ret = insert_tree_block_ref(trans, path, bytenr, parent, 1484 root_objectid); 1485 } else { 1486 ret = insert_extent_data_ref(trans, path, bytenr, parent, 1487 root_objectid, owner, offset, 1488 refs_to_add); 1489 } 1490 if (ret) 1491 btrfs_abort_transaction(trans, ret); 1492 out: 1493 btrfs_free_path(path); 1494 return ret; 1495 } 1496 1497 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1498 struct btrfs_delayed_ref_node *node, 1499 struct btrfs_delayed_extent_op *extent_op, 1500 bool insert_reserved) 1501 { 1502 int ret = 0; 1503 struct btrfs_delayed_data_ref *ref; 1504 struct btrfs_key ins; 1505 u64 parent = 0; 1506 u64 ref_root = 0; 1507 u64 flags = 0; 1508 1509 ins.objectid = node->bytenr; 1510 ins.offset = node->num_bytes; 1511 ins.type = BTRFS_EXTENT_ITEM_KEY; 1512 1513 ref = btrfs_delayed_node_to_data_ref(node); 1514 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); 1515 1516 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1517 parent = ref->parent; 1518 ref_root = ref->root; 1519 1520 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1521 if (extent_op) 1522 flags |= extent_op->flags_to_set; 1523 ret = alloc_reserved_file_extent(trans, parent, ref_root, 1524 flags, ref->objectid, 1525 ref->offset, &ins, 1526 node->ref_mod); 1527 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1528 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 1529 ref->objectid, ref->offset, 1530 node->ref_mod, extent_op); 1531 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1532 ret = __btrfs_free_extent(trans, node, parent, 1533 ref_root, ref->objectid, 1534 ref->offset, node->ref_mod, 1535 extent_op); 1536 } else { 1537 BUG(); 1538 } 1539 return ret; 1540 } 1541 1542 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 1543 struct extent_buffer *leaf, 1544 struct btrfs_extent_item *ei) 1545 { 1546 u64 flags = btrfs_extent_flags(leaf, ei); 1547 if (extent_op->update_flags) { 1548 flags |= extent_op->flags_to_set; 1549 btrfs_set_extent_flags(leaf, ei, flags); 1550 } 1551 1552 if (extent_op->update_key) { 1553 struct btrfs_tree_block_info *bi; 1554 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 1555 bi = (struct btrfs_tree_block_info *)(ei + 1); 1556 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 1557 } 1558 } 1559 1560 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 1561 struct btrfs_delayed_ref_head *head, 1562 struct btrfs_delayed_extent_op *extent_op) 1563 { 1564 struct btrfs_fs_info *fs_info = trans->fs_info; 1565 struct btrfs_root *root; 1566 struct btrfs_key key; 1567 struct btrfs_path *path; 1568 struct btrfs_extent_item *ei; 1569 struct extent_buffer *leaf; 1570 u32 item_size; 1571 int ret; 1572 int err = 0; 1573 int metadata = 1; 1574 1575 if (TRANS_ABORTED(trans)) 1576 return 0; 1577 1578 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1579 metadata = 0; 1580 1581 path = btrfs_alloc_path(); 1582 if (!path) 1583 return -ENOMEM; 1584 1585 key.objectid = head->bytenr; 1586 1587 if (metadata) { 1588 key.type = BTRFS_METADATA_ITEM_KEY; 1589 key.offset = extent_op->level; 1590 } else { 1591 key.type = BTRFS_EXTENT_ITEM_KEY; 1592 key.offset = head->num_bytes; 1593 } 1594 1595 root = btrfs_extent_root(fs_info, key.objectid); 1596 again: 1597 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1598 if (ret < 0) { 1599 err = ret; 1600 goto out; 1601 } 1602 if (ret > 0) { 1603 if (metadata) { 1604 if (path->slots[0] > 0) { 1605 path->slots[0]--; 1606 btrfs_item_key_to_cpu(path->nodes[0], &key, 1607 path->slots[0]); 1608 if (key.objectid == head->bytenr && 1609 key.type == BTRFS_EXTENT_ITEM_KEY && 1610 key.offset == head->num_bytes) 1611 ret = 0; 1612 } 1613 if (ret > 0) { 1614 btrfs_release_path(path); 1615 metadata = 0; 1616 1617 key.objectid = head->bytenr; 1618 key.offset = head->num_bytes; 1619 key.type = BTRFS_EXTENT_ITEM_KEY; 1620 goto again; 1621 } 1622 } else { 1623 err = -EIO; 1624 goto out; 1625 } 1626 } 1627 1628 leaf = path->nodes[0]; 1629 item_size = btrfs_item_size(leaf, path->slots[0]); 1630 1631 if (unlikely(item_size < sizeof(*ei))) { 1632 err = -EINVAL; 1633 btrfs_print_v0_err(fs_info); 1634 btrfs_abort_transaction(trans, err); 1635 goto out; 1636 } 1637 1638 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1639 __run_delayed_extent_op(extent_op, leaf, ei); 1640 1641 btrfs_mark_buffer_dirty(leaf); 1642 out: 1643 btrfs_free_path(path); 1644 return err; 1645 } 1646 1647 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 1648 struct btrfs_delayed_ref_node *node, 1649 struct btrfs_delayed_extent_op *extent_op, 1650 bool insert_reserved) 1651 { 1652 int ret = 0; 1653 struct btrfs_delayed_tree_ref *ref; 1654 u64 parent = 0; 1655 u64 ref_root = 0; 1656 1657 ref = btrfs_delayed_node_to_tree_ref(node); 1658 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); 1659 1660 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1661 parent = ref->parent; 1662 ref_root = ref->root; 1663 1664 if (node->ref_mod != 1) { 1665 btrfs_err(trans->fs_info, 1666 "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", 1667 node->bytenr, node->ref_mod, node->action, ref_root, 1668 parent); 1669 return -EIO; 1670 } 1671 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1672 BUG_ON(!extent_op || !extent_op->update_flags); 1673 ret = alloc_reserved_tree_block(trans, node, extent_op); 1674 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1675 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 1676 ref->level, 0, 1, extent_op); 1677 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1678 ret = __btrfs_free_extent(trans, node, parent, ref_root, 1679 ref->level, 0, 1, extent_op); 1680 } else { 1681 BUG(); 1682 } 1683 return ret; 1684 } 1685 1686 /* helper function to actually process a single delayed ref entry */ 1687 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1688 struct btrfs_delayed_ref_node *node, 1689 struct btrfs_delayed_extent_op *extent_op, 1690 bool insert_reserved) 1691 { 1692 int ret = 0; 1693 1694 if (TRANS_ABORTED(trans)) { 1695 if (insert_reserved) 1696 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1697 return 0; 1698 } 1699 1700 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1701 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1702 ret = run_delayed_tree_ref(trans, node, extent_op, 1703 insert_reserved); 1704 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1705 node->type == BTRFS_SHARED_DATA_REF_KEY) 1706 ret = run_delayed_data_ref(trans, node, extent_op, 1707 insert_reserved); 1708 else 1709 BUG(); 1710 if (ret && insert_reserved) 1711 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1712 if (ret < 0) 1713 btrfs_err(trans->fs_info, 1714 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", 1715 node->bytenr, node->num_bytes, node->type, 1716 node->action, node->ref_mod, ret); 1717 return ret; 1718 } 1719 1720 static inline struct btrfs_delayed_ref_node * 1721 select_delayed_ref(struct btrfs_delayed_ref_head *head) 1722 { 1723 struct btrfs_delayed_ref_node *ref; 1724 1725 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 1726 return NULL; 1727 1728 /* 1729 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 1730 * This is to prevent a ref count from going down to zero, which deletes 1731 * the extent item from the extent tree, when there still are references 1732 * to add, which would fail because they would not find the extent item. 1733 */ 1734 if (!list_empty(&head->ref_add_list)) 1735 return list_first_entry(&head->ref_add_list, 1736 struct btrfs_delayed_ref_node, add_list); 1737 1738 ref = rb_entry(rb_first_cached(&head->ref_tree), 1739 struct btrfs_delayed_ref_node, ref_node); 1740 ASSERT(list_empty(&ref->add_list)); 1741 return ref; 1742 } 1743 1744 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 1745 struct btrfs_delayed_ref_head *head) 1746 { 1747 spin_lock(&delayed_refs->lock); 1748 head->processing = false; 1749 delayed_refs->num_heads_ready++; 1750 spin_unlock(&delayed_refs->lock); 1751 btrfs_delayed_ref_unlock(head); 1752 } 1753 1754 static struct btrfs_delayed_extent_op *cleanup_extent_op( 1755 struct btrfs_delayed_ref_head *head) 1756 { 1757 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 1758 1759 if (!extent_op) 1760 return NULL; 1761 1762 if (head->must_insert_reserved) { 1763 head->extent_op = NULL; 1764 btrfs_free_delayed_extent_op(extent_op); 1765 return NULL; 1766 } 1767 return extent_op; 1768 } 1769 1770 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 1771 struct btrfs_delayed_ref_head *head) 1772 { 1773 struct btrfs_delayed_extent_op *extent_op; 1774 int ret; 1775 1776 extent_op = cleanup_extent_op(head); 1777 if (!extent_op) 1778 return 0; 1779 head->extent_op = NULL; 1780 spin_unlock(&head->lock); 1781 ret = run_delayed_extent_op(trans, head, extent_op); 1782 btrfs_free_delayed_extent_op(extent_op); 1783 return ret ? ret : 1; 1784 } 1785 1786 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, 1787 struct btrfs_delayed_ref_root *delayed_refs, 1788 struct btrfs_delayed_ref_head *head) 1789 { 1790 int nr_items = 1; /* Dropping this ref head update. */ 1791 1792 /* 1793 * We had csum deletions accounted for in our delayed refs rsv, we need 1794 * to drop the csum leaves for this update from our delayed_refs_rsv. 1795 */ 1796 if (head->total_ref_mod < 0 && head->is_data) { 1797 spin_lock(&delayed_refs->lock); 1798 delayed_refs->pending_csums -= head->num_bytes; 1799 spin_unlock(&delayed_refs->lock); 1800 nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); 1801 } 1802 1803 btrfs_delayed_refs_rsv_release(fs_info, nr_items); 1804 } 1805 1806 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 1807 struct btrfs_delayed_ref_head *head) 1808 { 1809 1810 struct btrfs_fs_info *fs_info = trans->fs_info; 1811 struct btrfs_delayed_ref_root *delayed_refs; 1812 int ret; 1813 1814 delayed_refs = &trans->transaction->delayed_refs; 1815 1816 ret = run_and_cleanup_extent_op(trans, head); 1817 if (ret < 0) { 1818 unselect_delayed_ref_head(delayed_refs, head); 1819 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 1820 return ret; 1821 } else if (ret) { 1822 return ret; 1823 } 1824 1825 /* 1826 * Need to drop our head ref lock and re-acquire the delayed ref lock 1827 * and then re-check to make sure nobody got added. 1828 */ 1829 spin_unlock(&head->lock); 1830 spin_lock(&delayed_refs->lock); 1831 spin_lock(&head->lock); 1832 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 1833 spin_unlock(&head->lock); 1834 spin_unlock(&delayed_refs->lock); 1835 return 1; 1836 } 1837 btrfs_delete_ref_head(delayed_refs, head); 1838 spin_unlock(&head->lock); 1839 spin_unlock(&delayed_refs->lock); 1840 1841 if (head->must_insert_reserved) { 1842 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); 1843 if (head->is_data) { 1844 struct btrfs_root *csum_root; 1845 1846 csum_root = btrfs_csum_root(fs_info, head->bytenr); 1847 ret = btrfs_del_csums(trans, csum_root, head->bytenr, 1848 head->num_bytes); 1849 } 1850 } 1851 1852 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 1853 1854 trace_run_delayed_ref_head(fs_info, head, 0); 1855 btrfs_delayed_ref_unlock(head); 1856 btrfs_put_delayed_ref_head(head); 1857 return ret; 1858 } 1859 1860 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( 1861 struct btrfs_trans_handle *trans) 1862 { 1863 struct btrfs_delayed_ref_root *delayed_refs = 1864 &trans->transaction->delayed_refs; 1865 struct btrfs_delayed_ref_head *head = NULL; 1866 int ret; 1867 1868 spin_lock(&delayed_refs->lock); 1869 head = btrfs_select_ref_head(delayed_refs); 1870 if (!head) { 1871 spin_unlock(&delayed_refs->lock); 1872 return head; 1873 } 1874 1875 /* 1876 * Grab the lock that says we are going to process all the refs for 1877 * this head 1878 */ 1879 ret = btrfs_delayed_ref_lock(delayed_refs, head); 1880 spin_unlock(&delayed_refs->lock); 1881 1882 /* 1883 * We may have dropped the spin lock to get the head mutex lock, and 1884 * that might have given someone else time to free the head. If that's 1885 * true, it has been removed from our list and we can move on. 1886 */ 1887 if (ret == -EAGAIN) 1888 head = ERR_PTR(-EAGAIN); 1889 1890 return head; 1891 } 1892 1893 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 1894 struct btrfs_delayed_ref_head *locked_ref) 1895 { 1896 struct btrfs_fs_info *fs_info = trans->fs_info; 1897 struct btrfs_delayed_ref_root *delayed_refs; 1898 struct btrfs_delayed_extent_op *extent_op; 1899 struct btrfs_delayed_ref_node *ref; 1900 bool must_insert_reserved; 1901 int ret; 1902 1903 delayed_refs = &trans->transaction->delayed_refs; 1904 1905 lockdep_assert_held(&locked_ref->mutex); 1906 lockdep_assert_held(&locked_ref->lock); 1907 1908 while ((ref = select_delayed_ref(locked_ref))) { 1909 if (ref->seq && 1910 btrfs_check_delayed_seq(fs_info, ref->seq)) { 1911 spin_unlock(&locked_ref->lock); 1912 unselect_delayed_ref_head(delayed_refs, locked_ref); 1913 return -EAGAIN; 1914 } 1915 1916 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 1917 RB_CLEAR_NODE(&ref->ref_node); 1918 if (!list_empty(&ref->add_list)) 1919 list_del(&ref->add_list); 1920 /* 1921 * When we play the delayed ref, also correct the ref_mod on 1922 * head 1923 */ 1924 switch (ref->action) { 1925 case BTRFS_ADD_DELAYED_REF: 1926 case BTRFS_ADD_DELAYED_EXTENT: 1927 locked_ref->ref_mod -= ref->ref_mod; 1928 break; 1929 case BTRFS_DROP_DELAYED_REF: 1930 locked_ref->ref_mod += ref->ref_mod; 1931 break; 1932 default: 1933 WARN_ON(1); 1934 } 1935 atomic_dec(&delayed_refs->num_entries); 1936 1937 /* 1938 * Record the must_insert_reserved flag before we drop the 1939 * spin lock. 1940 */ 1941 must_insert_reserved = locked_ref->must_insert_reserved; 1942 locked_ref->must_insert_reserved = false; 1943 1944 extent_op = locked_ref->extent_op; 1945 locked_ref->extent_op = NULL; 1946 spin_unlock(&locked_ref->lock); 1947 1948 ret = run_one_delayed_ref(trans, ref, extent_op, 1949 must_insert_reserved); 1950 1951 btrfs_free_delayed_extent_op(extent_op); 1952 if (ret) { 1953 unselect_delayed_ref_head(delayed_refs, locked_ref); 1954 btrfs_put_delayed_ref(ref); 1955 return ret; 1956 } 1957 1958 btrfs_put_delayed_ref(ref); 1959 cond_resched(); 1960 1961 spin_lock(&locked_ref->lock); 1962 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 1963 } 1964 1965 return 0; 1966 } 1967 1968 /* 1969 * Returns 0 on success or if called with an already aborted transaction. 1970 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 1971 */ 1972 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1973 unsigned long nr) 1974 { 1975 struct btrfs_fs_info *fs_info = trans->fs_info; 1976 struct btrfs_delayed_ref_root *delayed_refs; 1977 struct btrfs_delayed_ref_head *locked_ref = NULL; 1978 int ret; 1979 unsigned long count = 0; 1980 1981 delayed_refs = &trans->transaction->delayed_refs; 1982 do { 1983 if (!locked_ref) { 1984 locked_ref = btrfs_obtain_ref_head(trans); 1985 if (IS_ERR_OR_NULL(locked_ref)) { 1986 if (PTR_ERR(locked_ref) == -EAGAIN) { 1987 continue; 1988 } else { 1989 break; 1990 } 1991 } 1992 count++; 1993 } 1994 /* 1995 * We need to try and merge add/drops of the same ref since we 1996 * can run into issues with relocate dropping the implicit ref 1997 * and then it being added back again before the drop can 1998 * finish. If we merged anything we need to re-loop so we can 1999 * get a good ref. 2000 * Or we can get node references of the same type that weren't 2001 * merged when created due to bumps in the tree mod seq, and 2002 * we need to merge them to prevent adding an inline extent 2003 * backref before dropping it (triggering a BUG_ON at 2004 * insert_inline_extent_backref()). 2005 */ 2006 spin_lock(&locked_ref->lock); 2007 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 2008 2009 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); 2010 if (ret < 0 && ret != -EAGAIN) { 2011 /* 2012 * Error, btrfs_run_delayed_refs_for_head already 2013 * unlocked everything so just bail out 2014 */ 2015 return ret; 2016 } else if (!ret) { 2017 /* 2018 * Success, perform the usual cleanup of a processed 2019 * head 2020 */ 2021 ret = cleanup_ref_head(trans, locked_ref); 2022 if (ret > 0 ) { 2023 /* We dropped our lock, we need to loop. */ 2024 ret = 0; 2025 continue; 2026 } else if (ret) { 2027 return ret; 2028 } 2029 } 2030 2031 /* 2032 * Either success case or btrfs_run_delayed_refs_for_head 2033 * returned -EAGAIN, meaning we need to select another head 2034 */ 2035 2036 locked_ref = NULL; 2037 cond_resched(); 2038 } while ((nr != -1 && count < nr) || locked_ref); 2039 2040 return 0; 2041 } 2042 2043 #ifdef SCRAMBLE_DELAYED_REFS 2044 /* 2045 * Normally delayed refs get processed in ascending bytenr order. This 2046 * correlates in most cases to the order added. To expose dependencies on this 2047 * order, we start to process the tree in the middle instead of the beginning 2048 */ 2049 static u64 find_middle(struct rb_root *root) 2050 { 2051 struct rb_node *n = root->rb_node; 2052 struct btrfs_delayed_ref_node *entry; 2053 int alt = 1; 2054 u64 middle; 2055 u64 first = 0, last = 0; 2056 2057 n = rb_first(root); 2058 if (n) { 2059 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2060 first = entry->bytenr; 2061 } 2062 n = rb_last(root); 2063 if (n) { 2064 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2065 last = entry->bytenr; 2066 } 2067 n = root->rb_node; 2068 2069 while (n) { 2070 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2071 WARN_ON(!entry->in_tree); 2072 2073 middle = entry->bytenr; 2074 2075 if (alt) 2076 n = n->rb_left; 2077 else 2078 n = n->rb_right; 2079 2080 alt = 1 - alt; 2081 } 2082 return middle; 2083 } 2084 #endif 2085 2086 /* 2087 * this starts processing the delayed reference count updates and 2088 * extent insertions we have queued up so far. count can be 2089 * 0, which means to process everything in the tree at the start 2090 * of the run (but not newly added entries), or it can be some target 2091 * number you'd like to process. 2092 * 2093 * Returns 0 on success or if called with an aborted transaction 2094 * Returns <0 on error and aborts the transaction 2095 */ 2096 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2097 unsigned long count) 2098 { 2099 struct btrfs_fs_info *fs_info = trans->fs_info; 2100 struct rb_node *node; 2101 struct btrfs_delayed_ref_root *delayed_refs; 2102 struct btrfs_delayed_ref_head *head; 2103 int ret; 2104 int run_all = count == (unsigned long)-1; 2105 2106 /* We'll clean this up in btrfs_cleanup_transaction */ 2107 if (TRANS_ABORTED(trans)) 2108 return 0; 2109 2110 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2111 return 0; 2112 2113 delayed_refs = &trans->transaction->delayed_refs; 2114 if (count == 0) 2115 count = delayed_refs->num_heads_ready; 2116 2117 again: 2118 #ifdef SCRAMBLE_DELAYED_REFS 2119 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2120 #endif 2121 ret = __btrfs_run_delayed_refs(trans, count); 2122 if (ret < 0) { 2123 btrfs_abort_transaction(trans, ret); 2124 return ret; 2125 } 2126 2127 if (run_all) { 2128 btrfs_create_pending_block_groups(trans); 2129 2130 spin_lock(&delayed_refs->lock); 2131 node = rb_first_cached(&delayed_refs->href_root); 2132 if (!node) { 2133 spin_unlock(&delayed_refs->lock); 2134 goto out; 2135 } 2136 head = rb_entry(node, struct btrfs_delayed_ref_head, 2137 href_node); 2138 refcount_inc(&head->refs); 2139 spin_unlock(&delayed_refs->lock); 2140 2141 /* Mutex was contended, block until it's released and retry. */ 2142 mutex_lock(&head->mutex); 2143 mutex_unlock(&head->mutex); 2144 2145 btrfs_put_delayed_ref_head(head); 2146 cond_resched(); 2147 goto again; 2148 } 2149 out: 2150 return 0; 2151 } 2152 2153 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2154 struct extent_buffer *eb, u64 flags) 2155 { 2156 struct btrfs_delayed_extent_op *extent_op; 2157 int level = btrfs_header_level(eb); 2158 int ret; 2159 2160 extent_op = btrfs_alloc_delayed_extent_op(); 2161 if (!extent_op) 2162 return -ENOMEM; 2163 2164 extent_op->flags_to_set = flags; 2165 extent_op->update_flags = true; 2166 extent_op->update_key = false; 2167 extent_op->level = level; 2168 2169 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); 2170 if (ret) 2171 btrfs_free_delayed_extent_op(extent_op); 2172 return ret; 2173 } 2174 2175 static noinline int check_delayed_ref(struct btrfs_root *root, 2176 struct btrfs_path *path, 2177 u64 objectid, u64 offset, u64 bytenr) 2178 { 2179 struct btrfs_delayed_ref_head *head; 2180 struct btrfs_delayed_ref_node *ref; 2181 struct btrfs_delayed_data_ref *data_ref; 2182 struct btrfs_delayed_ref_root *delayed_refs; 2183 struct btrfs_transaction *cur_trans; 2184 struct rb_node *node; 2185 int ret = 0; 2186 2187 spin_lock(&root->fs_info->trans_lock); 2188 cur_trans = root->fs_info->running_transaction; 2189 if (cur_trans) 2190 refcount_inc(&cur_trans->use_count); 2191 spin_unlock(&root->fs_info->trans_lock); 2192 if (!cur_trans) 2193 return 0; 2194 2195 delayed_refs = &cur_trans->delayed_refs; 2196 spin_lock(&delayed_refs->lock); 2197 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 2198 if (!head) { 2199 spin_unlock(&delayed_refs->lock); 2200 btrfs_put_transaction(cur_trans); 2201 return 0; 2202 } 2203 2204 if (!mutex_trylock(&head->mutex)) { 2205 if (path->nowait) { 2206 spin_unlock(&delayed_refs->lock); 2207 btrfs_put_transaction(cur_trans); 2208 return -EAGAIN; 2209 } 2210 2211 refcount_inc(&head->refs); 2212 spin_unlock(&delayed_refs->lock); 2213 2214 btrfs_release_path(path); 2215 2216 /* 2217 * Mutex was contended, block until it's released and let 2218 * caller try again 2219 */ 2220 mutex_lock(&head->mutex); 2221 mutex_unlock(&head->mutex); 2222 btrfs_put_delayed_ref_head(head); 2223 btrfs_put_transaction(cur_trans); 2224 return -EAGAIN; 2225 } 2226 spin_unlock(&delayed_refs->lock); 2227 2228 spin_lock(&head->lock); 2229 /* 2230 * XXX: We should replace this with a proper search function in the 2231 * future. 2232 */ 2233 for (node = rb_first_cached(&head->ref_tree); node; 2234 node = rb_next(node)) { 2235 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 2236 /* If it's a shared ref we know a cross reference exists */ 2237 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2238 ret = 1; 2239 break; 2240 } 2241 2242 data_ref = btrfs_delayed_node_to_data_ref(ref); 2243 2244 /* 2245 * If our ref doesn't match the one we're currently looking at 2246 * then we have a cross reference. 2247 */ 2248 if (data_ref->root != root->root_key.objectid || 2249 data_ref->objectid != objectid || 2250 data_ref->offset != offset) { 2251 ret = 1; 2252 break; 2253 } 2254 } 2255 spin_unlock(&head->lock); 2256 mutex_unlock(&head->mutex); 2257 btrfs_put_transaction(cur_trans); 2258 return ret; 2259 } 2260 2261 static noinline int check_committed_ref(struct btrfs_root *root, 2262 struct btrfs_path *path, 2263 u64 objectid, u64 offset, u64 bytenr, 2264 bool strict) 2265 { 2266 struct btrfs_fs_info *fs_info = root->fs_info; 2267 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); 2268 struct extent_buffer *leaf; 2269 struct btrfs_extent_data_ref *ref; 2270 struct btrfs_extent_inline_ref *iref; 2271 struct btrfs_extent_item *ei; 2272 struct btrfs_key key; 2273 u32 item_size; 2274 int type; 2275 int ret; 2276 2277 key.objectid = bytenr; 2278 key.offset = (u64)-1; 2279 key.type = BTRFS_EXTENT_ITEM_KEY; 2280 2281 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2282 if (ret < 0) 2283 goto out; 2284 BUG_ON(ret == 0); /* Corruption */ 2285 2286 ret = -ENOENT; 2287 if (path->slots[0] == 0) 2288 goto out; 2289 2290 path->slots[0]--; 2291 leaf = path->nodes[0]; 2292 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2293 2294 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2295 goto out; 2296 2297 ret = 1; 2298 item_size = btrfs_item_size(leaf, path->slots[0]); 2299 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2300 2301 /* If extent item has more than 1 inline ref then it's shared */ 2302 if (item_size != sizeof(*ei) + 2303 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2304 goto out; 2305 2306 /* 2307 * If extent created before last snapshot => it's shared unless the 2308 * snapshot has been deleted. Use the heuristic if strict is false. 2309 */ 2310 if (!strict && 2311 (btrfs_extent_generation(leaf, ei) <= 2312 btrfs_root_last_snapshot(&root->root_item))) 2313 goto out; 2314 2315 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2316 2317 /* If this extent has SHARED_DATA_REF then it's shared */ 2318 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 2319 if (type != BTRFS_EXTENT_DATA_REF_KEY) 2320 goto out; 2321 2322 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2323 if (btrfs_extent_refs(leaf, ei) != 2324 btrfs_extent_data_ref_count(leaf, ref) || 2325 btrfs_extent_data_ref_root(leaf, ref) != 2326 root->root_key.objectid || 2327 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2328 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2329 goto out; 2330 2331 ret = 0; 2332 out: 2333 return ret; 2334 } 2335 2336 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 2337 u64 bytenr, bool strict, struct btrfs_path *path) 2338 { 2339 int ret; 2340 2341 do { 2342 ret = check_committed_ref(root, path, objectid, 2343 offset, bytenr, strict); 2344 if (ret && ret != -ENOENT) 2345 goto out; 2346 2347 ret = check_delayed_ref(root, path, objectid, offset, bytenr); 2348 } while (ret == -EAGAIN); 2349 2350 out: 2351 btrfs_release_path(path); 2352 if (btrfs_is_data_reloc_root(root)) 2353 WARN_ON(ret > 0); 2354 return ret; 2355 } 2356 2357 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2358 struct btrfs_root *root, 2359 struct extent_buffer *buf, 2360 int full_backref, int inc) 2361 { 2362 struct btrfs_fs_info *fs_info = root->fs_info; 2363 u64 bytenr; 2364 u64 num_bytes; 2365 u64 parent; 2366 u64 ref_root; 2367 u32 nritems; 2368 struct btrfs_key key; 2369 struct btrfs_file_extent_item *fi; 2370 struct btrfs_ref generic_ref = { 0 }; 2371 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); 2372 int i; 2373 int action; 2374 int level; 2375 int ret = 0; 2376 2377 if (btrfs_is_testing(fs_info)) 2378 return 0; 2379 2380 ref_root = btrfs_header_owner(buf); 2381 nritems = btrfs_header_nritems(buf); 2382 level = btrfs_header_level(buf); 2383 2384 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) 2385 return 0; 2386 2387 if (full_backref) 2388 parent = buf->start; 2389 else 2390 parent = 0; 2391 if (inc) 2392 action = BTRFS_ADD_DELAYED_REF; 2393 else 2394 action = BTRFS_DROP_DELAYED_REF; 2395 2396 for (i = 0; i < nritems; i++) { 2397 if (level == 0) { 2398 btrfs_item_key_to_cpu(buf, &key, i); 2399 if (key.type != BTRFS_EXTENT_DATA_KEY) 2400 continue; 2401 fi = btrfs_item_ptr(buf, i, 2402 struct btrfs_file_extent_item); 2403 if (btrfs_file_extent_type(buf, fi) == 2404 BTRFS_FILE_EXTENT_INLINE) 2405 continue; 2406 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2407 if (bytenr == 0) 2408 continue; 2409 2410 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2411 key.offset -= btrfs_file_extent_offset(buf, fi); 2412 btrfs_init_generic_ref(&generic_ref, action, bytenr, 2413 num_bytes, parent); 2414 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, 2415 key.offset, root->root_key.objectid, 2416 for_reloc); 2417 if (inc) 2418 ret = btrfs_inc_extent_ref(trans, &generic_ref); 2419 else 2420 ret = btrfs_free_extent(trans, &generic_ref); 2421 if (ret) 2422 goto fail; 2423 } else { 2424 bytenr = btrfs_node_blockptr(buf, i); 2425 num_bytes = fs_info->nodesize; 2426 btrfs_init_generic_ref(&generic_ref, action, bytenr, 2427 num_bytes, parent); 2428 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, 2429 root->root_key.objectid, for_reloc); 2430 if (inc) 2431 ret = btrfs_inc_extent_ref(trans, &generic_ref); 2432 else 2433 ret = btrfs_free_extent(trans, &generic_ref); 2434 if (ret) 2435 goto fail; 2436 } 2437 } 2438 return 0; 2439 fail: 2440 return ret; 2441 } 2442 2443 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2444 struct extent_buffer *buf, int full_backref) 2445 { 2446 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2447 } 2448 2449 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2450 struct extent_buffer *buf, int full_backref) 2451 { 2452 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2453 } 2454 2455 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 2456 { 2457 struct btrfs_fs_info *fs_info = root->fs_info; 2458 u64 flags; 2459 u64 ret; 2460 2461 if (data) 2462 flags = BTRFS_BLOCK_GROUP_DATA; 2463 else if (root == fs_info->chunk_root) 2464 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2465 else 2466 flags = BTRFS_BLOCK_GROUP_METADATA; 2467 2468 ret = btrfs_get_alloc_profile(fs_info, flags); 2469 return ret; 2470 } 2471 2472 static u64 first_logical_byte(struct btrfs_fs_info *fs_info) 2473 { 2474 struct rb_node *leftmost; 2475 u64 bytenr = 0; 2476 2477 read_lock(&fs_info->block_group_cache_lock); 2478 /* Get the block group with the lowest logical start address. */ 2479 leftmost = rb_first_cached(&fs_info->block_group_cache_tree); 2480 if (leftmost) { 2481 struct btrfs_block_group *bg; 2482 2483 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); 2484 bytenr = bg->start; 2485 } 2486 read_unlock(&fs_info->block_group_cache_lock); 2487 2488 return bytenr; 2489 } 2490 2491 static int pin_down_extent(struct btrfs_trans_handle *trans, 2492 struct btrfs_block_group *cache, 2493 u64 bytenr, u64 num_bytes, int reserved) 2494 { 2495 struct btrfs_fs_info *fs_info = cache->fs_info; 2496 2497 spin_lock(&cache->space_info->lock); 2498 spin_lock(&cache->lock); 2499 cache->pinned += num_bytes; 2500 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, 2501 num_bytes); 2502 if (reserved) { 2503 cache->reserved -= num_bytes; 2504 cache->space_info->bytes_reserved -= num_bytes; 2505 } 2506 spin_unlock(&cache->lock); 2507 spin_unlock(&cache->space_info->lock); 2508 2509 set_extent_bit(&trans->transaction->pinned_extents, bytenr, 2510 bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); 2511 return 0; 2512 } 2513 2514 int btrfs_pin_extent(struct btrfs_trans_handle *trans, 2515 u64 bytenr, u64 num_bytes, int reserved) 2516 { 2517 struct btrfs_block_group *cache; 2518 2519 cache = btrfs_lookup_block_group(trans->fs_info, bytenr); 2520 BUG_ON(!cache); /* Logic error */ 2521 2522 pin_down_extent(trans, cache, bytenr, num_bytes, reserved); 2523 2524 btrfs_put_block_group(cache); 2525 return 0; 2526 } 2527 2528 /* 2529 * this function must be called within transaction 2530 */ 2531 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 2532 u64 bytenr, u64 num_bytes) 2533 { 2534 struct btrfs_block_group *cache; 2535 int ret; 2536 2537 cache = btrfs_lookup_block_group(trans->fs_info, bytenr); 2538 if (!cache) 2539 return -EINVAL; 2540 2541 /* 2542 * Fully cache the free space first so that our pin removes the free space 2543 * from the cache. 2544 */ 2545 ret = btrfs_cache_block_group(cache, true); 2546 if (ret) 2547 goto out; 2548 2549 pin_down_extent(trans, cache, bytenr, num_bytes, 0); 2550 2551 /* remove us from the free space cache (if we're there at all) */ 2552 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 2553 out: 2554 btrfs_put_block_group(cache); 2555 return ret; 2556 } 2557 2558 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 2559 u64 start, u64 num_bytes) 2560 { 2561 int ret; 2562 struct btrfs_block_group *block_group; 2563 2564 block_group = btrfs_lookup_block_group(fs_info, start); 2565 if (!block_group) 2566 return -EINVAL; 2567 2568 ret = btrfs_cache_block_group(block_group, true); 2569 if (ret) 2570 goto out; 2571 2572 ret = btrfs_remove_free_space(block_group, start, num_bytes); 2573 out: 2574 btrfs_put_block_group(block_group); 2575 return ret; 2576 } 2577 2578 int btrfs_exclude_logged_extents(struct extent_buffer *eb) 2579 { 2580 struct btrfs_fs_info *fs_info = eb->fs_info; 2581 struct btrfs_file_extent_item *item; 2582 struct btrfs_key key; 2583 int found_type; 2584 int i; 2585 int ret = 0; 2586 2587 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 2588 return 0; 2589 2590 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2591 btrfs_item_key_to_cpu(eb, &key, i); 2592 if (key.type != BTRFS_EXTENT_DATA_KEY) 2593 continue; 2594 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 2595 found_type = btrfs_file_extent_type(eb, item); 2596 if (found_type == BTRFS_FILE_EXTENT_INLINE) 2597 continue; 2598 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 2599 continue; 2600 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 2601 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 2602 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 2603 if (ret) 2604 break; 2605 } 2606 2607 return ret; 2608 } 2609 2610 static void 2611 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) 2612 { 2613 atomic_inc(&bg->reservations); 2614 } 2615 2616 /* 2617 * Returns the free cluster for the given space info and sets empty_cluster to 2618 * what it should be based on the mount options. 2619 */ 2620 static struct btrfs_free_cluster * 2621 fetch_cluster_info(struct btrfs_fs_info *fs_info, 2622 struct btrfs_space_info *space_info, u64 *empty_cluster) 2623 { 2624 struct btrfs_free_cluster *ret = NULL; 2625 2626 *empty_cluster = 0; 2627 if (btrfs_mixed_space_info(space_info)) 2628 return ret; 2629 2630 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 2631 ret = &fs_info->meta_alloc_cluster; 2632 if (btrfs_test_opt(fs_info, SSD)) 2633 *empty_cluster = SZ_2M; 2634 else 2635 *empty_cluster = SZ_64K; 2636 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 2637 btrfs_test_opt(fs_info, SSD_SPREAD)) { 2638 *empty_cluster = SZ_2M; 2639 ret = &fs_info->data_alloc_cluster; 2640 } 2641 2642 return ret; 2643 } 2644 2645 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 2646 u64 start, u64 end, 2647 const bool return_free_space) 2648 { 2649 struct btrfs_block_group *cache = NULL; 2650 struct btrfs_space_info *space_info; 2651 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2652 struct btrfs_free_cluster *cluster = NULL; 2653 u64 len; 2654 u64 total_unpinned = 0; 2655 u64 empty_cluster = 0; 2656 bool readonly; 2657 2658 while (start <= end) { 2659 readonly = false; 2660 if (!cache || 2661 start >= cache->start + cache->length) { 2662 if (cache) 2663 btrfs_put_block_group(cache); 2664 total_unpinned = 0; 2665 cache = btrfs_lookup_block_group(fs_info, start); 2666 BUG_ON(!cache); /* Logic error */ 2667 2668 cluster = fetch_cluster_info(fs_info, 2669 cache->space_info, 2670 &empty_cluster); 2671 empty_cluster <<= 1; 2672 } 2673 2674 len = cache->start + cache->length - start; 2675 len = min(len, end + 1 - start); 2676 2677 if (return_free_space) 2678 btrfs_add_free_space(cache, start, len); 2679 2680 start += len; 2681 total_unpinned += len; 2682 space_info = cache->space_info; 2683 2684 /* 2685 * If this space cluster has been marked as fragmented and we've 2686 * unpinned enough in this block group to potentially allow a 2687 * cluster to be created inside of it go ahead and clear the 2688 * fragmented check. 2689 */ 2690 if (cluster && cluster->fragmented && 2691 total_unpinned > empty_cluster) { 2692 spin_lock(&cluster->lock); 2693 cluster->fragmented = 0; 2694 spin_unlock(&cluster->lock); 2695 } 2696 2697 spin_lock(&space_info->lock); 2698 spin_lock(&cache->lock); 2699 cache->pinned -= len; 2700 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); 2701 space_info->max_extent_size = 0; 2702 if (cache->ro) { 2703 space_info->bytes_readonly += len; 2704 readonly = true; 2705 } else if (btrfs_is_zoned(fs_info)) { 2706 /* Need reset before reusing in a zoned block group */ 2707 space_info->bytes_zone_unusable += len; 2708 readonly = true; 2709 } 2710 spin_unlock(&cache->lock); 2711 if (!readonly && return_free_space && 2712 global_rsv->space_info == space_info) { 2713 spin_lock(&global_rsv->lock); 2714 if (!global_rsv->full) { 2715 u64 to_add = min(len, global_rsv->size - 2716 global_rsv->reserved); 2717 2718 global_rsv->reserved += to_add; 2719 btrfs_space_info_update_bytes_may_use(fs_info, 2720 space_info, to_add); 2721 if (global_rsv->reserved >= global_rsv->size) 2722 global_rsv->full = 1; 2723 len -= to_add; 2724 } 2725 spin_unlock(&global_rsv->lock); 2726 } 2727 /* Add to any tickets we may have */ 2728 if (!readonly && return_free_space && len) 2729 btrfs_try_granting_tickets(fs_info, space_info); 2730 spin_unlock(&space_info->lock); 2731 } 2732 2733 if (cache) 2734 btrfs_put_block_group(cache); 2735 return 0; 2736 } 2737 2738 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 2739 { 2740 struct btrfs_fs_info *fs_info = trans->fs_info; 2741 struct btrfs_block_group *block_group, *tmp; 2742 struct list_head *deleted_bgs; 2743 struct extent_io_tree *unpin; 2744 u64 start; 2745 u64 end; 2746 int ret; 2747 2748 unpin = &trans->transaction->pinned_extents; 2749 2750 while (!TRANS_ABORTED(trans)) { 2751 struct extent_state *cached_state = NULL; 2752 2753 mutex_lock(&fs_info->unused_bg_unpin_mutex); 2754 ret = find_first_extent_bit(unpin, 0, &start, &end, 2755 EXTENT_DIRTY, &cached_state); 2756 if (ret) { 2757 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2758 break; 2759 } 2760 2761 if (btrfs_test_opt(fs_info, DISCARD_SYNC)) 2762 ret = btrfs_discard_extent(fs_info, start, 2763 end + 1 - start, NULL); 2764 2765 clear_extent_dirty(unpin, start, end, &cached_state); 2766 unpin_extent_range(fs_info, start, end, true); 2767 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2768 free_extent_state(cached_state); 2769 cond_resched(); 2770 } 2771 2772 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { 2773 btrfs_discard_calc_delay(&fs_info->discard_ctl); 2774 btrfs_discard_schedule_work(&fs_info->discard_ctl, true); 2775 } 2776 2777 /* 2778 * Transaction is finished. We don't need the lock anymore. We 2779 * do need to clean up the block groups in case of a transaction 2780 * abort. 2781 */ 2782 deleted_bgs = &trans->transaction->deleted_bgs; 2783 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 2784 u64 trimmed = 0; 2785 2786 ret = -EROFS; 2787 if (!TRANS_ABORTED(trans)) 2788 ret = btrfs_discard_extent(fs_info, 2789 block_group->start, 2790 block_group->length, 2791 &trimmed); 2792 2793 list_del_init(&block_group->bg_list); 2794 btrfs_unfreeze_block_group(block_group); 2795 btrfs_put_block_group(block_group); 2796 2797 if (ret) { 2798 const char *errstr = btrfs_decode_error(ret); 2799 btrfs_warn(fs_info, 2800 "discard failed while removing blockgroup: errno=%d %s", 2801 ret, errstr); 2802 } 2803 } 2804 2805 return 0; 2806 } 2807 2808 static int do_free_extent_accounting(struct btrfs_trans_handle *trans, 2809 u64 bytenr, u64 num_bytes, bool is_data) 2810 { 2811 int ret; 2812 2813 if (is_data) { 2814 struct btrfs_root *csum_root; 2815 2816 csum_root = btrfs_csum_root(trans->fs_info, bytenr); 2817 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); 2818 if (ret) { 2819 btrfs_abort_transaction(trans, ret); 2820 return ret; 2821 } 2822 } 2823 2824 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 2825 if (ret) { 2826 btrfs_abort_transaction(trans, ret); 2827 return ret; 2828 } 2829 2830 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); 2831 if (ret) 2832 btrfs_abort_transaction(trans, ret); 2833 2834 return ret; 2835 } 2836 2837 #define abort_and_dump(trans, path, fmt, args...) \ 2838 ({ \ 2839 btrfs_abort_transaction(trans, -EUCLEAN); \ 2840 btrfs_print_leaf(path->nodes[0]); \ 2841 btrfs_crit(trans->fs_info, fmt, ##args); \ 2842 }) 2843 2844 /* 2845 * Drop one or more refs of @node. 2846 * 2847 * 1. Locate the extent refs. 2848 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. 2849 * Locate it, then reduce the refs number or remove the ref line completely. 2850 * 2851 * 2. Update the refs count in EXTENT/METADATA_ITEM 2852 * 2853 * Inline backref case: 2854 * 2855 * in extent tree we have: 2856 * 2857 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 2858 * refs 2 gen 6 flags DATA 2859 * extent data backref root FS_TREE objectid 258 offset 0 count 1 2860 * extent data backref root FS_TREE objectid 257 offset 0 count 1 2861 * 2862 * This function gets called with: 2863 * 2864 * node->bytenr = 13631488 2865 * node->num_bytes = 1048576 2866 * root_objectid = FS_TREE 2867 * owner_objectid = 257 2868 * owner_offset = 0 2869 * refs_to_drop = 1 2870 * 2871 * Then we should get some like: 2872 * 2873 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 2874 * refs 1 gen 6 flags DATA 2875 * extent data backref root FS_TREE objectid 258 offset 0 count 1 2876 * 2877 * Keyed backref case: 2878 * 2879 * in extent tree we have: 2880 * 2881 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 2882 * refs 754 gen 6 flags DATA 2883 * [...] 2884 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 2885 * extent data backref root FS_TREE objectid 866 offset 0 count 1 2886 * 2887 * This function get called with: 2888 * 2889 * node->bytenr = 13631488 2890 * node->num_bytes = 1048576 2891 * root_objectid = FS_TREE 2892 * owner_objectid = 866 2893 * owner_offset = 0 2894 * refs_to_drop = 1 2895 * 2896 * Then we should get some like: 2897 * 2898 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 2899 * refs 753 gen 6 flags DATA 2900 * 2901 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. 2902 */ 2903 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2904 struct btrfs_delayed_ref_node *node, u64 parent, 2905 u64 root_objectid, u64 owner_objectid, 2906 u64 owner_offset, int refs_to_drop, 2907 struct btrfs_delayed_extent_op *extent_op) 2908 { 2909 struct btrfs_fs_info *info = trans->fs_info; 2910 struct btrfs_key key; 2911 struct btrfs_path *path; 2912 struct btrfs_root *extent_root; 2913 struct extent_buffer *leaf; 2914 struct btrfs_extent_item *ei; 2915 struct btrfs_extent_inline_ref *iref; 2916 int ret; 2917 int is_data; 2918 int extent_slot = 0; 2919 int found_extent = 0; 2920 int num_to_del = 1; 2921 u32 item_size; 2922 u64 refs; 2923 u64 bytenr = node->bytenr; 2924 u64 num_bytes = node->num_bytes; 2925 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 2926 2927 extent_root = btrfs_extent_root(info, bytenr); 2928 ASSERT(extent_root); 2929 2930 path = btrfs_alloc_path(); 2931 if (!path) 2932 return -ENOMEM; 2933 2934 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 2935 2936 if (!is_data && refs_to_drop != 1) { 2937 btrfs_crit(info, 2938 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", 2939 node->bytenr, refs_to_drop); 2940 ret = -EINVAL; 2941 btrfs_abort_transaction(trans, ret); 2942 goto out; 2943 } 2944 2945 if (is_data) 2946 skinny_metadata = false; 2947 2948 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 2949 parent, root_objectid, owner_objectid, 2950 owner_offset); 2951 if (ret == 0) { 2952 /* 2953 * Either the inline backref or the SHARED_DATA_REF/ 2954 * SHARED_BLOCK_REF is found 2955 * 2956 * Here is a quick path to locate EXTENT/METADATA_ITEM. 2957 * It's possible the EXTENT/METADATA_ITEM is near current slot. 2958 */ 2959 extent_slot = path->slots[0]; 2960 while (extent_slot >= 0) { 2961 btrfs_item_key_to_cpu(path->nodes[0], &key, 2962 extent_slot); 2963 if (key.objectid != bytenr) 2964 break; 2965 if (key.type == BTRFS_EXTENT_ITEM_KEY && 2966 key.offset == num_bytes) { 2967 found_extent = 1; 2968 break; 2969 } 2970 if (key.type == BTRFS_METADATA_ITEM_KEY && 2971 key.offset == owner_objectid) { 2972 found_extent = 1; 2973 break; 2974 } 2975 2976 /* Quick path didn't find the EXTEMT/METADATA_ITEM */ 2977 if (path->slots[0] - extent_slot > 5) 2978 break; 2979 extent_slot--; 2980 } 2981 2982 if (!found_extent) { 2983 if (iref) { 2984 abort_and_dump(trans, path, 2985 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", 2986 path->slots[0]); 2987 ret = -EUCLEAN; 2988 goto out; 2989 } 2990 /* Must be SHARED_* item, remove the backref first */ 2991 ret = remove_extent_backref(trans, extent_root, path, 2992 NULL, refs_to_drop, is_data); 2993 if (ret) { 2994 btrfs_abort_transaction(trans, ret); 2995 goto out; 2996 } 2997 btrfs_release_path(path); 2998 2999 /* Slow path to locate EXTENT/METADATA_ITEM */ 3000 key.objectid = bytenr; 3001 key.type = BTRFS_EXTENT_ITEM_KEY; 3002 key.offset = num_bytes; 3003 3004 if (!is_data && skinny_metadata) { 3005 key.type = BTRFS_METADATA_ITEM_KEY; 3006 key.offset = owner_objectid; 3007 } 3008 3009 ret = btrfs_search_slot(trans, extent_root, 3010 &key, path, -1, 1); 3011 if (ret > 0 && skinny_metadata && path->slots[0]) { 3012 /* 3013 * Couldn't find our skinny metadata item, 3014 * see if we have ye olde extent item. 3015 */ 3016 path->slots[0]--; 3017 btrfs_item_key_to_cpu(path->nodes[0], &key, 3018 path->slots[0]); 3019 if (key.objectid == bytenr && 3020 key.type == BTRFS_EXTENT_ITEM_KEY && 3021 key.offset == num_bytes) 3022 ret = 0; 3023 } 3024 3025 if (ret > 0 && skinny_metadata) { 3026 skinny_metadata = false; 3027 key.objectid = bytenr; 3028 key.type = BTRFS_EXTENT_ITEM_KEY; 3029 key.offset = num_bytes; 3030 btrfs_release_path(path); 3031 ret = btrfs_search_slot(trans, extent_root, 3032 &key, path, -1, 1); 3033 } 3034 3035 if (ret) { 3036 if (ret > 0) 3037 btrfs_print_leaf(path->nodes[0]); 3038 btrfs_err(info, 3039 "umm, got %d back from search, was looking for %llu, slot %d", 3040 ret, bytenr, path->slots[0]); 3041 } 3042 if (ret < 0) { 3043 btrfs_abort_transaction(trans, ret); 3044 goto out; 3045 } 3046 extent_slot = path->slots[0]; 3047 } 3048 } else if (WARN_ON(ret == -ENOENT)) { 3049 abort_and_dump(trans, path, 3050 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", 3051 bytenr, parent, root_objectid, owner_objectid, 3052 owner_offset, path->slots[0]); 3053 goto out; 3054 } else { 3055 btrfs_abort_transaction(trans, ret); 3056 goto out; 3057 } 3058 3059 leaf = path->nodes[0]; 3060 item_size = btrfs_item_size(leaf, extent_slot); 3061 if (unlikely(item_size < sizeof(*ei))) { 3062 ret = -EINVAL; 3063 btrfs_print_v0_err(info); 3064 btrfs_abort_transaction(trans, ret); 3065 goto out; 3066 } 3067 ei = btrfs_item_ptr(leaf, extent_slot, 3068 struct btrfs_extent_item); 3069 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 3070 key.type == BTRFS_EXTENT_ITEM_KEY) { 3071 struct btrfs_tree_block_info *bi; 3072 3073 if (item_size < sizeof(*ei) + sizeof(*bi)) { 3074 abort_and_dump(trans, path, 3075 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", 3076 key.objectid, key.type, key.offset, 3077 path->slots[0], owner_objectid, item_size, 3078 sizeof(*ei) + sizeof(*bi)); 3079 ret = -EUCLEAN; 3080 goto out; 3081 } 3082 bi = (struct btrfs_tree_block_info *)(ei + 1); 3083 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 3084 } 3085 3086 refs = btrfs_extent_refs(leaf, ei); 3087 if (refs < refs_to_drop) { 3088 abort_and_dump(trans, path, 3089 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", 3090 refs_to_drop, refs, bytenr, path->slots[0]); 3091 ret = -EUCLEAN; 3092 goto out; 3093 } 3094 refs -= refs_to_drop; 3095 3096 if (refs > 0) { 3097 if (extent_op) 3098 __run_delayed_extent_op(extent_op, leaf, ei); 3099 /* 3100 * In the case of inline back ref, reference count will 3101 * be updated by remove_extent_backref 3102 */ 3103 if (iref) { 3104 if (!found_extent) { 3105 abort_and_dump(trans, path, 3106 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", 3107 path->slots[0]); 3108 ret = -EUCLEAN; 3109 goto out; 3110 } 3111 } else { 3112 btrfs_set_extent_refs(leaf, ei, refs); 3113 btrfs_mark_buffer_dirty(leaf); 3114 } 3115 if (found_extent) { 3116 ret = remove_extent_backref(trans, extent_root, path, 3117 iref, refs_to_drop, is_data); 3118 if (ret) { 3119 btrfs_abort_transaction(trans, ret); 3120 goto out; 3121 } 3122 } 3123 } else { 3124 /* In this branch refs == 1 */ 3125 if (found_extent) { 3126 if (is_data && refs_to_drop != 3127 extent_data_ref_count(path, iref)) { 3128 abort_and_dump(trans, path, 3129 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", 3130 extent_data_ref_count(path, iref), 3131 refs_to_drop, path->slots[0]); 3132 ret = -EUCLEAN; 3133 goto out; 3134 } 3135 if (iref) { 3136 if (path->slots[0] != extent_slot) { 3137 abort_and_dump(trans, path, 3138 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", 3139 key.objectid, key.type, 3140 key.offset, path->slots[0]); 3141 ret = -EUCLEAN; 3142 goto out; 3143 } 3144 } else { 3145 /* 3146 * No inline ref, we must be at SHARED_* item, 3147 * And it's single ref, it must be: 3148 * | extent_slot ||extent_slot + 1| 3149 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] 3150 */ 3151 if (path->slots[0] != extent_slot + 1) { 3152 abort_and_dump(trans, path, 3153 "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", 3154 path->slots[0]); 3155 ret = -EUCLEAN; 3156 goto out; 3157 } 3158 path->slots[0] = extent_slot; 3159 num_to_del = 2; 3160 } 3161 } 3162 3163 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 3164 num_to_del); 3165 if (ret) { 3166 btrfs_abort_transaction(trans, ret); 3167 goto out; 3168 } 3169 btrfs_release_path(path); 3170 3171 ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); 3172 } 3173 btrfs_release_path(path); 3174 3175 out: 3176 btrfs_free_path(path); 3177 return ret; 3178 } 3179 3180 /* 3181 * when we free an block, it is possible (and likely) that we free the last 3182 * delayed ref for that extent as well. This searches the delayed ref tree for 3183 * a given extent, and if there are no other delayed refs to be processed, it 3184 * removes it from the tree. 3185 */ 3186 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 3187 u64 bytenr) 3188 { 3189 struct btrfs_delayed_ref_head *head; 3190 struct btrfs_delayed_ref_root *delayed_refs; 3191 int ret = 0; 3192 3193 delayed_refs = &trans->transaction->delayed_refs; 3194 spin_lock(&delayed_refs->lock); 3195 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3196 if (!head) 3197 goto out_delayed_unlock; 3198 3199 spin_lock(&head->lock); 3200 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 3201 goto out; 3202 3203 if (cleanup_extent_op(head) != NULL) 3204 goto out; 3205 3206 /* 3207 * waiting for the lock here would deadlock. If someone else has it 3208 * locked they are already in the process of dropping it anyway 3209 */ 3210 if (!mutex_trylock(&head->mutex)) 3211 goto out; 3212 3213 btrfs_delete_ref_head(delayed_refs, head); 3214 head->processing = false; 3215 3216 spin_unlock(&head->lock); 3217 spin_unlock(&delayed_refs->lock); 3218 3219 BUG_ON(head->extent_op); 3220 if (head->must_insert_reserved) 3221 ret = 1; 3222 3223 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); 3224 mutex_unlock(&head->mutex); 3225 btrfs_put_delayed_ref_head(head); 3226 return ret; 3227 out: 3228 spin_unlock(&head->lock); 3229 3230 out_delayed_unlock: 3231 spin_unlock(&delayed_refs->lock); 3232 return 0; 3233 } 3234 3235 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 3236 u64 root_id, 3237 struct extent_buffer *buf, 3238 u64 parent, int last_ref) 3239 { 3240 struct btrfs_fs_info *fs_info = trans->fs_info; 3241 struct btrfs_ref generic_ref = { 0 }; 3242 int ret; 3243 3244 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, 3245 buf->start, buf->len, parent); 3246 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 3247 root_id, 0, false); 3248 3249 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3250 btrfs_ref_tree_mod(fs_info, &generic_ref); 3251 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); 3252 BUG_ON(ret); /* -ENOMEM */ 3253 } 3254 3255 if (last_ref && btrfs_header_generation(buf) == trans->transid) { 3256 struct btrfs_block_group *cache; 3257 bool must_pin = false; 3258 3259 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3260 ret = check_ref_cleanup(trans, buf->start); 3261 if (!ret) { 3262 btrfs_redirty_list_add(trans->transaction, buf); 3263 goto out; 3264 } 3265 } 3266 3267 cache = btrfs_lookup_block_group(fs_info, buf->start); 3268 3269 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3270 pin_down_extent(trans, cache, buf->start, buf->len, 1); 3271 btrfs_put_block_group(cache); 3272 goto out; 3273 } 3274 3275 /* 3276 * If there are tree mod log users we may have recorded mod log 3277 * operations for this node. If we re-allocate this node we 3278 * could replay operations on this node that happened when it 3279 * existed in a completely different root. For example if it 3280 * was part of root A, then was reallocated to root B, and we 3281 * are doing a btrfs_old_search_slot(root b), we could replay 3282 * operations that happened when the block was part of root A, 3283 * giving us an inconsistent view of the btree. 3284 * 3285 * We are safe from races here because at this point no other 3286 * node or root points to this extent buffer, so if after this 3287 * check a new tree mod log user joins we will not have an 3288 * existing log of operations on this node that we have to 3289 * contend with. 3290 */ 3291 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 3292 must_pin = true; 3293 3294 if (must_pin || btrfs_is_zoned(fs_info)) { 3295 btrfs_redirty_list_add(trans->transaction, buf); 3296 pin_down_extent(trans, cache, buf->start, buf->len, 1); 3297 btrfs_put_block_group(cache); 3298 goto out; 3299 } 3300 3301 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 3302 3303 btrfs_add_free_space(cache, buf->start, buf->len); 3304 btrfs_free_reserved_bytes(cache, buf->len, 0); 3305 btrfs_put_block_group(cache); 3306 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 3307 } 3308 out: 3309 if (last_ref) { 3310 /* 3311 * Deleting the buffer, clear the corrupt flag since it doesn't 3312 * matter anymore. 3313 */ 3314 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 3315 } 3316 } 3317 3318 /* Can return -ENOMEM */ 3319 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) 3320 { 3321 struct btrfs_fs_info *fs_info = trans->fs_info; 3322 int ret; 3323 3324 if (btrfs_is_testing(fs_info)) 3325 return 0; 3326 3327 /* 3328 * tree log blocks never actually go into the extent allocation 3329 * tree, just update pinning info and exit early. 3330 */ 3331 if ((ref->type == BTRFS_REF_METADATA && 3332 ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || 3333 (ref->type == BTRFS_REF_DATA && 3334 ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { 3335 /* unlocks the pinned mutex */ 3336 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); 3337 ret = 0; 3338 } else if (ref->type == BTRFS_REF_METADATA) { 3339 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); 3340 } else { 3341 ret = btrfs_add_delayed_data_ref(trans, ref, 0); 3342 } 3343 3344 if (!((ref->type == BTRFS_REF_METADATA && 3345 ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || 3346 (ref->type == BTRFS_REF_DATA && 3347 ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) 3348 btrfs_ref_tree_mod(fs_info, ref); 3349 3350 return ret; 3351 } 3352 3353 enum btrfs_loop_type { 3354 LOOP_CACHING_NOWAIT, 3355 LOOP_CACHING_WAIT, 3356 LOOP_UNSET_SIZE_CLASS, 3357 LOOP_ALLOC_CHUNK, 3358 LOOP_WRONG_SIZE_CLASS, 3359 LOOP_NO_EMPTY_SIZE, 3360 }; 3361 3362 static inline void 3363 btrfs_lock_block_group(struct btrfs_block_group *cache, 3364 int delalloc) 3365 { 3366 if (delalloc) 3367 down_read(&cache->data_rwsem); 3368 } 3369 3370 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, 3371 int delalloc) 3372 { 3373 btrfs_get_block_group(cache); 3374 if (delalloc) 3375 down_read(&cache->data_rwsem); 3376 } 3377 3378 static struct btrfs_block_group *btrfs_lock_cluster( 3379 struct btrfs_block_group *block_group, 3380 struct btrfs_free_cluster *cluster, 3381 int delalloc) 3382 __acquires(&cluster->refill_lock) 3383 { 3384 struct btrfs_block_group *used_bg = NULL; 3385 3386 spin_lock(&cluster->refill_lock); 3387 while (1) { 3388 used_bg = cluster->block_group; 3389 if (!used_bg) 3390 return NULL; 3391 3392 if (used_bg == block_group) 3393 return used_bg; 3394 3395 btrfs_get_block_group(used_bg); 3396 3397 if (!delalloc) 3398 return used_bg; 3399 3400 if (down_read_trylock(&used_bg->data_rwsem)) 3401 return used_bg; 3402 3403 spin_unlock(&cluster->refill_lock); 3404 3405 /* We should only have one-level nested. */ 3406 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 3407 3408 spin_lock(&cluster->refill_lock); 3409 if (used_bg == cluster->block_group) 3410 return used_bg; 3411 3412 up_read(&used_bg->data_rwsem); 3413 btrfs_put_block_group(used_bg); 3414 } 3415 } 3416 3417 static inline void 3418 btrfs_release_block_group(struct btrfs_block_group *cache, 3419 int delalloc) 3420 { 3421 if (delalloc) 3422 up_read(&cache->data_rwsem); 3423 btrfs_put_block_group(cache); 3424 } 3425 3426 /* 3427 * Helper function for find_free_extent(). 3428 * 3429 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 3430 * Return -EAGAIN to inform caller that we need to re-search this block group 3431 * Return >0 to inform caller that we find nothing 3432 * Return 0 means we have found a location and set ffe_ctl->found_offset. 3433 */ 3434 static int find_free_extent_clustered(struct btrfs_block_group *bg, 3435 struct find_free_extent_ctl *ffe_ctl, 3436 struct btrfs_block_group **cluster_bg_ret) 3437 { 3438 struct btrfs_block_group *cluster_bg; 3439 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3440 u64 aligned_cluster; 3441 u64 offset; 3442 int ret; 3443 3444 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 3445 if (!cluster_bg) 3446 goto refill_cluster; 3447 if (cluster_bg != bg && (cluster_bg->ro || 3448 !block_group_bits(cluster_bg, ffe_ctl->flags))) 3449 goto release_cluster; 3450 3451 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 3452 ffe_ctl->num_bytes, cluster_bg->start, 3453 &ffe_ctl->max_extent_size); 3454 if (offset) { 3455 /* We have a block, we're done */ 3456 spin_unlock(&last_ptr->refill_lock); 3457 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); 3458 *cluster_bg_ret = cluster_bg; 3459 ffe_ctl->found_offset = offset; 3460 return 0; 3461 } 3462 WARN_ON(last_ptr->block_group != cluster_bg); 3463 3464 release_cluster: 3465 /* 3466 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 3467 * lets just skip it and let the allocator find whatever block it can 3468 * find. If we reach this point, we will have tried the cluster 3469 * allocator plenty of times and not have found anything, so we are 3470 * likely way too fragmented for the clustering stuff to find anything. 3471 * 3472 * However, if the cluster is taken from the current block group, 3473 * release the cluster first, so that we stand a better chance of 3474 * succeeding in the unclustered allocation. 3475 */ 3476 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 3477 spin_unlock(&last_ptr->refill_lock); 3478 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3479 return -ENOENT; 3480 } 3481 3482 /* This cluster didn't work out, free it and start over */ 3483 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3484 3485 if (cluster_bg != bg) 3486 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3487 3488 refill_cluster: 3489 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 3490 spin_unlock(&last_ptr->refill_lock); 3491 return -ENOENT; 3492 } 3493 3494 aligned_cluster = max_t(u64, 3495 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 3496 bg->full_stripe_len); 3497 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, 3498 ffe_ctl->num_bytes, aligned_cluster); 3499 if (ret == 0) { 3500 /* Now pull our allocation out of this cluster */ 3501 offset = btrfs_alloc_from_cluster(bg, last_ptr, 3502 ffe_ctl->num_bytes, ffe_ctl->search_start, 3503 &ffe_ctl->max_extent_size); 3504 if (offset) { 3505 /* We found one, proceed */ 3506 spin_unlock(&last_ptr->refill_lock); 3507 ffe_ctl->found_offset = offset; 3508 trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); 3509 return 0; 3510 } 3511 } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 3512 !ffe_ctl->retry_clustered) { 3513 spin_unlock(&last_ptr->refill_lock); 3514 3515 ffe_ctl->retry_clustered = true; 3516 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 3517 ffe_ctl->empty_cluster + ffe_ctl->empty_size); 3518 return -EAGAIN; 3519 } 3520 /* 3521 * At this point we either didn't find a cluster or we weren't able to 3522 * allocate a block from our cluster. Free the cluster we've been 3523 * trying to use, and go to the next block group. 3524 */ 3525 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3526 spin_unlock(&last_ptr->refill_lock); 3527 return 1; 3528 } 3529 3530 /* 3531 * Return >0 to inform caller that we find nothing 3532 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 3533 * Return -EAGAIN to inform caller that we need to re-search this block group 3534 */ 3535 static int find_free_extent_unclustered(struct btrfs_block_group *bg, 3536 struct find_free_extent_ctl *ffe_ctl) 3537 { 3538 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3539 u64 offset; 3540 3541 /* 3542 * We are doing an unclustered allocation, set the fragmented flag so 3543 * we don't bother trying to setup a cluster again until we get more 3544 * space. 3545 */ 3546 if (unlikely(last_ptr)) { 3547 spin_lock(&last_ptr->lock); 3548 last_ptr->fragmented = 1; 3549 spin_unlock(&last_ptr->lock); 3550 } 3551 if (ffe_ctl->cached) { 3552 struct btrfs_free_space_ctl *free_space_ctl; 3553 3554 free_space_ctl = bg->free_space_ctl; 3555 spin_lock(&free_space_ctl->tree_lock); 3556 if (free_space_ctl->free_space < 3557 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 3558 ffe_ctl->empty_size) { 3559 ffe_ctl->total_free_space = max_t(u64, 3560 ffe_ctl->total_free_space, 3561 free_space_ctl->free_space); 3562 spin_unlock(&free_space_ctl->tree_lock); 3563 return 1; 3564 } 3565 spin_unlock(&free_space_ctl->tree_lock); 3566 } 3567 3568 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 3569 ffe_ctl->num_bytes, ffe_ctl->empty_size, 3570 &ffe_ctl->max_extent_size); 3571 3572 /* 3573 * If we didn't find a chunk, and we haven't failed on this block group 3574 * before, and this block group is in the middle of caching and we are 3575 * ok with waiting, then go ahead and wait for progress to be made, and 3576 * set @retry_unclustered to true. 3577 * 3578 * If @retry_unclustered is true then we've already waited on this 3579 * block group once and should move on to the next block group. 3580 */ 3581 if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && 3582 ffe_ctl->loop > LOOP_CACHING_NOWAIT) { 3583 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + 3584 ffe_ctl->empty_size); 3585 ffe_ctl->retry_unclustered = true; 3586 return -EAGAIN; 3587 } else if (!offset) { 3588 return 1; 3589 } 3590 ffe_ctl->found_offset = offset; 3591 return 0; 3592 } 3593 3594 static int do_allocation_clustered(struct btrfs_block_group *block_group, 3595 struct find_free_extent_ctl *ffe_ctl, 3596 struct btrfs_block_group **bg_ret) 3597 { 3598 int ret; 3599 3600 /* We want to try and use the cluster allocator, so lets look there */ 3601 if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { 3602 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); 3603 if (ret >= 0 || ret == -EAGAIN) 3604 return ret; 3605 /* ret == -ENOENT case falls through */ 3606 } 3607 3608 return find_free_extent_unclustered(block_group, ffe_ctl); 3609 } 3610 3611 /* 3612 * Tree-log block group locking 3613 * ============================ 3614 * 3615 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which 3616 * indicates the starting address of a block group, which is reserved only 3617 * for tree-log metadata. 3618 * 3619 * Lock nesting 3620 * ============ 3621 * 3622 * space_info::lock 3623 * block_group::lock 3624 * fs_info::treelog_bg_lock 3625 */ 3626 3627 /* 3628 * Simple allocator for sequential-only block group. It only allows sequential 3629 * allocation. No need to play with trees. This function also reserves the 3630 * bytes as in btrfs_add_reserved_bytes. 3631 */ 3632 static int do_allocation_zoned(struct btrfs_block_group *block_group, 3633 struct find_free_extent_ctl *ffe_ctl, 3634 struct btrfs_block_group **bg_ret) 3635 { 3636 struct btrfs_fs_info *fs_info = block_group->fs_info; 3637 struct btrfs_space_info *space_info = block_group->space_info; 3638 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3639 u64 start = block_group->start; 3640 u64 num_bytes = ffe_ctl->num_bytes; 3641 u64 avail; 3642 u64 bytenr = block_group->start; 3643 u64 log_bytenr; 3644 u64 data_reloc_bytenr; 3645 int ret = 0; 3646 bool skip = false; 3647 3648 ASSERT(btrfs_is_zoned(block_group->fs_info)); 3649 3650 /* 3651 * Do not allow non-tree-log blocks in the dedicated tree-log block 3652 * group, and vice versa. 3653 */ 3654 spin_lock(&fs_info->treelog_bg_lock); 3655 log_bytenr = fs_info->treelog_bg; 3656 if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || 3657 (!ffe_ctl->for_treelog && bytenr == log_bytenr))) 3658 skip = true; 3659 spin_unlock(&fs_info->treelog_bg_lock); 3660 if (skip) 3661 return 1; 3662 3663 /* 3664 * Do not allow non-relocation blocks in the dedicated relocation block 3665 * group, and vice versa. 3666 */ 3667 spin_lock(&fs_info->relocation_bg_lock); 3668 data_reloc_bytenr = fs_info->data_reloc_bg; 3669 if (data_reloc_bytenr && 3670 ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || 3671 (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) 3672 skip = true; 3673 spin_unlock(&fs_info->relocation_bg_lock); 3674 if (skip) 3675 return 1; 3676 3677 /* Check RO and no space case before trying to activate it */ 3678 spin_lock(&block_group->lock); 3679 if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { 3680 ret = 1; 3681 /* 3682 * May need to clear fs_info->{treelog,data_reloc}_bg. 3683 * Return the error after taking the locks. 3684 */ 3685 } 3686 spin_unlock(&block_group->lock); 3687 3688 if (!ret && !btrfs_zone_activate(block_group)) { 3689 ret = 1; 3690 /* 3691 * May need to clear fs_info->{treelog,data_reloc}_bg. 3692 * Return the error after taking the locks. 3693 */ 3694 } 3695 3696 spin_lock(&space_info->lock); 3697 spin_lock(&block_group->lock); 3698 spin_lock(&fs_info->treelog_bg_lock); 3699 spin_lock(&fs_info->relocation_bg_lock); 3700 3701 if (ret) 3702 goto out; 3703 3704 ASSERT(!ffe_ctl->for_treelog || 3705 block_group->start == fs_info->treelog_bg || 3706 fs_info->treelog_bg == 0); 3707 ASSERT(!ffe_ctl->for_data_reloc || 3708 block_group->start == fs_info->data_reloc_bg || 3709 fs_info->data_reloc_bg == 0); 3710 3711 if (block_group->ro || 3712 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { 3713 ret = 1; 3714 goto out; 3715 } 3716 3717 /* 3718 * Do not allow currently using block group to be tree-log dedicated 3719 * block group. 3720 */ 3721 if (ffe_ctl->for_treelog && !fs_info->treelog_bg && 3722 (block_group->used || block_group->reserved)) { 3723 ret = 1; 3724 goto out; 3725 } 3726 3727 /* 3728 * Do not allow currently used block group to be the data relocation 3729 * dedicated block group. 3730 */ 3731 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && 3732 (block_group->used || block_group->reserved)) { 3733 ret = 1; 3734 goto out; 3735 } 3736 3737 WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); 3738 avail = block_group->zone_capacity - block_group->alloc_offset; 3739 if (avail < num_bytes) { 3740 if (ffe_ctl->max_extent_size < avail) { 3741 /* 3742 * With sequential allocator, free space is always 3743 * contiguous 3744 */ 3745 ffe_ctl->max_extent_size = avail; 3746 ffe_ctl->total_free_space = avail; 3747 } 3748 ret = 1; 3749 goto out; 3750 } 3751 3752 if (ffe_ctl->for_treelog && !fs_info->treelog_bg) 3753 fs_info->treelog_bg = block_group->start; 3754 3755 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) 3756 fs_info->data_reloc_bg = block_group->start; 3757 3758 ffe_ctl->found_offset = start + block_group->alloc_offset; 3759 block_group->alloc_offset += num_bytes; 3760 spin_lock(&ctl->tree_lock); 3761 ctl->free_space -= num_bytes; 3762 spin_unlock(&ctl->tree_lock); 3763 3764 /* 3765 * We do not check if found_offset is aligned to stripesize. The 3766 * address is anyway rewritten when using zone append writing. 3767 */ 3768 3769 ffe_ctl->search_start = ffe_ctl->found_offset; 3770 3771 out: 3772 if (ret && ffe_ctl->for_treelog) 3773 fs_info->treelog_bg = 0; 3774 if (ret && ffe_ctl->for_data_reloc && 3775 fs_info->data_reloc_bg == block_group->start) { 3776 /* 3777 * Do not allow further allocations from this block group. 3778 * Compared to increasing the ->ro, setting the 3779 * ->zoned_data_reloc_ongoing flag still allows nocow 3780 * writers to come in. See btrfs_inc_nocow_writers(). 3781 * 3782 * We need to disable an allocation to avoid an allocation of 3783 * regular (non-relocation data) extent. With mix of relocation 3784 * extents and regular extents, we can dispatch WRITE commands 3785 * (for relocation extents) and ZONE APPEND commands (for 3786 * regular extents) at the same time to the same zone, which 3787 * easily break the write pointer. 3788 */ 3789 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); 3790 fs_info->data_reloc_bg = 0; 3791 } 3792 spin_unlock(&fs_info->relocation_bg_lock); 3793 spin_unlock(&fs_info->treelog_bg_lock); 3794 spin_unlock(&block_group->lock); 3795 spin_unlock(&space_info->lock); 3796 return ret; 3797 } 3798 3799 static int do_allocation(struct btrfs_block_group *block_group, 3800 struct find_free_extent_ctl *ffe_ctl, 3801 struct btrfs_block_group **bg_ret) 3802 { 3803 switch (ffe_ctl->policy) { 3804 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3805 return do_allocation_clustered(block_group, ffe_ctl, bg_ret); 3806 case BTRFS_EXTENT_ALLOC_ZONED: 3807 return do_allocation_zoned(block_group, ffe_ctl, bg_ret); 3808 default: 3809 BUG(); 3810 } 3811 } 3812 3813 static void release_block_group(struct btrfs_block_group *block_group, 3814 struct find_free_extent_ctl *ffe_ctl, 3815 int delalloc) 3816 { 3817 switch (ffe_ctl->policy) { 3818 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3819 ffe_ctl->retry_clustered = false; 3820 ffe_ctl->retry_unclustered = false; 3821 break; 3822 case BTRFS_EXTENT_ALLOC_ZONED: 3823 /* Nothing to do */ 3824 break; 3825 default: 3826 BUG(); 3827 } 3828 3829 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 3830 ffe_ctl->index); 3831 btrfs_release_block_group(block_group, delalloc); 3832 } 3833 3834 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, 3835 struct btrfs_key *ins) 3836 { 3837 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3838 3839 if (!ffe_ctl->use_cluster && last_ptr) { 3840 spin_lock(&last_ptr->lock); 3841 last_ptr->window_start = ins->objectid; 3842 spin_unlock(&last_ptr->lock); 3843 } 3844 } 3845 3846 static void found_extent(struct find_free_extent_ctl *ffe_ctl, 3847 struct btrfs_key *ins) 3848 { 3849 switch (ffe_ctl->policy) { 3850 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3851 found_extent_clustered(ffe_ctl, ins); 3852 break; 3853 case BTRFS_EXTENT_ALLOC_ZONED: 3854 /* Nothing to do */ 3855 break; 3856 default: 3857 BUG(); 3858 } 3859 } 3860 3861 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, 3862 struct find_free_extent_ctl *ffe_ctl) 3863 { 3864 /* If we can activate new zone, just allocate a chunk and use it */ 3865 if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) 3866 return 0; 3867 3868 /* 3869 * We already reached the max active zones. Try to finish one block 3870 * group to make a room for a new block group. This is only possible 3871 * for a data block group because btrfs_zone_finish() may need to wait 3872 * for a running transaction which can cause a deadlock for metadata 3873 * allocation. 3874 */ 3875 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { 3876 int ret = btrfs_zone_finish_one_bg(fs_info); 3877 3878 if (ret == 1) 3879 return 0; 3880 else if (ret < 0) 3881 return ret; 3882 } 3883 3884 /* 3885 * If we have enough free space left in an already active block group 3886 * and we can't activate any other zone now, do not allow allocating a 3887 * new chunk and let find_free_extent() retry with a smaller size. 3888 */ 3889 if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) 3890 return -ENOSPC; 3891 3892 /* 3893 * Even min_alloc_size is not left in any block groups. Since we cannot 3894 * activate a new block group, allocating it may not help. Let's tell a 3895 * caller to try again and hope it progress something by writing some 3896 * parts of the region. That is only possible for data block groups, 3897 * where a part of the region can be written. 3898 */ 3899 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) 3900 return -EAGAIN; 3901 3902 /* 3903 * We cannot activate a new block group and no enough space left in any 3904 * block groups. So, allocating a new block group may not help. But, 3905 * there is nothing to do anyway, so let's go with it. 3906 */ 3907 return 0; 3908 } 3909 3910 static int can_allocate_chunk(struct btrfs_fs_info *fs_info, 3911 struct find_free_extent_ctl *ffe_ctl) 3912 { 3913 switch (ffe_ctl->policy) { 3914 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3915 return 0; 3916 case BTRFS_EXTENT_ALLOC_ZONED: 3917 return can_allocate_chunk_zoned(fs_info, ffe_ctl); 3918 default: 3919 BUG(); 3920 } 3921 } 3922 3923 /* 3924 * Return >0 means caller needs to re-search for free extent 3925 * Return 0 means we have the needed free extent. 3926 * Return <0 means we failed to locate any free extent. 3927 */ 3928 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 3929 struct btrfs_key *ins, 3930 struct find_free_extent_ctl *ffe_ctl, 3931 bool full_search) 3932 { 3933 struct btrfs_root *root = fs_info->chunk_root; 3934 int ret; 3935 3936 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 3937 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 3938 ffe_ctl->orig_have_caching_bg = true; 3939 3940 if (ins->objectid) { 3941 found_extent(ffe_ctl, ins); 3942 return 0; 3943 } 3944 3945 if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) 3946 return 1; 3947 3948 ffe_ctl->index++; 3949 if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) 3950 return 1; 3951 3952 /* 3953 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking 3954 * caching kthreads as we move along 3955 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 3956 * LOOP_UNSET_SIZE_CLASS, allow unset size class 3957 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 3958 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 3959 * again 3960 */ 3961 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 3962 ffe_ctl->index = 0; 3963 /* 3964 * We want to skip the LOOP_CACHING_WAIT step if we don't have 3965 * any uncached bgs and we've already done a full search 3966 * through. 3967 */ 3968 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && 3969 (!ffe_ctl->orig_have_caching_bg && full_search)) 3970 ffe_ctl->loop++; 3971 ffe_ctl->loop++; 3972 3973 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 3974 struct btrfs_trans_handle *trans; 3975 int exist = 0; 3976 3977 /* Check if allocation policy allows to create a new chunk */ 3978 ret = can_allocate_chunk(fs_info, ffe_ctl); 3979 if (ret) 3980 return ret; 3981 3982 trans = current->journal_info; 3983 if (trans) 3984 exist = 1; 3985 else 3986 trans = btrfs_join_transaction(root); 3987 3988 if (IS_ERR(trans)) { 3989 ret = PTR_ERR(trans); 3990 return ret; 3991 } 3992 3993 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, 3994 CHUNK_ALLOC_FORCE_FOR_EXTENT); 3995 3996 /* Do not bail out on ENOSPC since we can do more. */ 3997 if (ret == -ENOSPC) { 3998 ret = 0; 3999 ffe_ctl->loop++; 4000 } 4001 else if (ret < 0) 4002 btrfs_abort_transaction(trans, ret); 4003 else 4004 ret = 0; 4005 if (!exist) 4006 btrfs_end_transaction(trans); 4007 if (ret) 4008 return ret; 4009 } 4010 4011 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 4012 if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) 4013 return -ENOSPC; 4014 4015 /* 4016 * Don't loop again if we already have no empty_size and 4017 * no empty_cluster. 4018 */ 4019 if (ffe_ctl->empty_size == 0 && 4020 ffe_ctl->empty_cluster == 0) 4021 return -ENOSPC; 4022 ffe_ctl->empty_size = 0; 4023 ffe_ctl->empty_cluster = 0; 4024 } 4025 return 1; 4026 } 4027 return -ENOSPC; 4028 } 4029 4030 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, 4031 struct btrfs_block_group *bg) 4032 { 4033 if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) 4034 return true; 4035 if (!btrfs_block_group_should_use_size_class(bg)) 4036 return true; 4037 if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) 4038 return true; 4039 if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && 4040 bg->size_class == BTRFS_BG_SZ_NONE) 4041 return true; 4042 return ffe_ctl->size_class == bg->size_class; 4043 } 4044 4045 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, 4046 struct find_free_extent_ctl *ffe_ctl, 4047 struct btrfs_space_info *space_info, 4048 struct btrfs_key *ins) 4049 { 4050 /* 4051 * If our free space is heavily fragmented we may not be able to make 4052 * big contiguous allocations, so instead of doing the expensive search 4053 * for free space, simply return ENOSPC with our max_extent_size so we 4054 * can go ahead and search for a more manageable chunk. 4055 * 4056 * If our max_extent_size is large enough for our allocation simply 4057 * disable clustering since we will likely not be able to find enough 4058 * space to create a cluster and induce latency trying. 4059 */ 4060 if (space_info->max_extent_size) { 4061 spin_lock(&space_info->lock); 4062 if (space_info->max_extent_size && 4063 ffe_ctl->num_bytes > space_info->max_extent_size) { 4064 ins->offset = space_info->max_extent_size; 4065 spin_unlock(&space_info->lock); 4066 return -ENOSPC; 4067 } else if (space_info->max_extent_size) { 4068 ffe_ctl->use_cluster = false; 4069 } 4070 spin_unlock(&space_info->lock); 4071 } 4072 4073 ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, 4074 &ffe_ctl->empty_cluster); 4075 if (ffe_ctl->last_ptr) { 4076 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 4077 4078 spin_lock(&last_ptr->lock); 4079 if (last_ptr->block_group) 4080 ffe_ctl->hint_byte = last_ptr->window_start; 4081 if (last_ptr->fragmented) { 4082 /* 4083 * We still set window_start so we can keep track of the 4084 * last place we found an allocation to try and save 4085 * some time. 4086 */ 4087 ffe_ctl->hint_byte = last_ptr->window_start; 4088 ffe_ctl->use_cluster = false; 4089 } 4090 spin_unlock(&last_ptr->lock); 4091 } 4092 4093 return 0; 4094 } 4095 4096 static int prepare_allocation(struct btrfs_fs_info *fs_info, 4097 struct find_free_extent_ctl *ffe_ctl, 4098 struct btrfs_space_info *space_info, 4099 struct btrfs_key *ins) 4100 { 4101 switch (ffe_ctl->policy) { 4102 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4103 return prepare_allocation_clustered(fs_info, ffe_ctl, 4104 space_info, ins); 4105 case BTRFS_EXTENT_ALLOC_ZONED: 4106 if (ffe_ctl->for_treelog) { 4107 spin_lock(&fs_info->treelog_bg_lock); 4108 if (fs_info->treelog_bg) 4109 ffe_ctl->hint_byte = fs_info->treelog_bg; 4110 spin_unlock(&fs_info->treelog_bg_lock); 4111 } 4112 if (ffe_ctl->for_data_reloc) { 4113 spin_lock(&fs_info->relocation_bg_lock); 4114 if (fs_info->data_reloc_bg) 4115 ffe_ctl->hint_byte = fs_info->data_reloc_bg; 4116 spin_unlock(&fs_info->relocation_bg_lock); 4117 } 4118 return 0; 4119 default: 4120 BUG(); 4121 } 4122 } 4123 4124 /* 4125 * walks the btree of allocated extents and find a hole of a given size. 4126 * The key ins is changed to record the hole: 4127 * ins->objectid == start position 4128 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4129 * ins->offset == the size of the hole. 4130 * Any available blocks before search_start are skipped. 4131 * 4132 * If there is no suitable free space, we will record the max size of 4133 * the free space extent currently. 4134 * 4135 * The overall logic and call chain: 4136 * 4137 * find_free_extent() 4138 * |- Iterate through all block groups 4139 * | |- Get a valid block group 4140 * | |- Try to do clustered allocation in that block group 4141 * | |- Try to do unclustered allocation in that block group 4142 * | |- Check if the result is valid 4143 * | | |- If valid, then exit 4144 * | |- Jump to next block group 4145 * | 4146 * |- Push harder to find free extents 4147 * |- If not found, re-iterate all block groups 4148 */ 4149 static noinline int find_free_extent(struct btrfs_root *root, 4150 struct btrfs_key *ins, 4151 struct find_free_extent_ctl *ffe_ctl) 4152 { 4153 struct btrfs_fs_info *fs_info = root->fs_info; 4154 int ret = 0; 4155 int cache_block_group_error = 0; 4156 struct btrfs_block_group *block_group = NULL; 4157 struct btrfs_space_info *space_info; 4158 bool full_search = false; 4159 4160 WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); 4161 4162 ffe_ctl->search_start = 0; 4163 /* For clustered allocation */ 4164 ffe_ctl->empty_cluster = 0; 4165 ffe_ctl->last_ptr = NULL; 4166 ffe_ctl->use_cluster = true; 4167 ffe_ctl->have_caching_bg = false; 4168 ffe_ctl->orig_have_caching_bg = false; 4169 ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); 4170 ffe_ctl->loop = 0; 4171 /* For clustered allocation */ 4172 ffe_ctl->retry_clustered = false; 4173 ffe_ctl->retry_unclustered = false; 4174 ffe_ctl->cached = 0; 4175 ffe_ctl->max_extent_size = 0; 4176 ffe_ctl->total_free_space = 0; 4177 ffe_ctl->found_offset = 0; 4178 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; 4179 ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); 4180 4181 if (btrfs_is_zoned(fs_info)) 4182 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; 4183 4184 ins->type = BTRFS_EXTENT_ITEM_KEY; 4185 ins->objectid = 0; 4186 ins->offset = 0; 4187 4188 trace_find_free_extent(root, ffe_ctl); 4189 4190 space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); 4191 if (!space_info) { 4192 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); 4193 return -ENOSPC; 4194 } 4195 4196 ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); 4197 if (ret < 0) 4198 return ret; 4199 4200 ffe_ctl->search_start = max(ffe_ctl->search_start, 4201 first_logical_byte(fs_info)); 4202 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); 4203 if (ffe_ctl->search_start == ffe_ctl->hint_byte) { 4204 block_group = btrfs_lookup_block_group(fs_info, 4205 ffe_ctl->search_start); 4206 /* 4207 * we don't want to use the block group if it doesn't match our 4208 * allocation bits, or if its not cached. 4209 * 4210 * However if we are re-searching with an ideal block group 4211 * picked out then we don't care that the block group is cached. 4212 */ 4213 if (block_group && block_group_bits(block_group, ffe_ctl->flags) && 4214 block_group->cached != BTRFS_CACHE_NO) { 4215 down_read(&space_info->groups_sem); 4216 if (list_empty(&block_group->list) || 4217 block_group->ro) { 4218 /* 4219 * someone is removing this block group, 4220 * we can't jump into the have_block_group 4221 * target because our list pointers are not 4222 * valid 4223 */ 4224 btrfs_put_block_group(block_group); 4225 up_read(&space_info->groups_sem); 4226 } else { 4227 ffe_ctl->index = btrfs_bg_flags_to_raid_index( 4228 block_group->flags); 4229 btrfs_lock_block_group(block_group, 4230 ffe_ctl->delalloc); 4231 ffe_ctl->hinted = true; 4232 goto have_block_group; 4233 } 4234 } else if (block_group) { 4235 btrfs_put_block_group(block_group); 4236 } 4237 } 4238 search: 4239 trace_find_free_extent_search_loop(root, ffe_ctl); 4240 ffe_ctl->have_caching_bg = false; 4241 if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || 4242 ffe_ctl->index == 0) 4243 full_search = true; 4244 down_read(&space_info->groups_sem); 4245 list_for_each_entry(block_group, 4246 &space_info->block_groups[ffe_ctl->index], list) { 4247 struct btrfs_block_group *bg_ret; 4248 4249 ffe_ctl->hinted = false; 4250 /* If the block group is read-only, we can skip it entirely. */ 4251 if (unlikely(block_group->ro)) { 4252 if (ffe_ctl->for_treelog) 4253 btrfs_clear_treelog_bg(block_group); 4254 if (ffe_ctl->for_data_reloc) 4255 btrfs_clear_data_reloc_bg(block_group); 4256 continue; 4257 } 4258 4259 btrfs_grab_block_group(block_group, ffe_ctl->delalloc); 4260 ffe_ctl->search_start = block_group->start; 4261 4262 /* 4263 * this can happen if we end up cycling through all the 4264 * raid types, but we want to make sure we only allocate 4265 * for the proper type. 4266 */ 4267 if (!block_group_bits(block_group, ffe_ctl->flags)) { 4268 u64 extra = BTRFS_BLOCK_GROUP_DUP | 4269 BTRFS_BLOCK_GROUP_RAID1_MASK | 4270 BTRFS_BLOCK_GROUP_RAID56_MASK | 4271 BTRFS_BLOCK_GROUP_RAID10; 4272 4273 /* 4274 * if they asked for extra copies and this block group 4275 * doesn't provide them, bail. This does allow us to 4276 * fill raid0 from raid1. 4277 */ 4278 if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) 4279 goto loop; 4280 4281 /* 4282 * This block group has different flags than we want. 4283 * It's possible that we have MIXED_GROUP flag but no 4284 * block group is mixed. Just skip such block group. 4285 */ 4286 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4287 continue; 4288 } 4289 4290 have_block_group: 4291 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); 4292 ffe_ctl->cached = btrfs_block_group_done(block_group); 4293 if (unlikely(!ffe_ctl->cached)) { 4294 ffe_ctl->have_caching_bg = true; 4295 ret = btrfs_cache_block_group(block_group, false); 4296 4297 /* 4298 * If we get ENOMEM here or something else we want to 4299 * try other block groups, because it may not be fatal. 4300 * However if we can't find anything else we need to 4301 * save our return here so that we return the actual 4302 * error that caused problems, not ENOSPC. 4303 */ 4304 if (ret < 0) { 4305 if (!cache_block_group_error) 4306 cache_block_group_error = ret; 4307 ret = 0; 4308 goto loop; 4309 } 4310 ret = 0; 4311 } 4312 4313 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { 4314 if (!cache_block_group_error) 4315 cache_block_group_error = -EIO; 4316 goto loop; 4317 } 4318 4319 if (!find_free_extent_check_size_class(ffe_ctl, block_group)) 4320 goto loop; 4321 4322 bg_ret = NULL; 4323 ret = do_allocation(block_group, ffe_ctl, &bg_ret); 4324 if (ret == 0) { 4325 if (bg_ret && bg_ret != block_group) { 4326 btrfs_release_block_group(block_group, 4327 ffe_ctl->delalloc); 4328 block_group = bg_ret; 4329 } 4330 } else if (ret == -EAGAIN) { 4331 goto have_block_group; 4332 } else if (ret > 0) { 4333 goto loop; 4334 } 4335 4336 /* Checks */ 4337 ffe_ctl->search_start = round_up(ffe_ctl->found_offset, 4338 fs_info->stripesize); 4339 4340 /* move on to the next group */ 4341 if (ffe_ctl->search_start + ffe_ctl->num_bytes > 4342 block_group->start + block_group->length) { 4343 btrfs_add_free_space_unused(block_group, 4344 ffe_ctl->found_offset, 4345 ffe_ctl->num_bytes); 4346 goto loop; 4347 } 4348 4349 if (ffe_ctl->found_offset < ffe_ctl->search_start) 4350 btrfs_add_free_space_unused(block_group, 4351 ffe_ctl->found_offset, 4352 ffe_ctl->search_start - ffe_ctl->found_offset); 4353 4354 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, 4355 ffe_ctl->num_bytes, 4356 ffe_ctl->delalloc, 4357 ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); 4358 if (ret == -EAGAIN) { 4359 btrfs_add_free_space_unused(block_group, 4360 ffe_ctl->found_offset, 4361 ffe_ctl->num_bytes); 4362 goto loop; 4363 } 4364 btrfs_inc_block_group_reservations(block_group); 4365 4366 /* we are all good, lets return */ 4367 ins->objectid = ffe_ctl->search_start; 4368 ins->offset = ffe_ctl->num_bytes; 4369 4370 trace_btrfs_reserve_extent(block_group, ffe_ctl); 4371 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4372 break; 4373 loop: 4374 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); 4375 cond_resched(); 4376 } 4377 up_read(&space_info->groups_sem); 4378 4379 ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); 4380 if (ret > 0) 4381 goto search; 4382 4383 if (ret == -ENOSPC && !cache_block_group_error) { 4384 /* 4385 * Use ffe_ctl->total_free_space as fallback if we can't find 4386 * any contiguous hole. 4387 */ 4388 if (!ffe_ctl->max_extent_size) 4389 ffe_ctl->max_extent_size = ffe_ctl->total_free_space; 4390 spin_lock(&space_info->lock); 4391 space_info->max_extent_size = ffe_ctl->max_extent_size; 4392 spin_unlock(&space_info->lock); 4393 ins->offset = ffe_ctl->max_extent_size; 4394 } else if (ret == -ENOSPC) { 4395 ret = cache_block_group_error; 4396 } 4397 return ret; 4398 } 4399 4400 /* 4401 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 4402 * hole that is at least as big as @num_bytes. 4403 * 4404 * @root - The root that will contain this extent 4405 * 4406 * @ram_bytes - The amount of space in ram that @num_bytes take. This 4407 * is used for accounting purposes. This value differs 4408 * from @num_bytes only in the case of compressed extents. 4409 * 4410 * @num_bytes - Number of bytes to allocate on-disk. 4411 * 4412 * @min_alloc_size - Indicates the minimum amount of space that the 4413 * allocator should try to satisfy. In some cases 4414 * @num_bytes may be larger than what is required and if 4415 * the filesystem is fragmented then allocation fails. 4416 * However, the presence of @min_alloc_size gives a 4417 * chance to try and satisfy the smaller allocation. 4418 * 4419 * @empty_size - A hint that you plan on doing more COW. This is the 4420 * size in bytes the allocator should try to find free 4421 * next to the block it returns. This is just a hint and 4422 * may be ignored by the allocator. 4423 * 4424 * @hint_byte - Hint to the allocator to start searching above the byte 4425 * address passed. It might be ignored. 4426 * 4427 * @ins - This key is modified to record the found hole. It will 4428 * have the following values: 4429 * ins->objectid == start position 4430 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4431 * ins->offset == the size of the hole. 4432 * 4433 * @is_data - Boolean flag indicating whether an extent is 4434 * allocated for data (true) or metadata (false) 4435 * 4436 * @delalloc - Boolean flag indicating whether this allocation is for 4437 * delalloc or not. If 'true' data_rwsem of block groups 4438 * is going to be acquired. 4439 * 4440 * 4441 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 4442 * case -ENOSPC is returned then @ins->offset will contain the size of the 4443 * largest available hole the allocator managed to find. 4444 */ 4445 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 4446 u64 num_bytes, u64 min_alloc_size, 4447 u64 empty_size, u64 hint_byte, 4448 struct btrfs_key *ins, int is_data, int delalloc) 4449 { 4450 struct btrfs_fs_info *fs_info = root->fs_info; 4451 struct find_free_extent_ctl ffe_ctl = {}; 4452 bool final_tried = num_bytes == min_alloc_size; 4453 u64 flags; 4454 int ret; 4455 bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 4456 bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); 4457 4458 flags = get_alloc_profile_by_root(root, is_data); 4459 again: 4460 WARN_ON(num_bytes < fs_info->sectorsize); 4461 4462 ffe_ctl.ram_bytes = ram_bytes; 4463 ffe_ctl.num_bytes = num_bytes; 4464 ffe_ctl.min_alloc_size = min_alloc_size; 4465 ffe_ctl.empty_size = empty_size; 4466 ffe_ctl.flags = flags; 4467 ffe_ctl.delalloc = delalloc; 4468 ffe_ctl.hint_byte = hint_byte; 4469 ffe_ctl.for_treelog = for_treelog; 4470 ffe_ctl.for_data_reloc = for_data_reloc; 4471 4472 ret = find_free_extent(root, ins, &ffe_ctl); 4473 if (!ret && !is_data) { 4474 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 4475 } else if (ret == -ENOSPC) { 4476 if (!final_tried && ins->offset) { 4477 num_bytes = min(num_bytes >> 1, ins->offset); 4478 num_bytes = round_down(num_bytes, 4479 fs_info->sectorsize); 4480 num_bytes = max(num_bytes, min_alloc_size); 4481 ram_bytes = num_bytes; 4482 if (num_bytes == min_alloc_size) 4483 final_tried = true; 4484 goto again; 4485 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4486 struct btrfs_space_info *sinfo; 4487 4488 sinfo = btrfs_find_space_info(fs_info, flags); 4489 btrfs_err(fs_info, 4490 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", 4491 flags, num_bytes, for_treelog, for_data_reloc); 4492 if (sinfo) 4493 btrfs_dump_space_info(fs_info, sinfo, 4494 num_bytes, 1); 4495 } 4496 } 4497 4498 return ret; 4499 } 4500 4501 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 4502 u64 start, u64 len, int delalloc) 4503 { 4504 struct btrfs_block_group *cache; 4505 4506 cache = btrfs_lookup_block_group(fs_info, start); 4507 if (!cache) { 4508 btrfs_err(fs_info, "Unable to find block group for %llu", 4509 start); 4510 return -ENOSPC; 4511 } 4512 4513 btrfs_add_free_space(cache, start, len); 4514 btrfs_free_reserved_bytes(cache, len, delalloc); 4515 trace_btrfs_reserved_extent_free(fs_info, start, len); 4516 4517 btrfs_put_block_group(cache); 4518 return 0; 4519 } 4520 4521 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, 4522 u64 len) 4523 { 4524 struct btrfs_block_group *cache; 4525 int ret = 0; 4526 4527 cache = btrfs_lookup_block_group(trans->fs_info, start); 4528 if (!cache) { 4529 btrfs_err(trans->fs_info, "unable to find block group for %llu", 4530 start); 4531 return -ENOSPC; 4532 } 4533 4534 ret = pin_down_extent(trans, cache, start, len, 1); 4535 btrfs_put_block_group(cache); 4536 return ret; 4537 } 4538 4539 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, 4540 u64 num_bytes) 4541 { 4542 struct btrfs_fs_info *fs_info = trans->fs_info; 4543 int ret; 4544 4545 ret = remove_from_free_space_tree(trans, bytenr, num_bytes); 4546 if (ret) 4547 return ret; 4548 4549 ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); 4550 if (ret) { 4551 ASSERT(!ret); 4552 btrfs_err(fs_info, "update block group failed for %llu %llu", 4553 bytenr, num_bytes); 4554 return ret; 4555 } 4556 4557 trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); 4558 return 0; 4559 } 4560 4561 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4562 u64 parent, u64 root_objectid, 4563 u64 flags, u64 owner, u64 offset, 4564 struct btrfs_key *ins, int ref_mod) 4565 { 4566 struct btrfs_fs_info *fs_info = trans->fs_info; 4567 struct btrfs_root *extent_root; 4568 int ret; 4569 struct btrfs_extent_item *extent_item; 4570 struct btrfs_extent_inline_ref *iref; 4571 struct btrfs_path *path; 4572 struct extent_buffer *leaf; 4573 int type; 4574 u32 size; 4575 4576 if (parent > 0) 4577 type = BTRFS_SHARED_DATA_REF_KEY; 4578 else 4579 type = BTRFS_EXTENT_DATA_REF_KEY; 4580 4581 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 4582 4583 path = btrfs_alloc_path(); 4584 if (!path) 4585 return -ENOMEM; 4586 4587 extent_root = btrfs_extent_root(fs_info, ins->objectid); 4588 ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); 4589 if (ret) { 4590 btrfs_free_path(path); 4591 return ret; 4592 } 4593 4594 leaf = path->nodes[0]; 4595 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4596 struct btrfs_extent_item); 4597 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 4598 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4599 btrfs_set_extent_flags(leaf, extent_item, 4600 flags | BTRFS_EXTENT_FLAG_DATA); 4601 4602 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4603 btrfs_set_extent_inline_ref_type(leaf, iref, type); 4604 if (parent > 0) { 4605 struct btrfs_shared_data_ref *ref; 4606 ref = (struct btrfs_shared_data_ref *)(iref + 1); 4607 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4608 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 4609 } else { 4610 struct btrfs_extent_data_ref *ref; 4611 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 4612 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 4613 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 4614 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 4615 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 4616 } 4617 4618 btrfs_mark_buffer_dirty(path->nodes[0]); 4619 btrfs_free_path(path); 4620 4621 return alloc_reserved_extent(trans, ins->objectid, ins->offset); 4622 } 4623 4624 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 4625 struct btrfs_delayed_ref_node *node, 4626 struct btrfs_delayed_extent_op *extent_op) 4627 { 4628 struct btrfs_fs_info *fs_info = trans->fs_info; 4629 struct btrfs_root *extent_root; 4630 int ret; 4631 struct btrfs_extent_item *extent_item; 4632 struct btrfs_key extent_key; 4633 struct btrfs_tree_block_info *block_info; 4634 struct btrfs_extent_inline_ref *iref; 4635 struct btrfs_path *path; 4636 struct extent_buffer *leaf; 4637 struct btrfs_delayed_tree_ref *ref; 4638 u32 size = sizeof(*extent_item) + sizeof(*iref); 4639 u64 flags = extent_op->flags_to_set; 4640 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 4641 4642 ref = btrfs_delayed_node_to_tree_ref(node); 4643 4644 extent_key.objectid = node->bytenr; 4645 if (skinny_metadata) { 4646 extent_key.offset = ref->level; 4647 extent_key.type = BTRFS_METADATA_ITEM_KEY; 4648 } else { 4649 extent_key.offset = node->num_bytes; 4650 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 4651 size += sizeof(*block_info); 4652 } 4653 4654 path = btrfs_alloc_path(); 4655 if (!path) 4656 return -ENOMEM; 4657 4658 extent_root = btrfs_extent_root(fs_info, extent_key.objectid); 4659 ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, 4660 size); 4661 if (ret) { 4662 btrfs_free_path(path); 4663 return ret; 4664 } 4665 4666 leaf = path->nodes[0]; 4667 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4668 struct btrfs_extent_item); 4669 btrfs_set_extent_refs(leaf, extent_item, 1); 4670 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4671 btrfs_set_extent_flags(leaf, extent_item, 4672 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 4673 4674 if (skinny_metadata) { 4675 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4676 } else { 4677 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 4678 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 4679 btrfs_set_tree_block_level(leaf, block_info, ref->level); 4680 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 4681 } 4682 4683 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 4684 btrfs_set_extent_inline_ref_type(leaf, iref, 4685 BTRFS_SHARED_BLOCK_REF_KEY); 4686 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); 4687 } else { 4688 btrfs_set_extent_inline_ref_type(leaf, iref, 4689 BTRFS_TREE_BLOCK_REF_KEY); 4690 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); 4691 } 4692 4693 btrfs_mark_buffer_dirty(leaf); 4694 btrfs_free_path(path); 4695 4696 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); 4697 } 4698 4699 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4700 struct btrfs_root *root, u64 owner, 4701 u64 offset, u64 ram_bytes, 4702 struct btrfs_key *ins) 4703 { 4704 struct btrfs_ref generic_ref = { 0 }; 4705 4706 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 4707 4708 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 4709 ins->objectid, ins->offset, 0); 4710 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, 4711 offset, 0, false); 4712 btrfs_ref_tree_mod(root->fs_info, &generic_ref); 4713 4714 return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); 4715 } 4716 4717 /* 4718 * this is used by the tree logging recovery code. It records that 4719 * an extent has been allocated and makes sure to clear the free 4720 * space cache bits as well 4721 */ 4722 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 4723 u64 root_objectid, u64 owner, u64 offset, 4724 struct btrfs_key *ins) 4725 { 4726 struct btrfs_fs_info *fs_info = trans->fs_info; 4727 int ret; 4728 struct btrfs_block_group *block_group; 4729 struct btrfs_space_info *space_info; 4730 4731 /* 4732 * Mixed block groups will exclude before processing the log so we only 4733 * need to do the exclude dance if this fs isn't mixed. 4734 */ 4735 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 4736 ret = __exclude_logged_extent(fs_info, ins->objectid, 4737 ins->offset); 4738 if (ret) 4739 return ret; 4740 } 4741 4742 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 4743 if (!block_group) 4744 return -EINVAL; 4745 4746 space_info = block_group->space_info; 4747 spin_lock(&space_info->lock); 4748 spin_lock(&block_group->lock); 4749 space_info->bytes_reserved += ins->offset; 4750 block_group->reserved += ins->offset; 4751 spin_unlock(&block_group->lock); 4752 spin_unlock(&space_info->lock); 4753 4754 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 4755 offset, ins, 1); 4756 if (ret) 4757 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); 4758 btrfs_put_block_group(block_group); 4759 return ret; 4760 } 4761 4762 static struct extent_buffer * 4763 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4764 u64 bytenr, int level, u64 owner, 4765 enum btrfs_lock_nesting nest) 4766 { 4767 struct btrfs_fs_info *fs_info = root->fs_info; 4768 struct extent_buffer *buf; 4769 u64 lockdep_owner = owner; 4770 4771 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); 4772 if (IS_ERR(buf)) 4773 return buf; 4774 4775 /* 4776 * Extra safety check in case the extent tree is corrupted and extent 4777 * allocator chooses to use a tree block which is already used and 4778 * locked. 4779 */ 4780 if (buf->lock_owner == current->pid) { 4781 btrfs_err_rl(fs_info, 4782 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 4783 buf->start, btrfs_header_owner(buf), current->pid); 4784 free_extent_buffer(buf); 4785 return ERR_PTR(-EUCLEAN); 4786 } 4787 4788 /* 4789 * The reloc trees are just snapshots, so we need them to appear to be 4790 * just like any other fs tree WRT lockdep. 4791 * 4792 * The exception however is in replace_path() in relocation, where we 4793 * hold the lock on the original fs root and then search for the reloc 4794 * root. At that point we need to make sure any reloc root buffers are 4795 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make 4796 * lockdep happy. 4797 */ 4798 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && 4799 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) 4800 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 4801 4802 /* btrfs_clear_buffer_dirty() accesses generation field. */ 4803 btrfs_set_header_generation(buf, trans->transid); 4804 4805 /* 4806 * This needs to stay, because we could allocate a freed block from an 4807 * old tree into a new tree, so we need to make sure this new block is 4808 * set to the appropriate level and owner. 4809 */ 4810 btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); 4811 4812 __btrfs_tree_lock(buf, nest); 4813 btrfs_clear_buffer_dirty(trans, buf); 4814 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 4815 clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); 4816 4817 set_extent_buffer_uptodate(buf); 4818 4819 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 4820 btrfs_set_header_level(buf, level); 4821 btrfs_set_header_bytenr(buf, buf->start); 4822 btrfs_set_header_generation(buf, trans->transid); 4823 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 4824 btrfs_set_header_owner(buf, owner); 4825 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 4826 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 4827 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 4828 buf->log_index = root->log_transid % 2; 4829 /* 4830 * we allow two log transactions at a time, use different 4831 * EXTENT bit to differentiate dirty pages. 4832 */ 4833 if (buf->log_index == 0) 4834 set_extent_bit(&root->dirty_log_pages, buf->start, 4835 buf->start + buf->len - 1, 4836 EXTENT_DIRTY, NULL); 4837 else 4838 set_extent_bit(&root->dirty_log_pages, buf->start, 4839 buf->start + buf->len - 1, 4840 EXTENT_NEW, NULL); 4841 } else { 4842 buf->log_index = -1; 4843 set_extent_bit(&trans->transaction->dirty_pages, buf->start, 4844 buf->start + buf->len - 1, EXTENT_DIRTY, NULL); 4845 } 4846 /* this returns a buffer locked for blocking */ 4847 return buf; 4848 } 4849 4850 /* 4851 * finds a free extent and does all the dirty work required for allocation 4852 * returns the tree buffer or an ERR_PTR on error. 4853 */ 4854 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 4855 struct btrfs_root *root, 4856 u64 parent, u64 root_objectid, 4857 const struct btrfs_disk_key *key, 4858 int level, u64 hint, 4859 u64 empty_size, 4860 enum btrfs_lock_nesting nest) 4861 { 4862 struct btrfs_fs_info *fs_info = root->fs_info; 4863 struct btrfs_key ins; 4864 struct btrfs_block_rsv *block_rsv; 4865 struct extent_buffer *buf; 4866 struct btrfs_delayed_extent_op *extent_op; 4867 struct btrfs_ref generic_ref = { 0 }; 4868 u64 flags = 0; 4869 int ret; 4870 u32 blocksize = fs_info->nodesize; 4871 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 4872 4873 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4874 if (btrfs_is_testing(fs_info)) { 4875 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 4876 level, root_objectid, nest); 4877 if (!IS_ERR(buf)) 4878 root->alloc_bytenr += blocksize; 4879 return buf; 4880 } 4881 #endif 4882 4883 block_rsv = btrfs_use_block_rsv(trans, root, blocksize); 4884 if (IS_ERR(block_rsv)) 4885 return ERR_CAST(block_rsv); 4886 4887 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 4888 empty_size, hint, &ins, 0, 0); 4889 if (ret) 4890 goto out_unuse; 4891 4892 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 4893 root_objectid, nest); 4894 if (IS_ERR(buf)) { 4895 ret = PTR_ERR(buf); 4896 goto out_free_reserved; 4897 } 4898 4899 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4900 if (parent == 0) 4901 parent = ins.objectid; 4902 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 4903 } else 4904 BUG_ON(parent > 0); 4905 4906 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4907 extent_op = btrfs_alloc_delayed_extent_op(); 4908 if (!extent_op) { 4909 ret = -ENOMEM; 4910 goto out_free_buf; 4911 } 4912 if (key) 4913 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 4914 else 4915 memset(&extent_op->key, 0, sizeof(extent_op->key)); 4916 extent_op->flags_to_set = flags; 4917 extent_op->update_key = skinny_metadata ? false : true; 4918 extent_op->update_flags = true; 4919 extent_op->level = level; 4920 4921 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 4922 ins.objectid, ins.offset, parent); 4923 btrfs_init_tree_ref(&generic_ref, level, root_objectid, 4924 root->root_key.objectid, false); 4925 btrfs_ref_tree_mod(fs_info, &generic_ref); 4926 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); 4927 if (ret) 4928 goto out_free_delayed; 4929 } 4930 return buf; 4931 4932 out_free_delayed: 4933 btrfs_free_delayed_extent_op(extent_op); 4934 out_free_buf: 4935 btrfs_tree_unlock(buf); 4936 free_extent_buffer(buf); 4937 out_free_reserved: 4938 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 4939 out_unuse: 4940 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); 4941 return ERR_PTR(ret); 4942 } 4943 4944 struct walk_control { 4945 u64 refs[BTRFS_MAX_LEVEL]; 4946 u64 flags[BTRFS_MAX_LEVEL]; 4947 struct btrfs_key update_progress; 4948 struct btrfs_key drop_progress; 4949 int drop_level; 4950 int stage; 4951 int level; 4952 int shared_level; 4953 int update_ref; 4954 int keep_locks; 4955 int reada_slot; 4956 int reada_count; 4957 int restarted; 4958 }; 4959 4960 #define DROP_REFERENCE 1 4961 #define UPDATE_BACKREF 2 4962 4963 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 4964 struct btrfs_root *root, 4965 struct walk_control *wc, 4966 struct btrfs_path *path) 4967 { 4968 struct btrfs_fs_info *fs_info = root->fs_info; 4969 u64 bytenr; 4970 u64 generation; 4971 u64 refs; 4972 u64 flags; 4973 u32 nritems; 4974 struct btrfs_key key; 4975 struct extent_buffer *eb; 4976 int ret; 4977 int slot; 4978 int nread = 0; 4979 4980 if (path->slots[wc->level] < wc->reada_slot) { 4981 wc->reada_count = wc->reada_count * 2 / 3; 4982 wc->reada_count = max(wc->reada_count, 2); 4983 } else { 4984 wc->reada_count = wc->reada_count * 3 / 2; 4985 wc->reada_count = min_t(int, wc->reada_count, 4986 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 4987 } 4988 4989 eb = path->nodes[wc->level]; 4990 nritems = btrfs_header_nritems(eb); 4991 4992 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 4993 if (nread >= wc->reada_count) 4994 break; 4995 4996 cond_resched(); 4997 bytenr = btrfs_node_blockptr(eb, slot); 4998 generation = btrfs_node_ptr_generation(eb, slot); 4999 5000 if (slot == path->slots[wc->level]) 5001 goto reada; 5002 5003 if (wc->stage == UPDATE_BACKREF && 5004 generation <= root->root_key.offset) 5005 continue; 5006 5007 /* We don't lock the tree block, it's OK to be racy here */ 5008 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 5009 wc->level - 1, 1, &refs, 5010 &flags); 5011 /* We don't care about errors in readahead. */ 5012 if (ret < 0) 5013 continue; 5014 BUG_ON(refs == 0); 5015 5016 if (wc->stage == DROP_REFERENCE) { 5017 if (refs == 1) 5018 goto reada; 5019 5020 if (wc->level == 1 && 5021 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5022 continue; 5023 if (!wc->update_ref || 5024 generation <= root->root_key.offset) 5025 continue; 5026 btrfs_node_key_to_cpu(eb, &key, slot); 5027 ret = btrfs_comp_cpu_keys(&key, 5028 &wc->update_progress); 5029 if (ret < 0) 5030 continue; 5031 } else { 5032 if (wc->level == 1 && 5033 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5034 continue; 5035 } 5036 reada: 5037 btrfs_readahead_node_child(eb, slot); 5038 nread++; 5039 } 5040 wc->reada_slot = slot; 5041 } 5042 5043 /* 5044 * helper to process tree block while walking down the tree. 5045 * 5046 * when wc->stage == UPDATE_BACKREF, this function updates 5047 * back refs for pointers in the block. 5048 * 5049 * NOTE: return value 1 means we should stop walking down. 5050 */ 5051 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5052 struct btrfs_root *root, 5053 struct btrfs_path *path, 5054 struct walk_control *wc, int lookup_info) 5055 { 5056 struct btrfs_fs_info *fs_info = root->fs_info; 5057 int level = wc->level; 5058 struct extent_buffer *eb = path->nodes[level]; 5059 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5060 int ret; 5061 5062 if (wc->stage == UPDATE_BACKREF && 5063 btrfs_header_owner(eb) != root->root_key.objectid) 5064 return 1; 5065 5066 /* 5067 * when reference count of tree block is 1, it won't increase 5068 * again. once full backref flag is set, we never clear it. 5069 */ 5070 if (lookup_info && 5071 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5072 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 5073 BUG_ON(!path->locks[level]); 5074 ret = btrfs_lookup_extent_info(trans, fs_info, 5075 eb->start, level, 1, 5076 &wc->refs[level], 5077 &wc->flags[level]); 5078 BUG_ON(ret == -ENOMEM); 5079 if (ret) 5080 return ret; 5081 BUG_ON(wc->refs[level] == 0); 5082 } 5083 5084 if (wc->stage == DROP_REFERENCE) { 5085 if (wc->refs[level] > 1) 5086 return 1; 5087 5088 if (path->locks[level] && !wc->keep_locks) { 5089 btrfs_tree_unlock_rw(eb, path->locks[level]); 5090 path->locks[level] = 0; 5091 } 5092 return 0; 5093 } 5094 5095 /* wc->stage == UPDATE_BACKREF */ 5096 if (!(wc->flags[level] & flag)) { 5097 BUG_ON(!path->locks[level]); 5098 ret = btrfs_inc_ref(trans, root, eb, 1); 5099 BUG_ON(ret); /* -ENOMEM */ 5100 ret = btrfs_dec_ref(trans, root, eb, 0); 5101 BUG_ON(ret); /* -ENOMEM */ 5102 ret = btrfs_set_disk_extent_flags(trans, eb, flag); 5103 BUG_ON(ret); /* -ENOMEM */ 5104 wc->flags[level] |= flag; 5105 } 5106 5107 /* 5108 * the block is shared by multiple trees, so it's not good to 5109 * keep the tree lock 5110 */ 5111 if (path->locks[level] && level > 0) { 5112 btrfs_tree_unlock_rw(eb, path->locks[level]); 5113 path->locks[level] = 0; 5114 } 5115 return 0; 5116 } 5117 5118 /* 5119 * This is used to verify a ref exists for this root to deal with a bug where we 5120 * would have a drop_progress key that hadn't been updated properly. 5121 */ 5122 static int check_ref_exists(struct btrfs_trans_handle *trans, 5123 struct btrfs_root *root, u64 bytenr, u64 parent, 5124 int level) 5125 { 5126 struct btrfs_path *path; 5127 struct btrfs_extent_inline_ref *iref; 5128 int ret; 5129 5130 path = btrfs_alloc_path(); 5131 if (!path) 5132 return -ENOMEM; 5133 5134 ret = lookup_extent_backref(trans, path, &iref, bytenr, 5135 root->fs_info->nodesize, parent, 5136 root->root_key.objectid, level, 0); 5137 btrfs_free_path(path); 5138 if (ret == -ENOENT) 5139 return 0; 5140 if (ret < 0) 5141 return ret; 5142 return 1; 5143 } 5144 5145 /* 5146 * helper to process tree block pointer. 5147 * 5148 * when wc->stage == DROP_REFERENCE, this function checks 5149 * reference count of the block pointed to. if the block 5150 * is shared and we need update back refs for the subtree 5151 * rooted at the block, this function changes wc->stage to 5152 * UPDATE_BACKREF. if the block is shared and there is no 5153 * need to update back, this function drops the reference 5154 * to the block. 5155 * 5156 * NOTE: return value 1 means we should stop walking down. 5157 */ 5158 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5159 struct btrfs_root *root, 5160 struct btrfs_path *path, 5161 struct walk_control *wc, int *lookup_info) 5162 { 5163 struct btrfs_fs_info *fs_info = root->fs_info; 5164 u64 bytenr; 5165 u64 generation; 5166 u64 parent; 5167 struct btrfs_tree_parent_check check = { 0 }; 5168 struct btrfs_key key; 5169 struct btrfs_ref ref = { 0 }; 5170 struct extent_buffer *next; 5171 int level = wc->level; 5172 int reada = 0; 5173 int ret = 0; 5174 bool need_account = false; 5175 5176 generation = btrfs_node_ptr_generation(path->nodes[level], 5177 path->slots[level]); 5178 /* 5179 * if the lower level block was created before the snapshot 5180 * was created, we know there is no need to update back refs 5181 * for the subtree 5182 */ 5183 if (wc->stage == UPDATE_BACKREF && 5184 generation <= root->root_key.offset) { 5185 *lookup_info = 1; 5186 return 1; 5187 } 5188 5189 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5190 5191 check.level = level - 1; 5192 check.transid = generation; 5193 check.owner_root = root->root_key.objectid; 5194 check.has_first_key = true; 5195 btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, 5196 path->slots[level]); 5197 5198 next = find_extent_buffer(fs_info, bytenr); 5199 if (!next) { 5200 next = btrfs_find_create_tree_block(fs_info, bytenr, 5201 root->root_key.objectid, level - 1); 5202 if (IS_ERR(next)) 5203 return PTR_ERR(next); 5204 reada = 1; 5205 } 5206 btrfs_tree_lock(next); 5207 5208 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 5209 &wc->refs[level - 1], 5210 &wc->flags[level - 1]); 5211 if (ret < 0) 5212 goto out_unlock; 5213 5214 if (unlikely(wc->refs[level - 1] == 0)) { 5215 btrfs_err(fs_info, "Missing references."); 5216 ret = -EIO; 5217 goto out_unlock; 5218 } 5219 *lookup_info = 0; 5220 5221 if (wc->stage == DROP_REFERENCE) { 5222 if (wc->refs[level - 1] > 1) { 5223 need_account = true; 5224 if (level == 1 && 5225 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5226 goto skip; 5227 5228 if (!wc->update_ref || 5229 generation <= root->root_key.offset) 5230 goto skip; 5231 5232 btrfs_node_key_to_cpu(path->nodes[level], &key, 5233 path->slots[level]); 5234 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 5235 if (ret < 0) 5236 goto skip; 5237 5238 wc->stage = UPDATE_BACKREF; 5239 wc->shared_level = level - 1; 5240 } 5241 } else { 5242 if (level == 1 && 5243 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5244 goto skip; 5245 } 5246 5247 if (!btrfs_buffer_uptodate(next, generation, 0)) { 5248 btrfs_tree_unlock(next); 5249 free_extent_buffer(next); 5250 next = NULL; 5251 *lookup_info = 1; 5252 } 5253 5254 if (!next) { 5255 if (reada && level == 1) 5256 reada_walk_down(trans, root, wc, path); 5257 next = read_tree_block(fs_info, bytenr, &check); 5258 if (IS_ERR(next)) { 5259 return PTR_ERR(next); 5260 } else if (!extent_buffer_uptodate(next)) { 5261 free_extent_buffer(next); 5262 return -EIO; 5263 } 5264 btrfs_tree_lock(next); 5265 } 5266 5267 level--; 5268 ASSERT(level == btrfs_header_level(next)); 5269 if (level != btrfs_header_level(next)) { 5270 btrfs_err(root->fs_info, "mismatched level"); 5271 ret = -EIO; 5272 goto out_unlock; 5273 } 5274 path->nodes[level] = next; 5275 path->slots[level] = 0; 5276 path->locks[level] = BTRFS_WRITE_LOCK; 5277 wc->level = level; 5278 if (wc->level == 1) 5279 wc->reada_slot = 0; 5280 return 0; 5281 skip: 5282 wc->refs[level - 1] = 0; 5283 wc->flags[level - 1] = 0; 5284 if (wc->stage == DROP_REFERENCE) { 5285 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5286 parent = path->nodes[level]->start; 5287 } else { 5288 ASSERT(root->root_key.objectid == 5289 btrfs_header_owner(path->nodes[level])); 5290 if (root->root_key.objectid != 5291 btrfs_header_owner(path->nodes[level])) { 5292 btrfs_err(root->fs_info, 5293 "mismatched block owner"); 5294 ret = -EIO; 5295 goto out_unlock; 5296 } 5297 parent = 0; 5298 } 5299 5300 /* 5301 * If we had a drop_progress we need to verify the refs are set 5302 * as expected. If we find our ref then we know that from here 5303 * on out everything should be correct, and we can clear the 5304 * ->restarted flag. 5305 */ 5306 if (wc->restarted) { 5307 ret = check_ref_exists(trans, root, bytenr, parent, 5308 level - 1); 5309 if (ret < 0) 5310 goto out_unlock; 5311 if (ret == 0) 5312 goto no_delete; 5313 ret = 0; 5314 wc->restarted = 0; 5315 } 5316 5317 /* 5318 * Reloc tree doesn't contribute to qgroup numbers, and we have 5319 * already accounted them at merge time (replace_path), 5320 * thus we could skip expensive subtree trace here. 5321 */ 5322 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 5323 need_account) { 5324 ret = btrfs_qgroup_trace_subtree(trans, next, 5325 generation, level - 1); 5326 if (ret) { 5327 btrfs_err_rl(fs_info, 5328 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 5329 ret); 5330 } 5331 } 5332 5333 /* 5334 * We need to update the next key in our walk control so we can 5335 * update the drop_progress key accordingly. We don't care if 5336 * find_next_key doesn't find a key because that means we're at 5337 * the end and are going to clean up now. 5338 */ 5339 wc->drop_level = level; 5340 find_next_key(path, level, &wc->drop_progress); 5341 5342 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 5343 fs_info->nodesize, parent); 5344 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 5345 0, false); 5346 ret = btrfs_free_extent(trans, &ref); 5347 if (ret) 5348 goto out_unlock; 5349 } 5350 no_delete: 5351 *lookup_info = 1; 5352 ret = 1; 5353 5354 out_unlock: 5355 btrfs_tree_unlock(next); 5356 free_extent_buffer(next); 5357 5358 return ret; 5359 } 5360 5361 /* 5362 * helper to process tree block while walking up the tree. 5363 * 5364 * when wc->stage == DROP_REFERENCE, this function drops 5365 * reference count on the block. 5366 * 5367 * when wc->stage == UPDATE_BACKREF, this function changes 5368 * wc->stage back to DROP_REFERENCE if we changed wc->stage 5369 * to UPDATE_BACKREF previously while processing the block. 5370 * 5371 * NOTE: return value 1 means we should stop walking up. 5372 */ 5373 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 5374 struct btrfs_root *root, 5375 struct btrfs_path *path, 5376 struct walk_control *wc) 5377 { 5378 struct btrfs_fs_info *fs_info = root->fs_info; 5379 int ret; 5380 int level = wc->level; 5381 struct extent_buffer *eb = path->nodes[level]; 5382 u64 parent = 0; 5383 5384 if (wc->stage == UPDATE_BACKREF) { 5385 BUG_ON(wc->shared_level < level); 5386 if (level < wc->shared_level) 5387 goto out; 5388 5389 ret = find_next_key(path, level + 1, &wc->update_progress); 5390 if (ret > 0) 5391 wc->update_ref = 0; 5392 5393 wc->stage = DROP_REFERENCE; 5394 wc->shared_level = -1; 5395 path->slots[level] = 0; 5396 5397 /* 5398 * check reference count again if the block isn't locked. 5399 * we should start walking down the tree again if reference 5400 * count is one. 5401 */ 5402 if (!path->locks[level]) { 5403 BUG_ON(level == 0); 5404 btrfs_tree_lock(eb); 5405 path->locks[level] = BTRFS_WRITE_LOCK; 5406 5407 ret = btrfs_lookup_extent_info(trans, fs_info, 5408 eb->start, level, 1, 5409 &wc->refs[level], 5410 &wc->flags[level]); 5411 if (ret < 0) { 5412 btrfs_tree_unlock_rw(eb, path->locks[level]); 5413 path->locks[level] = 0; 5414 return ret; 5415 } 5416 BUG_ON(wc->refs[level] == 0); 5417 if (wc->refs[level] == 1) { 5418 btrfs_tree_unlock_rw(eb, path->locks[level]); 5419 path->locks[level] = 0; 5420 return 1; 5421 } 5422 } 5423 } 5424 5425 /* wc->stage == DROP_REFERENCE */ 5426 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 5427 5428 if (wc->refs[level] == 1) { 5429 if (level == 0) { 5430 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5431 ret = btrfs_dec_ref(trans, root, eb, 1); 5432 else 5433 ret = btrfs_dec_ref(trans, root, eb, 0); 5434 BUG_ON(ret); /* -ENOMEM */ 5435 if (is_fstree(root->root_key.objectid)) { 5436 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 5437 if (ret) { 5438 btrfs_err_rl(fs_info, 5439 "error %d accounting leaf items, quota is out of sync, rescan required", 5440 ret); 5441 } 5442 } 5443 } 5444 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ 5445 if (!path->locks[level]) { 5446 btrfs_tree_lock(eb); 5447 path->locks[level] = BTRFS_WRITE_LOCK; 5448 } 5449 btrfs_clear_buffer_dirty(trans, eb); 5450 } 5451 5452 if (eb == root->node) { 5453 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5454 parent = eb->start; 5455 else if (root->root_key.objectid != btrfs_header_owner(eb)) 5456 goto owner_mismatch; 5457 } else { 5458 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5459 parent = path->nodes[level + 1]->start; 5460 else if (root->root_key.objectid != 5461 btrfs_header_owner(path->nodes[level + 1])) 5462 goto owner_mismatch; 5463 } 5464 5465 btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, 5466 wc->refs[level] == 1); 5467 out: 5468 wc->refs[level] = 0; 5469 wc->flags[level] = 0; 5470 return 0; 5471 5472 owner_mismatch: 5473 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 5474 btrfs_header_owner(eb), root->root_key.objectid); 5475 return -EUCLEAN; 5476 } 5477 5478 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5479 struct btrfs_root *root, 5480 struct btrfs_path *path, 5481 struct walk_control *wc) 5482 { 5483 int level = wc->level; 5484 int lookup_info = 1; 5485 int ret = 0; 5486 5487 while (level >= 0) { 5488 ret = walk_down_proc(trans, root, path, wc, lookup_info); 5489 if (ret) 5490 break; 5491 5492 if (level == 0) 5493 break; 5494 5495 if (path->slots[level] >= 5496 btrfs_header_nritems(path->nodes[level])) 5497 break; 5498 5499 ret = do_walk_down(trans, root, path, wc, &lookup_info); 5500 if (ret > 0) { 5501 path->slots[level]++; 5502 continue; 5503 } else if (ret < 0) 5504 break; 5505 level = wc->level; 5506 } 5507 return (ret == 1) ? 0 : ret; 5508 } 5509 5510 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5511 struct btrfs_root *root, 5512 struct btrfs_path *path, 5513 struct walk_control *wc, int max_level) 5514 { 5515 int level = wc->level; 5516 int ret; 5517 5518 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 5519 while (level < max_level && path->nodes[level]) { 5520 wc->level = level; 5521 if (path->slots[level] + 1 < 5522 btrfs_header_nritems(path->nodes[level])) { 5523 path->slots[level]++; 5524 return 0; 5525 } else { 5526 ret = walk_up_proc(trans, root, path, wc); 5527 if (ret > 0) 5528 return 0; 5529 if (ret < 0) 5530 return ret; 5531 5532 if (path->locks[level]) { 5533 btrfs_tree_unlock_rw(path->nodes[level], 5534 path->locks[level]); 5535 path->locks[level] = 0; 5536 } 5537 free_extent_buffer(path->nodes[level]); 5538 path->nodes[level] = NULL; 5539 level++; 5540 } 5541 } 5542 return 1; 5543 } 5544 5545 /* 5546 * drop a subvolume tree. 5547 * 5548 * this function traverses the tree freeing any blocks that only 5549 * referenced by the tree. 5550 * 5551 * when a shared tree block is found. this function decreases its 5552 * reference count by one. if update_ref is true, this function 5553 * also make sure backrefs for the shared block and all lower level 5554 * blocks are properly updated. 5555 * 5556 * If called with for_reloc == 0, may exit early with -EAGAIN 5557 */ 5558 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) 5559 { 5560 const bool is_reloc_root = (root->root_key.objectid == 5561 BTRFS_TREE_RELOC_OBJECTID); 5562 struct btrfs_fs_info *fs_info = root->fs_info; 5563 struct btrfs_path *path; 5564 struct btrfs_trans_handle *trans; 5565 struct btrfs_root *tree_root = fs_info->tree_root; 5566 struct btrfs_root_item *root_item = &root->root_item; 5567 struct walk_control *wc; 5568 struct btrfs_key key; 5569 int err = 0; 5570 int ret; 5571 int level; 5572 bool root_dropped = false; 5573 bool unfinished_drop = false; 5574 5575 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); 5576 5577 path = btrfs_alloc_path(); 5578 if (!path) { 5579 err = -ENOMEM; 5580 goto out; 5581 } 5582 5583 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5584 if (!wc) { 5585 btrfs_free_path(path); 5586 err = -ENOMEM; 5587 goto out; 5588 } 5589 5590 /* 5591 * Use join to avoid potential EINTR from transaction start. See 5592 * wait_reserve_ticket and the whole reservation callchain. 5593 */ 5594 if (for_reloc) 5595 trans = btrfs_join_transaction(tree_root); 5596 else 5597 trans = btrfs_start_transaction(tree_root, 0); 5598 if (IS_ERR(trans)) { 5599 err = PTR_ERR(trans); 5600 goto out_free; 5601 } 5602 5603 err = btrfs_run_delayed_items(trans); 5604 if (err) 5605 goto out_end_trans; 5606 5607 /* 5608 * This will help us catch people modifying the fs tree while we're 5609 * dropping it. It is unsafe to mess with the fs tree while it's being 5610 * dropped as we unlock the root node and parent nodes as we walk down 5611 * the tree, assuming nothing will change. If something does change 5612 * then we'll have stale information and drop references to blocks we've 5613 * already dropped. 5614 */ 5615 set_bit(BTRFS_ROOT_DELETING, &root->state); 5616 unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); 5617 5618 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5619 level = btrfs_header_level(root->node); 5620 path->nodes[level] = btrfs_lock_root_node(root); 5621 path->slots[level] = 0; 5622 path->locks[level] = BTRFS_WRITE_LOCK; 5623 memset(&wc->update_progress, 0, 5624 sizeof(wc->update_progress)); 5625 } else { 5626 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 5627 memcpy(&wc->update_progress, &key, 5628 sizeof(wc->update_progress)); 5629 5630 level = btrfs_root_drop_level(root_item); 5631 BUG_ON(level == 0); 5632 path->lowest_level = level; 5633 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5634 path->lowest_level = 0; 5635 if (ret < 0) { 5636 err = ret; 5637 goto out_end_trans; 5638 } 5639 WARN_ON(ret > 0); 5640 5641 /* 5642 * unlock our path, this is safe because only this 5643 * function is allowed to delete this snapshot 5644 */ 5645 btrfs_unlock_up_safe(path, 0); 5646 5647 level = btrfs_header_level(root->node); 5648 while (1) { 5649 btrfs_tree_lock(path->nodes[level]); 5650 path->locks[level] = BTRFS_WRITE_LOCK; 5651 5652 ret = btrfs_lookup_extent_info(trans, fs_info, 5653 path->nodes[level]->start, 5654 level, 1, &wc->refs[level], 5655 &wc->flags[level]); 5656 if (ret < 0) { 5657 err = ret; 5658 goto out_end_trans; 5659 } 5660 BUG_ON(wc->refs[level] == 0); 5661 5662 if (level == btrfs_root_drop_level(root_item)) 5663 break; 5664 5665 btrfs_tree_unlock(path->nodes[level]); 5666 path->locks[level] = 0; 5667 WARN_ON(wc->refs[level] != 1); 5668 level--; 5669 } 5670 } 5671 5672 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 5673 wc->level = level; 5674 wc->shared_level = -1; 5675 wc->stage = DROP_REFERENCE; 5676 wc->update_ref = update_ref; 5677 wc->keep_locks = 0; 5678 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 5679 5680 while (1) { 5681 5682 ret = walk_down_tree(trans, root, path, wc); 5683 if (ret < 0) { 5684 btrfs_abort_transaction(trans, ret); 5685 err = ret; 5686 break; 5687 } 5688 5689 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 5690 if (ret < 0) { 5691 btrfs_abort_transaction(trans, ret); 5692 err = ret; 5693 break; 5694 } 5695 5696 if (ret > 0) { 5697 BUG_ON(wc->stage != DROP_REFERENCE); 5698 break; 5699 } 5700 5701 if (wc->stage == DROP_REFERENCE) { 5702 wc->drop_level = wc->level; 5703 btrfs_node_key_to_cpu(path->nodes[wc->drop_level], 5704 &wc->drop_progress, 5705 path->slots[wc->drop_level]); 5706 } 5707 btrfs_cpu_key_to_disk(&root_item->drop_progress, 5708 &wc->drop_progress); 5709 btrfs_set_root_drop_level(root_item, wc->drop_level); 5710 5711 BUG_ON(wc->level == 0); 5712 if (btrfs_should_end_transaction(trans) || 5713 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 5714 ret = btrfs_update_root(trans, tree_root, 5715 &root->root_key, 5716 root_item); 5717 if (ret) { 5718 btrfs_abort_transaction(trans, ret); 5719 err = ret; 5720 goto out_end_trans; 5721 } 5722 5723 if (!is_reloc_root) 5724 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 5725 5726 btrfs_end_transaction_throttle(trans); 5727 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 5728 btrfs_debug(fs_info, 5729 "drop snapshot early exit"); 5730 err = -EAGAIN; 5731 goto out_free; 5732 } 5733 5734 /* 5735 * Use join to avoid potential EINTR from transaction 5736 * start. See wait_reserve_ticket and the whole 5737 * reservation callchain. 5738 */ 5739 if (for_reloc) 5740 trans = btrfs_join_transaction(tree_root); 5741 else 5742 trans = btrfs_start_transaction(tree_root, 0); 5743 if (IS_ERR(trans)) { 5744 err = PTR_ERR(trans); 5745 goto out_free; 5746 } 5747 } 5748 } 5749 btrfs_release_path(path); 5750 if (err) 5751 goto out_end_trans; 5752 5753 ret = btrfs_del_root(trans, &root->root_key); 5754 if (ret) { 5755 btrfs_abort_transaction(trans, ret); 5756 err = ret; 5757 goto out_end_trans; 5758 } 5759 5760 if (!is_reloc_root) { 5761 ret = btrfs_find_root(tree_root, &root->root_key, path, 5762 NULL, NULL); 5763 if (ret < 0) { 5764 btrfs_abort_transaction(trans, ret); 5765 err = ret; 5766 goto out_end_trans; 5767 } else if (ret > 0) { 5768 /* if we fail to delete the orphan item this time 5769 * around, it'll get picked up the next time. 5770 * 5771 * The most common failure here is just -ENOENT. 5772 */ 5773 btrfs_del_orphan_item(trans, tree_root, 5774 root->root_key.objectid); 5775 } 5776 } 5777 5778 /* 5779 * This subvolume is going to be completely dropped, and won't be 5780 * recorded as dirty roots, thus pertrans meta rsv will not be freed at 5781 * commit transaction time. So free it here manually. 5782 */ 5783 btrfs_qgroup_convert_reserved_meta(root, INT_MAX); 5784 btrfs_qgroup_free_meta_all_pertrans(root); 5785 5786 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) 5787 btrfs_add_dropped_root(trans, root); 5788 else 5789 btrfs_put_root(root); 5790 root_dropped = true; 5791 out_end_trans: 5792 if (!is_reloc_root) 5793 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 5794 5795 btrfs_end_transaction_throttle(trans); 5796 out_free: 5797 kfree(wc); 5798 btrfs_free_path(path); 5799 out: 5800 /* 5801 * We were an unfinished drop root, check to see if there are any 5802 * pending, and if not clear and wake up any waiters. 5803 */ 5804 if (!err && unfinished_drop) 5805 btrfs_maybe_wake_unfinished_drop(fs_info); 5806 5807 /* 5808 * So if we need to stop dropping the snapshot for whatever reason we 5809 * need to make sure to add it back to the dead root list so that we 5810 * keep trying to do the work later. This also cleans up roots if we 5811 * don't have it in the radix (like when we recover after a power fail 5812 * or unmount) so we don't leak memory. 5813 */ 5814 if (!for_reloc && !root_dropped) 5815 btrfs_add_dead_root(root); 5816 return err; 5817 } 5818 5819 /* 5820 * drop subtree rooted at tree block 'node'. 5821 * 5822 * NOTE: this function will unlock and release tree block 'node' 5823 * only used by relocation code 5824 */ 5825 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 5826 struct btrfs_root *root, 5827 struct extent_buffer *node, 5828 struct extent_buffer *parent) 5829 { 5830 struct btrfs_fs_info *fs_info = root->fs_info; 5831 struct btrfs_path *path; 5832 struct walk_control *wc; 5833 int level; 5834 int parent_level; 5835 int ret = 0; 5836 int wret; 5837 5838 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 5839 5840 path = btrfs_alloc_path(); 5841 if (!path) 5842 return -ENOMEM; 5843 5844 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5845 if (!wc) { 5846 btrfs_free_path(path); 5847 return -ENOMEM; 5848 } 5849 5850 btrfs_assert_tree_write_locked(parent); 5851 parent_level = btrfs_header_level(parent); 5852 atomic_inc(&parent->refs); 5853 path->nodes[parent_level] = parent; 5854 path->slots[parent_level] = btrfs_header_nritems(parent); 5855 5856 btrfs_assert_tree_write_locked(node); 5857 level = btrfs_header_level(node); 5858 path->nodes[level] = node; 5859 path->slots[level] = 0; 5860 path->locks[level] = BTRFS_WRITE_LOCK; 5861 5862 wc->refs[parent_level] = 1; 5863 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5864 wc->level = level; 5865 wc->shared_level = -1; 5866 wc->stage = DROP_REFERENCE; 5867 wc->update_ref = 0; 5868 wc->keep_locks = 1; 5869 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 5870 5871 while (1) { 5872 wret = walk_down_tree(trans, root, path, wc); 5873 if (wret < 0) { 5874 ret = wret; 5875 break; 5876 } 5877 5878 wret = walk_up_tree(trans, root, path, wc, parent_level); 5879 if (wret < 0) 5880 ret = wret; 5881 if (wret != 0) 5882 break; 5883 } 5884 5885 kfree(wc); 5886 btrfs_free_path(path); 5887 return ret; 5888 } 5889 5890 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 5891 u64 start, u64 end) 5892 { 5893 return unpin_extent_range(fs_info, start, end, false); 5894 } 5895 5896 /* 5897 * It used to be that old block groups would be left around forever. 5898 * Iterating over them would be enough to trim unused space. Since we 5899 * now automatically remove them, we also need to iterate over unallocated 5900 * space. 5901 * 5902 * We don't want a transaction for this since the discard may take a 5903 * substantial amount of time. We don't require that a transaction be 5904 * running, but we do need to take a running transaction into account 5905 * to ensure that we're not discarding chunks that were released or 5906 * allocated in the current transaction. 5907 * 5908 * Holding the chunks lock will prevent other threads from allocating 5909 * or releasing chunks, but it won't prevent a running transaction 5910 * from committing and releasing the memory that the pending chunks 5911 * list head uses. For that, we need to take a reference to the 5912 * transaction and hold the commit root sem. We only need to hold 5913 * it while performing the free space search since we have already 5914 * held back allocations. 5915 */ 5916 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) 5917 { 5918 u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; 5919 int ret; 5920 5921 *trimmed = 0; 5922 5923 /* Discard not supported = nothing to do. */ 5924 if (!bdev_max_discard_sectors(device->bdev)) 5925 return 0; 5926 5927 /* Not writable = nothing to do. */ 5928 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 5929 return 0; 5930 5931 /* No free space = nothing to do. */ 5932 if (device->total_bytes <= device->bytes_used) 5933 return 0; 5934 5935 ret = 0; 5936 5937 while (1) { 5938 struct btrfs_fs_info *fs_info = device->fs_info; 5939 u64 bytes; 5940 5941 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 5942 if (ret) 5943 break; 5944 5945 find_first_clear_extent_bit(&device->alloc_state, start, 5946 &start, &end, 5947 CHUNK_TRIMMED | CHUNK_ALLOCATED); 5948 5949 /* Check if there are any CHUNK_* bits left */ 5950 if (start > device->total_bytes) { 5951 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 5952 btrfs_warn_in_rcu(fs_info, 5953 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", 5954 start, end - start + 1, 5955 btrfs_dev_name(device), 5956 device->total_bytes); 5957 mutex_unlock(&fs_info->chunk_mutex); 5958 ret = 0; 5959 break; 5960 } 5961 5962 /* Ensure we skip the reserved space on each device. */ 5963 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); 5964 5965 /* 5966 * If find_first_clear_extent_bit find a range that spans the 5967 * end of the device it will set end to -1, in this case it's up 5968 * to the caller to trim the value to the size of the device. 5969 */ 5970 end = min(end, device->total_bytes - 1); 5971 5972 len = end - start + 1; 5973 5974 /* We didn't find any extents */ 5975 if (!len) { 5976 mutex_unlock(&fs_info->chunk_mutex); 5977 ret = 0; 5978 break; 5979 } 5980 5981 ret = btrfs_issue_discard(device->bdev, start, len, 5982 &bytes); 5983 if (!ret) 5984 set_extent_bit(&device->alloc_state, start, 5985 start + bytes - 1, CHUNK_TRIMMED, NULL); 5986 mutex_unlock(&fs_info->chunk_mutex); 5987 5988 if (ret) 5989 break; 5990 5991 start += len; 5992 *trimmed += bytes; 5993 5994 if (fatal_signal_pending(current)) { 5995 ret = -ERESTARTSYS; 5996 break; 5997 } 5998 5999 cond_resched(); 6000 } 6001 6002 return ret; 6003 } 6004 6005 /* 6006 * Trim the whole filesystem by: 6007 * 1) trimming the free space in each block group 6008 * 2) trimming the unallocated space on each device 6009 * 6010 * This will also continue trimming even if a block group or device encounters 6011 * an error. The return value will be the last error, or 0 if nothing bad 6012 * happens. 6013 */ 6014 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 6015 { 6016 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6017 struct btrfs_block_group *cache = NULL; 6018 struct btrfs_device *device; 6019 u64 group_trimmed; 6020 u64 range_end = U64_MAX; 6021 u64 start; 6022 u64 end; 6023 u64 trimmed = 0; 6024 u64 bg_failed = 0; 6025 u64 dev_failed = 0; 6026 int bg_ret = 0; 6027 int dev_ret = 0; 6028 int ret = 0; 6029 6030 if (range->start == U64_MAX) 6031 return -EINVAL; 6032 6033 /* 6034 * Check range overflow if range->len is set. 6035 * The default range->len is U64_MAX. 6036 */ 6037 if (range->len != U64_MAX && 6038 check_add_overflow(range->start, range->len, &range_end)) 6039 return -EINVAL; 6040 6041 cache = btrfs_lookup_first_block_group(fs_info, range->start); 6042 for (; cache; cache = btrfs_next_block_group(cache)) { 6043 if (cache->start >= range_end) { 6044 btrfs_put_block_group(cache); 6045 break; 6046 } 6047 6048 start = max(range->start, cache->start); 6049 end = min(range_end, cache->start + cache->length); 6050 6051 if (end - start >= range->minlen) { 6052 if (!btrfs_block_group_done(cache)) { 6053 ret = btrfs_cache_block_group(cache, true); 6054 if (ret) { 6055 bg_failed++; 6056 bg_ret = ret; 6057 continue; 6058 } 6059 } 6060 ret = btrfs_trim_block_group(cache, 6061 &group_trimmed, 6062 start, 6063 end, 6064 range->minlen); 6065 6066 trimmed += group_trimmed; 6067 if (ret) { 6068 bg_failed++; 6069 bg_ret = ret; 6070 continue; 6071 } 6072 } 6073 } 6074 6075 if (bg_failed) 6076 btrfs_warn(fs_info, 6077 "failed to trim %llu block group(s), last error %d", 6078 bg_failed, bg_ret); 6079 6080 mutex_lock(&fs_devices->device_list_mutex); 6081 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6082 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 6083 continue; 6084 6085 ret = btrfs_trim_free_extents(device, &group_trimmed); 6086 if (ret) { 6087 dev_failed++; 6088 dev_ret = ret; 6089 break; 6090 } 6091 6092 trimmed += group_trimmed; 6093 } 6094 mutex_unlock(&fs_devices->device_list_mutex); 6095 6096 if (dev_failed) 6097 btrfs_warn(fs_info, 6098 "failed to trim %llu device(s), last error %d", 6099 dev_failed, dev_ret); 6100 range->len = trimmed; 6101 if (bg_ret) 6102 return bg_ret; 6103 return dev_ret; 6104 } 6105