1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/signal.h> 8 #include <linux/pagemap.h> 9 #include <linux/writeback.h> 10 #include <linux/blkdev.h> 11 #include <linux/sort.h> 12 #include <linux/rcupdate.h> 13 #include <linux/kthread.h> 14 #include <linux/slab.h> 15 #include <linux/ratelimit.h> 16 #include <linux/percpu_counter.h> 17 #include <linux/lockdep.h> 18 #include <linux/crc32c.h> 19 #include "ctree.h" 20 #include "extent-tree.h" 21 #include "tree-log.h" 22 #include "disk-io.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "locking.h" 27 #include "free-space-cache.h" 28 #include "free-space-tree.h" 29 #include "sysfs.h" 30 #include "qgroup.h" 31 #include "ref-verify.h" 32 #include "space-info.h" 33 #include "block-rsv.h" 34 #include "delalloc-space.h" 35 #include "discard.h" 36 #include "rcu-string.h" 37 #include "zoned.h" 38 #include "dev-replace.h" 39 #include "fs.h" 40 #include "accessors.h" 41 #include "root-tree.h" 42 #include "file-item.h" 43 #include "orphan.h" 44 #include "tree-checker.h" 45 46 #undef SCRAMBLE_DELAYED_REFS 47 48 49 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 50 struct btrfs_delayed_ref_node *node, u64 parent, 51 u64 root_objectid, u64 owner_objectid, 52 u64 owner_offset, int refs_to_drop, 53 struct btrfs_delayed_extent_op *extra_op); 54 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 55 struct extent_buffer *leaf, 56 struct btrfs_extent_item *ei); 57 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 58 u64 parent, u64 root_objectid, 59 u64 flags, u64 owner, u64 offset, 60 struct btrfs_key *ins, int ref_mod); 61 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 62 struct btrfs_delayed_ref_node *node, 63 struct btrfs_delayed_extent_op *extent_op); 64 static int find_next_key(struct btrfs_path *path, int level, 65 struct btrfs_key *key); 66 67 static int block_group_bits(struct btrfs_block_group *cache, u64 bits) 68 { 69 return (cache->flags & bits) == bits; 70 } 71 72 /* simple helper to search for an existing data extent at a given offset */ 73 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) 74 { 75 struct btrfs_root *root = btrfs_extent_root(fs_info, start); 76 int ret; 77 struct btrfs_key key; 78 struct btrfs_path *path; 79 80 path = btrfs_alloc_path(); 81 if (!path) 82 return -ENOMEM; 83 84 key.objectid = start; 85 key.offset = len; 86 key.type = BTRFS_EXTENT_ITEM_KEY; 87 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 88 btrfs_free_path(path); 89 return ret; 90 } 91 92 /* 93 * helper function to lookup reference count and flags of a tree block. 94 * 95 * the head node for delayed ref is used to store the sum of all the 96 * reference count modifications queued up in the rbtree. the head 97 * node may also store the extent flags to set. This way you can check 98 * to see what the reference count and extent flags would be if all of 99 * the delayed refs are not processed. 100 */ 101 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 102 struct btrfs_fs_info *fs_info, u64 bytenr, 103 u64 offset, int metadata, u64 *refs, u64 *flags) 104 { 105 struct btrfs_root *extent_root; 106 struct btrfs_delayed_ref_head *head; 107 struct btrfs_delayed_ref_root *delayed_refs; 108 struct btrfs_path *path; 109 struct btrfs_extent_item *ei; 110 struct extent_buffer *leaf; 111 struct btrfs_key key; 112 u32 item_size; 113 u64 num_refs; 114 u64 extent_flags; 115 int ret; 116 117 /* 118 * If we don't have skinny metadata, don't bother doing anything 119 * different 120 */ 121 if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { 122 offset = fs_info->nodesize; 123 metadata = 0; 124 } 125 126 path = btrfs_alloc_path(); 127 if (!path) 128 return -ENOMEM; 129 130 if (!trans) { 131 path->skip_locking = 1; 132 path->search_commit_root = 1; 133 } 134 135 search_again: 136 key.objectid = bytenr; 137 key.offset = offset; 138 if (metadata) 139 key.type = BTRFS_METADATA_ITEM_KEY; 140 else 141 key.type = BTRFS_EXTENT_ITEM_KEY; 142 143 extent_root = btrfs_extent_root(fs_info, bytenr); 144 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 145 if (ret < 0) 146 goto out_free; 147 148 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 149 if (path->slots[0]) { 150 path->slots[0]--; 151 btrfs_item_key_to_cpu(path->nodes[0], &key, 152 path->slots[0]); 153 if (key.objectid == bytenr && 154 key.type == BTRFS_EXTENT_ITEM_KEY && 155 key.offset == fs_info->nodesize) 156 ret = 0; 157 } 158 } 159 160 if (ret == 0) { 161 leaf = path->nodes[0]; 162 item_size = btrfs_item_size(leaf, path->slots[0]); 163 if (item_size >= sizeof(*ei)) { 164 ei = btrfs_item_ptr(leaf, path->slots[0], 165 struct btrfs_extent_item); 166 num_refs = btrfs_extent_refs(leaf, ei); 167 extent_flags = btrfs_extent_flags(leaf, ei); 168 } else { 169 ret = -EUCLEAN; 170 btrfs_err(fs_info, 171 "unexpected extent item size, has %u expect >= %zu", 172 item_size, sizeof(*ei)); 173 if (trans) 174 btrfs_abort_transaction(trans, ret); 175 else 176 btrfs_handle_fs_error(fs_info, ret, NULL); 177 178 goto out_free; 179 } 180 181 BUG_ON(num_refs == 0); 182 } else { 183 num_refs = 0; 184 extent_flags = 0; 185 ret = 0; 186 } 187 188 if (!trans) 189 goto out; 190 191 delayed_refs = &trans->transaction->delayed_refs; 192 spin_lock(&delayed_refs->lock); 193 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 194 if (head) { 195 if (!mutex_trylock(&head->mutex)) { 196 refcount_inc(&head->refs); 197 spin_unlock(&delayed_refs->lock); 198 199 btrfs_release_path(path); 200 201 /* 202 * Mutex was contended, block until it's released and try 203 * again 204 */ 205 mutex_lock(&head->mutex); 206 mutex_unlock(&head->mutex); 207 btrfs_put_delayed_ref_head(head); 208 goto search_again; 209 } 210 spin_lock(&head->lock); 211 if (head->extent_op && head->extent_op->update_flags) 212 extent_flags |= head->extent_op->flags_to_set; 213 else 214 BUG_ON(num_refs == 0); 215 216 num_refs += head->ref_mod; 217 spin_unlock(&head->lock); 218 mutex_unlock(&head->mutex); 219 } 220 spin_unlock(&delayed_refs->lock); 221 out: 222 WARN_ON(num_refs == 0); 223 if (refs) 224 *refs = num_refs; 225 if (flags) 226 *flags = extent_flags; 227 out_free: 228 btrfs_free_path(path); 229 return ret; 230 } 231 232 /* 233 * Back reference rules. Back refs have three main goals: 234 * 235 * 1) differentiate between all holders of references to an extent so that 236 * when a reference is dropped we can make sure it was a valid reference 237 * before freeing the extent. 238 * 239 * 2) Provide enough information to quickly find the holders of an extent 240 * if we notice a given block is corrupted or bad. 241 * 242 * 3) Make it easy to migrate blocks for FS shrinking or storage pool 243 * maintenance. This is actually the same as #2, but with a slightly 244 * different use case. 245 * 246 * There are two kinds of back refs. The implicit back refs is optimized 247 * for pointers in non-shared tree blocks. For a given pointer in a block, 248 * back refs of this kind provide information about the block's owner tree 249 * and the pointer's key. These information allow us to find the block by 250 * b-tree searching. The full back refs is for pointers in tree blocks not 251 * referenced by their owner trees. The location of tree block is recorded 252 * in the back refs. Actually the full back refs is generic, and can be 253 * used in all cases the implicit back refs is used. The major shortcoming 254 * of the full back refs is its overhead. Every time a tree block gets 255 * COWed, we have to update back refs entry for all pointers in it. 256 * 257 * For a newly allocated tree block, we use implicit back refs for 258 * pointers in it. This means most tree related operations only involve 259 * implicit back refs. For a tree block created in old transaction, the 260 * only way to drop a reference to it is COW it. So we can detect the 261 * event that tree block loses its owner tree's reference and do the 262 * back refs conversion. 263 * 264 * When a tree block is COWed through a tree, there are four cases: 265 * 266 * The reference count of the block is one and the tree is the block's 267 * owner tree. Nothing to do in this case. 268 * 269 * The reference count of the block is one and the tree is not the 270 * block's owner tree. In this case, full back refs is used for pointers 271 * in the block. Remove these full back refs, add implicit back refs for 272 * every pointers in the new block. 273 * 274 * The reference count of the block is greater than one and the tree is 275 * the block's owner tree. In this case, implicit back refs is used for 276 * pointers in the block. Add full back refs for every pointers in the 277 * block, increase lower level extents' reference counts. The original 278 * implicit back refs are entailed to the new block. 279 * 280 * The reference count of the block is greater than one and the tree is 281 * not the block's owner tree. Add implicit back refs for every pointer in 282 * the new block, increase lower level extents' reference count. 283 * 284 * Back Reference Key composing: 285 * 286 * The key objectid corresponds to the first byte in the extent, 287 * The key type is used to differentiate between types of back refs. 288 * There are different meanings of the key offset for different types 289 * of back refs. 290 * 291 * File extents can be referenced by: 292 * 293 * - multiple snapshots, subvolumes, or different generations in one subvol 294 * - different files inside a single subvolume 295 * - different offsets inside a file (bookend extents in file.c) 296 * 297 * The extent ref structure for the implicit back refs has fields for: 298 * 299 * - Objectid of the subvolume root 300 * - objectid of the file holding the reference 301 * - original offset in the file 302 * - how many bookend extents 303 * 304 * The key offset for the implicit back refs is hash of the first 305 * three fields. 306 * 307 * The extent ref structure for the full back refs has field for: 308 * 309 * - number of pointers in the tree leaf 310 * 311 * The key offset for the implicit back refs is the first byte of 312 * the tree leaf 313 * 314 * When a file extent is allocated, The implicit back refs is used. 315 * the fields are filled in: 316 * 317 * (root_key.objectid, inode objectid, offset in file, 1) 318 * 319 * When a file extent is removed file truncation, we find the 320 * corresponding implicit back refs and check the following fields: 321 * 322 * (btrfs_header_owner(leaf), inode objectid, offset in file) 323 * 324 * Btree extents can be referenced by: 325 * 326 * - Different subvolumes 327 * 328 * Both the implicit back refs and the full back refs for tree blocks 329 * only consist of key. The key offset for the implicit back refs is 330 * objectid of block's owner tree. The key offset for the full back refs 331 * is the first byte of parent block. 332 * 333 * When implicit back refs is used, information about the lowest key and 334 * level of the tree block are required. These information are stored in 335 * tree block info structure. 336 */ 337 338 /* 339 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, 340 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, 341 * is_data == BTRFS_REF_TYPE_ANY, either type is OK. 342 */ 343 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, 344 struct btrfs_extent_inline_ref *iref, 345 enum btrfs_inline_ref_type is_data) 346 { 347 int type = btrfs_extent_inline_ref_type(eb, iref); 348 u64 offset = btrfs_extent_inline_ref_offset(eb, iref); 349 350 if (type == BTRFS_TREE_BLOCK_REF_KEY || 351 type == BTRFS_SHARED_BLOCK_REF_KEY || 352 type == BTRFS_SHARED_DATA_REF_KEY || 353 type == BTRFS_EXTENT_DATA_REF_KEY) { 354 if (is_data == BTRFS_REF_TYPE_BLOCK) { 355 if (type == BTRFS_TREE_BLOCK_REF_KEY) 356 return type; 357 if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 358 ASSERT(eb->fs_info); 359 /* 360 * Every shared one has parent tree block, 361 * which must be aligned to sector size. 362 */ 363 if (offset && 364 IS_ALIGNED(offset, eb->fs_info->sectorsize)) 365 return type; 366 } 367 } else if (is_data == BTRFS_REF_TYPE_DATA) { 368 if (type == BTRFS_EXTENT_DATA_REF_KEY) 369 return type; 370 if (type == BTRFS_SHARED_DATA_REF_KEY) { 371 ASSERT(eb->fs_info); 372 /* 373 * Every shared one has parent tree block, 374 * which must be aligned to sector size. 375 */ 376 if (offset && 377 IS_ALIGNED(offset, eb->fs_info->sectorsize)) 378 return type; 379 } 380 } else { 381 ASSERT(is_data == BTRFS_REF_TYPE_ANY); 382 return type; 383 } 384 } 385 386 WARN_ON(1); 387 btrfs_print_leaf(eb); 388 btrfs_err(eb->fs_info, 389 "eb %llu iref 0x%lx invalid extent inline ref type %d", 390 eb->start, (unsigned long)iref, type); 391 392 return BTRFS_REF_TYPE_INVALID; 393 } 394 395 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) 396 { 397 u32 high_crc = ~(u32)0; 398 u32 low_crc = ~(u32)0; 399 __le64 lenum; 400 401 lenum = cpu_to_le64(root_objectid); 402 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); 403 lenum = cpu_to_le64(owner); 404 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 405 lenum = cpu_to_le64(offset); 406 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); 407 408 return ((u64)high_crc << 31) ^ (u64)low_crc; 409 } 410 411 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, 412 struct btrfs_extent_data_ref *ref) 413 { 414 return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), 415 btrfs_extent_data_ref_objectid(leaf, ref), 416 btrfs_extent_data_ref_offset(leaf, ref)); 417 } 418 419 static int match_extent_data_ref(struct extent_buffer *leaf, 420 struct btrfs_extent_data_ref *ref, 421 u64 root_objectid, u64 owner, u64 offset) 422 { 423 if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || 424 btrfs_extent_data_ref_objectid(leaf, ref) != owner || 425 btrfs_extent_data_ref_offset(leaf, ref) != offset) 426 return 0; 427 return 1; 428 } 429 430 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, 431 struct btrfs_path *path, 432 u64 bytenr, u64 parent, 433 u64 root_objectid, 434 u64 owner, u64 offset) 435 { 436 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 437 struct btrfs_key key; 438 struct btrfs_extent_data_ref *ref; 439 struct extent_buffer *leaf; 440 u32 nritems; 441 int ret; 442 int recow; 443 int err = -ENOENT; 444 445 key.objectid = bytenr; 446 if (parent) { 447 key.type = BTRFS_SHARED_DATA_REF_KEY; 448 key.offset = parent; 449 } else { 450 key.type = BTRFS_EXTENT_DATA_REF_KEY; 451 key.offset = hash_extent_data_ref(root_objectid, 452 owner, offset); 453 } 454 again: 455 recow = 0; 456 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 457 if (ret < 0) { 458 err = ret; 459 goto fail; 460 } 461 462 if (parent) { 463 if (!ret) 464 return 0; 465 goto fail; 466 } 467 468 leaf = path->nodes[0]; 469 nritems = btrfs_header_nritems(leaf); 470 while (1) { 471 if (path->slots[0] >= nritems) { 472 ret = btrfs_next_leaf(root, path); 473 if (ret < 0) 474 err = ret; 475 if (ret) 476 goto fail; 477 478 leaf = path->nodes[0]; 479 nritems = btrfs_header_nritems(leaf); 480 recow = 1; 481 } 482 483 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 484 if (key.objectid != bytenr || 485 key.type != BTRFS_EXTENT_DATA_REF_KEY) 486 goto fail; 487 488 ref = btrfs_item_ptr(leaf, path->slots[0], 489 struct btrfs_extent_data_ref); 490 491 if (match_extent_data_ref(leaf, ref, root_objectid, 492 owner, offset)) { 493 if (recow) { 494 btrfs_release_path(path); 495 goto again; 496 } 497 err = 0; 498 break; 499 } 500 path->slots[0]++; 501 } 502 fail: 503 return err; 504 } 505 506 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, 507 struct btrfs_path *path, 508 u64 bytenr, u64 parent, 509 u64 root_objectid, u64 owner, 510 u64 offset, int refs_to_add) 511 { 512 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 513 struct btrfs_key key; 514 struct extent_buffer *leaf; 515 u32 size; 516 u32 num_refs; 517 int ret; 518 519 key.objectid = bytenr; 520 if (parent) { 521 key.type = BTRFS_SHARED_DATA_REF_KEY; 522 key.offset = parent; 523 size = sizeof(struct btrfs_shared_data_ref); 524 } else { 525 key.type = BTRFS_EXTENT_DATA_REF_KEY; 526 key.offset = hash_extent_data_ref(root_objectid, 527 owner, offset); 528 size = sizeof(struct btrfs_extent_data_ref); 529 } 530 531 ret = btrfs_insert_empty_item(trans, root, path, &key, size); 532 if (ret && ret != -EEXIST) 533 goto fail; 534 535 leaf = path->nodes[0]; 536 if (parent) { 537 struct btrfs_shared_data_ref *ref; 538 ref = btrfs_item_ptr(leaf, path->slots[0], 539 struct btrfs_shared_data_ref); 540 if (ret == 0) { 541 btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); 542 } else { 543 num_refs = btrfs_shared_data_ref_count(leaf, ref); 544 num_refs += refs_to_add; 545 btrfs_set_shared_data_ref_count(leaf, ref, num_refs); 546 } 547 } else { 548 struct btrfs_extent_data_ref *ref; 549 while (ret == -EEXIST) { 550 ref = btrfs_item_ptr(leaf, path->slots[0], 551 struct btrfs_extent_data_ref); 552 if (match_extent_data_ref(leaf, ref, root_objectid, 553 owner, offset)) 554 break; 555 btrfs_release_path(path); 556 key.offset++; 557 ret = btrfs_insert_empty_item(trans, root, path, &key, 558 size); 559 if (ret && ret != -EEXIST) 560 goto fail; 561 562 leaf = path->nodes[0]; 563 } 564 ref = btrfs_item_ptr(leaf, path->slots[0], 565 struct btrfs_extent_data_ref); 566 if (ret == 0) { 567 btrfs_set_extent_data_ref_root(leaf, ref, 568 root_objectid); 569 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 570 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 571 btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); 572 } else { 573 num_refs = btrfs_extent_data_ref_count(leaf, ref); 574 num_refs += refs_to_add; 575 btrfs_set_extent_data_ref_count(leaf, ref, num_refs); 576 } 577 } 578 btrfs_mark_buffer_dirty(leaf); 579 ret = 0; 580 fail: 581 btrfs_release_path(path); 582 return ret; 583 } 584 585 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, 586 struct btrfs_root *root, 587 struct btrfs_path *path, 588 int refs_to_drop) 589 { 590 struct btrfs_key key; 591 struct btrfs_extent_data_ref *ref1 = NULL; 592 struct btrfs_shared_data_ref *ref2 = NULL; 593 struct extent_buffer *leaf; 594 u32 num_refs = 0; 595 int ret = 0; 596 597 leaf = path->nodes[0]; 598 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 599 600 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 601 ref1 = btrfs_item_ptr(leaf, path->slots[0], 602 struct btrfs_extent_data_ref); 603 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 604 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 605 ref2 = btrfs_item_ptr(leaf, path->slots[0], 606 struct btrfs_shared_data_ref); 607 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 608 } else { 609 btrfs_err(trans->fs_info, 610 "unrecognized backref key (%llu %u %llu)", 611 key.objectid, key.type, key.offset); 612 btrfs_abort_transaction(trans, -EUCLEAN); 613 return -EUCLEAN; 614 } 615 616 BUG_ON(num_refs < refs_to_drop); 617 num_refs -= refs_to_drop; 618 619 if (num_refs == 0) { 620 ret = btrfs_del_item(trans, root, path); 621 } else { 622 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) 623 btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); 624 else if (key.type == BTRFS_SHARED_DATA_REF_KEY) 625 btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); 626 btrfs_mark_buffer_dirty(leaf); 627 } 628 return ret; 629 } 630 631 static noinline u32 extent_data_ref_count(struct btrfs_path *path, 632 struct btrfs_extent_inline_ref *iref) 633 { 634 struct btrfs_key key; 635 struct extent_buffer *leaf; 636 struct btrfs_extent_data_ref *ref1; 637 struct btrfs_shared_data_ref *ref2; 638 u32 num_refs = 0; 639 int type; 640 641 leaf = path->nodes[0]; 642 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 643 644 if (iref) { 645 /* 646 * If type is invalid, we should have bailed out earlier than 647 * this call. 648 */ 649 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 650 ASSERT(type != BTRFS_REF_TYPE_INVALID); 651 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 652 ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); 653 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 654 } else { 655 ref2 = (struct btrfs_shared_data_ref *)(iref + 1); 656 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 657 } 658 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 659 ref1 = btrfs_item_ptr(leaf, path->slots[0], 660 struct btrfs_extent_data_ref); 661 num_refs = btrfs_extent_data_ref_count(leaf, ref1); 662 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 663 ref2 = btrfs_item_ptr(leaf, path->slots[0], 664 struct btrfs_shared_data_ref); 665 num_refs = btrfs_shared_data_ref_count(leaf, ref2); 666 } else { 667 WARN_ON(1); 668 } 669 return num_refs; 670 } 671 672 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, 673 struct btrfs_path *path, 674 u64 bytenr, u64 parent, 675 u64 root_objectid) 676 { 677 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 678 struct btrfs_key key; 679 int ret; 680 681 key.objectid = bytenr; 682 if (parent) { 683 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 684 key.offset = parent; 685 } else { 686 key.type = BTRFS_TREE_BLOCK_REF_KEY; 687 key.offset = root_objectid; 688 } 689 690 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 691 if (ret > 0) 692 ret = -ENOENT; 693 return ret; 694 } 695 696 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, 697 struct btrfs_path *path, 698 u64 bytenr, u64 parent, 699 u64 root_objectid) 700 { 701 struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); 702 struct btrfs_key key; 703 int ret; 704 705 key.objectid = bytenr; 706 if (parent) { 707 key.type = BTRFS_SHARED_BLOCK_REF_KEY; 708 key.offset = parent; 709 } else { 710 key.type = BTRFS_TREE_BLOCK_REF_KEY; 711 key.offset = root_objectid; 712 } 713 714 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 715 btrfs_release_path(path); 716 return ret; 717 } 718 719 static inline int extent_ref_type(u64 parent, u64 owner) 720 { 721 int type; 722 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 723 if (parent > 0) 724 type = BTRFS_SHARED_BLOCK_REF_KEY; 725 else 726 type = BTRFS_TREE_BLOCK_REF_KEY; 727 } else { 728 if (parent > 0) 729 type = BTRFS_SHARED_DATA_REF_KEY; 730 else 731 type = BTRFS_EXTENT_DATA_REF_KEY; 732 } 733 return type; 734 } 735 736 static int find_next_key(struct btrfs_path *path, int level, 737 struct btrfs_key *key) 738 739 { 740 for (; level < BTRFS_MAX_LEVEL; level++) { 741 if (!path->nodes[level]) 742 break; 743 if (path->slots[level] + 1 >= 744 btrfs_header_nritems(path->nodes[level])) 745 continue; 746 if (level == 0) 747 btrfs_item_key_to_cpu(path->nodes[level], key, 748 path->slots[level] + 1); 749 else 750 btrfs_node_key_to_cpu(path->nodes[level], key, 751 path->slots[level] + 1); 752 return 0; 753 } 754 return 1; 755 } 756 757 /* 758 * look for inline back ref. if back ref is found, *ref_ret is set 759 * to the address of inline back ref, and 0 is returned. 760 * 761 * if back ref isn't found, *ref_ret is set to the address where it 762 * should be inserted, and -ENOENT is returned. 763 * 764 * if insert is true and there are too many inline back refs, the path 765 * points to the extent item, and -EAGAIN is returned. 766 * 767 * NOTE: inline back refs are ordered in the same way that back ref 768 * items in the tree are ordered. 769 */ 770 static noinline_for_stack 771 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, 772 struct btrfs_path *path, 773 struct btrfs_extent_inline_ref **ref_ret, 774 u64 bytenr, u64 num_bytes, 775 u64 parent, u64 root_objectid, 776 u64 owner, u64 offset, int insert) 777 { 778 struct btrfs_fs_info *fs_info = trans->fs_info; 779 struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); 780 struct btrfs_key key; 781 struct extent_buffer *leaf; 782 struct btrfs_extent_item *ei; 783 struct btrfs_extent_inline_ref *iref; 784 u64 flags; 785 u64 item_size; 786 unsigned long ptr; 787 unsigned long end; 788 int extra_size; 789 int type; 790 int want; 791 int ret; 792 int err = 0; 793 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 794 int needed; 795 796 key.objectid = bytenr; 797 key.type = BTRFS_EXTENT_ITEM_KEY; 798 key.offset = num_bytes; 799 800 want = extent_ref_type(parent, owner); 801 if (insert) { 802 extra_size = btrfs_extent_inline_ref_size(want); 803 path->search_for_extension = 1; 804 path->keep_locks = 1; 805 } else 806 extra_size = -1; 807 808 /* 809 * Owner is our level, so we can just add one to get the level for the 810 * block we are interested in. 811 */ 812 if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { 813 key.type = BTRFS_METADATA_ITEM_KEY; 814 key.offset = owner; 815 } 816 817 again: 818 ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); 819 if (ret < 0) { 820 err = ret; 821 goto out; 822 } 823 824 /* 825 * We may be a newly converted file system which still has the old fat 826 * extent entries for metadata, so try and see if we have one of those. 827 */ 828 if (ret > 0 && skinny_metadata) { 829 skinny_metadata = false; 830 if (path->slots[0]) { 831 path->slots[0]--; 832 btrfs_item_key_to_cpu(path->nodes[0], &key, 833 path->slots[0]); 834 if (key.objectid == bytenr && 835 key.type == BTRFS_EXTENT_ITEM_KEY && 836 key.offset == num_bytes) 837 ret = 0; 838 } 839 if (ret) { 840 key.objectid = bytenr; 841 key.type = BTRFS_EXTENT_ITEM_KEY; 842 key.offset = num_bytes; 843 btrfs_release_path(path); 844 goto again; 845 } 846 } 847 848 if (ret && !insert) { 849 err = -ENOENT; 850 goto out; 851 } else if (WARN_ON(ret)) { 852 btrfs_print_leaf(path->nodes[0]); 853 btrfs_err(fs_info, 854 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", 855 bytenr, num_bytes, parent, root_objectid, owner, 856 offset); 857 err = -EIO; 858 goto out; 859 } 860 861 leaf = path->nodes[0]; 862 item_size = btrfs_item_size(leaf, path->slots[0]); 863 if (unlikely(item_size < sizeof(*ei))) { 864 err = -EUCLEAN; 865 btrfs_err(fs_info, 866 "unexpected extent item size, has %llu expect >= %zu", 867 item_size, sizeof(*ei)); 868 btrfs_abort_transaction(trans, err); 869 goto out; 870 } 871 872 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 873 flags = btrfs_extent_flags(leaf, ei); 874 875 ptr = (unsigned long)(ei + 1); 876 end = (unsigned long)ei + item_size; 877 878 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { 879 ptr += sizeof(struct btrfs_tree_block_info); 880 BUG_ON(ptr > end); 881 } 882 883 if (owner >= BTRFS_FIRST_FREE_OBJECTID) 884 needed = BTRFS_REF_TYPE_DATA; 885 else 886 needed = BTRFS_REF_TYPE_BLOCK; 887 888 err = -ENOENT; 889 while (1) { 890 if (ptr >= end) { 891 if (ptr > end) { 892 err = -EUCLEAN; 893 btrfs_print_leaf(path->nodes[0]); 894 btrfs_crit(fs_info, 895 "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", 896 path->slots[0], root_objectid, owner, offset, parent); 897 } 898 break; 899 } 900 iref = (struct btrfs_extent_inline_ref *)ptr; 901 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); 902 if (type == BTRFS_REF_TYPE_INVALID) { 903 err = -EUCLEAN; 904 goto out; 905 } 906 907 if (want < type) 908 break; 909 if (want > type) { 910 ptr += btrfs_extent_inline_ref_size(type); 911 continue; 912 } 913 914 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 915 struct btrfs_extent_data_ref *dref; 916 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 917 if (match_extent_data_ref(leaf, dref, root_objectid, 918 owner, offset)) { 919 err = 0; 920 break; 921 } 922 if (hash_extent_data_ref_item(leaf, dref) < 923 hash_extent_data_ref(root_objectid, owner, offset)) 924 break; 925 } else { 926 u64 ref_offset; 927 ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); 928 if (parent > 0) { 929 if (parent == ref_offset) { 930 err = 0; 931 break; 932 } 933 if (ref_offset < parent) 934 break; 935 } else { 936 if (root_objectid == ref_offset) { 937 err = 0; 938 break; 939 } 940 if (ref_offset < root_objectid) 941 break; 942 } 943 } 944 ptr += btrfs_extent_inline_ref_size(type); 945 } 946 if (err == -ENOENT && insert) { 947 if (item_size + extra_size >= 948 BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { 949 err = -EAGAIN; 950 goto out; 951 } 952 /* 953 * To add new inline back ref, we have to make sure 954 * there is no corresponding back ref item. 955 * For simplicity, we just do not add new inline back 956 * ref if there is any kind of item for this block 957 */ 958 if (find_next_key(path, 0, &key) == 0 && 959 key.objectid == bytenr && 960 key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { 961 err = -EAGAIN; 962 goto out; 963 } 964 } 965 *ref_ret = (struct btrfs_extent_inline_ref *)ptr; 966 out: 967 if (insert) { 968 path->keep_locks = 0; 969 path->search_for_extension = 0; 970 btrfs_unlock_up_safe(path, 1); 971 } 972 return err; 973 } 974 975 /* 976 * helper to add new inline back ref 977 */ 978 static noinline_for_stack 979 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, 980 struct btrfs_path *path, 981 struct btrfs_extent_inline_ref *iref, 982 u64 parent, u64 root_objectid, 983 u64 owner, u64 offset, int refs_to_add, 984 struct btrfs_delayed_extent_op *extent_op) 985 { 986 struct extent_buffer *leaf; 987 struct btrfs_extent_item *ei; 988 unsigned long ptr; 989 unsigned long end; 990 unsigned long item_offset; 991 u64 refs; 992 int size; 993 int type; 994 995 leaf = path->nodes[0]; 996 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 997 item_offset = (unsigned long)iref - (unsigned long)ei; 998 999 type = extent_ref_type(parent, owner); 1000 size = btrfs_extent_inline_ref_size(type); 1001 1002 btrfs_extend_item(path, size); 1003 1004 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1005 refs = btrfs_extent_refs(leaf, ei); 1006 refs += refs_to_add; 1007 btrfs_set_extent_refs(leaf, ei, refs); 1008 if (extent_op) 1009 __run_delayed_extent_op(extent_op, leaf, ei); 1010 1011 ptr = (unsigned long)ei + item_offset; 1012 end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); 1013 if (ptr < end - size) 1014 memmove_extent_buffer(leaf, ptr + size, ptr, 1015 end - size - ptr); 1016 1017 iref = (struct btrfs_extent_inline_ref *)ptr; 1018 btrfs_set_extent_inline_ref_type(leaf, iref, type); 1019 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1020 struct btrfs_extent_data_ref *dref; 1021 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1022 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); 1023 btrfs_set_extent_data_ref_objectid(leaf, dref, owner); 1024 btrfs_set_extent_data_ref_offset(leaf, dref, offset); 1025 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); 1026 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1027 struct btrfs_shared_data_ref *sref; 1028 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1029 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); 1030 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1031 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { 1032 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 1033 } else { 1034 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); 1035 } 1036 btrfs_mark_buffer_dirty(leaf); 1037 } 1038 1039 static int lookup_extent_backref(struct btrfs_trans_handle *trans, 1040 struct btrfs_path *path, 1041 struct btrfs_extent_inline_ref **ref_ret, 1042 u64 bytenr, u64 num_bytes, u64 parent, 1043 u64 root_objectid, u64 owner, u64 offset) 1044 { 1045 int ret; 1046 1047 ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, 1048 num_bytes, parent, root_objectid, 1049 owner, offset, 0); 1050 if (ret != -ENOENT) 1051 return ret; 1052 1053 btrfs_release_path(path); 1054 *ref_ret = NULL; 1055 1056 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1057 ret = lookup_tree_block_ref(trans, path, bytenr, parent, 1058 root_objectid); 1059 } else { 1060 ret = lookup_extent_data_ref(trans, path, bytenr, parent, 1061 root_objectid, owner, offset); 1062 } 1063 return ret; 1064 } 1065 1066 /* 1067 * helper to update/remove inline back ref 1068 */ 1069 static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path, 1070 struct btrfs_extent_inline_ref *iref, 1071 int refs_to_mod, 1072 struct btrfs_delayed_extent_op *extent_op) 1073 { 1074 struct extent_buffer *leaf = path->nodes[0]; 1075 struct btrfs_fs_info *fs_info = leaf->fs_info; 1076 struct btrfs_extent_item *ei; 1077 struct btrfs_extent_data_ref *dref = NULL; 1078 struct btrfs_shared_data_ref *sref = NULL; 1079 unsigned long ptr; 1080 unsigned long end; 1081 u32 item_size; 1082 int size; 1083 int type; 1084 u64 refs; 1085 1086 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1087 refs = btrfs_extent_refs(leaf, ei); 1088 if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { 1089 struct btrfs_key key; 1090 u32 extent_size; 1091 1092 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1093 if (key.type == BTRFS_METADATA_ITEM_KEY) 1094 extent_size = fs_info->nodesize; 1095 else 1096 extent_size = key.offset; 1097 btrfs_print_leaf(leaf); 1098 btrfs_err(fs_info, 1099 "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", 1100 key.objectid, extent_size, refs_to_mod, refs); 1101 return -EUCLEAN; 1102 } 1103 refs += refs_to_mod; 1104 btrfs_set_extent_refs(leaf, ei, refs); 1105 if (extent_op) 1106 __run_delayed_extent_op(extent_op, leaf, ei); 1107 1108 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); 1109 /* 1110 * Function btrfs_get_extent_inline_ref_type() has already printed 1111 * error messages. 1112 */ 1113 if (unlikely(type == BTRFS_REF_TYPE_INVALID)) 1114 return -EUCLEAN; 1115 1116 if (type == BTRFS_EXTENT_DATA_REF_KEY) { 1117 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 1118 refs = btrfs_extent_data_ref_count(leaf, dref); 1119 } else if (type == BTRFS_SHARED_DATA_REF_KEY) { 1120 sref = (struct btrfs_shared_data_ref *)(iref + 1); 1121 refs = btrfs_shared_data_ref_count(leaf, sref); 1122 } else { 1123 refs = 1; 1124 /* 1125 * For tree blocks we can only drop one ref for it, and tree 1126 * blocks should not have refs > 1. 1127 * 1128 * Furthermore if we're inserting a new inline backref, we 1129 * won't reach this path either. That would be 1130 * setup_inline_extent_backref(). 1131 */ 1132 if (unlikely(refs_to_mod != -1)) { 1133 struct btrfs_key key; 1134 1135 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1136 1137 btrfs_print_leaf(leaf); 1138 btrfs_err(fs_info, 1139 "invalid refs_to_mod for tree block %llu, has %d expect -1", 1140 key.objectid, refs_to_mod); 1141 return -EUCLEAN; 1142 } 1143 } 1144 1145 if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { 1146 struct btrfs_key key; 1147 u32 extent_size; 1148 1149 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1150 if (key.type == BTRFS_METADATA_ITEM_KEY) 1151 extent_size = fs_info->nodesize; 1152 else 1153 extent_size = key.offset; 1154 btrfs_print_leaf(leaf); 1155 btrfs_err(fs_info, 1156 "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", 1157 (unsigned long)iref, key.objectid, extent_size, 1158 refs_to_mod, refs); 1159 return -EUCLEAN; 1160 } 1161 refs += refs_to_mod; 1162 1163 if (refs > 0) { 1164 if (type == BTRFS_EXTENT_DATA_REF_KEY) 1165 btrfs_set_extent_data_ref_count(leaf, dref, refs); 1166 else 1167 btrfs_set_shared_data_ref_count(leaf, sref, refs); 1168 } else { 1169 size = btrfs_extent_inline_ref_size(type); 1170 item_size = btrfs_item_size(leaf, path->slots[0]); 1171 ptr = (unsigned long)iref; 1172 end = (unsigned long)ei + item_size; 1173 if (ptr + size < end) 1174 memmove_extent_buffer(leaf, ptr, ptr + size, 1175 end - ptr - size); 1176 item_size -= size; 1177 btrfs_truncate_item(path, item_size, 1); 1178 } 1179 btrfs_mark_buffer_dirty(leaf); 1180 return 0; 1181 } 1182 1183 static noinline_for_stack 1184 int insert_inline_extent_backref(struct btrfs_trans_handle *trans, 1185 struct btrfs_path *path, 1186 u64 bytenr, u64 num_bytes, u64 parent, 1187 u64 root_objectid, u64 owner, 1188 u64 offset, int refs_to_add, 1189 struct btrfs_delayed_extent_op *extent_op) 1190 { 1191 struct btrfs_extent_inline_ref *iref; 1192 int ret; 1193 1194 ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, 1195 num_bytes, parent, root_objectid, 1196 owner, offset, 1); 1197 if (ret == 0) { 1198 /* 1199 * We're adding refs to a tree block we already own, this 1200 * should not happen at all. 1201 */ 1202 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1203 btrfs_print_leaf(path->nodes[0]); 1204 btrfs_crit(trans->fs_info, 1205 "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", 1206 bytenr, num_bytes, root_objectid, path->slots[0]); 1207 return -EUCLEAN; 1208 } 1209 ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op); 1210 } else if (ret == -ENOENT) { 1211 setup_inline_extent_backref(trans->fs_info, path, iref, parent, 1212 root_objectid, owner, offset, 1213 refs_to_add, extent_op); 1214 ret = 0; 1215 } 1216 return ret; 1217 } 1218 1219 static int remove_extent_backref(struct btrfs_trans_handle *trans, 1220 struct btrfs_root *root, 1221 struct btrfs_path *path, 1222 struct btrfs_extent_inline_ref *iref, 1223 int refs_to_drop, int is_data) 1224 { 1225 int ret = 0; 1226 1227 BUG_ON(!is_data && refs_to_drop != 1); 1228 if (iref) 1229 ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL); 1230 else if (is_data) 1231 ret = remove_extent_data_ref(trans, root, path, refs_to_drop); 1232 else 1233 ret = btrfs_del_item(trans, root, path); 1234 return ret; 1235 } 1236 1237 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, 1238 u64 *discarded_bytes) 1239 { 1240 int j, ret = 0; 1241 u64 bytes_left, end; 1242 u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); 1243 1244 if (WARN_ON(start != aligned_start)) { 1245 len -= aligned_start - start; 1246 len = round_down(len, 1 << SECTOR_SHIFT); 1247 start = aligned_start; 1248 } 1249 1250 *discarded_bytes = 0; 1251 1252 if (!len) 1253 return 0; 1254 1255 end = start + len; 1256 bytes_left = len; 1257 1258 /* Skip any superblocks on this device. */ 1259 for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { 1260 u64 sb_start = btrfs_sb_offset(j); 1261 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; 1262 u64 size = sb_start - start; 1263 1264 if (!in_range(sb_start, start, bytes_left) && 1265 !in_range(sb_end, start, bytes_left) && 1266 !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) 1267 continue; 1268 1269 /* 1270 * Superblock spans beginning of range. Adjust start and 1271 * try again. 1272 */ 1273 if (sb_start <= start) { 1274 start += sb_end - start; 1275 if (start > end) { 1276 bytes_left = 0; 1277 break; 1278 } 1279 bytes_left = end - start; 1280 continue; 1281 } 1282 1283 if (size) { 1284 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1285 size >> SECTOR_SHIFT, 1286 GFP_NOFS); 1287 if (!ret) 1288 *discarded_bytes += size; 1289 else if (ret != -EOPNOTSUPP) 1290 return ret; 1291 } 1292 1293 start = sb_end; 1294 if (start > end) { 1295 bytes_left = 0; 1296 break; 1297 } 1298 bytes_left = end - start; 1299 } 1300 1301 if (bytes_left) { 1302 ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, 1303 bytes_left >> SECTOR_SHIFT, 1304 GFP_NOFS); 1305 if (!ret) 1306 *discarded_bytes += bytes_left; 1307 } 1308 return ret; 1309 } 1310 1311 static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) 1312 { 1313 struct btrfs_device *dev = stripe->dev; 1314 struct btrfs_fs_info *fs_info = dev->fs_info; 1315 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1316 u64 phys = stripe->physical; 1317 u64 len = stripe->length; 1318 u64 discarded = 0; 1319 int ret = 0; 1320 1321 /* Zone reset on a zoned filesystem */ 1322 if (btrfs_can_zone_reset(dev, phys, len)) { 1323 u64 src_disc; 1324 1325 ret = btrfs_reset_device_zone(dev, phys, len, &discarded); 1326 if (ret) 1327 goto out; 1328 1329 if (!btrfs_dev_replace_is_ongoing(dev_replace) || 1330 dev != dev_replace->srcdev) 1331 goto out; 1332 1333 src_disc = discarded; 1334 1335 /* Send to replace target as well */ 1336 ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, 1337 &discarded); 1338 discarded += src_disc; 1339 } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { 1340 ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); 1341 } else { 1342 ret = 0; 1343 *bytes = 0; 1344 } 1345 1346 out: 1347 *bytes = discarded; 1348 return ret; 1349 } 1350 1351 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1352 u64 num_bytes, u64 *actual_bytes) 1353 { 1354 int ret = 0; 1355 u64 discarded_bytes = 0; 1356 u64 end = bytenr + num_bytes; 1357 u64 cur = bytenr; 1358 1359 /* 1360 * Avoid races with device replace and make sure the devices in the 1361 * stripes don't go away while we are discarding. 1362 */ 1363 btrfs_bio_counter_inc_blocked(fs_info); 1364 while (cur < end) { 1365 struct btrfs_discard_stripe *stripes; 1366 unsigned int num_stripes; 1367 int i; 1368 1369 num_bytes = end - cur; 1370 stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); 1371 if (IS_ERR(stripes)) { 1372 ret = PTR_ERR(stripes); 1373 if (ret == -EOPNOTSUPP) 1374 ret = 0; 1375 break; 1376 } 1377 1378 for (i = 0; i < num_stripes; i++) { 1379 struct btrfs_discard_stripe *stripe = stripes + i; 1380 u64 bytes; 1381 1382 if (!stripe->dev->bdev) { 1383 ASSERT(btrfs_test_opt(fs_info, DEGRADED)); 1384 continue; 1385 } 1386 1387 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 1388 &stripe->dev->dev_state)) 1389 continue; 1390 1391 ret = do_discard_extent(stripe, &bytes); 1392 if (ret) { 1393 /* 1394 * Keep going if discard is not supported by the 1395 * device. 1396 */ 1397 if (ret != -EOPNOTSUPP) 1398 break; 1399 ret = 0; 1400 } else { 1401 discarded_bytes += bytes; 1402 } 1403 } 1404 kfree(stripes); 1405 if (ret) 1406 break; 1407 cur += num_bytes; 1408 } 1409 btrfs_bio_counter_dec(fs_info); 1410 if (actual_bytes) 1411 *actual_bytes = discarded_bytes; 1412 return ret; 1413 } 1414 1415 /* Can return -ENOMEM */ 1416 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1417 struct btrfs_ref *generic_ref) 1418 { 1419 struct btrfs_fs_info *fs_info = trans->fs_info; 1420 int ret; 1421 1422 ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && 1423 generic_ref->action); 1424 BUG_ON(generic_ref->type == BTRFS_REF_METADATA && 1425 generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); 1426 1427 if (generic_ref->type == BTRFS_REF_METADATA) 1428 ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); 1429 else 1430 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); 1431 1432 btrfs_ref_tree_mod(fs_info, generic_ref); 1433 1434 return ret; 1435 } 1436 1437 /* 1438 * __btrfs_inc_extent_ref - insert backreference for a given extent 1439 * 1440 * The counterpart is in __btrfs_free_extent(), with examples and more details 1441 * how it works. 1442 * 1443 * @trans: Handle of transaction 1444 * 1445 * @node: The delayed ref node used to get the bytenr/length for 1446 * extent whose references are incremented. 1447 * 1448 * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ 1449 * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical 1450 * bytenr of the parent block. Since new extents are always 1451 * created with indirect references, this will only be the case 1452 * when relocating a shared extent. In that case, root_objectid 1453 * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must 1454 * be 0 1455 * 1456 * @root_objectid: The id of the root where this modification has originated, 1457 * this can be either one of the well-known metadata trees or 1458 * the subvolume id which references this extent. 1459 * 1460 * @owner: For data extents it is the inode number of the owning file. 1461 * For metadata extents this parameter holds the level in the 1462 * tree of the extent. 1463 * 1464 * @offset: For metadata extents the offset is ignored and is currently 1465 * always passed as 0. For data extents it is the fileoffset 1466 * this extent belongs to. 1467 * 1468 * @refs_to_add Number of references to add 1469 * 1470 * @extent_op Pointer to a structure, holding information necessary when 1471 * updating a tree block's flags 1472 * 1473 */ 1474 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1475 struct btrfs_delayed_ref_node *node, 1476 u64 parent, u64 root_objectid, 1477 u64 owner, u64 offset, int refs_to_add, 1478 struct btrfs_delayed_extent_op *extent_op) 1479 { 1480 struct btrfs_path *path; 1481 struct extent_buffer *leaf; 1482 struct btrfs_extent_item *item; 1483 struct btrfs_key key; 1484 u64 bytenr = node->bytenr; 1485 u64 num_bytes = node->num_bytes; 1486 u64 refs; 1487 int ret; 1488 1489 path = btrfs_alloc_path(); 1490 if (!path) 1491 return -ENOMEM; 1492 1493 /* this will setup the path even if it fails to insert the back ref */ 1494 ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, 1495 parent, root_objectid, owner, 1496 offset, refs_to_add, extent_op); 1497 if ((ret < 0 && ret != -EAGAIN) || !ret) 1498 goto out; 1499 1500 /* 1501 * Ok we had -EAGAIN which means we didn't have space to insert and 1502 * inline extent ref, so just update the reference count and add a 1503 * normal backref. 1504 */ 1505 leaf = path->nodes[0]; 1506 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1507 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1508 refs = btrfs_extent_refs(leaf, item); 1509 btrfs_set_extent_refs(leaf, item, refs + refs_to_add); 1510 if (extent_op) 1511 __run_delayed_extent_op(extent_op, leaf, item); 1512 1513 btrfs_mark_buffer_dirty(leaf); 1514 btrfs_release_path(path); 1515 1516 /* now insert the actual backref */ 1517 if (owner < BTRFS_FIRST_FREE_OBJECTID) 1518 ret = insert_tree_block_ref(trans, path, bytenr, parent, 1519 root_objectid); 1520 else 1521 ret = insert_extent_data_ref(trans, path, bytenr, parent, 1522 root_objectid, owner, offset, 1523 refs_to_add); 1524 1525 if (ret) 1526 btrfs_abort_transaction(trans, ret); 1527 out: 1528 btrfs_free_path(path); 1529 return ret; 1530 } 1531 1532 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, 1533 struct btrfs_delayed_ref_node *node, 1534 struct btrfs_delayed_extent_op *extent_op, 1535 bool insert_reserved) 1536 { 1537 int ret = 0; 1538 struct btrfs_delayed_data_ref *ref; 1539 struct btrfs_key ins; 1540 u64 parent = 0; 1541 u64 ref_root = 0; 1542 u64 flags = 0; 1543 1544 ins.objectid = node->bytenr; 1545 ins.offset = node->num_bytes; 1546 ins.type = BTRFS_EXTENT_ITEM_KEY; 1547 1548 ref = btrfs_delayed_node_to_data_ref(node); 1549 trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); 1550 1551 if (node->type == BTRFS_SHARED_DATA_REF_KEY) 1552 parent = ref->parent; 1553 ref_root = ref->root; 1554 1555 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1556 if (extent_op) 1557 flags |= extent_op->flags_to_set; 1558 ret = alloc_reserved_file_extent(trans, parent, ref_root, 1559 flags, ref->objectid, 1560 ref->offset, &ins, 1561 node->ref_mod); 1562 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1563 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 1564 ref->objectid, ref->offset, 1565 node->ref_mod, extent_op); 1566 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1567 ret = __btrfs_free_extent(trans, node, parent, 1568 ref_root, ref->objectid, 1569 ref->offset, node->ref_mod, 1570 extent_op); 1571 } else { 1572 BUG(); 1573 } 1574 return ret; 1575 } 1576 1577 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, 1578 struct extent_buffer *leaf, 1579 struct btrfs_extent_item *ei) 1580 { 1581 u64 flags = btrfs_extent_flags(leaf, ei); 1582 if (extent_op->update_flags) { 1583 flags |= extent_op->flags_to_set; 1584 btrfs_set_extent_flags(leaf, ei, flags); 1585 } 1586 1587 if (extent_op->update_key) { 1588 struct btrfs_tree_block_info *bi; 1589 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); 1590 bi = (struct btrfs_tree_block_info *)(ei + 1); 1591 btrfs_set_tree_block_key(leaf, bi, &extent_op->key); 1592 } 1593 } 1594 1595 static int run_delayed_extent_op(struct btrfs_trans_handle *trans, 1596 struct btrfs_delayed_ref_head *head, 1597 struct btrfs_delayed_extent_op *extent_op) 1598 { 1599 struct btrfs_fs_info *fs_info = trans->fs_info; 1600 struct btrfs_root *root; 1601 struct btrfs_key key; 1602 struct btrfs_path *path; 1603 struct btrfs_extent_item *ei; 1604 struct extent_buffer *leaf; 1605 u32 item_size; 1606 int ret; 1607 int err = 0; 1608 int metadata = 1; 1609 1610 if (TRANS_ABORTED(trans)) 1611 return 0; 1612 1613 if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 1614 metadata = 0; 1615 1616 path = btrfs_alloc_path(); 1617 if (!path) 1618 return -ENOMEM; 1619 1620 key.objectid = head->bytenr; 1621 1622 if (metadata) { 1623 key.type = BTRFS_METADATA_ITEM_KEY; 1624 key.offset = extent_op->level; 1625 } else { 1626 key.type = BTRFS_EXTENT_ITEM_KEY; 1627 key.offset = head->num_bytes; 1628 } 1629 1630 root = btrfs_extent_root(fs_info, key.objectid); 1631 again: 1632 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1633 if (ret < 0) { 1634 err = ret; 1635 goto out; 1636 } 1637 if (ret > 0) { 1638 if (metadata) { 1639 if (path->slots[0] > 0) { 1640 path->slots[0]--; 1641 btrfs_item_key_to_cpu(path->nodes[0], &key, 1642 path->slots[0]); 1643 if (key.objectid == head->bytenr && 1644 key.type == BTRFS_EXTENT_ITEM_KEY && 1645 key.offset == head->num_bytes) 1646 ret = 0; 1647 } 1648 if (ret > 0) { 1649 btrfs_release_path(path); 1650 metadata = 0; 1651 1652 key.objectid = head->bytenr; 1653 key.offset = head->num_bytes; 1654 key.type = BTRFS_EXTENT_ITEM_KEY; 1655 goto again; 1656 } 1657 } else { 1658 err = -EUCLEAN; 1659 btrfs_err(fs_info, 1660 "missing extent item for extent %llu num_bytes %llu level %d", 1661 head->bytenr, head->num_bytes, extent_op->level); 1662 goto out; 1663 } 1664 } 1665 1666 leaf = path->nodes[0]; 1667 item_size = btrfs_item_size(leaf, path->slots[0]); 1668 1669 if (unlikely(item_size < sizeof(*ei))) { 1670 err = -EUCLEAN; 1671 btrfs_err(fs_info, 1672 "unexpected extent item size, has %u expect >= %zu", 1673 item_size, sizeof(*ei)); 1674 btrfs_abort_transaction(trans, err); 1675 goto out; 1676 } 1677 1678 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1679 __run_delayed_extent_op(extent_op, leaf, ei); 1680 1681 btrfs_mark_buffer_dirty(leaf); 1682 out: 1683 btrfs_free_path(path); 1684 return err; 1685 } 1686 1687 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, 1688 struct btrfs_delayed_ref_node *node, 1689 struct btrfs_delayed_extent_op *extent_op, 1690 bool insert_reserved) 1691 { 1692 int ret = 0; 1693 struct btrfs_delayed_tree_ref *ref; 1694 u64 parent = 0; 1695 u64 ref_root = 0; 1696 1697 ref = btrfs_delayed_node_to_tree_ref(node); 1698 trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); 1699 1700 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1701 parent = ref->parent; 1702 ref_root = ref->root; 1703 1704 if (unlikely(node->ref_mod != 1)) { 1705 btrfs_err(trans->fs_info, 1706 "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", 1707 node->bytenr, node->ref_mod, node->action, ref_root, 1708 parent); 1709 return -EUCLEAN; 1710 } 1711 if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { 1712 BUG_ON(!extent_op || !extent_op->update_flags); 1713 ret = alloc_reserved_tree_block(trans, node, extent_op); 1714 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1715 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, 1716 ref->level, 0, 1, extent_op); 1717 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1718 ret = __btrfs_free_extent(trans, node, parent, ref_root, 1719 ref->level, 0, 1, extent_op); 1720 } else { 1721 BUG(); 1722 } 1723 return ret; 1724 } 1725 1726 /* helper function to actually process a single delayed ref entry */ 1727 static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1728 struct btrfs_delayed_ref_node *node, 1729 struct btrfs_delayed_extent_op *extent_op, 1730 bool insert_reserved) 1731 { 1732 int ret = 0; 1733 1734 if (TRANS_ABORTED(trans)) { 1735 if (insert_reserved) 1736 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1737 return 0; 1738 } 1739 1740 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1741 node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1742 ret = run_delayed_tree_ref(trans, node, extent_op, 1743 insert_reserved); 1744 else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1745 node->type == BTRFS_SHARED_DATA_REF_KEY) 1746 ret = run_delayed_data_ref(trans, node, extent_op, 1747 insert_reserved); 1748 else 1749 BUG(); 1750 if (ret && insert_reserved) 1751 btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); 1752 if (ret < 0) 1753 btrfs_err(trans->fs_info, 1754 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", 1755 node->bytenr, node->num_bytes, node->type, 1756 node->action, node->ref_mod, ret); 1757 return ret; 1758 } 1759 1760 static inline struct btrfs_delayed_ref_node * 1761 select_delayed_ref(struct btrfs_delayed_ref_head *head) 1762 { 1763 struct btrfs_delayed_ref_node *ref; 1764 1765 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 1766 return NULL; 1767 1768 /* 1769 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. 1770 * This is to prevent a ref count from going down to zero, which deletes 1771 * the extent item from the extent tree, when there still are references 1772 * to add, which would fail because they would not find the extent item. 1773 */ 1774 if (!list_empty(&head->ref_add_list)) 1775 return list_first_entry(&head->ref_add_list, 1776 struct btrfs_delayed_ref_node, add_list); 1777 1778 ref = rb_entry(rb_first_cached(&head->ref_tree), 1779 struct btrfs_delayed_ref_node, ref_node); 1780 ASSERT(list_empty(&ref->add_list)); 1781 return ref; 1782 } 1783 1784 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 1785 struct btrfs_delayed_ref_head *head) 1786 { 1787 spin_lock(&delayed_refs->lock); 1788 head->processing = false; 1789 delayed_refs->num_heads_ready++; 1790 spin_unlock(&delayed_refs->lock); 1791 btrfs_delayed_ref_unlock(head); 1792 } 1793 1794 static struct btrfs_delayed_extent_op *cleanup_extent_op( 1795 struct btrfs_delayed_ref_head *head) 1796 { 1797 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 1798 1799 if (!extent_op) 1800 return NULL; 1801 1802 if (head->must_insert_reserved) { 1803 head->extent_op = NULL; 1804 btrfs_free_delayed_extent_op(extent_op); 1805 return NULL; 1806 } 1807 return extent_op; 1808 } 1809 1810 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, 1811 struct btrfs_delayed_ref_head *head) 1812 { 1813 struct btrfs_delayed_extent_op *extent_op; 1814 int ret; 1815 1816 extent_op = cleanup_extent_op(head); 1817 if (!extent_op) 1818 return 0; 1819 head->extent_op = NULL; 1820 spin_unlock(&head->lock); 1821 ret = run_delayed_extent_op(trans, head, extent_op); 1822 btrfs_free_delayed_extent_op(extent_op); 1823 return ret ? ret : 1; 1824 } 1825 1826 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, 1827 struct btrfs_delayed_ref_root *delayed_refs, 1828 struct btrfs_delayed_ref_head *head) 1829 { 1830 int nr_items = 1; /* Dropping this ref head update. */ 1831 1832 /* 1833 * We had csum deletions accounted for in our delayed refs rsv, we need 1834 * to drop the csum leaves for this update from our delayed_refs_rsv. 1835 */ 1836 if (head->total_ref_mod < 0 && head->is_data) { 1837 spin_lock(&delayed_refs->lock); 1838 delayed_refs->pending_csums -= head->num_bytes; 1839 spin_unlock(&delayed_refs->lock); 1840 nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); 1841 } 1842 1843 btrfs_delayed_refs_rsv_release(fs_info, nr_items); 1844 } 1845 1846 static int cleanup_ref_head(struct btrfs_trans_handle *trans, 1847 struct btrfs_delayed_ref_head *head) 1848 { 1849 1850 struct btrfs_fs_info *fs_info = trans->fs_info; 1851 struct btrfs_delayed_ref_root *delayed_refs; 1852 int ret; 1853 1854 delayed_refs = &trans->transaction->delayed_refs; 1855 1856 ret = run_and_cleanup_extent_op(trans, head); 1857 if (ret < 0) { 1858 unselect_delayed_ref_head(delayed_refs, head); 1859 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 1860 return ret; 1861 } else if (ret) { 1862 return ret; 1863 } 1864 1865 /* 1866 * Need to drop our head ref lock and re-acquire the delayed ref lock 1867 * and then re-check to make sure nobody got added. 1868 */ 1869 spin_unlock(&head->lock); 1870 spin_lock(&delayed_refs->lock); 1871 spin_lock(&head->lock); 1872 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { 1873 spin_unlock(&head->lock); 1874 spin_unlock(&delayed_refs->lock); 1875 return 1; 1876 } 1877 btrfs_delete_ref_head(delayed_refs, head); 1878 spin_unlock(&head->lock); 1879 spin_unlock(&delayed_refs->lock); 1880 1881 if (head->must_insert_reserved) { 1882 btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); 1883 if (head->is_data) { 1884 struct btrfs_root *csum_root; 1885 1886 csum_root = btrfs_csum_root(fs_info, head->bytenr); 1887 ret = btrfs_del_csums(trans, csum_root, head->bytenr, 1888 head->num_bytes); 1889 } 1890 } 1891 1892 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 1893 1894 trace_run_delayed_ref_head(fs_info, head, 0); 1895 btrfs_delayed_ref_unlock(head); 1896 btrfs_put_delayed_ref_head(head); 1897 return ret; 1898 } 1899 1900 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( 1901 struct btrfs_trans_handle *trans) 1902 { 1903 struct btrfs_delayed_ref_root *delayed_refs = 1904 &trans->transaction->delayed_refs; 1905 struct btrfs_delayed_ref_head *head = NULL; 1906 int ret; 1907 1908 spin_lock(&delayed_refs->lock); 1909 head = btrfs_select_ref_head(delayed_refs); 1910 if (!head) { 1911 spin_unlock(&delayed_refs->lock); 1912 return head; 1913 } 1914 1915 /* 1916 * Grab the lock that says we are going to process all the refs for 1917 * this head 1918 */ 1919 ret = btrfs_delayed_ref_lock(delayed_refs, head); 1920 spin_unlock(&delayed_refs->lock); 1921 1922 /* 1923 * We may have dropped the spin lock to get the head mutex lock, and 1924 * that might have given someone else time to free the head. If that's 1925 * true, it has been removed from our list and we can move on. 1926 */ 1927 if (ret == -EAGAIN) 1928 head = ERR_PTR(-EAGAIN); 1929 1930 return head; 1931 } 1932 1933 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, 1934 struct btrfs_delayed_ref_head *locked_ref) 1935 { 1936 struct btrfs_fs_info *fs_info = trans->fs_info; 1937 struct btrfs_delayed_ref_root *delayed_refs; 1938 struct btrfs_delayed_extent_op *extent_op; 1939 struct btrfs_delayed_ref_node *ref; 1940 bool must_insert_reserved; 1941 int ret; 1942 1943 delayed_refs = &trans->transaction->delayed_refs; 1944 1945 lockdep_assert_held(&locked_ref->mutex); 1946 lockdep_assert_held(&locked_ref->lock); 1947 1948 while ((ref = select_delayed_ref(locked_ref))) { 1949 if (ref->seq && 1950 btrfs_check_delayed_seq(fs_info, ref->seq)) { 1951 spin_unlock(&locked_ref->lock); 1952 unselect_delayed_ref_head(delayed_refs, locked_ref); 1953 return -EAGAIN; 1954 } 1955 1956 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); 1957 RB_CLEAR_NODE(&ref->ref_node); 1958 if (!list_empty(&ref->add_list)) 1959 list_del(&ref->add_list); 1960 /* 1961 * When we play the delayed ref, also correct the ref_mod on 1962 * head 1963 */ 1964 switch (ref->action) { 1965 case BTRFS_ADD_DELAYED_REF: 1966 case BTRFS_ADD_DELAYED_EXTENT: 1967 locked_ref->ref_mod -= ref->ref_mod; 1968 break; 1969 case BTRFS_DROP_DELAYED_REF: 1970 locked_ref->ref_mod += ref->ref_mod; 1971 break; 1972 default: 1973 WARN_ON(1); 1974 } 1975 atomic_dec(&delayed_refs->num_entries); 1976 1977 /* 1978 * Record the must_insert_reserved flag before we drop the 1979 * spin lock. 1980 */ 1981 must_insert_reserved = locked_ref->must_insert_reserved; 1982 locked_ref->must_insert_reserved = false; 1983 1984 extent_op = locked_ref->extent_op; 1985 locked_ref->extent_op = NULL; 1986 spin_unlock(&locked_ref->lock); 1987 1988 ret = run_one_delayed_ref(trans, ref, extent_op, 1989 must_insert_reserved); 1990 1991 btrfs_free_delayed_extent_op(extent_op); 1992 if (ret) { 1993 unselect_delayed_ref_head(delayed_refs, locked_ref); 1994 btrfs_put_delayed_ref(ref); 1995 return ret; 1996 } 1997 1998 btrfs_put_delayed_ref(ref); 1999 cond_resched(); 2000 2001 spin_lock(&locked_ref->lock); 2002 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 2003 } 2004 2005 return 0; 2006 } 2007 2008 /* 2009 * Returns 0 on success or if called with an already aborted transaction. 2010 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2011 */ 2012 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2013 unsigned long nr) 2014 { 2015 struct btrfs_fs_info *fs_info = trans->fs_info; 2016 struct btrfs_delayed_ref_root *delayed_refs; 2017 struct btrfs_delayed_ref_head *locked_ref = NULL; 2018 int ret; 2019 unsigned long count = 0; 2020 2021 delayed_refs = &trans->transaction->delayed_refs; 2022 do { 2023 if (!locked_ref) { 2024 locked_ref = btrfs_obtain_ref_head(trans); 2025 if (IS_ERR_OR_NULL(locked_ref)) { 2026 if (PTR_ERR(locked_ref) == -EAGAIN) { 2027 continue; 2028 } else { 2029 break; 2030 } 2031 } 2032 count++; 2033 } 2034 /* 2035 * We need to try and merge add/drops of the same ref since we 2036 * can run into issues with relocate dropping the implicit ref 2037 * and then it being added back again before the drop can 2038 * finish. If we merged anything we need to re-loop so we can 2039 * get a good ref. 2040 * Or we can get node references of the same type that weren't 2041 * merged when created due to bumps in the tree mod seq, and 2042 * we need to merge them to prevent adding an inline extent 2043 * backref before dropping it (triggering a BUG_ON at 2044 * insert_inline_extent_backref()). 2045 */ 2046 spin_lock(&locked_ref->lock); 2047 btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); 2048 2049 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); 2050 if (ret < 0 && ret != -EAGAIN) { 2051 /* 2052 * Error, btrfs_run_delayed_refs_for_head already 2053 * unlocked everything so just bail out 2054 */ 2055 return ret; 2056 } else if (!ret) { 2057 /* 2058 * Success, perform the usual cleanup of a processed 2059 * head 2060 */ 2061 ret = cleanup_ref_head(trans, locked_ref); 2062 if (ret > 0 ) { 2063 /* We dropped our lock, we need to loop. */ 2064 ret = 0; 2065 continue; 2066 } else if (ret) { 2067 return ret; 2068 } 2069 } 2070 2071 /* 2072 * Either success case or btrfs_run_delayed_refs_for_head 2073 * returned -EAGAIN, meaning we need to select another head 2074 */ 2075 2076 locked_ref = NULL; 2077 cond_resched(); 2078 } while ((nr != -1 && count < nr) || locked_ref); 2079 2080 return 0; 2081 } 2082 2083 #ifdef SCRAMBLE_DELAYED_REFS 2084 /* 2085 * Normally delayed refs get processed in ascending bytenr order. This 2086 * correlates in most cases to the order added. To expose dependencies on this 2087 * order, we start to process the tree in the middle instead of the beginning 2088 */ 2089 static u64 find_middle(struct rb_root *root) 2090 { 2091 struct rb_node *n = root->rb_node; 2092 struct btrfs_delayed_ref_node *entry; 2093 int alt = 1; 2094 u64 middle; 2095 u64 first = 0, last = 0; 2096 2097 n = rb_first(root); 2098 if (n) { 2099 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2100 first = entry->bytenr; 2101 } 2102 n = rb_last(root); 2103 if (n) { 2104 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2105 last = entry->bytenr; 2106 } 2107 n = root->rb_node; 2108 2109 while (n) { 2110 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 2111 WARN_ON(!entry->in_tree); 2112 2113 middle = entry->bytenr; 2114 2115 if (alt) 2116 n = n->rb_left; 2117 else 2118 n = n->rb_right; 2119 2120 alt = 1 - alt; 2121 } 2122 return middle; 2123 } 2124 #endif 2125 2126 /* 2127 * this starts processing the delayed reference count updates and 2128 * extent insertions we have queued up so far. count can be 2129 * 0, which means to process everything in the tree at the start 2130 * of the run (but not newly added entries), or it can be some target 2131 * number you'd like to process. 2132 * 2133 * Returns 0 on success or if called with an aborted transaction 2134 * Returns <0 on error and aborts the transaction 2135 */ 2136 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2137 unsigned long count) 2138 { 2139 struct btrfs_fs_info *fs_info = trans->fs_info; 2140 struct rb_node *node; 2141 struct btrfs_delayed_ref_root *delayed_refs; 2142 struct btrfs_delayed_ref_head *head; 2143 int ret; 2144 int run_all = count == (unsigned long)-1; 2145 2146 /* We'll clean this up in btrfs_cleanup_transaction */ 2147 if (TRANS_ABORTED(trans)) 2148 return 0; 2149 2150 if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) 2151 return 0; 2152 2153 delayed_refs = &trans->transaction->delayed_refs; 2154 if (count == 0) 2155 count = delayed_refs->num_heads_ready; 2156 2157 again: 2158 #ifdef SCRAMBLE_DELAYED_REFS 2159 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2160 #endif 2161 ret = __btrfs_run_delayed_refs(trans, count); 2162 if (ret < 0) { 2163 btrfs_abort_transaction(trans, ret); 2164 return ret; 2165 } 2166 2167 if (run_all) { 2168 btrfs_create_pending_block_groups(trans); 2169 2170 spin_lock(&delayed_refs->lock); 2171 node = rb_first_cached(&delayed_refs->href_root); 2172 if (!node) { 2173 spin_unlock(&delayed_refs->lock); 2174 goto out; 2175 } 2176 head = rb_entry(node, struct btrfs_delayed_ref_head, 2177 href_node); 2178 refcount_inc(&head->refs); 2179 spin_unlock(&delayed_refs->lock); 2180 2181 /* Mutex was contended, block until it's released and retry. */ 2182 mutex_lock(&head->mutex); 2183 mutex_unlock(&head->mutex); 2184 2185 btrfs_put_delayed_ref_head(head); 2186 cond_resched(); 2187 goto again; 2188 } 2189 out: 2190 return 0; 2191 } 2192 2193 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2194 struct extent_buffer *eb, u64 flags) 2195 { 2196 struct btrfs_delayed_extent_op *extent_op; 2197 int level = btrfs_header_level(eb); 2198 int ret; 2199 2200 extent_op = btrfs_alloc_delayed_extent_op(); 2201 if (!extent_op) 2202 return -ENOMEM; 2203 2204 extent_op->flags_to_set = flags; 2205 extent_op->update_flags = true; 2206 extent_op->update_key = false; 2207 extent_op->level = level; 2208 2209 ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); 2210 if (ret) 2211 btrfs_free_delayed_extent_op(extent_op); 2212 return ret; 2213 } 2214 2215 static noinline int check_delayed_ref(struct btrfs_root *root, 2216 struct btrfs_path *path, 2217 u64 objectid, u64 offset, u64 bytenr) 2218 { 2219 struct btrfs_delayed_ref_head *head; 2220 struct btrfs_delayed_ref_node *ref; 2221 struct btrfs_delayed_data_ref *data_ref; 2222 struct btrfs_delayed_ref_root *delayed_refs; 2223 struct btrfs_transaction *cur_trans; 2224 struct rb_node *node; 2225 int ret = 0; 2226 2227 spin_lock(&root->fs_info->trans_lock); 2228 cur_trans = root->fs_info->running_transaction; 2229 if (cur_trans) 2230 refcount_inc(&cur_trans->use_count); 2231 spin_unlock(&root->fs_info->trans_lock); 2232 if (!cur_trans) 2233 return 0; 2234 2235 delayed_refs = &cur_trans->delayed_refs; 2236 spin_lock(&delayed_refs->lock); 2237 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 2238 if (!head) { 2239 spin_unlock(&delayed_refs->lock); 2240 btrfs_put_transaction(cur_trans); 2241 return 0; 2242 } 2243 2244 if (!mutex_trylock(&head->mutex)) { 2245 if (path->nowait) { 2246 spin_unlock(&delayed_refs->lock); 2247 btrfs_put_transaction(cur_trans); 2248 return -EAGAIN; 2249 } 2250 2251 refcount_inc(&head->refs); 2252 spin_unlock(&delayed_refs->lock); 2253 2254 btrfs_release_path(path); 2255 2256 /* 2257 * Mutex was contended, block until it's released and let 2258 * caller try again 2259 */ 2260 mutex_lock(&head->mutex); 2261 mutex_unlock(&head->mutex); 2262 btrfs_put_delayed_ref_head(head); 2263 btrfs_put_transaction(cur_trans); 2264 return -EAGAIN; 2265 } 2266 spin_unlock(&delayed_refs->lock); 2267 2268 spin_lock(&head->lock); 2269 /* 2270 * XXX: We should replace this with a proper search function in the 2271 * future. 2272 */ 2273 for (node = rb_first_cached(&head->ref_tree); node; 2274 node = rb_next(node)) { 2275 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 2276 /* If it's a shared ref we know a cross reference exists */ 2277 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { 2278 ret = 1; 2279 break; 2280 } 2281 2282 data_ref = btrfs_delayed_node_to_data_ref(ref); 2283 2284 /* 2285 * If our ref doesn't match the one we're currently looking at 2286 * then we have a cross reference. 2287 */ 2288 if (data_ref->root != root->root_key.objectid || 2289 data_ref->objectid != objectid || 2290 data_ref->offset != offset) { 2291 ret = 1; 2292 break; 2293 } 2294 } 2295 spin_unlock(&head->lock); 2296 mutex_unlock(&head->mutex); 2297 btrfs_put_transaction(cur_trans); 2298 return ret; 2299 } 2300 2301 static noinline int check_committed_ref(struct btrfs_root *root, 2302 struct btrfs_path *path, 2303 u64 objectid, u64 offset, u64 bytenr, 2304 bool strict) 2305 { 2306 struct btrfs_fs_info *fs_info = root->fs_info; 2307 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); 2308 struct extent_buffer *leaf; 2309 struct btrfs_extent_data_ref *ref; 2310 struct btrfs_extent_inline_ref *iref; 2311 struct btrfs_extent_item *ei; 2312 struct btrfs_key key; 2313 u32 item_size; 2314 int type; 2315 int ret; 2316 2317 key.objectid = bytenr; 2318 key.offset = (u64)-1; 2319 key.type = BTRFS_EXTENT_ITEM_KEY; 2320 2321 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2322 if (ret < 0) 2323 goto out; 2324 BUG_ON(ret == 0); /* Corruption */ 2325 2326 ret = -ENOENT; 2327 if (path->slots[0] == 0) 2328 goto out; 2329 2330 path->slots[0]--; 2331 leaf = path->nodes[0]; 2332 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2333 2334 if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) 2335 goto out; 2336 2337 ret = 1; 2338 item_size = btrfs_item_size(leaf, path->slots[0]); 2339 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 2340 2341 /* If extent item has more than 1 inline ref then it's shared */ 2342 if (item_size != sizeof(*ei) + 2343 btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) 2344 goto out; 2345 2346 /* 2347 * If extent created before last snapshot => it's shared unless the 2348 * snapshot has been deleted. Use the heuristic if strict is false. 2349 */ 2350 if (!strict && 2351 (btrfs_extent_generation(leaf, ei) <= 2352 btrfs_root_last_snapshot(&root->root_item))) 2353 goto out; 2354 2355 iref = (struct btrfs_extent_inline_ref *)(ei + 1); 2356 2357 /* If this extent has SHARED_DATA_REF then it's shared */ 2358 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); 2359 if (type != BTRFS_EXTENT_DATA_REF_KEY) 2360 goto out; 2361 2362 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 2363 if (btrfs_extent_refs(leaf, ei) != 2364 btrfs_extent_data_ref_count(leaf, ref) || 2365 btrfs_extent_data_ref_root(leaf, ref) != 2366 root->root_key.objectid || 2367 btrfs_extent_data_ref_objectid(leaf, ref) != objectid || 2368 btrfs_extent_data_ref_offset(leaf, ref) != offset) 2369 goto out; 2370 2371 ret = 0; 2372 out: 2373 return ret; 2374 } 2375 2376 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, 2377 u64 bytenr, bool strict, struct btrfs_path *path) 2378 { 2379 int ret; 2380 2381 do { 2382 ret = check_committed_ref(root, path, objectid, 2383 offset, bytenr, strict); 2384 if (ret && ret != -ENOENT) 2385 goto out; 2386 2387 ret = check_delayed_ref(root, path, objectid, offset, bytenr); 2388 } while (ret == -EAGAIN); 2389 2390 out: 2391 btrfs_release_path(path); 2392 if (btrfs_is_data_reloc_root(root)) 2393 WARN_ON(ret > 0); 2394 return ret; 2395 } 2396 2397 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2398 struct btrfs_root *root, 2399 struct extent_buffer *buf, 2400 int full_backref, int inc) 2401 { 2402 struct btrfs_fs_info *fs_info = root->fs_info; 2403 u64 bytenr; 2404 u64 num_bytes; 2405 u64 parent; 2406 u64 ref_root; 2407 u32 nritems; 2408 struct btrfs_key key; 2409 struct btrfs_file_extent_item *fi; 2410 struct btrfs_ref generic_ref = { 0 }; 2411 bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); 2412 int i; 2413 int action; 2414 int level; 2415 int ret = 0; 2416 2417 if (btrfs_is_testing(fs_info)) 2418 return 0; 2419 2420 ref_root = btrfs_header_owner(buf); 2421 nritems = btrfs_header_nritems(buf); 2422 level = btrfs_header_level(buf); 2423 2424 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) 2425 return 0; 2426 2427 if (full_backref) 2428 parent = buf->start; 2429 else 2430 parent = 0; 2431 if (inc) 2432 action = BTRFS_ADD_DELAYED_REF; 2433 else 2434 action = BTRFS_DROP_DELAYED_REF; 2435 2436 for (i = 0; i < nritems; i++) { 2437 if (level == 0) { 2438 btrfs_item_key_to_cpu(buf, &key, i); 2439 if (key.type != BTRFS_EXTENT_DATA_KEY) 2440 continue; 2441 fi = btrfs_item_ptr(buf, i, 2442 struct btrfs_file_extent_item); 2443 if (btrfs_file_extent_type(buf, fi) == 2444 BTRFS_FILE_EXTENT_INLINE) 2445 continue; 2446 bytenr = btrfs_file_extent_disk_bytenr(buf, fi); 2447 if (bytenr == 0) 2448 continue; 2449 2450 num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); 2451 key.offset -= btrfs_file_extent_offset(buf, fi); 2452 btrfs_init_generic_ref(&generic_ref, action, bytenr, 2453 num_bytes, parent); 2454 btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, 2455 key.offset, root->root_key.objectid, 2456 for_reloc); 2457 if (inc) 2458 ret = btrfs_inc_extent_ref(trans, &generic_ref); 2459 else 2460 ret = btrfs_free_extent(trans, &generic_ref); 2461 if (ret) 2462 goto fail; 2463 } else { 2464 bytenr = btrfs_node_blockptr(buf, i); 2465 num_bytes = fs_info->nodesize; 2466 btrfs_init_generic_ref(&generic_ref, action, bytenr, 2467 num_bytes, parent); 2468 btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, 2469 root->root_key.objectid, for_reloc); 2470 if (inc) 2471 ret = btrfs_inc_extent_ref(trans, &generic_ref); 2472 else 2473 ret = btrfs_free_extent(trans, &generic_ref); 2474 if (ret) 2475 goto fail; 2476 } 2477 } 2478 return 0; 2479 fail: 2480 return ret; 2481 } 2482 2483 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2484 struct extent_buffer *buf, int full_backref) 2485 { 2486 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2487 } 2488 2489 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2490 struct extent_buffer *buf, int full_backref) 2491 { 2492 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2493 } 2494 2495 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) 2496 { 2497 struct btrfs_fs_info *fs_info = root->fs_info; 2498 u64 flags; 2499 u64 ret; 2500 2501 if (data) 2502 flags = BTRFS_BLOCK_GROUP_DATA; 2503 else if (root == fs_info->chunk_root) 2504 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2505 else 2506 flags = BTRFS_BLOCK_GROUP_METADATA; 2507 2508 ret = btrfs_get_alloc_profile(fs_info, flags); 2509 return ret; 2510 } 2511 2512 static u64 first_logical_byte(struct btrfs_fs_info *fs_info) 2513 { 2514 struct rb_node *leftmost; 2515 u64 bytenr = 0; 2516 2517 read_lock(&fs_info->block_group_cache_lock); 2518 /* Get the block group with the lowest logical start address. */ 2519 leftmost = rb_first_cached(&fs_info->block_group_cache_tree); 2520 if (leftmost) { 2521 struct btrfs_block_group *bg; 2522 2523 bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); 2524 bytenr = bg->start; 2525 } 2526 read_unlock(&fs_info->block_group_cache_lock); 2527 2528 return bytenr; 2529 } 2530 2531 static int pin_down_extent(struct btrfs_trans_handle *trans, 2532 struct btrfs_block_group *cache, 2533 u64 bytenr, u64 num_bytes, int reserved) 2534 { 2535 struct btrfs_fs_info *fs_info = cache->fs_info; 2536 2537 spin_lock(&cache->space_info->lock); 2538 spin_lock(&cache->lock); 2539 cache->pinned += num_bytes; 2540 btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, 2541 num_bytes); 2542 if (reserved) { 2543 cache->reserved -= num_bytes; 2544 cache->space_info->bytes_reserved -= num_bytes; 2545 } 2546 spin_unlock(&cache->lock); 2547 spin_unlock(&cache->space_info->lock); 2548 2549 set_extent_bit(&trans->transaction->pinned_extents, bytenr, 2550 bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); 2551 return 0; 2552 } 2553 2554 int btrfs_pin_extent(struct btrfs_trans_handle *trans, 2555 u64 bytenr, u64 num_bytes, int reserved) 2556 { 2557 struct btrfs_block_group *cache; 2558 2559 cache = btrfs_lookup_block_group(trans->fs_info, bytenr); 2560 BUG_ON(!cache); /* Logic error */ 2561 2562 pin_down_extent(trans, cache, bytenr, num_bytes, reserved); 2563 2564 btrfs_put_block_group(cache); 2565 return 0; 2566 } 2567 2568 /* 2569 * this function must be called within transaction 2570 */ 2571 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 2572 u64 bytenr, u64 num_bytes) 2573 { 2574 struct btrfs_block_group *cache; 2575 int ret; 2576 2577 cache = btrfs_lookup_block_group(trans->fs_info, bytenr); 2578 if (!cache) 2579 return -EINVAL; 2580 2581 /* 2582 * Fully cache the free space first so that our pin removes the free space 2583 * from the cache. 2584 */ 2585 ret = btrfs_cache_block_group(cache, true); 2586 if (ret) 2587 goto out; 2588 2589 pin_down_extent(trans, cache, bytenr, num_bytes, 0); 2590 2591 /* remove us from the free space cache (if we're there at all) */ 2592 ret = btrfs_remove_free_space(cache, bytenr, num_bytes); 2593 out: 2594 btrfs_put_block_group(cache); 2595 return ret; 2596 } 2597 2598 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, 2599 u64 start, u64 num_bytes) 2600 { 2601 int ret; 2602 struct btrfs_block_group *block_group; 2603 2604 block_group = btrfs_lookup_block_group(fs_info, start); 2605 if (!block_group) 2606 return -EINVAL; 2607 2608 ret = btrfs_cache_block_group(block_group, true); 2609 if (ret) 2610 goto out; 2611 2612 ret = btrfs_remove_free_space(block_group, start, num_bytes); 2613 out: 2614 btrfs_put_block_group(block_group); 2615 return ret; 2616 } 2617 2618 int btrfs_exclude_logged_extents(struct extent_buffer *eb) 2619 { 2620 struct btrfs_fs_info *fs_info = eb->fs_info; 2621 struct btrfs_file_extent_item *item; 2622 struct btrfs_key key; 2623 int found_type; 2624 int i; 2625 int ret = 0; 2626 2627 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) 2628 return 0; 2629 2630 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2631 btrfs_item_key_to_cpu(eb, &key, i); 2632 if (key.type != BTRFS_EXTENT_DATA_KEY) 2633 continue; 2634 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 2635 found_type = btrfs_file_extent_type(eb, item); 2636 if (found_type == BTRFS_FILE_EXTENT_INLINE) 2637 continue; 2638 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 2639 continue; 2640 key.objectid = btrfs_file_extent_disk_bytenr(eb, item); 2641 key.offset = btrfs_file_extent_disk_num_bytes(eb, item); 2642 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); 2643 if (ret) 2644 break; 2645 } 2646 2647 return ret; 2648 } 2649 2650 static void 2651 btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) 2652 { 2653 atomic_inc(&bg->reservations); 2654 } 2655 2656 /* 2657 * Returns the free cluster for the given space info and sets empty_cluster to 2658 * what it should be based on the mount options. 2659 */ 2660 static struct btrfs_free_cluster * 2661 fetch_cluster_info(struct btrfs_fs_info *fs_info, 2662 struct btrfs_space_info *space_info, u64 *empty_cluster) 2663 { 2664 struct btrfs_free_cluster *ret = NULL; 2665 2666 *empty_cluster = 0; 2667 if (btrfs_mixed_space_info(space_info)) 2668 return ret; 2669 2670 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 2671 ret = &fs_info->meta_alloc_cluster; 2672 if (btrfs_test_opt(fs_info, SSD)) 2673 *empty_cluster = SZ_2M; 2674 else 2675 *empty_cluster = SZ_64K; 2676 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && 2677 btrfs_test_opt(fs_info, SSD_SPREAD)) { 2678 *empty_cluster = SZ_2M; 2679 ret = &fs_info->data_alloc_cluster; 2680 } 2681 2682 return ret; 2683 } 2684 2685 static int unpin_extent_range(struct btrfs_fs_info *fs_info, 2686 u64 start, u64 end, 2687 const bool return_free_space) 2688 { 2689 struct btrfs_block_group *cache = NULL; 2690 struct btrfs_space_info *space_info; 2691 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 2692 struct btrfs_free_cluster *cluster = NULL; 2693 u64 len; 2694 u64 total_unpinned = 0; 2695 u64 empty_cluster = 0; 2696 bool readonly; 2697 2698 while (start <= end) { 2699 readonly = false; 2700 if (!cache || 2701 start >= cache->start + cache->length) { 2702 if (cache) 2703 btrfs_put_block_group(cache); 2704 total_unpinned = 0; 2705 cache = btrfs_lookup_block_group(fs_info, start); 2706 BUG_ON(!cache); /* Logic error */ 2707 2708 cluster = fetch_cluster_info(fs_info, 2709 cache->space_info, 2710 &empty_cluster); 2711 empty_cluster <<= 1; 2712 } 2713 2714 len = cache->start + cache->length - start; 2715 len = min(len, end + 1 - start); 2716 2717 if (return_free_space) 2718 btrfs_add_free_space(cache, start, len); 2719 2720 start += len; 2721 total_unpinned += len; 2722 space_info = cache->space_info; 2723 2724 /* 2725 * If this space cluster has been marked as fragmented and we've 2726 * unpinned enough in this block group to potentially allow a 2727 * cluster to be created inside of it go ahead and clear the 2728 * fragmented check. 2729 */ 2730 if (cluster && cluster->fragmented && 2731 total_unpinned > empty_cluster) { 2732 spin_lock(&cluster->lock); 2733 cluster->fragmented = 0; 2734 spin_unlock(&cluster->lock); 2735 } 2736 2737 spin_lock(&space_info->lock); 2738 spin_lock(&cache->lock); 2739 cache->pinned -= len; 2740 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); 2741 space_info->max_extent_size = 0; 2742 if (cache->ro) { 2743 space_info->bytes_readonly += len; 2744 readonly = true; 2745 } else if (btrfs_is_zoned(fs_info)) { 2746 /* Need reset before reusing in a zoned block group */ 2747 space_info->bytes_zone_unusable += len; 2748 readonly = true; 2749 } 2750 spin_unlock(&cache->lock); 2751 if (!readonly && return_free_space && 2752 global_rsv->space_info == space_info) { 2753 spin_lock(&global_rsv->lock); 2754 if (!global_rsv->full) { 2755 u64 to_add = min(len, global_rsv->size - 2756 global_rsv->reserved); 2757 2758 global_rsv->reserved += to_add; 2759 btrfs_space_info_update_bytes_may_use(fs_info, 2760 space_info, to_add); 2761 if (global_rsv->reserved >= global_rsv->size) 2762 global_rsv->full = 1; 2763 len -= to_add; 2764 } 2765 spin_unlock(&global_rsv->lock); 2766 } 2767 /* Add to any tickets we may have */ 2768 if (!readonly && return_free_space && len) 2769 btrfs_try_granting_tickets(fs_info, space_info); 2770 spin_unlock(&space_info->lock); 2771 } 2772 2773 if (cache) 2774 btrfs_put_block_group(cache); 2775 return 0; 2776 } 2777 2778 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 2779 { 2780 struct btrfs_fs_info *fs_info = trans->fs_info; 2781 struct btrfs_block_group *block_group, *tmp; 2782 struct list_head *deleted_bgs; 2783 struct extent_io_tree *unpin; 2784 u64 start; 2785 u64 end; 2786 int ret; 2787 2788 unpin = &trans->transaction->pinned_extents; 2789 2790 while (!TRANS_ABORTED(trans)) { 2791 struct extent_state *cached_state = NULL; 2792 2793 mutex_lock(&fs_info->unused_bg_unpin_mutex); 2794 if (!find_first_extent_bit(unpin, 0, &start, &end, 2795 EXTENT_DIRTY, &cached_state)) { 2796 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2797 break; 2798 } 2799 2800 if (btrfs_test_opt(fs_info, DISCARD_SYNC)) 2801 ret = btrfs_discard_extent(fs_info, start, 2802 end + 1 - start, NULL); 2803 2804 clear_extent_dirty(unpin, start, end, &cached_state); 2805 unpin_extent_range(fs_info, start, end, true); 2806 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 2807 free_extent_state(cached_state); 2808 cond_resched(); 2809 } 2810 2811 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { 2812 btrfs_discard_calc_delay(&fs_info->discard_ctl); 2813 btrfs_discard_schedule_work(&fs_info->discard_ctl, true); 2814 } 2815 2816 /* 2817 * Transaction is finished. We don't need the lock anymore. We 2818 * do need to clean up the block groups in case of a transaction 2819 * abort. 2820 */ 2821 deleted_bgs = &trans->transaction->deleted_bgs; 2822 list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { 2823 u64 trimmed = 0; 2824 2825 ret = -EROFS; 2826 if (!TRANS_ABORTED(trans)) 2827 ret = btrfs_discard_extent(fs_info, 2828 block_group->start, 2829 block_group->length, 2830 &trimmed); 2831 2832 list_del_init(&block_group->bg_list); 2833 btrfs_unfreeze_block_group(block_group); 2834 btrfs_put_block_group(block_group); 2835 2836 if (ret) { 2837 const char *errstr = btrfs_decode_error(ret); 2838 btrfs_warn(fs_info, 2839 "discard failed while removing blockgroup: errno=%d %s", 2840 ret, errstr); 2841 } 2842 } 2843 2844 return 0; 2845 } 2846 2847 static int do_free_extent_accounting(struct btrfs_trans_handle *trans, 2848 u64 bytenr, u64 num_bytes, bool is_data) 2849 { 2850 int ret; 2851 2852 if (is_data) { 2853 struct btrfs_root *csum_root; 2854 2855 csum_root = btrfs_csum_root(trans->fs_info, bytenr); 2856 ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); 2857 if (ret) { 2858 btrfs_abort_transaction(trans, ret); 2859 return ret; 2860 } 2861 } 2862 2863 ret = add_to_free_space_tree(trans, bytenr, num_bytes); 2864 if (ret) { 2865 btrfs_abort_transaction(trans, ret); 2866 return ret; 2867 } 2868 2869 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); 2870 if (ret) 2871 btrfs_abort_transaction(trans, ret); 2872 2873 return ret; 2874 } 2875 2876 #define abort_and_dump(trans, path, fmt, args...) \ 2877 ({ \ 2878 btrfs_abort_transaction(trans, -EUCLEAN); \ 2879 btrfs_print_leaf(path->nodes[0]); \ 2880 btrfs_crit(trans->fs_info, fmt, ##args); \ 2881 }) 2882 2883 /* 2884 * Drop one or more refs of @node. 2885 * 2886 * 1. Locate the extent refs. 2887 * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. 2888 * Locate it, then reduce the refs number or remove the ref line completely. 2889 * 2890 * 2. Update the refs count in EXTENT/METADATA_ITEM 2891 * 2892 * Inline backref case: 2893 * 2894 * in extent tree we have: 2895 * 2896 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 2897 * refs 2 gen 6 flags DATA 2898 * extent data backref root FS_TREE objectid 258 offset 0 count 1 2899 * extent data backref root FS_TREE objectid 257 offset 0 count 1 2900 * 2901 * This function gets called with: 2902 * 2903 * node->bytenr = 13631488 2904 * node->num_bytes = 1048576 2905 * root_objectid = FS_TREE 2906 * owner_objectid = 257 2907 * owner_offset = 0 2908 * refs_to_drop = 1 2909 * 2910 * Then we should get some like: 2911 * 2912 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 2913 * refs 1 gen 6 flags DATA 2914 * extent data backref root FS_TREE objectid 258 offset 0 count 1 2915 * 2916 * Keyed backref case: 2917 * 2918 * in extent tree we have: 2919 * 2920 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 2921 * refs 754 gen 6 flags DATA 2922 * [...] 2923 * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 2924 * extent data backref root FS_TREE objectid 866 offset 0 count 1 2925 * 2926 * This function get called with: 2927 * 2928 * node->bytenr = 13631488 2929 * node->num_bytes = 1048576 2930 * root_objectid = FS_TREE 2931 * owner_objectid = 866 2932 * owner_offset = 0 2933 * refs_to_drop = 1 2934 * 2935 * Then we should get some like: 2936 * 2937 * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 2938 * refs 753 gen 6 flags DATA 2939 * 2940 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. 2941 */ 2942 static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 2943 struct btrfs_delayed_ref_node *node, u64 parent, 2944 u64 root_objectid, u64 owner_objectid, 2945 u64 owner_offset, int refs_to_drop, 2946 struct btrfs_delayed_extent_op *extent_op) 2947 { 2948 struct btrfs_fs_info *info = trans->fs_info; 2949 struct btrfs_key key; 2950 struct btrfs_path *path; 2951 struct btrfs_root *extent_root; 2952 struct extent_buffer *leaf; 2953 struct btrfs_extent_item *ei; 2954 struct btrfs_extent_inline_ref *iref; 2955 int ret; 2956 int is_data; 2957 int extent_slot = 0; 2958 int found_extent = 0; 2959 int num_to_del = 1; 2960 u32 item_size; 2961 u64 refs; 2962 u64 bytenr = node->bytenr; 2963 u64 num_bytes = node->num_bytes; 2964 bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); 2965 2966 extent_root = btrfs_extent_root(info, bytenr); 2967 ASSERT(extent_root); 2968 2969 path = btrfs_alloc_path(); 2970 if (!path) 2971 return -ENOMEM; 2972 2973 is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; 2974 2975 if (!is_data && refs_to_drop != 1) { 2976 btrfs_crit(info, 2977 "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", 2978 node->bytenr, refs_to_drop); 2979 ret = -EINVAL; 2980 btrfs_abort_transaction(trans, ret); 2981 goto out; 2982 } 2983 2984 if (is_data) 2985 skinny_metadata = false; 2986 2987 ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, 2988 parent, root_objectid, owner_objectid, 2989 owner_offset); 2990 if (ret == 0) { 2991 /* 2992 * Either the inline backref or the SHARED_DATA_REF/ 2993 * SHARED_BLOCK_REF is found 2994 * 2995 * Here is a quick path to locate EXTENT/METADATA_ITEM. 2996 * It's possible the EXTENT/METADATA_ITEM is near current slot. 2997 */ 2998 extent_slot = path->slots[0]; 2999 while (extent_slot >= 0) { 3000 btrfs_item_key_to_cpu(path->nodes[0], &key, 3001 extent_slot); 3002 if (key.objectid != bytenr) 3003 break; 3004 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3005 key.offset == num_bytes) { 3006 found_extent = 1; 3007 break; 3008 } 3009 if (key.type == BTRFS_METADATA_ITEM_KEY && 3010 key.offset == owner_objectid) { 3011 found_extent = 1; 3012 break; 3013 } 3014 3015 /* Quick path didn't find the EXTEMT/METADATA_ITEM */ 3016 if (path->slots[0] - extent_slot > 5) 3017 break; 3018 extent_slot--; 3019 } 3020 3021 if (!found_extent) { 3022 if (iref) { 3023 abort_and_dump(trans, path, 3024 "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", 3025 path->slots[0]); 3026 ret = -EUCLEAN; 3027 goto out; 3028 } 3029 /* Must be SHARED_* item, remove the backref first */ 3030 ret = remove_extent_backref(trans, extent_root, path, 3031 NULL, refs_to_drop, is_data); 3032 if (ret) { 3033 btrfs_abort_transaction(trans, ret); 3034 goto out; 3035 } 3036 btrfs_release_path(path); 3037 3038 /* Slow path to locate EXTENT/METADATA_ITEM */ 3039 key.objectid = bytenr; 3040 key.type = BTRFS_EXTENT_ITEM_KEY; 3041 key.offset = num_bytes; 3042 3043 if (!is_data && skinny_metadata) { 3044 key.type = BTRFS_METADATA_ITEM_KEY; 3045 key.offset = owner_objectid; 3046 } 3047 3048 ret = btrfs_search_slot(trans, extent_root, 3049 &key, path, -1, 1); 3050 if (ret > 0 && skinny_metadata && path->slots[0]) { 3051 /* 3052 * Couldn't find our skinny metadata item, 3053 * see if we have ye olde extent item. 3054 */ 3055 path->slots[0]--; 3056 btrfs_item_key_to_cpu(path->nodes[0], &key, 3057 path->slots[0]); 3058 if (key.objectid == bytenr && 3059 key.type == BTRFS_EXTENT_ITEM_KEY && 3060 key.offset == num_bytes) 3061 ret = 0; 3062 } 3063 3064 if (ret > 0 && skinny_metadata) { 3065 skinny_metadata = false; 3066 key.objectid = bytenr; 3067 key.type = BTRFS_EXTENT_ITEM_KEY; 3068 key.offset = num_bytes; 3069 btrfs_release_path(path); 3070 ret = btrfs_search_slot(trans, extent_root, 3071 &key, path, -1, 1); 3072 } 3073 3074 if (ret) { 3075 if (ret > 0) 3076 btrfs_print_leaf(path->nodes[0]); 3077 btrfs_err(info, 3078 "umm, got %d back from search, was looking for %llu, slot %d", 3079 ret, bytenr, path->slots[0]); 3080 } 3081 if (ret < 0) { 3082 btrfs_abort_transaction(trans, ret); 3083 goto out; 3084 } 3085 extent_slot = path->slots[0]; 3086 } 3087 } else if (WARN_ON(ret == -ENOENT)) { 3088 abort_and_dump(trans, path, 3089 "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", 3090 bytenr, parent, root_objectid, owner_objectid, 3091 owner_offset, path->slots[0]); 3092 goto out; 3093 } else { 3094 btrfs_abort_transaction(trans, ret); 3095 goto out; 3096 } 3097 3098 leaf = path->nodes[0]; 3099 item_size = btrfs_item_size(leaf, extent_slot); 3100 if (unlikely(item_size < sizeof(*ei))) { 3101 ret = -EUCLEAN; 3102 btrfs_err(trans->fs_info, 3103 "unexpected extent item size, has %u expect >= %zu", 3104 item_size, sizeof(*ei)); 3105 btrfs_abort_transaction(trans, ret); 3106 goto out; 3107 } 3108 ei = btrfs_item_ptr(leaf, extent_slot, 3109 struct btrfs_extent_item); 3110 if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && 3111 key.type == BTRFS_EXTENT_ITEM_KEY) { 3112 struct btrfs_tree_block_info *bi; 3113 3114 if (item_size < sizeof(*ei) + sizeof(*bi)) { 3115 abort_and_dump(trans, path, 3116 "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", 3117 key.objectid, key.type, key.offset, 3118 path->slots[0], owner_objectid, item_size, 3119 sizeof(*ei) + sizeof(*bi)); 3120 ret = -EUCLEAN; 3121 goto out; 3122 } 3123 bi = (struct btrfs_tree_block_info *)(ei + 1); 3124 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); 3125 } 3126 3127 refs = btrfs_extent_refs(leaf, ei); 3128 if (refs < refs_to_drop) { 3129 abort_and_dump(trans, path, 3130 "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", 3131 refs_to_drop, refs, bytenr, path->slots[0]); 3132 ret = -EUCLEAN; 3133 goto out; 3134 } 3135 refs -= refs_to_drop; 3136 3137 if (refs > 0) { 3138 if (extent_op) 3139 __run_delayed_extent_op(extent_op, leaf, ei); 3140 /* 3141 * In the case of inline back ref, reference count will 3142 * be updated by remove_extent_backref 3143 */ 3144 if (iref) { 3145 if (!found_extent) { 3146 abort_and_dump(trans, path, 3147 "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", 3148 path->slots[0]); 3149 ret = -EUCLEAN; 3150 goto out; 3151 } 3152 } else { 3153 btrfs_set_extent_refs(leaf, ei, refs); 3154 btrfs_mark_buffer_dirty(leaf); 3155 } 3156 if (found_extent) { 3157 ret = remove_extent_backref(trans, extent_root, path, 3158 iref, refs_to_drop, is_data); 3159 if (ret) { 3160 btrfs_abort_transaction(trans, ret); 3161 goto out; 3162 } 3163 } 3164 } else { 3165 /* In this branch refs == 1 */ 3166 if (found_extent) { 3167 if (is_data && refs_to_drop != 3168 extent_data_ref_count(path, iref)) { 3169 abort_and_dump(trans, path, 3170 "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", 3171 extent_data_ref_count(path, iref), 3172 refs_to_drop, path->slots[0]); 3173 ret = -EUCLEAN; 3174 goto out; 3175 } 3176 if (iref) { 3177 if (path->slots[0] != extent_slot) { 3178 abort_and_dump(trans, path, 3179 "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", 3180 key.objectid, key.type, 3181 key.offset, path->slots[0]); 3182 ret = -EUCLEAN; 3183 goto out; 3184 } 3185 } else { 3186 /* 3187 * No inline ref, we must be at SHARED_* item, 3188 * And it's single ref, it must be: 3189 * | extent_slot ||extent_slot + 1| 3190 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] 3191 */ 3192 if (path->slots[0] != extent_slot + 1) { 3193 abort_and_dump(trans, path, 3194 "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", 3195 path->slots[0]); 3196 ret = -EUCLEAN; 3197 goto out; 3198 } 3199 path->slots[0] = extent_slot; 3200 num_to_del = 2; 3201 } 3202 } 3203 3204 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 3205 num_to_del); 3206 if (ret) { 3207 btrfs_abort_transaction(trans, ret); 3208 goto out; 3209 } 3210 btrfs_release_path(path); 3211 3212 ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); 3213 } 3214 btrfs_release_path(path); 3215 3216 out: 3217 btrfs_free_path(path); 3218 return ret; 3219 } 3220 3221 /* 3222 * when we free an block, it is possible (and likely) that we free the last 3223 * delayed ref for that extent as well. This searches the delayed ref tree for 3224 * a given extent, and if there are no other delayed refs to be processed, it 3225 * removes it from the tree. 3226 */ 3227 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, 3228 u64 bytenr) 3229 { 3230 struct btrfs_delayed_ref_head *head; 3231 struct btrfs_delayed_ref_root *delayed_refs; 3232 int ret = 0; 3233 3234 delayed_refs = &trans->transaction->delayed_refs; 3235 spin_lock(&delayed_refs->lock); 3236 head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); 3237 if (!head) 3238 goto out_delayed_unlock; 3239 3240 spin_lock(&head->lock); 3241 if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 3242 goto out; 3243 3244 if (cleanup_extent_op(head) != NULL) 3245 goto out; 3246 3247 /* 3248 * waiting for the lock here would deadlock. If someone else has it 3249 * locked they are already in the process of dropping it anyway 3250 */ 3251 if (!mutex_trylock(&head->mutex)) 3252 goto out; 3253 3254 btrfs_delete_ref_head(delayed_refs, head); 3255 head->processing = false; 3256 3257 spin_unlock(&head->lock); 3258 spin_unlock(&delayed_refs->lock); 3259 3260 BUG_ON(head->extent_op); 3261 if (head->must_insert_reserved) 3262 ret = 1; 3263 3264 btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); 3265 mutex_unlock(&head->mutex); 3266 btrfs_put_delayed_ref_head(head); 3267 return ret; 3268 out: 3269 spin_unlock(&head->lock); 3270 3271 out_delayed_unlock: 3272 spin_unlock(&delayed_refs->lock); 3273 return 0; 3274 } 3275 3276 void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 3277 u64 root_id, 3278 struct extent_buffer *buf, 3279 u64 parent, int last_ref) 3280 { 3281 struct btrfs_fs_info *fs_info = trans->fs_info; 3282 struct btrfs_ref generic_ref = { 0 }; 3283 int ret; 3284 3285 btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, 3286 buf->start, buf->len, parent); 3287 btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 3288 root_id, 0, false); 3289 3290 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3291 btrfs_ref_tree_mod(fs_info, &generic_ref); 3292 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); 3293 BUG_ON(ret); /* -ENOMEM */ 3294 } 3295 3296 if (last_ref && btrfs_header_generation(buf) == trans->transid) { 3297 struct btrfs_block_group *cache; 3298 bool must_pin = false; 3299 3300 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3301 ret = check_ref_cleanup(trans, buf->start); 3302 if (!ret) { 3303 btrfs_redirty_list_add(trans->transaction, buf); 3304 goto out; 3305 } 3306 } 3307 3308 cache = btrfs_lookup_block_group(fs_info, buf->start); 3309 3310 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3311 pin_down_extent(trans, cache, buf->start, buf->len, 1); 3312 btrfs_put_block_group(cache); 3313 goto out; 3314 } 3315 3316 /* 3317 * If there are tree mod log users we may have recorded mod log 3318 * operations for this node. If we re-allocate this node we 3319 * could replay operations on this node that happened when it 3320 * existed in a completely different root. For example if it 3321 * was part of root A, then was reallocated to root B, and we 3322 * are doing a btrfs_old_search_slot(root b), we could replay 3323 * operations that happened when the block was part of root A, 3324 * giving us an inconsistent view of the btree. 3325 * 3326 * We are safe from races here because at this point no other 3327 * node or root points to this extent buffer, so if after this 3328 * check a new tree mod log user joins we will not have an 3329 * existing log of operations on this node that we have to 3330 * contend with. 3331 */ 3332 if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 3333 must_pin = true; 3334 3335 if (must_pin || btrfs_is_zoned(fs_info)) { 3336 btrfs_redirty_list_add(trans->transaction, buf); 3337 pin_down_extent(trans, cache, buf->start, buf->len, 1); 3338 btrfs_put_block_group(cache); 3339 goto out; 3340 } 3341 3342 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 3343 3344 btrfs_add_free_space(cache, buf->start, buf->len); 3345 btrfs_free_reserved_bytes(cache, buf->len, 0); 3346 btrfs_put_block_group(cache); 3347 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 3348 } 3349 out: 3350 if (last_ref) { 3351 /* 3352 * Deleting the buffer, clear the corrupt flag since it doesn't 3353 * matter anymore. 3354 */ 3355 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); 3356 } 3357 } 3358 3359 /* Can return -ENOMEM */ 3360 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) 3361 { 3362 struct btrfs_fs_info *fs_info = trans->fs_info; 3363 int ret; 3364 3365 if (btrfs_is_testing(fs_info)) 3366 return 0; 3367 3368 /* 3369 * tree log blocks never actually go into the extent allocation 3370 * tree, just update pinning info and exit early. 3371 */ 3372 if ((ref->type == BTRFS_REF_METADATA && 3373 ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || 3374 (ref->type == BTRFS_REF_DATA && 3375 ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { 3376 /* unlocks the pinned mutex */ 3377 btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); 3378 ret = 0; 3379 } else if (ref->type == BTRFS_REF_METADATA) { 3380 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); 3381 } else { 3382 ret = btrfs_add_delayed_data_ref(trans, ref, 0); 3383 } 3384 3385 if (!((ref->type == BTRFS_REF_METADATA && 3386 ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || 3387 (ref->type == BTRFS_REF_DATA && 3388 ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) 3389 btrfs_ref_tree_mod(fs_info, ref); 3390 3391 return ret; 3392 } 3393 3394 enum btrfs_loop_type { 3395 /* 3396 * Start caching block groups but do not wait for progress or for them 3397 * to be done. 3398 */ 3399 LOOP_CACHING_NOWAIT, 3400 3401 /* 3402 * Wait for the block group free_space >= the space we're waiting for if 3403 * the block group isn't cached. 3404 */ 3405 LOOP_CACHING_WAIT, 3406 3407 /* 3408 * Allow allocations to happen from block groups that do not yet have a 3409 * size classification. 3410 */ 3411 LOOP_UNSET_SIZE_CLASS, 3412 3413 /* 3414 * Allocate a chunk and then retry the allocation. 3415 */ 3416 LOOP_ALLOC_CHUNK, 3417 3418 /* 3419 * Ignore the size class restrictions for this allocation. 3420 */ 3421 LOOP_WRONG_SIZE_CLASS, 3422 3423 /* 3424 * Ignore the empty size, only try to allocate the number of bytes 3425 * needed for this allocation. 3426 */ 3427 LOOP_NO_EMPTY_SIZE, 3428 }; 3429 3430 static inline void 3431 btrfs_lock_block_group(struct btrfs_block_group *cache, 3432 int delalloc) 3433 { 3434 if (delalloc) 3435 down_read(&cache->data_rwsem); 3436 } 3437 3438 static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, 3439 int delalloc) 3440 { 3441 btrfs_get_block_group(cache); 3442 if (delalloc) 3443 down_read(&cache->data_rwsem); 3444 } 3445 3446 static struct btrfs_block_group *btrfs_lock_cluster( 3447 struct btrfs_block_group *block_group, 3448 struct btrfs_free_cluster *cluster, 3449 int delalloc) 3450 __acquires(&cluster->refill_lock) 3451 { 3452 struct btrfs_block_group *used_bg = NULL; 3453 3454 spin_lock(&cluster->refill_lock); 3455 while (1) { 3456 used_bg = cluster->block_group; 3457 if (!used_bg) 3458 return NULL; 3459 3460 if (used_bg == block_group) 3461 return used_bg; 3462 3463 btrfs_get_block_group(used_bg); 3464 3465 if (!delalloc) 3466 return used_bg; 3467 3468 if (down_read_trylock(&used_bg->data_rwsem)) 3469 return used_bg; 3470 3471 spin_unlock(&cluster->refill_lock); 3472 3473 /* We should only have one-level nested. */ 3474 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); 3475 3476 spin_lock(&cluster->refill_lock); 3477 if (used_bg == cluster->block_group) 3478 return used_bg; 3479 3480 up_read(&used_bg->data_rwsem); 3481 btrfs_put_block_group(used_bg); 3482 } 3483 } 3484 3485 static inline void 3486 btrfs_release_block_group(struct btrfs_block_group *cache, 3487 int delalloc) 3488 { 3489 if (delalloc) 3490 up_read(&cache->data_rwsem); 3491 btrfs_put_block_group(cache); 3492 } 3493 3494 /* 3495 * Helper function for find_free_extent(). 3496 * 3497 * Return -ENOENT to inform caller that we need fallback to unclustered mode. 3498 * Return >0 to inform caller that we find nothing 3499 * Return 0 means we have found a location and set ffe_ctl->found_offset. 3500 */ 3501 static int find_free_extent_clustered(struct btrfs_block_group *bg, 3502 struct find_free_extent_ctl *ffe_ctl, 3503 struct btrfs_block_group **cluster_bg_ret) 3504 { 3505 struct btrfs_block_group *cluster_bg; 3506 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3507 u64 aligned_cluster; 3508 u64 offset; 3509 int ret; 3510 3511 cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); 3512 if (!cluster_bg) 3513 goto refill_cluster; 3514 if (cluster_bg != bg && (cluster_bg->ro || 3515 !block_group_bits(cluster_bg, ffe_ctl->flags))) 3516 goto release_cluster; 3517 3518 offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, 3519 ffe_ctl->num_bytes, cluster_bg->start, 3520 &ffe_ctl->max_extent_size); 3521 if (offset) { 3522 /* We have a block, we're done */ 3523 spin_unlock(&last_ptr->refill_lock); 3524 trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); 3525 *cluster_bg_ret = cluster_bg; 3526 ffe_ctl->found_offset = offset; 3527 return 0; 3528 } 3529 WARN_ON(last_ptr->block_group != cluster_bg); 3530 3531 release_cluster: 3532 /* 3533 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so 3534 * lets just skip it and let the allocator find whatever block it can 3535 * find. If we reach this point, we will have tried the cluster 3536 * allocator plenty of times and not have found anything, so we are 3537 * likely way too fragmented for the clustering stuff to find anything. 3538 * 3539 * However, if the cluster is taken from the current block group, 3540 * release the cluster first, so that we stand a better chance of 3541 * succeeding in the unclustered allocation. 3542 */ 3543 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { 3544 spin_unlock(&last_ptr->refill_lock); 3545 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3546 return -ENOENT; 3547 } 3548 3549 /* This cluster didn't work out, free it and start over */ 3550 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3551 3552 if (cluster_bg != bg) 3553 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); 3554 3555 refill_cluster: 3556 if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { 3557 spin_unlock(&last_ptr->refill_lock); 3558 return -ENOENT; 3559 } 3560 3561 aligned_cluster = max_t(u64, 3562 ffe_ctl->empty_cluster + ffe_ctl->empty_size, 3563 bg->full_stripe_len); 3564 ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, 3565 ffe_ctl->num_bytes, aligned_cluster); 3566 if (ret == 0) { 3567 /* Now pull our allocation out of this cluster */ 3568 offset = btrfs_alloc_from_cluster(bg, last_ptr, 3569 ffe_ctl->num_bytes, ffe_ctl->search_start, 3570 &ffe_ctl->max_extent_size); 3571 if (offset) { 3572 /* We found one, proceed */ 3573 spin_unlock(&last_ptr->refill_lock); 3574 ffe_ctl->found_offset = offset; 3575 trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); 3576 return 0; 3577 } 3578 } 3579 /* 3580 * At this point we either didn't find a cluster or we weren't able to 3581 * allocate a block from our cluster. Free the cluster we've been 3582 * trying to use, and go to the next block group. 3583 */ 3584 btrfs_return_cluster_to_free_space(NULL, last_ptr); 3585 spin_unlock(&last_ptr->refill_lock); 3586 return 1; 3587 } 3588 3589 /* 3590 * Return >0 to inform caller that we find nothing 3591 * Return 0 when we found an free extent and set ffe_ctrl->found_offset 3592 */ 3593 static int find_free_extent_unclustered(struct btrfs_block_group *bg, 3594 struct find_free_extent_ctl *ffe_ctl) 3595 { 3596 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3597 u64 offset; 3598 3599 /* 3600 * We are doing an unclustered allocation, set the fragmented flag so 3601 * we don't bother trying to setup a cluster again until we get more 3602 * space. 3603 */ 3604 if (unlikely(last_ptr)) { 3605 spin_lock(&last_ptr->lock); 3606 last_ptr->fragmented = 1; 3607 spin_unlock(&last_ptr->lock); 3608 } 3609 if (ffe_ctl->cached) { 3610 struct btrfs_free_space_ctl *free_space_ctl; 3611 3612 free_space_ctl = bg->free_space_ctl; 3613 spin_lock(&free_space_ctl->tree_lock); 3614 if (free_space_ctl->free_space < 3615 ffe_ctl->num_bytes + ffe_ctl->empty_cluster + 3616 ffe_ctl->empty_size) { 3617 ffe_ctl->total_free_space = max_t(u64, 3618 ffe_ctl->total_free_space, 3619 free_space_ctl->free_space); 3620 spin_unlock(&free_space_ctl->tree_lock); 3621 return 1; 3622 } 3623 spin_unlock(&free_space_ctl->tree_lock); 3624 } 3625 3626 offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, 3627 ffe_ctl->num_bytes, ffe_ctl->empty_size, 3628 &ffe_ctl->max_extent_size); 3629 if (!offset) 3630 return 1; 3631 ffe_ctl->found_offset = offset; 3632 return 0; 3633 } 3634 3635 static int do_allocation_clustered(struct btrfs_block_group *block_group, 3636 struct find_free_extent_ctl *ffe_ctl, 3637 struct btrfs_block_group **bg_ret) 3638 { 3639 int ret; 3640 3641 /* We want to try and use the cluster allocator, so lets look there */ 3642 if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { 3643 ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); 3644 if (ret >= 0) 3645 return ret; 3646 /* ret == -ENOENT case falls through */ 3647 } 3648 3649 return find_free_extent_unclustered(block_group, ffe_ctl); 3650 } 3651 3652 /* 3653 * Tree-log block group locking 3654 * ============================ 3655 * 3656 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which 3657 * indicates the starting address of a block group, which is reserved only 3658 * for tree-log metadata. 3659 * 3660 * Lock nesting 3661 * ============ 3662 * 3663 * space_info::lock 3664 * block_group::lock 3665 * fs_info::treelog_bg_lock 3666 */ 3667 3668 /* 3669 * Simple allocator for sequential-only block group. It only allows sequential 3670 * allocation. No need to play with trees. This function also reserves the 3671 * bytes as in btrfs_add_reserved_bytes. 3672 */ 3673 static int do_allocation_zoned(struct btrfs_block_group *block_group, 3674 struct find_free_extent_ctl *ffe_ctl, 3675 struct btrfs_block_group **bg_ret) 3676 { 3677 struct btrfs_fs_info *fs_info = block_group->fs_info; 3678 struct btrfs_space_info *space_info = block_group->space_info; 3679 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3680 u64 start = block_group->start; 3681 u64 num_bytes = ffe_ctl->num_bytes; 3682 u64 avail; 3683 u64 bytenr = block_group->start; 3684 u64 log_bytenr; 3685 u64 data_reloc_bytenr; 3686 int ret = 0; 3687 bool skip = false; 3688 3689 ASSERT(btrfs_is_zoned(block_group->fs_info)); 3690 3691 /* 3692 * Do not allow non-tree-log blocks in the dedicated tree-log block 3693 * group, and vice versa. 3694 */ 3695 spin_lock(&fs_info->treelog_bg_lock); 3696 log_bytenr = fs_info->treelog_bg; 3697 if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || 3698 (!ffe_ctl->for_treelog && bytenr == log_bytenr))) 3699 skip = true; 3700 spin_unlock(&fs_info->treelog_bg_lock); 3701 if (skip) 3702 return 1; 3703 3704 /* 3705 * Do not allow non-relocation blocks in the dedicated relocation block 3706 * group, and vice versa. 3707 */ 3708 spin_lock(&fs_info->relocation_bg_lock); 3709 data_reloc_bytenr = fs_info->data_reloc_bg; 3710 if (data_reloc_bytenr && 3711 ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || 3712 (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) 3713 skip = true; 3714 spin_unlock(&fs_info->relocation_bg_lock); 3715 if (skip) 3716 return 1; 3717 3718 /* Check RO and no space case before trying to activate it */ 3719 spin_lock(&block_group->lock); 3720 if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { 3721 ret = 1; 3722 /* 3723 * May need to clear fs_info->{treelog,data_reloc}_bg. 3724 * Return the error after taking the locks. 3725 */ 3726 } 3727 spin_unlock(&block_group->lock); 3728 3729 /* Metadata block group is activated at write time. */ 3730 if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && 3731 !btrfs_zone_activate(block_group)) { 3732 ret = 1; 3733 /* 3734 * May need to clear fs_info->{treelog,data_reloc}_bg. 3735 * Return the error after taking the locks. 3736 */ 3737 } 3738 3739 spin_lock(&space_info->lock); 3740 spin_lock(&block_group->lock); 3741 spin_lock(&fs_info->treelog_bg_lock); 3742 spin_lock(&fs_info->relocation_bg_lock); 3743 3744 if (ret) 3745 goto out; 3746 3747 ASSERT(!ffe_ctl->for_treelog || 3748 block_group->start == fs_info->treelog_bg || 3749 fs_info->treelog_bg == 0); 3750 ASSERT(!ffe_ctl->for_data_reloc || 3751 block_group->start == fs_info->data_reloc_bg || 3752 fs_info->data_reloc_bg == 0); 3753 3754 if (block_group->ro || 3755 (!ffe_ctl->for_data_reloc && 3756 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { 3757 ret = 1; 3758 goto out; 3759 } 3760 3761 /* 3762 * Do not allow currently using block group to be tree-log dedicated 3763 * block group. 3764 */ 3765 if (ffe_ctl->for_treelog && !fs_info->treelog_bg && 3766 (block_group->used || block_group->reserved)) { 3767 ret = 1; 3768 goto out; 3769 } 3770 3771 /* 3772 * Do not allow currently used block group to be the data relocation 3773 * dedicated block group. 3774 */ 3775 if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && 3776 (block_group->used || block_group->reserved)) { 3777 ret = 1; 3778 goto out; 3779 } 3780 3781 WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); 3782 avail = block_group->zone_capacity - block_group->alloc_offset; 3783 if (avail < num_bytes) { 3784 if (ffe_ctl->max_extent_size < avail) { 3785 /* 3786 * With sequential allocator, free space is always 3787 * contiguous 3788 */ 3789 ffe_ctl->max_extent_size = avail; 3790 ffe_ctl->total_free_space = avail; 3791 } 3792 ret = 1; 3793 goto out; 3794 } 3795 3796 if (ffe_ctl->for_treelog && !fs_info->treelog_bg) 3797 fs_info->treelog_bg = block_group->start; 3798 3799 if (ffe_ctl->for_data_reloc) { 3800 if (!fs_info->data_reloc_bg) 3801 fs_info->data_reloc_bg = block_group->start; 3802 /* 3803 * Do not allow allocations from this block group, unless it is 3804 * for data relocation. Compared to increasing the ->ro, setting 3805 * the ->zoned_data_reloc_ongoing flag still allows nocow 3806 * writers to come in. See btrfs_inc_nocow_writers(). 3807 * 3808 * We need to disable an allocation to avoid an allocation of 3809 * regular (non-relocation data) extent. With mix of relocation 3810 * extents and regular extents, we can dispatch WRITE commands 3811 * (for relocation extents) and ZONE APPEND commands (for 3812 * regular extents) at the same time to the same zone, which 3813 * easily break the write pointer. 3814 * 3815 * Also, this flag avoids this block group to be zone finished. 3816 */ 3817 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); 3818 } 3819 3820 ffe_ctl->found_offset = start + block_group->alloc_offset; 3821 block_group->alloc_offset += num_bytes; 3822 spin_lock(&ctl->tree_lock); 3823 ctl->free_space -= num_bytes; 3824 spin_unlock(&ctl->tree_lock); 3825 3826 /* 3827 * We do not check if found_offset is aligned to stripesize. The 3828 * address is anyway rewritten when using zone append writing. 3829 */ 3830 3831 ffe_ctl->search_start = ffe_ctl->found_offset; 3832 3833 out: 3834 if (ret && ffe_ctl->for_treelog) 3835 fs_info->treelog_bg = 0; 3836 if (ret && ffe_ctl->for_data_reloc) 3837 fs_info->data_reloc_bg = 0; 3838 spin_unlock(&fs_info->relocation_bg_lock); 3839 spin_unlock(&fs_info->treelog_bg_lock); 3840 spin_unlock(&block_group->lock); 3841 spin_unlock(&space_info->lock); 3842 return ret; 3843 } 3844 3845 static int do_allocation(struct btrfs_block_group *block_group, 3846 struct find_free_extent_ctl *ffe_ctl, 3847 struct btrfs_block_group **bg_ret) 3848 { 3849 switch (ffe_ctl->policy) { 3850 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3851 return do_allocation_clustered(block_group, ffe_ctl, bg_ret); 3852 case BTRFS_EXTENT_ALLOC_ZONED: 3853 return do_allocation_zoned(block_group, ffe_ctl, bg_ret); 3854 default: 3855 BUG(); 3856 } 3857 } 3858 3859 static void release_block_group(struct btrfs_block_group *block_group, 3860 struct find_free_extent_ctl *ffe_ctl, 3861 int delalloc) 3862 { 3863 switch (ffe_ctl->policy) { 3864 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3865 ffe_ctl->retry_uncached = false; 3866 break; 3867 case BTRFS_EXTENT_ALLOC_ZONED: 3868 /* Nothing to do */ 3869 break; 3870 default: 3871 BUG(); 3872 } 3873 3874 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != 3875 ffe_ctl->index); 3876 btrfs_release_block_group(block_group, delalloc); 3877 } 3878 3879 static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, 3880 struct btrfs_key *ins) 3881 { 3882 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 3883 3884 if (!ffe_ctl->use_cluster && last_ptr) { 3885 spin_lock(&last_ptr->lock); 3886 last_ptr->window_start = ins->objectid; 3887 spin_unlock(&last_ptr->lock); 3888 } 3889 } 3890 3891 static void found_extent(struct find_free_extent_ctl *ffe_ctl, 3892 struct btrfs_key *ins) 3893 { 3894 switch (ffe_ctl->policy) { 3895 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3896 found_extent_clustered(ffe_ctl, ins); 3897 break; 3898 case BTRFS_EXTENT_ALLOC_ZONED: 3899 /* Nothing to do */ 3900 break; 3901 default: 3902 BUG(); 3903 } 3904 } 3905 3906 static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, 3907 struct find_free_extent_ctl *ffe_ctl) 3908 { 3909 /* Block group's activeness is not a requirement for METADATA block groups. */ 3910 if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) 3911 return 0; 3912 3913 /* If we can activate new zone, just allocate a chunk and use it */ 3914 if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) 3915 return 0; 3916 3917 /* 3918 * We already reached the max active zones. Try to finish one block 3919 * group to make a room for a new block group. This is only possible 3920 * for a data block group because btrfs_zone_finish() may need to wait 3921 * for a running transaction which can cause a deadlock for metadata 3922 * allocation. 3923 */ 3924 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { 3925 int ret = btrfs_zone_finish_one_bg(fs_info); 3926 3927 if (ret == 1) 3928 return 0; 3929 else if (ret < 0) 3930 return ret; 3931 } 3932 3933 /* 3934 * If we have enough free space left in an already active block group 3935 * and we can't activate any other zone now, do not allow allocating a 3936 * new chunk and let find_free_extent() retry with a smaller size. 3937 */ 3938 if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) 3939 return -ENOSPC; 3940 3941 /* 3942 * Even min_alloc_size is not left in any block groups. Since we cannot 3943 * activate a new block group, allocating it may not help. Let's tell a 3944 * caller to try again and hope it progress something by writing some 3945 * parts of the region. That is only possible for data block groups, 3946 * where a part of the region can be written. 3947 */ 3948 if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) 3949 return -EAGAIN; 3950 3951 /* 3952 * We cannot activate a new block group and no enough space left in any 3953 * block groups. So, allocating a new block group may not help. But, 3954 * there is nothing to do anyway, so let's go with it. 3955 */ 3956 return 0; 3957 } 3958 3959 static int can_allocate_chunk(struct btrfs_fs_info *fs_info, 3960 struct find_free_extent_ctl *ffe_ctl) 3961 { 3962 switch (ffe_ctl->policy) { 3963 case BTRFS_EXTENT_ALLOC_CLUSTERED: 3964 return 0; 3965 case BTRFS_EXTENT_ALLOC_ZONED: 3966 return can_allocate_chunk_zoned(fs_info, ffe_ctl); 3967 default: 3968 BUG(); 3969 } 3970 } 3971 3972 /* 3973 * Return >0 means caller needs to re-search for free extent 3974 * Return 0 means we have the needed free extent. 3975 * Return <0 means we failed to locate any free extent. 3976 */ 3977 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, 3978 struct btrfs_key *ins, 3979 struct find_free_extent_ctl *ffe_ctl, 3980 bool full_search) 3981 { 3982 struct btrfs_root *root = fs_info->chunk_root; 3983 int ret; 3984 3985 if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && 3986 ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) 3987 ffe_ctl->orig_have_caching_bg = true; 3988 3989 if (ins->objectid) { 3990 found_extent(ffe_ctl, ins); 3991 return 0; 3992 } 3993 3994 if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) 3995 return 1; 3996 3997 ffe_ctl->index++; 3998 if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) 3999 return 1; 4000 4001 /* See the comments for btrfs_loop_type for an explanation of the phases. */ 4002 if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 4003 ffe_ctl->index = 0; 4004 /* 4005 * We want to skip the LOOP_CACHING_WAIT step if we don't have 4006 * any uncached bgs and we've already done a full search 4007 * through. 4008 */ 4009 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && 4010 (!ffe_ctl->orig_have_caching_bg && full_search)) 4011 ffe_ctl->loop++; 4012 ffe_ctl->loop++; 4013 4014 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 4015 struct btrfs_trans_handle *trans; 4016 int exist = 0; 4017 4018 /* Check if allocation policy allows to create a new chunk */ 4019 ret = can_allocate_chunk(fs_info, ffe_ctl); 4020 if (ret) 4021 return ret; 4022 4023 trans = current->journal_info; 4024 if (trans) 4025 exist = 1; 4026 else 4027 trans = btrfs_join_transaction(root); 4028 4029 if (IS_ERR(trans)) { 4030 ret = PTR_ERR(trans); 4031 return ret; 4032 } 4033 4034 ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, 4035 CHUNK_ALLOC_FORCE_FOR_EXTENT); 4036 4037 /* Do not bail out on ENOSPC since we can do more. */ 4038 if (ret == -ENOSPC) { 4039 ret = 0; 4040 ffe_ctl->loop++; 4041 } 4042 else if (ret < 0) 4043 btrfs_abort_transaction(trans, ret); 4044 else 4045 ret = 0; 4046 if (!exist) 4047 btrfs_end_transaction(trans); 4048 if (ret) 4049 return ret; 4050 } 4051 4052 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 4053 if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) 4054 return -ENOSPC; 4055 4056 /* 4057 * Don't loop again if we already have no empty_size and 4058 * no empty_cluster. 4059 */ 4060 if (ffe_ctl->empty_size == 0 && 4061 ffe_ctl->empty_cluster == 0) 4062 return -ENOSPC; 4063 ffe_ctl->empty_size = 0; 4064 ffe_ctl->empty_cluster = 0; 4065 } 4066 return 1; 4067 } 4068 return -ENOSPC; 4069 } 4070 4071 static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, 4072 struct btrfs_block_group *bg) 4073 { 4074 if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) 4075 return true; 4076 if (!btrfs_block_group_should_use_size_class(bg)) 4077 return true; 4078 if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) 4079 return true; 4080 if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && 4081 bg->size_class == BTRFS_BG_SZ_NONE) 4082 return true; 4083 return ffe_ctl->size_class == bg->size_class; 4084 } 4085 4086 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, 4087 struct find_free_extent_ctl *ffe_ctl, 4088 struct btrfs_space_info *space_info, 4089 struct btrfs_key *ins) 4090 { 4091 /* 4092 * If our free space is heavily fragmented we may not be able to make 4093 * big contiguous allocations, so instead of doing the expensive search 4094 * for free space, simply return ENOSPC with our max_extent_size so we 4095 * can go ahead and search for a more manageable chunk. 4096 * 4097 * If our max_extent_size is large enough for our allocation simply 4098 * disable clustering since we will likely not be able to find enough 4099 * space to create a cluster and induce latency trying. 4100 */ 4101 if (space_info->max_extent_size) { 4102 spin_lock(&space_info->lock); 4103 if (space_info->max_extent_size && 4104 ffe_ctl->num_bytes > space_info->max_extent_size) { 4105 ins->offset = space_info->max_extent_size; 4106 spin_unlock(&space_info->lock); 4107 return -ENOSPC; 4108 } else if (space_info->max_extent_size) { 4109 ffe_ctl->use_cluster = false; 4110 } 4111 spin_unlock(&space_info->lock); 4112 } 4113 4114 ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, 4115 &ffe_ctl->empty_cluster); 4116 if (ffe_ctl->last_ptr) { 4117 struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; 4118 4119 spin_lock(&last_ptr->lock); 4120 if (last_ptr->block_group) 4121 ffe_ctl->hint_byte = last_ptr->window_start; 4122 if (last_ptr->fragmented) { 4123 /* 4124 * We still set window_start so we can keep track of the 4125 * last place we found an allocation to try and save 4126 * some time. 4127 */ 4128 ffe_ctl->hint_byte = last_ptr->window_start; 4129 ffe_ctl->use_cluster = false; 4130 } 4131 spin_unlock(&last_ptr->lock); 4132 } 4133 4134 return 0; 4135 } 4136 4137 static int prepare_allocation(struct btrfs_fs_info *fs_info, 4138 struct find_free_extent_ctl *ffe_ctl, 4139 struct btrfs_space_info *space_info, 4140 struct btrfs_key *ins) 4141 { 4142 switch (ffe_ctl->policy) { 4143 case BTRFS_EXTENT_ALLOC_CLUSTERED: 4144 return prepare_allocation_clustered(fs_info, ffe_ctl, 4145 space_info, ins); 4146 case BTRFS_EXTENT_ALLOC_ZONED: 4147 if (ffe_ctl->for_treelog) { 4148 spin_lock(&fs_info->treelog_bg_lock); 4149 if (fs_info->treelog_bg) 4150 ffe_ctl->hint_byte = fs_info->treelog_bg; 4151 spin_unlock(&fs_info->treelog_bg_lock); 4152 } 4153 if (ffe_ctl->for_data_reloc) { 4154 spin_lock(&fs_info->relocation_bg_lock); 4155 if (fs_info->data_reloc_bg) 4156 ffe_ctl->hint_byte = fs_info->data_reloc_bg; 4157 spin_unlock(&fs_info->relocation_bg_lock); 4158 } 4159 return 0; 4160 default: 4161 BUG(); 4162 } 4163 } 4164 4165 /* 4166 * walks the btree of allocated extents and find a hole of a given size. 4167 * The key ins is changed to record the hole: 4168 * ins->objectid == start position 4169 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4170 * ins->offset == the size of the hole. 4171 * Any available blocks before search_start are skipped. 4172 * 4173 * If there is no suitable free space, we will record the max size of 4174 * the free space extent currently. 4175 * 4176 * The overall logic and call chain: 4177 * 4178 * find_free_extent() 4179 * |- Iterate through all block groups 4180 * | |- Get a valid block group 4181 * | |- Try to do clustered allocation in that block group 4182 * | |- Try to do unclustered allocation in that block group 4183 * | |- Check if the result is valid 4184 * | | |- If valid, then exit 4185 * | |- Jump to next block group 4186 * | 4187 * |- Push harder to find free extents 4188 * |- If not found, re-iterate all block groups 4189 */ 4190 static noinline int find_free_extent(struct btrfs_root *root, 4191 struct btrfs_key *ins, 4192 struct find_free_extent_ctl *ffe_ctl) 4193 { 4194 struct btrfs_fs_info *fs_info = root->fs_info; 4195 int ret = 0; 4196 int cache_block_group_error = 0; 4197 struct btrfs_block_group *block_group = NULL; 4198 struct btrfs_space_info *space_info; 4199 bool full_search = false; 4200 4201 WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); 4202 4203 ffe_ctl->search_start = 0; 4204 /* For clustered allocation */ 4205 ffe_ctl->empty_cluster = 0; 4206 ffe_ctl->last_ptr = NULL; 4207 ffe_ctl->use_cluster = true; 4208 ffe_ctl->have_caching_bg = false; 4209 ffe_ctl->orig_have_caching_bg = false; 4210 ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); 4211 ffe_ctl->loop = 0; 4212 ffe_ctl->retry_uncached = false; 4213 ffe_ctl->cached = 0; 4214 ffe_ctl->max_extent_size = 0; 4215 ffe_ctl->total_free_space = 0; 4216 ffe_ctl->found_offset = 0; 4217 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; 4218 ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); 4219 4220 if (btrfs_is_zoned(fs_info)) 4221 ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; 4222 4223 ins->type = BTRFS_EXTENT_ITEM_KEY; 4224 ins->objectid = 0; 4225 ins->offset = 0; 4226 4227 trace_find_free_extent(root, ffe_ctl); 4228 4229 space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); 4230 if (!space_info) { 4231 btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); 4232 return -ENOSPC; 4233 } 4234 4235 ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); 4236 if (ret < 0) 4237 return ret; 4238 4239 ffe_ctl->search_start = max(ffe_ctl->search_start, 4240 first_logical_byte(fs_info)); 4241 ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); 4242 if (ffe_ctl->search_start == ffe_ctl->hint_byte) { 4243 block_group = btrfs_lookup_block_group(fs_info, 4244 ffe_ctl->search_start); 4245 /* 4246 * we don't want to use the block group if it doesn't match our 4247 * allocation bits, or if its not cached. 4248 * 4249 * However if we are re-searching with an ideal block group 4250 * picked out then we don't care that the block group is cached. 4251 */ 4252 if (block_group && block_group_bits(block_group, ffe_ctl->flags) && 4253 block_group->cached != BTRFS_CACHE_NO) { 4254 down_read(&space_info->groups_sem); 4255 if (list_empty(&block_group->list) || 4256 block_group->ro) { 4257 /* 4258 * someone is removing this block group, 4259 * we can't jump into the have_block_group 4260 * target because our list pointers are not 4261 * valid 4262 */ 4263 btrfs_put_block_group(block_group); 4264 up_read(&space_info->groups_sem); 4265 } else { 4266 ffe_ctl->index = btrfs_bg_flags_to_raid_index( 4267 block_group->flags); 4268 btrfs_lock_block_group(block_group, 4269 ffe_ctl->delalloc); 4270 ffe_ctl->hinted = true; 4271 goto have_block_group; 4272 } 4273 } else if (block_group) { 4274 btrfs_put_block_group(block_group); 4275 } 4276 } 4277 search: 4278 trace_find_free_extent_search_loop(root, ffe_ctl); 4279 ffe_ctl->have_caching_bg = false; 4280 if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || 4281 ffe_ctl->index == 0) 4282 full_search = true; 4283 down_read(&space_info->groups_sem); 4284 list_for_each_entry(block_group, 4285 &space_info->block_groups[ffe_ctl->index], list) { 4286 struct btrfs_block_group *bg_ret; 4287 4288 ffe_ctl->hinted = false; 4289 /* If the block group is read-only, we can skip it entirely. */ 4290 if (unlikely(block_group->ro)) { 4291 if (ffe_ctl->for_treelog) 4292 btrfs_clear_treelog_bg(block_group); 4293 if (ffe_ctl->for_data_reloc) 4294 btrfs_clear_data_reloc_bg(block_group); 4295 continue; 4296 } 4297 4298 btrfs_grab_block_group(block_group, ffe_ctl->delalloc); 4299 ffe_ctl->search_start = block_group->start; 4300 4301 /* 4302 * this can happen if we end up cycling through all the 4303 * raid types, but we want to make sure we only allocate 4304 * for the proper type. 4305 */ 4306 if (!block_group_bits(block_group, ffe_ctl->flags)) { 4307 u64 extra = BTRFS_BLOCK_GROUP_DUP | 4308 BTRFS_BLOCK_GROUP_RAID1_MASK | 4309 BTRFS_BLOCK_GROUP_RAID56_MASK | 4310 BTRFS_BLOCK_GROUP_RAID10; 4311 4312 /* 4313 * if they asked for extra copies and this block group 4314 * doesn't provide them, bail. This does allow us to 4315 * fill raid0 from raid1. 4316 */ 4317 if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) 4318 goto loop; 4319 4320 /* 4321 * This block group has different flags than we want. 4322 * It's possible that we have MIXED_GROUP flag but no 4323 * block group is mixed. Just skip such block group. 4324 */ 4325 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4326 continue; 4327 } 4328 4329 have_block_group: 4330 trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); 4331 ffe_ctl->cached = btrfs_block_group_done(block_group); 4332 if (unlikely(!ffe_ctl->cached)) { 4333 ffe_ctl->have_caching_bg = true; 4334 ret = btrfs_cache_block_group(block_group, false); 4335 4336 /* 4337 * If we get ENOMEM here or something else we want to 4338 * try other block groups, because it may not be fatal. 4339 * However if we can't find anything else we need to 4340 * save our return here so that we return the actual 4341 * error that caused problems, not ENOSPC. 4342 */ 4343 if (ret < 0) { 4344 if (!cache_block_group_error) 4345 cache_block_group_error = ret; 4346 ret = 0; 4347 goto loop; 4348 } 4349 ret = 0; 4350 } 4351 4352 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { 4353 if (!cache_block_group_error) 4354 cache_block_group_error = -EIO; 4355 goto loop; 4356 } 4357 4358 if (!find_free_extent_check_size_class(ffe_ctl, block_group)) 4359 goto loop; 4360 4361 bg_ret = NULL; 4362 ret = do_allocation(block_group, ffe_ctl, &bg_ret); 4363 if (ret > 0) 4364 goto loop; 4365 4366 if (bg_ret && bg_ret != block_group) { 4367 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4368 block_group = bg_ret; 4369 } 4370 4371 /* Checks */ 4372 ffe_ctl->search_start = round_up(ffe_ctl->found_offset, 4373 fs_info->stripesize); 4374 4375 /* move on to the next group */ 4376 if (ffe_ctl->search_start + ffe_ctl->num_bytes > 4377 block_group->start + block_group->length) { 4378 btrfs_add_free_space_unused(block_group, 4379 ffe_ctl->found_offset, 4380 ffe_ctl->num_bytes); 4381 goto loop; 4382 } 4383 4384 if (ffe_ctl->found_offset < ffe_ctl->search_start) 4385 btrfs_add_free_space_unused(block_group, 4386 ffe_ctl->found_offset, 4387 ffe_ctl->search_start - ffe_ctl->found_offset); 4388 4389 ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, 4390 ffe_ctl->num_bytes, 4391 ffe_ctl->delalloc, 4392 ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); 4393 if (ret == -EAGAIN) { 4394 btrfs_add_free_space_unused(block_group, 4395 ffe_ctl->found_offset, 4396 ffe_ctl->num_bytes); 4397 goto loop; 4398 } 4399 btrfs_inc_block_group_reservations(block_group); 4400 4401 /* we are all good, lets return */ 4402 ins->objectid = ffe_ctl->search_start; 4403 ins->offset = ffe_ctl->num_bytes; 4404 4405 trace_btrfs_reserve_extent(block_group, ffe_ctl); 4406 btrfs_release_block_group(block_group, ffe_ctl->delalloc); 4407 break; 4408 loop: 4409 if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && 4410 !ffe_ctl->retry_uncached) { 4411 ffe_ctl->retry_uncached = true; 4412 btrfs_wait_block_group_cache_progress(block_group, 4413 ffe_ctl->num_bytes + 4414 ffe_ctl->empty_cluster + 4415 ffe_ctl->empty_size); 4416 goto have_block_group; 4417 } 4418 release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); 4419 cond_resched(); 4420 } 4421 up_read(&space_info->groups_sem); 4422 4423 ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); 4424 if (ret > 0) 4425 goto search; 4426 4427 if (ret == -ENOSPC && !cache_block_group_error) { 4428 /* 4429 * Use ffe_ctl->total_free_space as fallback if we can't find 4430 * any contiguous hole. 4431 */ 4432 if (!ffe_ctl->max_extent_size) 4433 ffe_ctl->max_extent_size = ffe_ctl->total_free_space; 4434 spin_lock(&space_info->lock); 4435 space_info->max_extent_size = ffe_ctl->max_extent_size; 4436 spin_unlock(&space_info->lock); 4437 ins->offset = ffe_ctl->max_extent_size; 4438 } else if (ret == -ENOSPC) { 4439 ret = cache_block_group_error; 4440 } 4441 return ret; 4442 } 4443 4444 /* 4445 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a 4446 * hole that is at least as big as @num_bytes. 4447 * 4448 * @root - The root that will contain this extent 4449 * 4450 * @ram_bytes - The amount of space in ram that @num_bytes take. This 4451 * is used for accounting purposes. This value differs 4452 * from @num_bytes only in the case of compressed extents. 4453 * 4454 * @num_bytes - Number of bytes to allocate on-disk. 4455 * 4456 * @min_alloc_size - Indicates the minimum amount of space that the 4457 * allocator should try to satisfy. In some cases 4458 * @num_bytes may be larger than what is required and if 4459 * the filesystem is fragmented then allocation fails. 4460 * However, the presence of @min_alloc_size gives a 4461 * chance to try and satisfy the smaller allocation. 4462 * 4463 * @empty_size - A hint that you plan on doing more COW. This is the 4464 * size in bytes the allocator should try to find free 4465 * next to the block it returns. This is just a hint and 4466 * may be ignored by the allocator. 4467 * 4468 * @hint_byte - Hint to the allocator to start searching above the byte 4469 * address passed. It might be ignored. 4470 * 4471 * @ins - This key is modified to record the found hole. It will 4472 * have the following values: 4473 * ins->objectid == start position 4474 * ins->flags = BTRFS_EXTENT_ITEM_KEY 4475 * ins->offset == the size of the hole. 4476 * 4477 * @is_data - Boolean flag indicating whether an extent is 4478 * allocated for data (true) or metadata (false) 4479 * 4480 * @delalloc - Boolean flag indicating whether this allocation is for 4481 * delalloc or not. If 'true' data_rwsem of block groups 4482 * is going to be acquired. 4483 * 4484 * 4485 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In 4486 * case -ENOSPC is returned then @ins->offset will contain the size of the 4487 * largest available hole the allocator managed to find. 4488 */ 4489 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, 4490 u64 num_bytes, u64 min_alloc_size, 4491 u64 empty_size, u64 hint_byte, 4492 struct btrfs_key *ins, int is_data, int delalloc) 4493 { 4494 struct btrfs_fs_info *fs_info = root->fs_info; 4495 struct find_free_extent_ctl ffe_ctl = {}; 4496 bool final_tried = num_bytes == min_alloc_size; 4497 u64 flags; 4498 int ret; 4499 bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 4500 bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); 4501 4502 flags = get_alloc_profile_by_root(root, is_data); 4503 again: 4504 WARN_ON(num_bytes < fs_info->sectorsize); 4505 4506 ffe_ctl.ram_bytes = ram_bytes; 4507 ffe_ctl.num_bytes = num_bytes; 4508 ffe_ctl.min_alloc_size = min_alloc_size; 4509 ffe_ctl.empty_size = empty_size; 4510 ffe_ctl.flags = flags; 4511 ffe_ctl.delalloc = delalloc; 4512 ffe_ctl.hint_byte = hint_byte; 4513 ffe_ctl.for_treelog = for_treelog; 4514 ffe_ctl.for_data_reloc = for_data_reloc; 4515 4516 ret = find_free_extent(root, ins, &ffe_ctl); 4517 if (!ret && !is_data) { 4518 btrfs_dec_block_group_reservations(fs_info, ins->objectid); 4519 } else if (ret == -ENOSPC) { 4520 if (!final_tried && ins->offset) { 4521 num_bytes = min(num_bytes >> 1, ins->offset); 4522 num_bytes = round_down(num_bytes, 4523 fs_info->sectorsize); 4524 num_bytes = max(num_bytes, min_alloc_size); 4525 ram_bytes = num_bytes; 4526 if (num_bytes == min_alloc_size) 4527 final_tried = true; 4528 goto again; 4529 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 4530 struct btrfs_space_info *sinfo; 4531 4532 sinfo = btrfs_find_space_info(fs_info, flags); 4533 btrfs_err(fs_info, 4534 "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", 4535 flags, num_bytes, for_treelog, for_data_reloc); 4536 if (sinfo) 4537 btrfs_dump_space_info(fs_info, sinfo, 4538 num_bytes, 1); 4539 } 4540 } 4541 4542 return ret; 4543 } 4544 4545 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, 4546 u64 start, u64 len, int delalloc) 4547 { 4548 struct btrfs_block_group *cache; 4549 4550 cache = btrfs_lookup_block_group(fs_info, start); 4551 if (!cache) { 4552 btrfs_err(fs_info, "Unable to find block group for %llu", 4553 start); 4554 return -ENOSPC; 4555 } 4556 4557 btrfs_add_free_space(cache, start, len); 4558 btrfs_free_reserved_bytes(cache, len, delalloc); 4559 trace_btrfs_reserved_extent_free(fs_info, start, len); 4560 4561 btrfs_put_block_group(cache); 4562 return 0; 4563 } 4564 4565 int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, 4566 u64 len) 4567 { 4568 struct btrfs_block_group *cache; 4569 int ret = 0; 4570 4571 cache = btrfs_lookup_block_group(trans->fs_info, start); 4572 if (!cache) { 4573 btrfs_err(trans->fs_info, "unable to find block group for %llu", 4574 start); 4575 return -ENOSPC; 4576 } 4577 4578 ret = pin_down_extent(trans, cache, start, len, 1); 4579 btrfs_put_block_group(cache); 4580 return ret; 4581 } 4582 4583 static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, 4584 u64 num_bytes) 4585 { 4586 struct btrfs_fs_info *fs_info = trans->fs_info; 4587 int ret; 4588 4589 ret = remove_from_free_space_tree(trans, bytenr, num_bytes); 4590 if (ret) 4591 return ret; 4592 4593 ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); 4594 if (ret) { 4595 ASSERT(!ret); 4596 btrfs_err(fs_info, "update block group failed for %llu %llu", 4597 bytenr, num_bytes); 4598 return ret; 4599 } 4600 4601 trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); 4602 return 0; 4603 } 4604 4605 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4606 u64 parent, u64 root_objectid, 4607 u64 flags, u64 owner, u64 offset, 4608 struct btrfs_key *ins, int ref_mod) 4609 { 4610 struct btrfs_fs_info *fs_info = trans->fs_info; 4611 struct btrfs_root *extent_root; 4612 int ret; 4613 struct btrfs_extent_item *extent_item; 4614 struct btrfs_extent_inline_ref *iref; 4615 struct btrfs_path *path; 4616 struct extent_buffer *leaf; 4617 int type; 4618 u32 size; 4619 4620 if (parent > 0) 4621 type = BTRFS_SHARED_DATA_REF_KEY; 4622 else 4623 type = BTRFS_EXTENT_DATA_REF_KEY; 4624 4625 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 4626 4627 path = btrfs_alloc_path(); 4628 if (!path) 4629 return -ENOMEM; 4630 4631 extent_root = btrfs_extent_root(fs_info, ins->objectid); 4632 ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); 4633 if (ret) { 4634 btrfs_free_path(path); 4635 return ret; 4636 } 4637 4638 leaf = path->nodes[0]; 4639 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4640 struct btrfs_extent_item); 4641 btrfs_set_extent_refs(leaf, extent_item, ref_mod); 4642 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4643 btrfs_set_extent_flags(leaf, extent_item, 4644 flags | BTRFS_EXTENT_FLAG_DATA); 4645 4646 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4647 btrfs_set_extent_inline_ref_type(leaf, iref, type); 4648 if (parent > 0) { 4649 struct btrfs_shared_data_ref *ref; 4650 ref = (struct btrfs_shared_data_ref *)(iref + 1); 4651 btrfs_set_extent_inline_ref_offset(leaf, iref, parent); 4652 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); 4653 } else { 4654 struct btrfs_extent_data_ref *ref; 4655 ref = (struct btrfs_extent_data_ref *)(&iref->offset); 4656 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); 4657 btrfs_set_extent_data_ref_objectid(leaf, ref, owner); 4658 btrfs_set_extent_data_ref_offset(leaf, ref, offset); 4659 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); 4660 } 4661 4662 btrfs_mark_buffer_dirty(path->nodes[0]); 4663 btrfs_free_path(path); 4664 4665 return alloc_reserved_extent(trans, ins->objectid, ins->offset); 4666 } 4667 4668 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, 4669 struct btrfs_delayed_ref_node *node, 4670 struct btrfs_delayed_extent_op *extent_op) 4671 { 4672 struct btrfs_fs_info *fs_info = trans->fs_info; 4673 struct btrfs_root *extent_root; 4674 int ret; 4675 struct btrfs_extent_item *extent_item; 4676 struct btrfs_key extent_key; 4677 struct btrfs_tree_block_info *block_info; 4678 struct btrfs_extent_inline_ref *iref; 4679 struct btrfs_path *path; 4680 struct extent_buffer *leaf; 4681 struct btrfs_delayed_tree_ref *ref; 4682 u32 size = sizeof(*extent_item) + sizeof(*iref); 4683 u64 flags = extent_op->flags_to_set; 4684 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 4685 4686 ref = btrfs_delayed_node_to_tree_ref(node); 4687 4688 extent_key.objectid = node->bytenr; 4689 if (skinny_metadata) { 4690 extent_key.offset = ref->level; 4691 extent_key.type = BTRFS_METADATA_ITEM_KEY; 4692 } else { 4693 extent_key.offset = node->num_bytes; 4694 extent_key.type = BTRFS_EXTENT_ITEM_KEY; 4695 size += sizeof(*block_info); 4696 } 4697 4698 path = btrfs_alloc_path(); 4699 if (!path) 4700 return -ENOMEM; 4701 4702 extent_root = btrfs_extent_root(fs_info, extent_key.objectid); 4703 ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, 4704 size); 4705 if (ret) { 4706 btrfs_free_path(path); 4707 return ret; 4708 } 4709 4710 leaf = path->nodes[0]; 4711 extent_item = btrfs_item_ptr(leaf, path->slots[0], 4712 struct btrfs_extent_item); 4713 btrfs_set_extent_refs(leaf, extent_item, 1); 4714 btrfs_set_extent_generation(leaf, extent_item, trans->transid); 4715 btrfs_set_extent_flags(leaf, extent_item, 4716 flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); 4717 4718 if (skinny_metadata) { 4719 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 4720 } else { 4721 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 4722 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); 4723 btrfs_set_tree_block_level(leaf, block_info, ref->level); 4724 iref = (struct btrfs_extent_inline_ref *)(block_info + 1); 4725 } 4726 4727 if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 4728 btrfs_set_extent_inline_ref_type(leaf, iref, 4729 BTRFS_SHARED_BLOCK_REF_KEY); 4730 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); 4731 } else { 4732 btrfs_set_extent_inline_ref_type(leaf, iref, 4733 BTRFS_TREE_BLOCK_REF_KEY); 4734 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); 4735 } 4736 4737 btrfs_mark_buffer_dirty(leaf); 4738 btrfs_free_path(path); 4739 4740 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); 4741 } 4742 4743 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 4744 struct btrfs_root *root, u64 owner, 4745 u64 offset, u64 ram_bytes, 4746 struct btrfs_key *ins) 4747 { 4748 struct btrfs_ref generic_ref = { 0 }; 4749 4750 BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); 4751 4752 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 4753 ins->objectid, ins->offset, 0); 4754 btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, 4755 offset, 0, false); 4756 btrfs_ref_tree_mod(root->fs_info, &generic_ref); 4757 4758 return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); 4759 } 4760 4761 /* 4762 * this is used by the tree logging recovery code. It records that 4763 * an extent has been allocated and makes sure to clear the free 4764 * space cache bits as well 4765 */ 4766 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 4767 u64 root_objectid, u64 owner, u64 offset, 4768 struct btrfs_key *ins) 4769 { 4770 struct btrfs_fs_info *fs_info = trans->fs_info; 4771 int ret; 4772 struct btrfs_block_group *block_group; 4773 struct btrfs_space_info *space_info; 4774 4775 /* 4776 * Mixed block groups will exclude before processing the log so we only 4777 * need to do the exclude dance if this fs isn't mixed. 4778 */ 4779 if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 4780 ret = __exclude_logged_extent(fs_info, ins->objectid, 4781 ins->offset); 4782 if (ret) 4783 return ret; 4784 } 4785 4786 block_group = btrfs_lookup_block_group(fs_info, ins->objectid); 4787 if (!block_group) 4788 return -EINVAL; 4789 4790 space_info = block_group->space_info; 4791 spin_lock(&space_info->lock); 4792 spin_lock(&block_group->lock); 4793 space_info->bytes_reserved += ins->offset; 4794 block_group->reserved += ins->offset; 4795 spin_unlock(&block_group->lock); 4796 spin_unlock(&space_info->lock); 4797 4798 ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, 4799 offset, ins, 1); 4800 if (ret) 4801 btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); 4802 btrfs_put_block_group(block_group); 4803 return ret; 4804 } 4805 4806 static struct extent_buffer * 4807 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4808 u64 bytenr, int level, u64 owner, 4809 enum btrfs_lock_nesting nest) 4810 { 4811 struct btrfs_fs_info *fs_info = root->fs_info; 4812 struct extent_buffer *buf; 4813 u64 lockdep_owner = owner; 4814 4815 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); 4816 if (IS_ERR(buf)) 4817 return buf; 4818 4819 /* 4820 * Extra safety check in case the extent tree is corrupted and extent 4821 * allocator chooses to use a tree block which is already used and 4822 * locked. 4823 */ 4824 if (buf->lock_owner == current->pid) { 4825 btrfs_err_rl(fs_info, 4826 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", 4827 buf->start, btrfs_header_owner(buf), current->pid); 4828 free_extent_buffer(buf); 4829 return ERR_PTR(-EUCLEAN); 4830 } 4831 4832 /* 4833 * The reloc trees are just snapshots, so we need them to appear to be 4834 * just like any other fs tree WRT lockdep. 4835 * 4836 * The exception however is in replace_path() in relocation, where we 4837 * hold the lock on the original fs root and then search for the reloc 4838 * root. At that point we need to make sure any reloc root buffers are 4839 * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make 4840 * lockdep happy. 4841 */ 4842 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && 4843 !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) 4844 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 4845 4846 /* btrfs_clear_buffer_dirty() accesses generation field. */ 4847 btrfs_set_header_generation(buf, trans->transid); 4848 4849 /* 4850 * This needs to stay, because we could allocate a freed block from an 4851 * old tree into a new tree, so we need to make sure this new block is 4852 * set to the appropriate level and owner. 4853 */ 4854 btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); 4855 4856 __btrfs_tree_lock(buf, nest); 4857 btrfs_clear_buffer_dirty(trans, buf); 4858 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 4859 clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); 4860 4861 set_extent_buffer_uptodate(buf); 4862 4863 memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); 4864 btrfs_set_header_level(buf, level); 4865 btrfs_set_header_bytenr(buf, buf->start); 4866 btrfs_set_header_generation(buf, trans->transid); 4867 btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); 4868 btrfs_set_header_owner(buf, owner); 4869 write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); 4870 write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); 4871 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 4872 buf->log_index = root->log_transid % 2; 4873 /* 4874 * we allow two log transactions at a time, use different 4875 * EXTENT bit to differentiate dirty pages. 4876 */ 4877 if (buf->log_index == 0) 4878 set_extent_bit(&root->dirty_log_pages, buf->start, 4879 buf->start + buf->len - 1, 4880 EXTENT_DIRTY, NULL); 4881 else 4882 set_extent_bit(&root->dirty_log_pages, buf->start, 4883 buf->start + buf->len - 1, 4884 EXTENT_NEW, NULL); 4885 } else { 4886 buf->log_index = -1; 4887 set_extent_bit(&trans->transaction->dirty_pages, buf->start, 4888 buf->start + buf->len - 1, EXTENT_DIRTY, NULL); 4889 } 4890 /* this returns a buffer locked for blocking */ 4891 return buf; 4892 } 4893 4894 /* 4895 * finds a free extent and does all the dirty work required for allocation 4896 * returns the tree buffer or an ERR_PTR on error. 4897 */ 4898 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, 4899 struct btrfs_root *root, 4900 u64 parent, u64 root_objectid, 4901 const struct btrfs_disk_key *key, 4902 int level, u64 hint, 4903 u64 empty_size, 4904 enum btrfs_lock_nesting nest) 4905 { 4906 struct btrfs_fs_info *fs_info = root->fs_info; 4907 struct btrfs_key ins; 4908 struct btrfs_block_rsv *block_rsv; 4909 struct extent_buffer *buf; 4910 struct btrfs_delayed_extent_op *extent_op; 4911 struct btrfs_ref generic_ref = { 0 }; 4912 u64 flags = 0; 4913 int ret; 4914 u32 blocksize = fs_info->nodesize; 4915 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 4916 4917 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4918 if (btrfs_is_testing(fs_info)) { 4919 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 4920 level, root_objectid, nest); 4921 if (!IS_ERR(buf)) 4922 root->alloc_bytenr += blocksize; 4923 return buf; 4924 } 4925 #endif 4926 4927 block_rsv = btrfs_use_block_rsv(trans, root, blocksize); 4928 if (IS_ERR(block_rsv)) 4929 return ERR_CAST(block_rsv); 4930 4931 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, 4932 empty_size, hint, &ins, 0, 0); 4933 if (ret) 4934 goto out_unuse; 4935 4936 buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, 4937 root_objectid, nest); 4938 if (IS_ERR(buf)) { 4939 ret = PTR_ERR(buf); 4940 goto out_free_reserved; 4941 } 4942 4943 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 4944 if (parent == 0) 4945 parent = ins.objectid; 4946 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 4947 } else 4948 BUG_ON(parent > 0); 4949 4950 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 4951 extent_op = btrfs_alloc_delayed_extent_op(); 4952 if (!extent_op) { 4953 ret = -ENOMEM; 4954 goto out_free_buf; 4955 } 4956 if (key) 4957 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 4958 else 4959 memset(&extent_op->key, 0, sizeof(extent_op->key)); 4960 extent_op->flags_to_set = flags; 4961 extent_op->update_key = skinny_metadata ? false : true; 4962 extent_op->update_flags = true; 4963 extent_op->level = level; 4964 4965 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, 4966 ins.objectid, ins.offset, parent); 4967 btrfs_init_tree_ref(&generic_ref, level, root_objectid, 4968 root->root_key.objectid, false); 4969 btrfs_ref_tree_mod(fs_info, &generic_ref); 4970 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); 4971 if (ret) 4972 goto out_free_delayed; 4973 } 4974 return buf; 4975 4976 out_free_delayed: 4977 btrfs_free_delayed_extent_op(extent_op); 4978 out_free_buf: 4979 btrfs_tree_unlock(buf); 4980 free_extent_buffer(buf); 4981 out_free_reserved: 4982 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); 4983 out_unuse: 4984 btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); 4985 return ERR_PTR(ret); 4986 } 4987 4988 struct walk_control { 4989 u64 refs[BTRFS_MAX_LEVEL]; 4990 u64 flags[BTRFS_MAX_LEVEL]; 4991 struct btrfs_key update_progress; 4992 struct btrfs_key drop_progress; 4993 int drop_level; 4994 int stage; 4995 int level; 4996 int shared_level; 4997 int update_ref; 4998 int keep_locks; 4999 int reada_slot; 5000 int reada_count; 5001 int restarted; 5002 }; 5003 5004 #define DROP_REFERENCE 1 5005 #define UPDATE_BACKREF 2 5006 5007 static noinline void reada_walk_down(struct btrfs_trans_handle *trans, 5008 struct btrfs_root *root, 5009 struct walk_control *wc, 5010 struct btrfs_path *path) 5011 { 5012 struct btrfs_fs_info *fs_info = root->fs_info; 5013 u64 bytenr; 5014 u64 generation; 5015 u64 refs; 5016 u64 flags; 5017 u32 nritems; 5018 struct btrfs_key key; 5019 struct extent_buffer *eb; 5020 int ret; 5021 int slot; 5022 int nread = 0; 5023 5024 if (path->slots[wc->level] < wc->reada_slot) { 5025 wc->reada_count = wc->reada_count * 2 / 3; 5026 wc->reada_count = max(wc->reada_count, 2); 5027 } else { 5028 wc->reada_count = wc->reada_count * 3 / 2; 5029 wc->reada_count = min_t(int, wc->reada_count, 5030 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 5031 } 5032 5033 eb = path->nodes[wc->level]; 5034 nritems = btrfs_header_nritems(eb); 5035 5036 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 5037 if (nread >= wc->reada_count) 5038 break; 5039 5040 cond_resched(); 5041 bytenr = btrfs_node_blockptr(eb, slot); 5042 generation = btrfs_node_ptr_generation(eb, slot); 5043 5044 if (slot == path->slots[wc->level]) 5045 goto reada; 5046 5047 if (wc->stage == UPDATE_BACKREF && 5048 generation <= root->root_key.offset) 5049 continue; 5050 5051 /* We don't lock the tree block, it's OK to be racy here */ 5052 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, 5053 wc->level - 1, 1, &refs, 5054 &flags); 5055 /* We don't care about errors in readahead. */ 5056 if (ret < 0) 5057 continue; 5058 BUG_ON(refs == 0); 5059 5060 if (wc->stage == DROP_REFERENCE) { 5061 if (refs == 1) 5062 goto reada; 5063 5064 if (wc->level == 1 && 5065 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5066 continue; 5067 if (!wc->update_ref || 5068 generation <= root->root_key.offset) 5069 continue; 5070 btrfs_node_key_to_cpu(eb, &key, slot); 5071 ret = btrfs_comp_cpu_keys(&key, 5072 &wc->update_progress); 5073 if (ret < 0) 5074 continue; 5075 } else { 5076 if (wc->level == 1 && 5077 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5078 continue; 5079 } 5080 reada: 5081 btrfs_readahead_node_child(eb, slot); 5082 nread++; 5083 } 5084 wc->reada_slot = slot; 5085 } 5086 5087 /* 5088 * helper to process tree block while walking down the tree. 5089 * 5090 * when wc->stage == UPDATE_BACKREF, this function updates 5091 * back refs for pointers in the block. 5092 * 5093 * NOTE: return value 1 means we should stop walking down. 5094 */ 5095 static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5096 struct btrfs_root *root, 5097 struct btrfs_path *path, 5098 struct walk_control *wc, int lookup_info) 5099 { 5100 struct btrfs_fs_info *fs_info = root->fs_info; 5101 int level = wc->level; 5102 struct extent_buffer *eb = path->nodes[level]; 5103 u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5104 int ret; 5105 5106 if (wc->stage == UPDATE_BACKREF && 5107 btrfs_header_owner(eb) != root->root_key.objectid) 5108 return 1; 5109 5110 /* 5111 * when reference count of tree block is 1, it won't increase 5112 * again. once full backref flag is set, we never clear it. 5113 */ 5114 if (lookup_info && 5115 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5116 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { 5117 BUG_ON(!path->locks[level]); 5118 ret = btrfs_lookup_extent_info(trans, fs_info, 5119 eb->start, level, 1, 5120 &wc->refs[level], 5121 &wc->flags[level]); 5122 BUG_ON(ret == -ENOMEM); 5123 if (ret) 5124 return ret; 5125 BUG_ON(wc->refs[level] == 0); 5126 } 5127 5128 if (wc->stage == DROP_REFERENCE) { 5129 if (wc->refs[level] > 1) 5130 return 1; 5131 5132 if (path->locks[level] && !wc->keep_locks) { 5133 btrfs_tree_unlock_rw(eb, path->locks[level]); 5134 path->locks[level] = 0; 5135 } 5136 return 0; 5137 } 5138 5139 /* wc->stage == UPDATE_BACKREF */ 5140 if (!(wc->flags[level] & flag)) { 5141 BUG_ON(!path->locks[level]); 5142 ret = btrfs_inc_ref(trans, root, eb, 1); 5143 BUG_ON(ret); /* -ENOMEM */ 5144 ret = btrfs_dec_ref(trans, root, eb, 0); 5145 BUG_ON(ret); /* -ENOMEM */ 5146 ret = btrfs_set_disk_extent_flags(trans, eb, flag); 5147 BUG_ON(ret); /* -ENOMEM */ 5148 wc->flags[level] |= flag; 5149 } 5150 5151 /* 5152 * the block is shared by multiple trees, so it's not good to 5153 * keep the tree lock 5154 */ 5155 if (path->locks[level] && level > 0) { 5156 btrfs_tree_unlock_rw(eb, path->locks[level]); 5157 path->locks[level] = 0; 5158 } 5159 return 0; 5160 } 5161 5162 /* 5163 * This is used to verify a ref exists for this root to deal with a bug where we 5164 * would have a drop_progress key that hadn't been updated properly. 5165 */ 5166 static int check_ref_exists(struct btrfs_trans_handle *trans, 5167 struct btrfs_root *root, u64 bytenr, u64 parent, 5168 int level) 5169 { 5170 struct btrfs_path *path; 5171 struct btrfs_extent_inline_ref *iref; 5172 int ret; 5173 5174 path = btrfs_alloc_path(); 5175 if (!path) 5176 return -ENOMEM; 5177 5178 ret = lookup_extent_backref(trans, path, &iref, bytenr, 5179 root->fs_info->nodesize, parent, 5180 root->root_key.objectid, level, 0); 5181 btrfs_free_path(path); 5182 if (ret == -ENOENT) 5183 return 0; 5184 if (ret < 0) 5185 return ret; 5186 return 1; 5187 } 5188 5189 /* 5190 * helper to process tree block pointer. 5191 * 5192 * when wc->stage == DROP_REFERENCE, this function checks 5193 * reference count of the block pointed to. if the block 5194 * is shared and we need update back refs for the subtree 5195 * rooted at the block, this function changes wc->stage to 5196 * UPDATE_BACKREF. if the block is shared and there is no 5197 * need to update back, this function drops the reference 5198 * to the block. 5199 * 5200 * NOTE: return value 1 means we should stop walking down. 5201 */ 5202 static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5203 struct btrfs_root *root, 5204 struct btrfs_path *path, 5205 struct walk_control *wc, int *lookup_info) 5206 { 5207 struct btrfs_fs_info *fs_info = root->fs_info; 5208 u64 bytenr; 5209 u64 generation; 5210 u64 parent; 5211 struct btrfs_tree_parent_check check = { 0 }; 5212 struct btrfs_key key; 5213 struct btrfs_ref ref = { 0 }; 5214 struct extent_buffer *next; 5215 int level = wc->level; 5216 int reada = 0; 5217 int ret = 0; 5218 bool need_account = false; 5219 5220 generation = btrfs_node_ptr_generation(path->nodes[level], 5221 path->slots[level]); 5222 /* 5223 * if the lower level block was created before the snapshot 5224 * was created, we know there is no need to update back refs 5225 * for the subtree 5226 */ 5227 if (wc->stage == UPDATE_BACKREF && 5228 generation <= root->root_key.offset) { 5229 *lookup_info = 1; 5230 return 1; 5231 } 5232 5233 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5234 5235 check.level = level - 1; 5236 check.transid = generation; 5237 check.owner_root = root->root_key.objectid; 5238 check.has_first_key = true; 5239 btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, 5240 path->slots[level]); 5241 5242 next = find_extent_buffer(fs_info, bytenr); 5243 if (!next) { 5244 next = btrfs_find_create_tree_block(fs_info, bytenr, 5245 root->root_key.objectid, level - 1); 5246 if (IS_ERR(next)) 5247 return PTR_ERR(next); 5248 reada = 1; 5249 } 5250 btrfs_tree_lock(next); 5251 5252 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, 5253 &wc->refs[level - 1], 5254 &wc->flags[level - 1]); 5255 if (ret < 0) 5256 goto out_unlock; 5257 5258 if (unlikely(wc->refs[level - 1] == 0)) { 5259 btrfs_err(fs_info, "Missing references."); 5260 ret = -EIO; 5261 goto out_unlock; 5262 } 5263 *lookup_info = 0; 5264 5265 if (wc->stage == DROP_REFERENCE) { 5266 if (wc->refs[level - 1] > 1) { 5267 need_account = true; 5268 if (level == 1 && 5269 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5270 goto skip; 5271 5272 if (!wc->update_ref || 5273 generation <= root->root_key.offset) 5274 goto skip; 5275 5276 btrfs_node_key_to_cpu(path->nodes[level], &key, 5277 path->slots[level]); 5278 ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); 5279 if (ret < 0) 5280 goto skip; 5281 5282 wc->stage = UPDATE_BACKREF; 5283 wc->shared_level = level - 1; 5284 } 5285 } else { 5286 if (level == 1 && 5287 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 5288 goto skip; 5289 } 5290 5291 if (!btrfs_buffer_uptodate(next, generation, 0)) { 5292 btrfs_tree_unlock(next); 5293 free_extent_buffer(next); 5294 next = NULL; 5295 *lookup_info = 1; 5296 } 5297 5298 if (!next) { 5299 if (reada && level == 1) 5300 reada_walk_down(trans, root, wc, path); 5301 next = read_tree_block(fs_info, bytenr, &check); 5302 if (IS_ERR(next)) { 5303 return PTR_ERR(next); 5304 } else if (!extent_buffer_uptodate(next)) { 5305 free_extent_buffer(next); 5306 return -EIO; 5307 } 5308 btrfs_tree_lock(next); 5309 } 5310 5311 level--; 5312 ASSERT(level == btrfs_header_level(next)); 5313 if (level != btrfs_header_level(next)) { 5314 btrfs_err(root->fs_info, "mismatched level"); 5315 ret = -EIO; 5316 goto out_unlock; 5317 } 5318 path->nodes[level] = next; 5319 path->slots[level] = 0; 5320 path->locks[level] = BTRFS_WRITE_LOCK; 5321 wc->level = level; 5322 if (wc->level == 1) 5323 wc->reada_slot = 0; 5324 return 0; 5325 skip: 5326 wc->refs[level - 1] = 0; 5327 wc->flags[level - 1] = 0; 5328 if (wc->stage == DROP_REFERENCE) { 5329 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5330 parent = path->nodes[level]->start; 5331 } else { 5332 ASSERT(root->root_key.objectid == 5333 btrfs_header_owner(path->nodes[level])); 5334 if (root->root_key.objectid != 5335 btrfs_header_owner(path->nodes[level])) { 5336 btrfs_err(root->fs_info, 5337 "mismatched block owner"); 5338 ret = -EIO; 5339 goto out_unlock; 5340 } 5341 parent = 0; 5342 } 5343 5344 /* 5345 * If we had a drop_progress we need to verify the refs are set 5346 * as expected. If we find our ref then we know that from here 5347 * on out everything should be correct, and we can clear the 5348 * ->restarted flag. 5349 */ 5350 if (wc->restarted) { 5351 ret = check_ref_exists(trans, root, bytenr, parent, 5352 level - 1); 5353 if (ret < 0) 5354 goto out_unlock; 5355 if (ret == 0) 5356 goto no_delete; 5357 ret = 0; 5358 wc->restarted = 0; 5359 } 5360 5361 /* 5362 * Reloc tree doesn't contribute to qgroup numbers, and we have 5363 * already accounted them at merge time (replace_path), 5364 * thus we could skip expensive subtree trace here. 5365 */ 5366 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 5367 need_account) { 5368 ret = btrfs_qgroup_trace_subtree(trans, next, 5369 generation, level - 1); 5370 if (ret) { 5371 btrfs_err_rl(fs_info, 5372 "Error %d accounting shared subtree. Quota is out of sync, rescan required.", 5373 ret); 5374 } 5375 } 5376 5377 /* 5378 * We need to update the next key in our walk control so we can 5379 * update the drop_progress key accordingly. We don't care if 5380 * find_next_key doesn't find a key because that means we're at 5381 * the end and are going to clean up now. 5382 */ 5383 wc->drop_level = level; 5384 find_next_key(path, level, &wc->drop_progress); 5385 5386 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 5387 fs_info->nodesize, parent); 5388 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 5389 0, false); 5390 ret = btrfs_free_extent(trans, &ref); 5391 if (ret) 5392 goto out_unlock; 5393 } 5394 no_delete: 5395 *lookup_info = 1; 5396 ret = 1; 5397 5398 out_unlock: 5399 btrfs_tree_unlock(next); 5400 free_extent_buffer(next); 5401 5402 return ret; 5403 } 5404 5405 /* 5406 * helper to process tree block while walking up the tree. 5407 * 5408 * when wc->stage == DROP_REFERENCE, this function drops 5409 * reference count on the block. 5410 * 5411 * when wc->stage == UPDATE_BACKREF, this function changes 5412 * wc->stage back to DROP_REFERENCE if we changed wc->stage 5413 * to UPDATE_BACKREF previously while processing the block. 5414 * 5415 * NOTE: return value 1 means we should stop walking up. 5416 */ 5417 static noinline int walk_up_proc(struct btrfs_trans_handle *trans, 5418 struct btrfs_root *root, 5419 struct btrfs_path *path, 5420 struct walk_control *wc) 5421 { 5422 struct btrfs_fs_info *fs_info = root->fs_info; 5423 int ret; 5424 int level = wc->level; 5425 struct extent_buffer *eb = path->nodes[level]; 5426 u64 parent = 0; 5427 5428 if (wc->stage == UPDATE_BACKREF) { 5429 BUG_ON(wc->shared_level < level); 5430 if (level < wc->shared_level) 5431 goto out; 5432 5433 ret = find_next_key(path, level + 1, &wc->update_progress); 5434 if (ret > 0) 5435 wc->update_ref = 0; 5436 5437 wc->stage = DROP_REFERENCE; 5438 wc->shared_level = -1; 5439 path->slots[level] = 0; 5440 5441 /* 5442 * check reference count again if the block isn't locked. 5443 * we should start walking down the tree again if reference 5444 * count is one. 5445 */ 5446 if (!path->locks[level]) { 5447 BUG_ON(level == 0); 5448 btrfs_tree_lock(eb); 5449 path->locks[level] = BTRFS_WRITE_LOCK; 5450 5451 ret = btrfs_lookup_extent_info(trans, fs_info, 5452 eb->start, level, 1, 5453 &wc->refs[level], 5454 &wc->flags[level]); 5455 if (ret < 0) { 5456 btrfs_tree_unlock_rw(eb, path->locks[level]); 5457 path->locks[level] = 0; 5458 return ret; 5459 } 5460 BUG_ON(wc->refs[level] == 0); 5461 if (wc->refs[level] == 1) { 5462 btrfs_tree_unlock_rw(eb, path->locks[level]); 5463 path->locks[level] = 0; 5464 return 1; 5465 } 5466 } 5467 } 5468 5469 /* wc->stage == DROP_REFERENCE */ 5470 BUG_ON(wc->refs[level] > 1 && !path->locks[level]); 5471 5472 if (wc->refs[level] == 1) { 5473 if (level == 0) { 5474 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5475 ret = btrfs_dec_ref(trans, root, eb, 1); 5476 else 5477 ret = btrfs_dec_ref(trans, root, eb, 0); 5478 BUG_ON(ret); /* -ENOMEM */ 5479 if (is_fstree(root->root_key.objectid)) { 5480 ret = btrfs_qgroup_trace_leaf_items(trans, eb); 5481 if (ret) { 5482 btrfs_err_rl(fs_info, 5483 "error %d accounting leaf items, quota is out of sync, rescan required", 5484 ret); 5485 } 5486 } 5487 } 5488 /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ 5489 if (!path->locks[level]) { 5490 btrfs_tree_lock(eb); 5491 path->locks[level] = BTRFS_WRITE_LOCK; 5492 } 5493 btrfs_clear_buffer_dirty(trans, eb); 5494 } 5495 5496 if (eb == root->node) { 5497 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5498 parent = eb->start; 5499 else if (root->root_key.objectid != btrfs_header_owner(eb)) 5500 goto owner_mismatch; 5501 } else { 5502 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 5503 parent = path->nodes[level + 1]->start; 5504 else if (root->root_key.objectid != 5505 btrfs_header_owner(path->nodes[level + 1])) 5506 goto owner_mismatch; 5507 } 5508 5509 btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, 5510 wc->refs[level] == 1); 5511 out: 5512 wc->refs[level] = 0; 5513 wc->flags[level] = 0; 5514 return 0; 5515 5516 owner_mismatch: 5517 btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", 5518 btrfs_header_owner(eb), root->root_key.objectid); 5519 return -EUCLEAN; 5520 } 5521 5522 static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5523 struct btrfs_root *root, 5524 struct btrfs_path *path, 5525 struct walk_control *wc) 5526 { 5527 int level = wc->level; 5528 int lookup_info = 1; 5529 int ret = 0; 5530 5531 while (level >= 0) { 5532 ret = walk_down_proc(trans, root, path, wc, lookup_info); 5533 if (ret) 5534 break; 5535 5536 if (level == 0) 5537 break; 5538 5539 if (path->slots[level] >= 5540 btrfs_header_nritems(path->nodes[level])) 5541 break; 5542 5543 ret = do_walk_down(trans, root, path, wc, &lookup_info); 5544 if (ret > 0) { 5545 path->slots[level]++; 5546 continue; 5547 } else if (ret < 0) 5548 break; 5549 level = wc->level; 5550 } 5551 return (ret == 1) ? 0 : ret; 5552 } 5553 5554 static noinline int walk_up_tree(struct btrfs_trans_handle *trans, 5555 struct btrfs_root *root, 5556 struct btrfs_path *path, 5557 struct walk_control *wc, int max_level) 5558 { 5559 int level = wc->level; 5560 int ret; 5561 5562 path->slots[level] = btrfs_header_nritems(path->nodes[level]); 5563 while (level < max_level && path->nodes[level]) { 5564 wc->level = level; 5565 if (path->slots[level] + 1 < 5566 btrfs_header_nritems(path->nodes[level])) { 5567 path->slots[level]++; 5568 return 0; 5569 } else { 5570 ret = walk_up_proc(trans, root, path, wc); 5571 if (ret > 0) 5572 return 0; 5573 if (ret < 0) 5574 return ret; 5575 5576 if (path->locks[level]) { 5577 btrfs_tree_unlock_rw(path->nodes[level], 5578 path->locks[level]); 5579 path->locks[level] = 0; 5580 } 5581 free_extent_buffer(path->nodes[level]); 5582 path->nodes[level] = NULL; 5583 level++; 5584 } 5585 } 5586 return 1; 5587 } 5588 5589 /* 5590 * drop a subvolume tree. 5591 * 5592 * this function traverses the tree freeing any blocks that only 5593 * referenced by the tree. 5594 * 5595 * when a shared tree block is found. this function decreases its 5596 * reference count by one. if update_ref is true, this function 5597 * also make sure backrefs for the shared block and all lower level 5598 * blocks are properly updated. 5599 * 5600 * If called with for_reloc == 0, may exit early with -EAGAIN 5601 */ 5602 int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) 5603 { 5604 const bool is_reloc_root = (root->root_key.objectid == 5605 BTRFS_TREE_RELOC_OBJECTID); 5606 struct btrfs_fs_info *fs_info = root->fs_info; 5607 struct btrfs_path *path; 5608 struct btrfs_trans_handle *trans; 5609 struct btrfs_root *tree_root = fs_info->tree_root; 5610 struct btrfs_root_item *root_item = &root->root_item; 5611 struct walk_control *wc; 5612 struct btrfs_key key; 5613 int err = 0; 5614 int ret; 5615 int level; 5616 bool root_dropped = false; 5617 bool unfinished_drop = false; 5618 5619 btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); 5620 5621 path = btrfs_alloc_path(); 5622 if (!path) { 5623 err = -ENOMEM; 5624 goto out; 5625 } 5626 5627 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5628 if (!wc) { 5629 btrfs_free_path(path); 5630 err = -ENOMEM; 5631 goto out; 5632 } 5633 5634 /* 5635 * Use join to avoid potential EINTR from transaction start. See 5636 * wait_reserve_ticket and the whole reservation callchain. 5637 */ 5638 if (for_reloc) 5639 trans = btrfs_join_transaction(tree_root); 5640 else 5641 trans = btrfs_start_transaction(tree_root, 0); 5642 if (IS_ERR(trans)) { 5643 err = PTR_ERR(trans); 5644 goto out_free; 5645 } 5646 5647 err = btrfs_run_delayed_items(trans); 5648 if (err) 5649 goto out_end_trans; 5650 5651 /* 5652 * This will help us catch people modifying the fs tree while we're 5653 * dropping it. It is unsafe to mess with the fs tree while it's being 5654 * dropped as we unlock the root node and parent nodes as we walk down 5655 * the tree, assuming nothing will change. If something does change 5656 * then we'll have stale information and drop references to blocks we've 5657 * already dropped. 5658 */ 5659 set_bit(BTRFS_ROOT_DELETING, &root->state); 5660 unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); 5661 5662 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5663 level = btrfs_header_level(root->node); 5664 path->nodes[level] = btrfs_lock_root_node(root); 5665 path->slots[level] = 0; 5666 path->locks[level] = BTRFS_WRITE_LOCK; 5667 memset(&wc->update_progress, 0, 5668 sizeof(wc->update_progress)); 5669 } else { 5670 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 5671 memcpy(&wc->update_progress, &key, 5672 sizeof(wc->update_progress)); 5673 5674 level = btrfs_root_drop_level(root_item); 5675 BUG_ON(level == 0); 5676 path->lowest_level = level; 5677 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5678 path->lowest_level = 0; 5679 if (ret < 0) { 5680 err = ret; 5681 goto out_end_trans; 5682 } 5683 WARN_ON(ret > 0); 5684 5685 /* 5686 * unlock our path, this is safe because only this 5687 * function is allowed to delete this snapshot 5688 */ 5689 btrfs_unlock_up_safe(path, 0); 5690 5691 level = btrfs_header_level(root->node); 5692 while (1) { 5693 btrfs_tree_lock(path->nodes[level]); 5694 path->locks[level] = BTRFS_WRITE_LOCK; 5695 5696 ret = btrfs_lookup_extent_info(trans, fs_info, 5697 path->nodes[level]->start, 5698 level, 1, &wc->refs[level], 5699 &wc->flags[level]); 5700 if (ret < 0) { 5701 err = ret; 5702 goto out_end_trans; 5703 } 5704 BUG_ON(wc->refs[level] == 0); 5705 5706 if (level == btrfs_root_drop_level(root_item)) 5707 break; 5708 5709 btrfs_tree_unlock(path->nodes[level]); 5710 path->locks[level] = 0; 5711 WARN_ON(wc->refs[level] != 1); 5712 level--; 5713 } 5714 } 5715 5716 wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); 5717 wc->level = level; 5718 wc->shared_level = -1; 5719 wc->stage = DROP_REFERENCE; 5720 wc->update_ref = update_ref; 5721 wc->keep_locks = 0; 5722 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 5723 5724 while (1) { 5725 5726 ret = walk_down_tree(trans, root, path, wc); 5727 if (ret < 0) { 5728 btrfs_abort_transaction(trans, ret); 5729 err = ret; 5730 break; 5731 } 5732 5733 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); 5734 if (ret < 0) { 5735 btrfs_abort_transaction(trans, ret); 5736 err = ret; 5737 break; 5738 } 5739 5740 if (ret > 0) { 5741 BUG_ON(wc->stage != DROP_REFERENCE); 5742 break; 5743 } 5744 5745 if (wc->stage == DROP_REFERENCE) { 5746 wc->drop_level = wc->level; 5747 btrfs_node_key_to_cpu(path->nodes[wc->drop_level], 5748 &wc->drop_progress, 5749 path->slots[wc->drop_level]); 5750 } 5751 btrfs_cpu_key_to_disk(&root_item->drop_progress, 5752 &wc->drop_progress); 5753 btrfs_set_root_drop_level(root_item, wc->drop_level); 5754 5755 BUG_ON(wc->level == 0); 5756 if (btrfs_should_end_transaction(trans) || 5757 (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { 5758 ret = btrfs_update_root(trans, tree_root, 5759 &root->root_key, 5760 root_item); 5761 if (ret) { 5762 btrfs_abort_transaction(trans, ret); 5763 err = ret; 5764 goto out_end_trans; 5765 } 5766 5767 if (!is_reloc_root) 5768 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 5769 5770 btrfs_end_transaction_throttle(trans); 5771 if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { 5772 btrfs_debug(fs_info, 5773 "drop snapshot early exit"); 5774 err = -EAGAIN; 5775 goto out_free; 5776 } 5777 5778 /* 5779 * Use join to avoid potential EINTR from transaction 5780 * start. See wait_reserve_ticket and the whole 5781 * reservation callchain. 5782 */ 5783 if (for_reloc) 5784 trans = btrfs_join_transaction(tree_root); 5785 else 5786 trans = btrfs_start_transaction(tree_root, 0); 5787 if (IS_ERR(trans)) { 5788 err = PTR_ERR(trans); 5789 goto out_free; 5790 } 5791 } 5792 } 5793 btrfs_release_path(path); 5794 if (err) 5795 goto out_end_trans; 5796 5797 ret = btrfs_del_root(trans, &root->root_key); 5798 if (ret) { 5799 btrfs_abort_transaction(trans, ret); 5800 err = ret; 5801 goto out_end_trans; 5802 } 5803 5804 if (!is_reloc_root) { 5805 ret = btrfs_find_root(tree_root, &root->root_key, path, 5806 NULL, NULL); 5807 if (ret < 0) { 5808 btrfs_abort_transaction(trans, ret); 5809 err = ret; 5810 goto out_end_trans; 5811 } else if (ret > 0) { 5812 /* if we fail to delete the orphan item this time 5813 * around, it'll get picked up the next time. 5814 * 5815 * The most common failure here is just -ENOENT. 5816 */ 5817 btrfs_del_orphan_item(trans, tree_root, 5818 root->root_key.objectid); 5819 } 5820 } 5821 5822 /* 5823 * This subvolume is going to be completely dropped, and won't be 5824 * recorded as dirty roots, thus pertrans meta rsv will not be freed at 5825 * commit transaction time. So free it here manually. 5826 */ 5827 btrfs_qgroup_convert_reserved_meta(root, INT_MAX); 5828 btrfs_qgroup_free_meta_all_pertrans(root); 5829 5830 if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) 5831 btrfs_add_dropped_root(trans, root); 5832 else 5833 btrfs_put_root(root); 5834 root_dropped = true; 5835 out_end_trans: 5836 if (!is_reloc_root) 5837 btrfs_set_last_root_drop_gen(fs_info, trans->transid); 5838 5839 btrfs_end_transaction_throttle(trans); 5840 out_free: 5841 kfree(wc); 5842 btrfs_free_path(path); 5843 out: 5844 /* 5845 * We were an unfinished drop root, check to see if there are any 5846 * pending, and if not clear and wake up any waiters. 5847 */ 5848 if (!err && unfinished_drop) 5849 btrfs_maybe_wake_unfinished_drop(fs_info); 5850 5851 /* 5852 * So if we need to stop dropping the snapshot for whatever reason we 5853 * need to make sure to add it back to the dead root list so that we 5854 * keep trying to do the work later. This also cleans up roots if we 5855 * don't have it in the radix (like when we recover after a power fail 5856 * or unmount) so we don't leak memory. 5857 */ 5858 if (!for_reloc && !root_dropped) 5859 btrfs_add_dead_root(root); 5860 return err; 5861 } 5862 5863 /* 5864 * drop subtree rooted at tree block 'node'. 5865 * 5866 * NOTE: this function will unlock and release tree block 'node' 5867 * only used by relocation code 5868 */ 5869 int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 5870 struct btrfs_root *root, 5871 struct extent_buffer *node, 5872 struct extent_buffer *parent) 5873 { 5874 struct btrfs_fs_info *fs_info = root->fs_info; 5875 struct btrfs_path *path; 5876 struct walk_control *wc; 5877 int level; 5878 int parent_level; 5879 int ret = 0; 5880 int wret; 5881 5882 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 5883 5884 path = btrfs_alloc_path(); 5885 if (!path) 5886 return -ENOMEM; 5887 5888 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5889 if (!wc) { 5890 btrfs_free_path(path); 5891 return -ENOMEM; 5892 } 5893 5894 btrfs_assert_tree_write_locked(parent); 5895 parent_level = btrfs_header_level(parent); 5896 atomic_inc(&parent->refs); 5897 path->nodes[parent_level] = parent; 5898 path->slots[parent_level] = btrfs_header_nritems(parent); 5899 5900 btrfs_assert_tree_write_locked(node); 5901 level = btrfs_header_level(node); 5902 path->nodes[level] = node; 5903 path->slots[level] = 0; 5904 path->locks[level] = BTRFS_WRITE_LOCK; 5905 5906 wc->refs[parent_level] = 1; 5907 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 5908 wc->level = level; 5909 wc->shared_level = -1; 5910 wc->stage = DROP_REFERENCE; 5911 wc->update_ref = 0; 5912 wc->keep_locks = 1; 5913 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); 5914 5915 while (1) { 5916 wret = walk_down_tree(trans, root, path, wc); 5917 if (wret < 0) { 5918 ret = wret; 5919 break; 5920 } 5921 5922 wret = walk_up_tree(trans, root, path, wc, parent_level); 5923 if (wret < 0) 5924 ret = wret; 5925 if (wret != 0) 5926 break; 5927 } 5928 5929 kfree(wc); 5930 btrfs_free_path(path); 5931 return ret; 5932 } 5933 5934 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, 5935 u64 start, u64 end) 5936 { 5937 return unpin_extent_range(fs_info, start, end, false); 5938 } 5939 5940 /* 5941 * It used to be that old block groups would be left around forever. 5942 * Iterating over them would be enough to trim unused space. Since we 5943 * now automatically remove them, we also need to iterate over unallocated 5944 * space. 5945 * 5946 * We don't want a transaction for this since the discard may take a 5947 * substantial amount of time. We don't require that a transaction be 5948 * running, but we do need to take a running transaction into account 5949 * to ensure that we're not discarding chunks that were released or 5950 * allocated in the current transaction. 5951 * 5952 * Holding the chunks lock will prevent other threads from allocating 5953 * or releasing chunks, but it won't prevent a running transaction 5954 * from committing and releasing the memory that the pending chunks 5955 * list head uses. For that, we need to take a reference to the 5956 * transaction and hold the commit root sem. We only need to hold 5957 * it while performing the free space search since we have already 5958 * held back allocations. 5959 */ 5960 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) 5961 { 5962 u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; 5963 int ret; 5964 5965 *trimmed = 0; 5966 5967 /* Discard not supported = nothing to do. */ 5968 if (!bdev_max_discard_sectors(device->bdev)) 5969 return 0; 5970 5971 /* Not writable = nothing to do. */ 5972 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 5973 return 0; 5974 5975 /* No free space = nothing to do. */ 5976 if (device->total_bytes <= device->bytes_used) 5977 return 0; 5978 5979 ret = 0; 5980 5981 while (1) { 5982 struct btrfs_fs_info *fs_info = device->fs_info; 5983 u64 bytes; 5984 5985 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 5986 if (ret) 5987 break; 5988 5989 find_first_clear_extent_bit(&device->alloc_state, start, 5990 &start, &end, 5991 CHUNK_TRIMMED | CHUNK_ALLOCATED); 5992 5993 /* Check if there are any CHUNK_* bits left */ 5994 if (start > device->total_bytes) { 5995 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 5996 btrfs_warn_in_rcu(fs_info, 5997 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", 5998 start, end - start + 1, 5999 btrfs_dev_name(device), 6000 device->total_bytes); 6001 mutex_unlock(&fs_info->chunk_mutex); 6002 ret = 0; 6003 break; 6004 } 6005 6006 /* Ensure we skip the reserved space on each device. */ 6007 start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); 6008 6009 /* 6010 * If find_first_clear_extent_bit find a range that spans the 6011 * end of the device it will set end to -1, in this case it's up 6012 * to the caller to trim the value to the size of the device. 6013 */ 6014 end = min(end, device->total_bytes - 1); 6015 6016 len = end - start + 1; 6017 6018 /* We didn't find any extents */ 6019 if (!len) { 6020 mutex_unlock(&fs_info->chunk_mutex); 6021 ret = 0; 6022 break; 6023 } 6024 6025 ret = btrfs_issue_discard(device->bdev, start, len, 6026 &bytes); 6027 if (!ret) 6028 set_extent_bit(&device->alloc_state, start, 6029 start + bytes - 1, CHUNK_TRIMMED, NULL); 6030 mutex_unlock(&fs_info->chunk_mutex); 6031 6032 if (ret) 6033 break; 6034 6035 start += len; 6036 *trimmed += bytes; 6037 6038 if (fatal_signal_pending(current)) { 6039 ret = -ERESTARTSYS; 6040 break; 6041 } 6042 6043 cond_resched(); 6044 } 6045 6046 return ret; 6047 } 6048 6049 /* 6050 * Trim the whole filesystem by: 6051 * 1) trimming the free space in each block group 6052 * 2) trimming the unallocated space on each device 6053 * 6054 * This will also continue trimming even if a block group or device encounters 6055 * an error. The return value will be the last error, or 0 if nothing bad 6056 * happens. 6057 */ 6058 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 6059 { 6060 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6061 struct btrfs_block_group *cache = NULL; 6062 struct btrfs_device *device; 6063 u64 group_trimmed; 6064 u64 range_end = U64_MAX; 6065 u64 start; 6066 u64 end; 6067 u64 trimmed = 0; 6068 u64 bg_failed = 0; 6069 u64 dev_failed = 0; 6070 int bg_ret = 0; 6071 int dev_ret = 0; 6072 int ret = 0; 6073 6074 if (range->start == U64_MAX) 6075 return -EINVAL; 6076 6077 /* 6078 * Check range overflow if range->len is set. 6079 * The default range->len is U64_MAX. 6080 */ 6081 if (range->len != U64_MAX && 6082 check_add_overflow(range->start, range->len, &range_end)) 6083 return -EINVAL; 6084 6085 cache = btrfs_lookup_first_block_group(fs_info, range->start); 6086 for (; cache; cache = btrfs_next_block_group(cache)) { 6087 if (cache->start >= range_end) { 6088 btrfs_put_block_group(cache); 6089 break; 6090 } 6091 6092 start = max(range->start, cache->start); 6093 end = min(range_end, cache->start + cache->length); 6094 6095 if (end - start >= range->minlen) { 6096 if (!btrfs_block_group_done(cache)) { 6097 ret = btrfs_cache_block_group(cache, true); 6098 if (ret) { 6099 bg_failed++; 6100 bg_ret = ret; 6101 continue; 6102 } 6103 } 6104 ret = btrfs_trim_block_group(cache, 6105 &group_trimmed, 6106 start, 6107 end, 6108 range->minlen); 6109 6110 trimmed += group_trimmed; 6111 if (ret) { 6112 bg_failed++; 6113 bg_ret = ret; 6114 continue; 6115 } 6116 } 6117 } 6118 6119 if (bg_failed) 6120 btrfs_warn(fs_info, 6121 "failed to trim %llu block group(s), last error %d", 6122 bg_failed, bg_ret); 6123 6124 mutex_lock(&fs_devices->device_list_mutex); 6125 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6126 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 6127 continue; 6128 6129 ret = btrfs_trim_free_extents(device, &group_trimmed); 6130 if (ret) { 6131 dev_failed++; 6132 dev_ret = ret; 6133 break; 6134 } 6135 6136 trimmed += group_trimmed; 6137 } 6138 mutex_unlock(&fs_devices->device_list_mutex); 6139 6140 if (dev_failed) 6141 btrfs_warn(fs_info, 6142 "failed to trim %llu device(s), last error %d", 6143 dev_failed, dev_ret); 6144 range->len = trimmed; 6145 if (bg_ret) 6146 return bg_ret; 6147 return dev_ret; 6148 } 6149