1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 40 /* 41 * Relocation overview 42 * 43 * [What does relocation do] 44 * 45 * The objective of relocation is to relocate all extents of the target block 46 * group to other block groups. 47 * This is utilized by resize (shrink only), profile converting, compacting 48 * space, or balance routine to spread chunks over devices. 49 * 50 * Before | After 51 * ------------------------------------------------------------------ 52 * BG A: 10 data extents | BG A: deleted 53 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 54 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 55 * 56 * [How does relocation work] 57 * 58 * 1. Mark the target block group read-only 59 * New extents won't be allocated from the target block group. 60 * 61 * 2.1 Record each extent in the target block group 62 * To build a proper map of extents to be relocated. 63 * 64 * 2.2 Build data reloc tree and reloc trees 65 * Data reloc tree will contain an inode, recording all newly relocated 66 * data extents. 67 * There will be only one data reloc tree for one data block group. 68 * 69 * Reloc tree will be a special snapshot of its source tree, containing 70 * relocated tree blocks. 71 * Each tree referring to a tree block in target block group will get its 72 * reloc tree built. 73 * 74 * 2.3 Swap source tree with its corresponding reloc tree 75 * Each involved tree only refers to new extents after swap. 76 * 77 * 3. Cleanup reloc trees and data reloc tree. 78 * As old extents in the target block group are still referenced by reloc 79 * trees, we need to clean them up before really freeing the target block 80 * group. 81 * 82 * The main complexity is in steps 2.2 and 2.3. 83 * 84 * The entry point of relocation is relocate_block_group() function. 85 */ 86 87 #define RELOCATION_RESERVED_NODES 256 88 /* 89 * map address of tree root to tree 90 */ 91 struct mapping_node { 92 struct { 93 struct rb_node rb_node; 94 u64 bytenr; 95 }; /* Use rb_simle_node for search/insert */ 96 void *data; 97 }; 98 99 struct mapping_tree { 100 struct rb_root rb_root; 101 spinlock_t lock; 102 }; 103 104 /* 105 * present a tree block to process 106 */ 107 struct tree_block { 108 struct { 109 struct rb_node rb_node; 110 u64 bytenr; 111 }; /* Use rb_simple_node for search/insert */ 112 u64 owner; 113 struct btrfs_key key; 114 unsigned int level:8; 115 unsigned int key_ready:1; 116 }; 117 118 #define MAX_EXTENTS 128 119 120 struct file_extent_cluster { 121 u64 start; 122 u64 end; 123 u64 boundary[MAX_EXTENTS]; 124 unsigned int nr; 125 }; 126 127 struct reloc_control { 128 /* block group to relocate */ 129 struct btrfs_block_group *block_group; 130 /* extent tree */ 131 struct btrfs_root *extent_root; 132 /* inode for moving data */ 133 struct inode *data_inode; 134 135 struct btrfs_block_rsv *block_rsv; 136 137 struct btrfs_backref_cache backref_cache; 138 139 struct file_extent_cluster cluster; 140 /* tree blocks have been processed */ 141 struct extent_io_tree processed_blocks; 142 /* map start of tree root to corresponding reloc tree */ 143 struct mapping_tree reloc_root_tree; 144 /* list of reloc trees */ 145 struct list_head reloc_roots; 146 /* list of subvolume trees that get relocated */ 147 struct list_head dirty_subvol_roots; 148 /* size of metadata reservation for merging reloc trees */ 149 u64 merging_rsv_size; 150 /* size of relocated tree nodes */ 151 u64 nodes_relocated; 152 /* reserved size for block group relocation*/ 153 u64 reserved_bytes; 154 155 u64 search_start; 156 u64 extents_found; 157 158 unsigned int stage:8; 159 unsigned int create_reloc_tree:1; 160 unsigned int merge_reloc_tree:1; 161 unsigned int found_file_extent:1; 162 }; 163 164 /* stages of data relocation */ 165 #define MOVE_DATA_EXTENTS 0 166 #define UPDATE_DATA_PTRS 1 167 168 static void mark_block_processed(struct reloc_control *rc, 169 struct btrfs_backref_node *node) 170 { 171 u32 blocksize; 172 173 if (node->level == 0 || 174 in_range(node->bytenr, rc->block_group->start, 175 rc->block_group->length)) { 176 blocksize = rc->extent_root->fs_info->nodesize; 177 set_extent_bit(&rc->processed_blocks, node->bytenr, 178 node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); 179 } 180 node->processed = 1; 181 } 182 183 184 static void mapping_tree_init(struct mapping_tree *tree) 185 { 186 tree->rb_root = RB_ROOT; 187 spin_lock_init(&tree->lock); 188 } 189 190 /* 191 * walk up backref nodes until reach node presents tree root 192 */ 193 static struct btrfs_backref_node *walk_up_backref( 194 struct btrfs_backref_node *node, 195 struct btrfs_backref_edge *edges[], int *index) 196 { 197 struct btrfs_backref_edge *edge; 198 int idx = *index; 199 200 while (!list_empty(&node->upper)) { 201 edge = list_entry(node->upper.next, 202 struct btrfs_backref_edge, list[LOWER]); 203 edges[idx++] = edge; 204 node = edge->node[UPPER]; 205 } 206 BUG_ON(node->detached); 207 *index = idx; 208 return node; 209 } 210 211 /* 212 * walk down backref nodes to find start of next reference path 213 */ 214 static struct btrfs_backref_node *walk_down_backref( 215 struct btrfs_backref_edge *edges[], int *index) 216 { 217 struct btrfs_backref_edge *edge; 218 struct btrfs_backref_node *lower; 219 int idx = *index; 220 221 while (idx > 0) { 222 edge = edges[idx - 1]; 223 lower = edge->node[LOWER]; 224 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 225 idx--; 226 continue; 227 } 228 edge = list_entry(edge->list[LOWER].next, 229 struct btrfs_backref_edge, list[LOWER]); 230 edges[idx - 1] = edge; 231 *index = idx; 232 return edge->node[UPPER]; 233 } 234 *index = 0; 235 return NULL; 236 } 237 238 static bool reloc_root_is_dead(const struct btrfs_root *root) 239 { 240 /* 241 * Pair with set_bit/clear_bit in clean_dirty_subvols and 242 * btrfs_update_reloc_root. We need to see the updated bit before 243 * trying to access reloc_root 244 */ 245 smp_rmb(); 246 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 247 return true; 248 return false; 249 } 250 251 /* 252 * Check if this subvolume tree has valid reloc tree. 253 * 254 * Reloc tree after swap is considered dead, thus not considered as valid. 255 * This is enough for most callers, as they don't distinguish dead reloc root 256 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 257 * special case. 258 */ 259 static bool have_reloc_root(const struct btrfs_root *root) 260 { 261 if (reloc_root_is_dead(root)) 262 return false; 263 if (!root->reloc_root) 264 return false; 265 return true; 266 } 267 268 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 269 { 270 struct btrfs_root *reloc_root; 271 272 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 273 return false; 274 275 /* This root has been merged with its reloc tree, we can ignore it */ 276 if (reloc_root_is_dead(root)) 277 return true; 278 279 reloc_root = root->reloc_root; 280 if (!reloc_root) 281 return false; 282 283 if (btrfs_header_generation(reloc_root->commit_root) == 284 root->fs_info->running_transaction->transid) 285 return false; 286 /* 287 * If there is reloc tree and it was created in previous transaction 288 * backref lookup can find the reloc tree, so backref node for the fs 289 * tree root is useless for relocation. 290 */ 291 return true; 292 } 293 294 /* 295 * find reloc tree by address of tree root 296 */ 297 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 298 { 299 struct reloc_control *rc = fs_info->reloc_ctl; 300 struct rb_node *rb_node; 301 struct mapping_node *node; 302 struct btrfs_root *root = NULL; 303 304 ASSERT(rc); 305 spin_lock(&rc->reloc_root_tree.lock); 306 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 307 if (rb_node) { 308 node = rb_entry(rb_node, struct mapping_node, rb_node); 309 root = node->data; 310 } 311 spin_unlock(&rc->reloc_root_tree.lock); 312 return btrfs_grab_root(root); 313 } 314 315 /* 316 * For useless nodes, do two major clean ups: 317 * 318 * - Cleanup the children edges and nodes 319 * If child node is also orphan (no parent) during cleanup, then the child 320 * node will also be cleaned up. 321 * 322 * - Freeing up leaves (level 0), keeps nodes detached 323 * For nodes, the node is still cached as "detached" 324 * 325 * Return false if @node is not in the @useless_nodes list. 326 * Return true if @node is in the @useless_nodes list. 327 */ 328 static bool handle_useless_nodes(struct reloc_control *rc, 329 struct btrfs_backref_node *node) 330 { 331 struct btrfs_backref_cache *cache = &rc->backref_cache; 332 struct list_head *useless_node = &cache->useless_node; 333 bool ret = false; 334 335 while (!list_empty(useless_node)) { 336 struct btrfs_backref_node *cur; 337 338 cur = list_first_entry(useless_node, struct btrfs_backref_node, 339 list); 340 list_del_init(&cur->list); 341 342 /* Only tree root nodes can be added to @useless_nodes */ 343 ASSERT(list_empty(&cur->upper)); 344 345 if (cur == node) 346 ret = true; 347 348 /* The node is the lowest node */ 349 if (cur->lowest) { 350 list_del_init(&cur->lower); 351 cur->lowest = 0; 352 } 353 354 /* Cleanup the lower edges */ 355 while (!list_empty(&cur->lower)) { 356 struct btrfs_backref_edge *edge; 357 struct btrfs_backref_node *lower; 358 359 edge = list_entry(cur->lower.next, 360 struct btrfs_backref_edge, list[UPPER]); 361 list_del(&edge->list[UPPER]); 362 list_del(&edge->list[LOWER]); 363 lower = edge->node[LOWER]; 364 btrfs_backref_free_edge(cache, edge); 365 366 /* Child node is also orphan, queue for cleanup */ 367 if (list_empty(&lower->upper)) 368 list_add(&lower->list, useless_node); 369 } 370 /* Mark this block processed for relocation */ 371 mark_block_processed(rc, cur); 372 373 /* 374 * Backref nodes for tree leaves are deleted from the cache. 375 * Backref nodes for upper level tree blocks are left in the 376 * cache to avoid unnecessary backref lookup. 377 */ 378 if (cur->level > 0) { 379 list_add(&cur->list, &cache->detached); 380 cur->detached = 1; 381 } else { 382 rb_erase(&cur->rb_node, &cache->rb_root); 383 btrfs_backref_free_node(cache, cur); 384 } 385 } 386 return ret; 387 } 388 389 /* 390 * Build backref tree for a given tree block. Root of the backref tree 391 * corresponds the tree block, leaves of the backref tree correspond roots of 392 * b-trees that reference the tree block. 393 * 394 * The basic idea of this function is check backrefs of a given block to find 395 * upper level blocks that reference the block, and then check backrefs of 396 * these upper level blocks recursively. The recursion stops when tree root is 397 * reached or backrefs for the block is cached. 398 * 399 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 400 * all upper level blocks that directly/indirectly reference the block are also 401 * cached. 402 */ 403 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 404 struct btrfs_trans_handle *trans, 405 struct reloc_control *rc, struct btrfs_key *node_key, 406 int level, u64 bytenr) 407 { 408 struct btrfs_backref_iter *iter; 409 struct btrfs_backref_cache *cache = &rc->backref_cache; 410 /* For searching parent of TREE_BLOCK_REF */ 411 struct btrfs_path *path; 412 struct btrfs_backref_node *cur; 413 struct btrfs_backref_node *node = NULL; 414 struct btrfs_backref_edge *edge; 415 int ret; 416 int err = 0; 417 418 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 419 if (!iter) 420 return ERR_PTR(-ENOMEM); 421 path = btrfs_alloc_path(); 422 if (!path) { 423 err = -ENOMEM; 424 goto out; 425 } 426 427 node = btrfs_backref_alloc_node(cache, bytenr, level); 428 if (!node) { 429 err = -ENOMEM; 430 goto out; 431 } 432 433 node->lowest = 1; 434 cur = node; 435 436 /* Breadth-first search to build backref cache */ 437 do { 438 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 439 node_key, cur); 440 if (ret < 0) { 441 err = ret; 442 goto out; 443 } 444 edge = list_first_entry_or_null(&cache->pending_edge, 445 struct btrfs_backref_edge, list[UPPER]); 446 /* 447 * The pending list isn't empty, take the first block to 448 * process 449 */ 450 if (edge) { 451 list_del_init(&edge->list[UPPER]); 452 cur = edge->node[UPPER]; 453 } 454 } while (edge); 455 456 /* Finish the upper linkage of newly added edges/nodes */ 457 ret = btrfs_backref_finish_upper_links(cache, node); 458 if (ret < 0) { 459 err = ret; 460 goto out; 461 } 462 463 if (handle_useless_nodes(rc, node)) 464 node = NULL; 465 out: 466 btrfs_backref_iter_free(iter); 467 btrfs_free_path(path); 468 if (err) { 469 btrfs_backref_error_cleanup(cache, node); 470 return ERR_PTR(err); 471 } 472 ASSERT(!node || !node->detached); 473 ASSERT(list_empty(&cache->useless_node) && 474 list_empty(&cache->pending_edge)); 475 return node; 476 } 477 478 /* 479 * helper to add backref node for the newly created snapshot. 480 * the backref node is created by cloning backref node that 481 * corresponds to root of source tree 482 */ 483 static int clone_backref_node(struct btrfs_trans_handle *trans, 484 struct reloc_control *rc, 485 const struct btrfs_root *src, 486 struct btrfs_root *dest) 487 { 488 struct btrfs_root *reloc_root = src->reloc_root; 489 struct btrfs_backref_cache *cache = &rc->backref_cache; 490 struct btrfs_backref_node *node = NULL; 491 struct btrfs_backref_node *new_node; 492 struct btrfs_backref_edge *edge; 493 struct btrfs_backref_edge *new_edge; 494 struct rb_node *rb_node; 495 496 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 497 if (rb_node) { 498 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 499 if (node->detached) 500 node = NULL; 501 else 502 BUG_ON(node->new_bytenr != reloc_root->node->start); 503 } 504 505 if (!node) { 506 rb_node = rb_simple_search(&cache->rb_root, 507 reloc_root->commit_root->start); 508 if (rb_node) { 509 node = rb_entry(rb_node, struct btrfs_backref_node, 510 rb_node); 511 BUG_ON(node->detached); 512 } 513 } 514 515 if (!node) 516 return 0; 517 518 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 519 node->level); 520 if (!new_node) 521 return -ENOMEM; 522 523 new_node->lowest = node->lowest; 524 new_node->checked = 1; 525 new_node->root = btrfs_grab_root(dest); 526 ASSERT(new_node->root); 527 528 if (!node->lowest) { 529 list_for_each_entry(edge, &node->lower, list[UPPER]) { 530 new_edge = btrfs_backref_alloc_edge(cache); 531 if (!new_edge) 532 goto fail; 533 534 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 535 new_node, LINK_UPPER); 536 } 537 } else { 538 list_add_tail(&new_node->lower, &cache->leaves); 539 } 540 541 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 542 &new_node->rb_node); 543 if (rb_node) 544 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 545 546 if (!new_node->lowest) { 547 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 548 list_add_tail(&new_edge->list[LOWER], 549 &new_edge->node[LOWER]->upper); 550 } 551 } 552 return 0; 553 fail: 554 while (!list_empty(&new_node->lower)) { 555 new_edge = list_entry(new_node->lower.next, 556 struct btrfs_backref_edge, list[UPPER]); 557 list_del(&new_edge->list[UPPER]); 558 btrfs_backref_free_edge(cache, new_edge); 559 } 560 btrfs_backref_free_node(cache, new_node); 561 return -ENOMEM; 562 } 563 564 /* 565 * helper to add 'address of tree root -> reloc tree' mapping 566 */ 567 static int __must_check __add_reloc_root(struct btrfs_root *root) 568 { 569 struct btrfs_fs_info *fs_info = root->fs_info; 570 struct rb_node *rb_node; 571 struct mapping_node *node; 572 struct reloc_control *rc = fs_info->reloc_ctl; 573 574 node = kmalloc(sizeof(*node), GFP_NOFS); 575 if (!node) 576 return -ENOMEM; 577 578 node->bytenr = root->commit_root->start; 579 node->data = root; 580 581 spin_lock(&rc->reloc_root_tree.lock); 582 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 583 node->bytenr, &node->rb_node); 584 spin_unlock(&rc->reloc_root_tree.lock); 585 if (rb_node) { 586 btrfs_err(fs_info, 587 "Duplicate root found for start=%llu while inserting into relocation tree", 588 node->bytenr); 589 return -EEXIST; 590 } 591 592 list_add_tail(&root->root_list, &rc->reloc_roots); 593 return 0; 594 } 595 596 /* 597 * helper to delete the 'address of tree root -> reloc tree' 598 * mapping 599 */ 600 static void __del_reloc_root(struct btrfs_root *root) 601 { 602 struct btrfs_fs_info *fs_info = root->fs_info; 603 struct rb_node *rb_node; 604 struct mapping_node *node = NULL; 605 struct reloc_control *rc = fs_info->reloc_ctl; 606 bool put_ref = false; 607 608 if (rc && root->node) { 609 spin_lock(&rc->reloc_root_tree.lock); 610 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 611 root->commit_root->start); 612 if (rb_node) { 613 node = rb_entry(rb_node, struct mapping_node, rb_node); 614 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 615 RB_CLEAR_NODE(&node->rb_node); 616 } 617 spin_unlock(&rc->reloc_root_tree.lock); 618 ASSERT(!node || (struct btrfs_root *)node->data == root); 619 } 620 621 /* 622 * We only put the reloc root here if it's on the list. There's a lot 623 * of places where the pattern is to splice the rc->reloc_roots, process 624 * the reloc roots, and then add the reloc root back onto 625 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 626 * list we don't want the reference being dropped, because the guy 627 * messing with the list is in charge of the reference. 628 */ 629 spin_lock(&fs_info->trans_lock); 630 if (!list_empty(&root->root_list)) { 631 put_ref = true; 632 list_del_init(&root->root_list); 633 } 634 spin_unlock(&fs_info->trans_lock); 635 if (put_ref) 636 btrfs_put_root(root); 637 kfree(node); 638 } 639 640 /* 641 * helper to update the 'address of tree root -> reloc tree' 642 * mapping 643 */ 644 static int __update_reloc_root(struct btrfs_root *root) 645 { 646 struct btrfs_fs_info *fs_info = root->fs_info; 647 struct rb_node *rb_node; 648 struct mapping_node *node = NULL; 649 struct reloc_control *rc = fs_info->reloc_ctl; 650 651 spin_lock(&rc->reloc_root_tree.lock); 652 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 653 root->commit_root->start); 654 if (rb_node) { 655 node = rb_entry(rb_node, struct mapping_node, rb_node); 656 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 657 } 658 spin_unlock(&rc->reloc_root_tree.lock); 659 660 if (!node) 661 return 0; 662 BUG_ON((struct btrfs_root *)node->data != root); 663 664 spin_lock(&rc->reloc_root_tree.lock); 665 node->bytenr = root->node->start; 666 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 667 node->bytenr, &node->rb_node); 668 spin_unlock(&rc->reloc_root_tree.lock); 669 if (rb_node) 670 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 671 return 0; 672 } 673 674 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 675 struct btrfs_root *root, u64 objectid) 676 { 677 struct btrfs_fs_info *fs_info = root->fs_info; 678 struct btrfs_root *reloc_root; 679 struct extent_buffer *eb; 680 struct btrfs_root_item *root_item; 681 struct btrfs_key root_key; 682 int ret = 0; 683 bool must_abort = false; 684 685 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 686 if (!root_item) 687 return ERR_PTR(-ENOMEM); 688 689 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 690 root_key.type = BTRFS_ROOT_ITEM_KEY; 691 root_key.offset = objectid; 692 693 if (root->root_key.objectid == objectid) { 694 u64 commit_root_gen; 695 696 /* called by btrfs_init_reloc_root */ 697 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 698 BTRFS_TREE_RELOC_OBJECTID); 699 if (ret) 700 goto fail; 701 702 /* 703 * Set the last_snapshot field to the generation of the commit 704 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 705 * correctly (returns true) when the relocation root is created 706 * either inside the critical section of a transaction commit 707 * (through transaction.c:qgroup_account_snapshot()) and when 708 * it's created before the transaction commit is started. 709 */ 710 commit_root_gen = btrfs_header_generation(root->commit_root); 711 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 712 } else { 713 /* 714 * called by btrfs_reloc_post_snapshot_hook. 715 * the source tree is a reloc tree, all tree blocks 716 * modified after it was created have RELOC flag 717 * set in their headers. so it's OK to not update 718 * the 'last_snapshot'. 719 */ 720 ret = btrfs_copy_root(trans, root, root->node, &eb, 721 BTRFS_TREE_RELOC_OBJECTID); 722 if (ret) 723 goto fail; 724 } 725 726 /* 727 * We have changed references at this point, we must abort the 728 * transaction if anything fails. 729 */ 730 must_abort = true; 731 732 memcpy(root_item, &root->root_item, sizeof(*root_item)); 733 btrfs_set_root_bytenr(root_item, eb->start); 734 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 735 btrfs_set_root_generation(root_item, trans->transid); 736 737 if (root->root_key.objectid == objectid) { 738 btrfs_set_root_refs(root_item, 0); 739 memset(&root_item->drop_progress, 0, 740 sizeof(struct btrfs_disk_key)); 741 btrfs_set_root_drop_level(root_item, 0); 742 } 743 744 btrfs_tree_unlock(eb); 745 free_extent_buffer(eb); 746 747 ret = btrfs_insert_root(trans, fs_info->tree_root, 748 &root_key, root_item); 749 if (ret) 750 goto fail; 751 752 kfree(root_item); 753 754 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 755 if (IS_ERR(reloc_root)) { 756 ret = PTR_ERR(reloc_root); 757 goto abort; 758 } 759 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 760 reloc_root->last_trans = trans->transid; 761 return reloc_root; 762 fail: 763 kfree(root_item); 764 abort: 765 if (must_abort) 766 btrfs_abort_transaction(trans, ret); 767 return ERR_PTR(ret); 768 } 769 770 /* 771 * create reloc tree for a given fs tree. reloc tree is just a 772 * snapshot of the fs tree with special root objectid. 773 * 774 * The reloc_root comes out of here with two references, one for 775 * root->reloc_root, and another for being on the rc->reloc_roots list. 776 */ 777 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 778 struct btrfs_root *root) 779 { 780 struct btrfs_fs_info *fs_info = root->fs_info; 781 struct btrfs_root *reloc_root; 782 struct reloc_control *rc = fs_info->reloc_ctl; 783 struct btrfs_block_rsv *rsv; 784 int clear_rsv = 0; 785 int ret; 786 787 if (!rc) 788 return 0; 789 790 /* 791 * The subvolume has reloc tree but the swap is finished, no need to 792 * create/update the dead reloc tree 793 */ 794 if (reloc_root_is_dead(root)) 795 return 0; 796 797 /* 798 * This is subtle but important. We do not do 799 * record_root_in_transaction for reloc roots, instead we record their 800 * corresponding fs root, and then here we update the last trans for the 801 * reloc root. This means that we have to do this for the entire life 802 * of the reloc root, regardless of which stage of the relocation we are 803 * in. 804 */ 805 if (root->reloc_root) { 806 reloc_root = root->reloc_root; 807 reloc_root->last_trans = trans->transid; 808 return 0; 809 } 810 811 /* 812 * We are merging reloc roots, we do not need new reloc trees. Also 813 * reloc trees never need their own reloc tree. 814 */ 815 if (!rc->create_reloc_tree || 816 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 817 return 0; 818 819 if (!trans->reloc_reserved) { 820 rsv = trans->block_rsv; 821 trans->block_rsv = rc->block_rsv; 822 clear_rsv = 1; 823 } 824 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 825 if (clear_rsv) 826 trans->block_rsv = rsv; 827 if (IS_ERR(reloc_root)) 828 return PTR_ERR(reloc_root); 829 830 ret = __add_reloc_root(reloc_root); 831 ASSERT(ret != -EEXIST); 832 if (ret) { 833 /* Pairs with create_reloc_root */ 834 btrfs_put_root(reloc_root); 835 return ret; 836 } 837 root->reloc_root = btrfs_grab_root(reloc_root); 838 return 0; 839 } 840 841 /* 842 * update root item of reloc tree 843 */ 844 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 845 struct btrfs_root *root) 846 { 847 struct btrfs_fs_info *fs_info = root->fs_info; 848 struct btrfs_root *reloc_root; 849 struct btrfs_root_item *root_item; 850 int ret; 851 852 if (!have_reloc_root(root)) 853 return 0; 854 855 reloc_root = root->reloc_root; 856 root_item = &reloc_root->root_item; 857 858 /* 859 * We are probably ok here, but __del_reloc_root() will drop its ref of 860 * the root. We have the ref for root->reloc_root, but just in case 861 * hold it while we update the reloc root. 862 */ 863 btrfs_grab_root(reloc_root); 864 865 /* root->reloc_root will stay until current relocation finished */ 866 if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 867 btrfs_root_refs(root_item) == 0) { 868 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 869 /* 870 * Mark the tree as dead before we change reloc_root so 871 * have_reloc_root will not touch it from now on. 872 */ 873 smp_wmb(); 874 __del_reloc_root(reloc_root); 875 } 876 877 if (reloc_root->commit_root != reloc_root->node) { 878 __update_reloc_root(reloc_root); 879 btrfs_set_root_node(root_item, reloc_root->node); 880 free_extent_buffer(reloc_root->commit_root); 881 reloc_root->commit_root = btrfs_root_node(reloc_root); 882 } 883 884 ret = btrfs_update_root(trans, fs_info->tree_root, 885 &reloc_root->root_key, root_item); 886 btrfs_put_root(reloc_root); 887 return ret; 888 } 889 890 /* 891 * helper to find first cached inode with inode number >= objectid 892 * in a subvolume 893 */ 894 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 895 { 896 struct rb_node *node; 897 struct rb_node *prev; 898 struct btrfs_inode *entry; 899 struct inode *inode; 900 901 spin_lock(&root->inode_lock); 902 again: 903 node = root->inode_tree.rb_node; 904 prev = NULL; 905 while (node) { 906 prev = node; 907 entry = rb_entry(node, struct btrfs_inode, rb_node); 908 909 if (objectid < btrfs_ino(entry)) 910 node = node->rb_left; 911 else if (objectid > btrfs_ino(entry)) 912 node = node->rb_right; 913 else 914 break; 915 } 916 if (!node) { 917 while (prev) { 918 entry = rb_entry(prev, struct btrfs_inode, rb_node); 919 if (objectid <= btrfs_ino(entry)) { 920 node = prev; 921 break; 922 } 923 prev = rb_next(prev); 924 } 925 } 926 while (node) { 927 entry = rb_entry(node, struct btrfs_inode, rb_node); 928 inode = igrab(&entry->vfs_inode); 929 if (inode) { 930 spin_unlock(&root->inode_lock); 931 return inode; 932 } 933 934 objectid = btrfs_ino(entry) + 1; 935 if (cond_resched_lock(&root->inode_lock)) 936 goto again; 937 938 node = rb_next(node); 939 } 940 spin_unlock(&root->inode_lock); 941 return NULL; 942 } 943 944 /* 945 * get new location of data 946 */ 947 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 948 u64 bytenr, u64 num_bytes) 949 { 950 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 951 struct btrfs_path *path; 952 struct btrfs_file_extent_item *fi; 953 struct extent_buffer *leaf; 954 int ret; 955 956 path = btrfs_alloc_path(); 957 if (!path) 958 return -ENOMEM; 959 960 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 961 ret = btrfs_lookup_file_extent(NULL, root, path, 962 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 963 if (ret < 0) 964 goto out; 965 if (ret > 0) { 966 ret = -ENOENT; 967 goto out; 968 } 969 970 leaf = path->nodes[0]; 971 fi = btrfs_item_ptr(leaf, path->slots[0], 972 struct btrfs_file_extent_item); 973 974 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 975 btrfs_file_extent_compression(leaf, fi) || 976 btrfs_file_extent_encryption(leaf, fi) || 977 btrfs_file_extent_other_encoding(leaf, fi)); 978 979 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 980 ret = -EINVAL; 981 goto out; 982 } 983 984 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 985 ret = 0; 986 out: 987 btrfs_free_path(path); 988 return ret; 989 } 990 991 /* 992 * update file extent items in the tree leaf to point to 993 * the new locations. 994 */ 995 static noinline_for_stack 996 int replace_file_extents(struct btrfs_trans_handle *trans, 997 struct reloc_control *rc, 998 struct btrfs_root *root, 999 struct extent_buffer *leaf) 1000 { 1001 struct btrfs_fs_info *fs_info = root->fs_info; 1002 struct btrfs_key key; 1003 struct btrfs_file_extent_item *fi; 1004 struct inode *inode = NULL; 1005 u64 parent; 1006 u64 bytenr; 1007 u64 new_bytenr = 0; 1008 u64 num_bytes; 1009 u64 end; 1010 u32 nritems; 1011 u32 i; 1012 int ret = 0; 1013 int first = 1; 1014 int dirty = 0; 1015 1016 if (rc->stage != UPDATE_DATA_PTRS) 1017 return 0; 1018 1019 /* reloc trees always use full backref */ 1020 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1021 parent = leaf->start; 1022 else 1023 parent = 0; 1024 1025 nritems = btrfs_header_nritems(leaf); 1026 for (i = 0; i < nritems; i++) { 1027 struct btrfs_ref ref = { 0 }; 1028 1029 cond_resched(); 1030 btrfs_item_key_to_cpu(leaf, &key, i); 1031 if (key.type != BTRFS_EXTENT_DATA_KEY) 1032 continue; 1033 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1034 if (btrfs_file_extent_type(leaf, fi) == 1035 BTRFS_FILE_EXTENT_INLINE) 1036 continue; 1037 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1038 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1039 if (bytenr == 0) 1040 continue; 1041 if (!in_range(bytenr, rc->block_group->start, 1042 rc->block_group->length)) 1043 continue; 1044 1045 /* 1046 * if we are modifying block in fs tree, wait for read_folio 1047 * to complete and drop the extent cache 1048 */ 1049 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1050 if (first) { 1051 inode = find_next_inode(root, key.objectid); 1052 first = 0; 1053 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1054 btrfs_add_delayed_iput(BTRFS_I(inode)); 1055 inode = find_next_inode(root, key.objectid); 1056 } 1057 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1058 struct extent_state *cached_state = NULL; 1059 1060 end = key.offset + 1061 btrfs_file_extent_num_bytes(leaf, fi); 1062 WARN_ON(!IS_ALIGNED(key.offset, 1063 fs_info->sectorsize)); 1064 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1065 end--; 1066 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1067 key.offset, end, 1068 &cached_state); 1069 if (!ret) 1070 continue; 1071 1072 btrfs_drop_extent_map_range(BTRFS_I(inode), 1073 key.offset, end, true); 1074 unlock_extent(&BTRFS_I(inode)->io_tree, 1075 key.offset, end, &cached_state); 1076 } 1077 } 1078 1079 ret = get_new_location(rc->data_inode, &new_bytenr, 1080 bytenr, num_bytes); 1081 if (ret) { 1082 /* 1083 * Don't have to abort since we've not changed anything 1084 * in the file extent yet. 1085 */ 1086 break; 1087 } 1088 1089 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1090 dirty = 1; 1091 1092 key.offset -= btrfs_file_extent_offset(leaf, fi); 1093 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1094 num_bytes, parent); 1095 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1096 key.objectid, key.offset, 1097 root->root_key.objectid, false); 1098 ret = btrfs_inc_extent_ref(trans, &ref); 1099 if (ret) { 1100 btrfs_abort_transaction(trans, ret); 1101 break; 1102 } 1103 1104 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1105 num_bytes, parent); 1106 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1107 key.objectid, key.offset, 1108 root->root_key.objectid, false); 1109 ret = btrfs_free_extent(trans, &ref); 1110 if (ret) { 1111 btrfs_abort_transaction(trans, ret); 1112 break; 1113 } 1114 } 1115 if (dirty) 1116 btrfs_mark_buffer_dirty(trans, leaf); 1117 if (inode) 1118 btrfs_add_delayed_iput(BTRFS_I(inode)); 1119 return ret; 1120 } 1121 1122 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 1123 int slot, const struct btrfs_path *path, 1124 int level) 1125 { 1126 struct btrfs_disk_key key1; 1127 struct btrfs_disk_key key2; 1128 btrfs_node_key(eb, &key1, slot); 1129 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1130 return memcmp(&key1, &key2, sizeof(key1)); 1131 } 1132 1133 /* 1134 * try to replace tree blocks in fs tree with the new blocks 1135 * in reloc tree. tree blocks haven't been modified since the 1136 * reloc tree was create can be replaced. 1137 * 1138 * if a block was replaced, level of the block + 1 is returned. 1139 * if no block got replaced, 0 is returned. if there are other 1140 * errors, a negative error number is returned. 1141 */ 1142 static noinline_for_stack 1143 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1144 struct btrfs_root *dest, struct btrfs_root *src, 1145 struct btrfs_path *path, struct btrfs_key *next_key, 1146 int lowest_level, int max_level) 1147 { 1148 struct btrfs_fs_info *fs_info = dest->fs_info; 1149 struct extent_buffer *eb; 1150 struct extent_buffer *parent; 1151 struct btrfs_ref ref = { 0 }; 1152 struct btrfs_key key; 1153 u64 old_bytenr; 1154 u64 new_bytenr; 1155 u64 old_ptr_gen; 1156 u64 new_ptr_gen; 1157 u64 last_snapshot; 1158 u32 blocksize; 1159 int cow = 0; 1160 int level; 1161 int ret; 1162 int slot; 1163 1164 ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1165 ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1166 1167 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1168 again: 1169 slot = path->slots[lowest_level]; 1170 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1171 1172 eb = btrfs_lock_root_node(dest); 1173 level = btrfs_header_level(eb); 1174 1175 if (level < lowest_level) { 1176 btrfs_tree_unlock(eb); 1177 free_extent_buffer(eb); 1178 return 0; 1179 } 1180 1181 if (cow) { 1182 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1183 BTRFS_NESTING_COW); 1184 if (ret) { 1185 btrfs_tree_unlock(eb); 1186 free_extent_buffer(eb); 1187 return ret; 1188 } 1189 } 1190 1191 if (next_key) { 1192 next_key->objectid = (u64)-1; 1193 next_key->type = (u8)-1; 1194 next_key->offset = (u64)-1; 1195 } 1196 1197 parent = eb; 1198 while (1) { 1199 level = btrfs_header_level(parent); 1200 ASSERT(level >= lowest_level); 1201 1202 ret = btrfs_bin_search(parent, 0, &key, &slot); 1203 if (ret < 0) 1204 break; 1205 if (ret && slot > 0) 1206 slot--; 1207 1208 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1209 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1210 1211 old_bytenr = btrfs_node_blockptr(parent, slot); 1212 blocksize = fs_info->nodesize; 1213 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1214 1215 if (level <= max_level) { 1216 eb = path->nodes[level]; 1217 new_bytenr = btrfs_node_blockptr(eb, 1218 path->slots[level]); 1219 new_ptr_gen = btrfs_node_ptr_generation(eb, 1220 path->slots[level]); 1221 } else { 1222 new_bytenr = 0; 1223 new_ptr_gen = 0; 1224 } 1225 1226 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1227 ret = level; 1228 break; 1229 } 1230 1231 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1232 memcmp_node_keys(parent, slot, path, level)) { 1233 if (level <= lowest_level) { 1234 ret = 0; 1235 break; 1236 } 1237 1238 eb = btrfs_read_node_slot(parent, slot); 1239 if (IS_ERR(eb)) { 1240 ret = PTR_ERR(eb); 1241 break; 1242 } 1243 btrfs_tree_lock(eb); 1244 if (cow) { 1245 ret = btrfs_cow_block(trans, dest, eb, parent, 1246 slot, &eb, 1247 BTRFS_NESTING_COW); 1248 if (ret) { 1249 btrfs_tree_unlock(eb); 1250 free_extent_buffer(eb); 1251 break; 1252 } 1253 } 1254 1255 btrfs_tree_unlock(parent); 1256 free_extent_buffer(parent); 1257 1258 parent = eb; 1259 continue; 1260 } 1261 1262 if (!cow) { 1263 btrfs_tree_unlock(parent); 1264 free_extent_buffer(parent); 1265 cow = 1; 1266 goto again; 1267 } 1268 1269 btrfs_node_key_to_cpu(path->nodes[level], &key, 1270 path->slots[level]); 1271 btrfs_release_path(path); 1272 1273 path->lowest_level = level; 1274 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1275 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1276 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1277 path->lowest_level = 0; 1278 if (ret) { 1279 if (ret > 0) 1280 ret = -ENOENT; 1281 break; 1282 } 1283 1284 /* 1285 * Info qgroup to trace both subtrees. 1286 * 1287 * We must trace both trees. 1288 * 1) Tree reloc subtree 1289 * If not traced, we will leak data numbers 1290 * 2) Fs subtree 1291 * If not traced, we will double count old data 1292 * 1293 * We don't scan the subtree right now, but only record 1294 * the swapped tree blocks. 1295 * The real subtree rescan is delayed until we have new 1296 * CoW on the subtree root node before transaction commit. 1297 */ 1298 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1299 rc->block_group, parent, slot, 1300 path->nodes[level], path->slots[level], 1301 last_snapshot); 1302 if (ret < 0) 1303 break; 1304 /* 1305 * swap blocks in fs tree and reloc tree. 1306 */ 1307 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1308 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1309 btrfs_mark_buffer_dirty(trans, parent); 1310 1311 btrfs_set_node_blockptr(path->nodes[level], 1312 path->slots[level], old_bytenr); 1313 btrfs_set_node_ptr_generation(path->nodes[level], 1314 path->slots[level], old_ptr_gen); 1315 btrfs_mark_buffer_dirty(trans, path->nodes[level]); 1316 1317 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 1318 blocksize, path->nodes[level]->start); 1319 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1320 0, true); 1321 ret = btrfs_inc_extent_ref(trans, &ref); 1322 if (ret) { 1323 btrfs_abort_transaction(trans, ret); 1324 break; 1325 } 1326 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1327 blocksize, 0); 1328 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, 1329 true); 1330 ret = btrfs_inc_extent_ref(trans, &ref); 1331 if (ret) { 1332 btrfs_abort_transaction(trans, ret); 1333 break; 1334 } 1335 1336 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 1337 blocksize, path->nodes[level]->start); 1338 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1339 0, true); 1340 ret = btrfs_free_extent(trans, &ref); 1341 if (ret) { 1342 btrfs_abort_transaction(trans, ret); 1343 break; 1344 } 1345 1346 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 1347 blocksize, 0); 1348 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 1349 0, true); 1350 ret = btrfs_free_extent(trans, &ref); 1351 if (ret) { 1352 btrfs_abort_transaction(trans, ret); 1353 break; 1354 } 1355 1356 btrfs_unlock_up_safe(path, 0); 1357 1358 ret = level; 1359 break; 1360 } 1361 btrfs_tree_unlock(parent); 1362 free_extent_buffer(parent); 1363 return ret; 1364 } 1365 1366 /* 1367 * helper to find next relocated block in reloc tree 1368 */ 1369 static noinline_for_stack 1370 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1371 int *level) 1372 { 1373 struct extent_buffer *eb; 1374 int i; 1375 u64 last_snapshot; 1376 u32 nritems; 1377 1378 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1379 1380 for (i = 0; i < *level; i++) { 1381 free_extent_buffer(path->nodes[i]); 1382 path->nodes[i] = NULL; 1383 } 1384 1385 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1386 eb = path->nodes[i]; 1387 nritems = btrfs_header_nritems(eb); 1388 while (path->slots[i] + 1 < nritems) { 1389 path->slots[i]++; 1390 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1391 last_snapshot) 1392 continue; 1393 1394 *level = i; 1395 return 0; 1396 } 1397 free_extent_buffer(path->nodes[i]); 1398 path->nodes[i] = NULL; 1399 } 1400 return 1; 1401 } 1402 1403 /* 1404 * walk down reloc tree to find relocated block of lowest level 1405 */ 1406 static noinline_for_stack 1407 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1408 int *level) 1409 { 1410 struct extent_buffer *eb = NULL; 1411 int i; 1412 u64 ptr_gen = 0; 1413 u64 last_snapshot; 1414 u32 nritems; 1415 1416 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1417 1418 for (i = *level; i > 0; i--) { 1419 eb = path->nodes[i]; 1420 nritems = btrfs_header_nritems(eb); 1421 while (path->slots[i] < nritems) { 1422 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1423 if (ptr_gen > last_snapshot) 1424 break; 1425 path->slots[i]++; 1426 } 1427 if (path->slots[i] >= nritems) { 1428 if (i == *level) 1429 break; 1430 *level = i + 1; 1431 return 0; 1432 } 1433 if (i == 1) { 1434 *level = i; 1435 return 0; 1436 } 1437 1438 eb = btrfs_read_node_slot(eb, path->slots[i]); 1439 if (IS_ERR(eb)) 1440 return PTR_ERR(eb); 1441 BUG_ON(btrfs_header_level(eb) != i - 1); 1442 path->nodes[i - 1] = eb; 1443 path->slots[i - 1] = 0; 1444 } 1445 return 1; 1446 } 1447 1448 /* 1449 * invalidate extent cache for file extents whose key in range of 1450 * [min_key, max_key) 1451 */ 1452 static int invalidate_extent_cache(struct btrfs_root *root, 1453 const struct btrfs_key *min_key, 1454 const struct btrfs_key *max_key) 1455 { 1456 struct btrfs_fs_info *fs_info = root->fs_info; 1457 struct inode *inode = NULL; 1458 u64 objectid; 1459 u64 start, end; 1460 u64 ino; 1461 1462 objectid = min_key->objectid; 1463 while (1) { 1464 struct extent_state *cached_state = NULL; 1465 1466 cond_resched(); 1467 iput(inode); 1468 1469 if (objectid > max_key->objectid) 1470 break; 1471 1472 inode = find_next_inode(root, objectid); 1473 if (!inode) 1474 break; 1475 ino = btrfs_ino(BTRFS_I(inode)); 1476 1477 if (ino > max_key->objectid) { 1478 iput(inode); 1479 break; 1480 } 1481 1482 objectid = ino + 1; 1483 if (!S_ISREG(inode->i_mode)) 1484 continue; 1485 1486 if (unlikely(min_key->objectid == ino)) { 1487 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1488 continue; 1489 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1490 start = 0; 1491 else { 1492 start = min_key->offset; 1493 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1494 } 1495 } else { 1496 start = 0; 1497 } 1498 1499 if (unlikely(max_key->objectid == ino)) { 1500 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1501 continue; 1502 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1503 end = (u64)-1; 1504 } else { 1505 if (max_key->offset == 0) 1506 continue; 1507 end = max_key->offset; 1508 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1509 end--; 1510 } 1511 } else { 1512 end = (u64)-1; 1513 } 1514 1515 /* the lock_extent waits for read_folio to complete */ 1516 lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 1517 btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); 1518 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 1519 } 1520 return 0; 1521 } 1522 1523 static int find_next_key(struct btrfs_path *path, int level, 1524 struct btrfs_key *key) 1525 1526 { 1527 while (level < BTRFS_MAX_LEVEL) { 1528 if (!path->nodes[level]) 1529 break; 1530 if (path->slots[level] + 1 < 1531 btrfs_header_nritems(path->nodes[level])) { 1532 btrfs_node_key_to_cpu(path->nodes[level], key, 1533 path->slots[level] + 1); 1534 return 0; 1535 } 1536 level++; 1537 } 1538 return 1; 1539 } 1540 1541 /* 1542 * Insert current subvolume into reloc_control::dirty_subvol_roots 1543 */ 1544 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1545 struct reloc_control *rc, 1546 struct btrfs_root *root) 1547 { 1548 struct btrfs_root *reloc_root = root->reloc_root; 1549 struct btrfs_root_item *reloc_root_item; 1550 int ret; 1551 1552 /* @root must be a subvolume tree root with a valid reloc tree */ 1553 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1554 ASSERT(reloc_root); 1555 1556 reloc_root_item = &reloc_root->root_item; 1557 memset(&reloc_root_item->drop_progress, 0, 1558 sizeof(reloc_root_item->drop_progress)); 1559 btrfs_set_root_drop_level(reloc_root_item, 0); 1560 btrfs_set_root_refs(reloc_root_item, 0); 1561 ret = btrfs_update_reloc_root(trans, root); 1562 if (ret) 1563 return ret; 1564 1565 if (list_empty(&root->reloc_dirty_list)) { 1566 btrfs_grab_root(root); 1567 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1568 } 1569 1570 return 0; 1571 } 1572 1573 static int clean_dirty_subvols(struct reloc_control *rc) 1574 { 1575 struct btrfs_root *root; 1576 struct btrfs_root *next; 1577 int ret = 0; 1578 int ret2; 1579 1580 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1581 reloc_dirty_list) { 1582 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1583 /* Merged subvolume, cleanup its reloc root */ 1584 struct btrfs_root *reloc_root = root->reloc_root; 1585 1586 list_del_init(&root->reloc_dirty_list); 1587 root->reloc_root = NULL; 1588 /* 1589 * Need barrier to ensure clear_bit() only happens after 1590 * root->reloc_root = NULL. Pairs with have_reloc_root. 1591 */ 1592 smp_wmb(); 1593 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1594 if (reloc_root) { 1595 /* 1596 * btrfs_drop_snapshot drops our ref we hold for 1597 * ->reloc_root. If it fails however we must 1598 * drop the ref ourselves. 1599 */ 1600 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1601 if (ret2 < 0) { 1602 btrfs_put_root(reloc_root); 1603 if (!ret) 1604 ret = ret2; 1605 } 1606 } 1607 btrfs_put_root(root); 1608 } else { 1609 /* Orphan reloc tree, just clean it up */ 1610 ret2 = btrfs_drop_snapshot(root, 0, 1); 1611 if (ret2 < 0) { 1612 btrfs_put_root(root); 1613 if (!ret) 1614 ret = ret2; 1615 } 1616 } 1617 } 1618 return ret; 1619 } 1620 1621 /* 1622 * merge the relocated tree blocks in reloc tree with corresponding 1623 * fs tree. 1624 */ 1625 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1626 struct btrfs_root *root) 1627 { 1628 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1629 struct btrfs_key key; 1630 struct btrfs_key next_key; 1631 struct btrfs_trans_handle *trans = NULL; 1632 struct btrfs_root *reloc_root; 1633 struct btrfs_root_item *root_item; 1634 struct btrfs_path *path; 1635 struct extent_buffer *leaf; 1636 int reserve_level; 1637 int level; 1638 int max_level; 1639 int replaced = 0; 1640 int ret = 0; 1641 u32 min_reserved; 1642 1643 path = btrfs_alloc_path(); 1644 if (!path) 1645 return -ENOMEM; 1646 path->reada = READA_FORWARD; 1647 1648 reloc_root = root->reloc_root; 1649 root_item = &reloc_root->root_item; 1650 1651 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1652 level = btrfs_root_level(root_item); 1653 atomic_inc(&reloc_root->node->refs); 1654 path->nodes[level] = reloc_root->node; 1655 path->slots[level] = 0; 1656 } else { 1657 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1658 1659 level = btrfs_root_drop_level(root_item); 1660 BUG_ON(level == 0); 1661 path->lowest_level = level; 1662 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1663 path->lowest_level = 0; 1664 if (ret < 0) { 1665 btrfs_free_path(path); 1666 return ret; 1667 } 1668 1669 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1670 path->slots[level]); 1671 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1672 1673 btrfs_unlock_up_safe(path, 0); 1674 } 1675 1676 /* 1677 * In merge_reloc_root(), we modify the upper level pointer to swap the 1678 * tree blocks between reloc tree and subvolume tree. Thus for tree 1679 * block COW, we COW at most from level 1 to root level for each tree. 1680 * 1681 * Thus the needed metadata size is at most root_level * nodesize, 1682 * and * 2 since we have two trees to COW. 1683 */ 1684 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1685 min_reserved = fs_info->nodesize * reserve_level * 2; 1686 memset(&next_key, 0, sizeof(next_key)); 1687 1688 while (1) { 1689 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1690 min_reserved, 1691 BTRFS_RESERVE_FLUSH_LIMIT); 1692 if (ret) 1693 goto out; 1694 trans = btrfs_start_transaction(root, 0); 1695 if (IS_ERR(trans)) { 1696 ret = PTR_ERR(trans); 1697 trans = NULL; 1698 goto out; 1699 } 1700 1701 /* 1702 * At this point we no longer have a reloc_control, so we can't 1703 * depend on btrfs_init_reloc_root to update our last_trans. 1704 * 1705 * But that's ok, we started the trans handle on our 1706 * corresponding fs_root, which means it's been added to the 1707 * dirty list. At commit time we'll still call 1708 * btrfs_update_reloc_root() and update our root item 1709 * appropriately. 1710 */ 1711 reloc_root->last_trans = trans->transid; 1712 trans->block_rsv = rc->block_rsv; 1713 1714 replaced = 0; 1715 max_level = level; 1716 1717 ret = walk_down_reloc_tree(reloc_root, path, &level); 1718 if (ret < 0) 1719 goto out; 1720 if (ret > 0) 1721 break; 1722 1723 if (!find_next_key(path, level, &key) && 1724 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1725 ret = 0; 1726 } else { 1727 ret = replace_path(trans, rc, root, reloc_root, path, 1728 &next_key, level, max_level); 1729 } 1730 if (ret < 0) 1731 goto out; 1732 if (ret > 0) { 1733 level = ret; 1734 btrfs_node_key_to_cpu(path->nodes[level], &key, 1735 path->slots[level]); 1736 replaced = 1; 1737 } 1738 1739 ret = walk_up_reloc_tree(reloc_root, path, &level); 1740 if (ret > 0) 1741 break; 1742 1743 BUG_ON(level == 0); 1744 /* 1745 * save the merging progress in the drop_progress. 1746 * this is OK since root refs == 1 in this case. 1747 */ 1748 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1749 path->slots[level]); 1750 btrfs_set_root_drop_level(root_item, level); 1751 1752 btrfs_end_transaction_throttle(trans); 1753 trans = NULL; 1754 1755 btrfs_btree_balance_dirty(fs_info); 1756 1757 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1758 invalidate_extent_cache(root, &key, &next_key); 1759 } 1760 1761 /* 1762 * handle the case only one block in the fs tree need to be 1763 * relocated and the block is tree root. 1764 */ 1765 leaf = btrfs_lock_root_node(root); 1766 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1767 BTRFS_NESTING_COW); 1768 btrfs_tree_unlock(leaf); 1769 free_extent_buffer(leaf); 1770 out: 1771 btrfs_free_path(path); 1772 1773 if (ret == 0) { 1774 ret = insert_dirty_subvol(trans, rc, root); 1775 if (ret) 1776 btrfs_abort_transaction(trans, ret); 1777 } 1778 1779 if (trans) 1780 btrfs_end_transaction_throttle(trans); 1781 1782 btrfs_btree_balance_dirty(fs_info); 1783 1784 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1785 invalidate_extent_cache(root, &key, &next_key); 1786 1787 return ret; 1788 } 1789 1790 static noinline_for_stack 1791 int prepare_to_merge(struct reloc_control *rc, int err) 1792 { 1793 struct btrfs_root *root = rc->extent_root; 1794 struct btrfs_fs_info *fs_info = root->fs_info; 1795 struct btrfs_root *reloc_root; 1796 struct btrfs_trans_handle *trans; 1797 LIST_HEAD(reloc_roots); 1798 u64 num_bytes = 0; 1799 int ret; 1800 1801 mutex_lock(&fs_info->reloc_mutex); 1802 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1803 rc->merging_rsv_size += rc->nodes_relocated * 2; 1804 mutex_unlock(&fs_info->reloc_mutex); 1805 1806 again: 1807 if (!err) { 1808 num_bytes = rc->merging_rsv_size; 1809 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1810 BTRFS_RESERVE_FLUSH_ALL); 1811 if (ret) 1812 err = ret; 1813 } 1814 1815 trans = btrfs_join_transaction(rc->extent_root); 1816 if (IS_ERR(trans)) { 1817 if (!err) 1818 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1819 num_bytes, NULL); 1820 return PTR_ERR(trans); 1821 } 1822 1823 if (!err) { 1824 if (num_bytes != rc->merging_rsv_size) { 1825 btrfs_end_transaction(trans); 1826 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1827 num_bytes, NULL); 1828 goto again; 1829 } 1830 } 1831 1832 rc->merge_reloc_tree = 1; 1833 1834 while (!list_empty(&rc->reloc_roots)) { 1835 reloc_root = list_entry(rc->reloc_roots.next, 1836 struct btrfs_root, root_list); 1837 list_del_init(&reloc_root->root_list); 1838 1839 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1840 false); 1841 if (IS_ERR(root)) { 1842 /* 1843 * Even if we have an error we need this reloc root 1844 * back on our list so we can clean up properly. 1845 */ 1846 list_add(&reloc_root->root_list, &reloc_roots); 1847 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1848 if (!err) 1849 err = PTR_ERR(root); 1850 break; 1851 } 1852 1853 if (unlikely(root->reloc_root != reloc_root)) { 1854 if (root->reloc_root) { 1855 btrfs_err(fs_info, 1856 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1857 root->root_key.objectid, 1858 root->reloc_root->root_key.objectid, 1859 root->reloc_root->root_key.type, 1860 root->reloc_root->root_key.offset, 1861 btrfs_root_generation( 1862 &root->reloc_root->root_item), 1863 reloc_root->root_key.objectid, 1864 reloc_root->root_key.type, 1865 reloc_root->root_key.offset, 1866 btrfs_root_generation( 1867 &reloc_root->root_item)); 1868 } else { 1869 btrfs_err(fs_info, 1870 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1871 root->root_key.objectid, 1872 reloc_root->root_key.objectid, 1873 reloc_root->root_key.type, 1874 reloc_root->root_key.offset, 1875 btrfs_root_generation( 1876 &reloc_root->root_item)); 1877 } 1878 list_add(&reloc_root->root_list, &reloc_roots); 1879 btrfs_put_root(root); 1880 btrfs_abort_transaction(trans, -EUCLEAN); 1881 if (!err) 1882 err = -EUCLEAN; 1883 break; 1884 } 1885 1886 /* 1887 * set reference count to 1, so btrfs_recover_relocation 1888 * knows it should resumes merging 1889 */ 1890 if (!err) 1891 btrfs_set_root_refs(&reloc_root->root_item, 1); 1892 ret = btrfs_update_reloc_root(trans, root); 1893 1894 /* 1895 * Even if we have an error we need this reloc root back on our 1896 * list so we can clean up properly. 1897 */ 1898 list_add(&reloc_root->root_list, &reloc_roots); 1899 btrfs_put_root(root); 1900 1901 if (ret) { 1902 btrfs_abort_transaction(trans, ret); 1903 if (!err) 1904 err = ret; 1905 break; 1906 } 1907 } 1908 1909 list_splice(&reloc_roots, &rc->reloc_roots); 1910 1911 if (!err) 1912 err = btrfs_commit_transaction(trans); 1913 else 1914 btrfs_end_transaction(trans); 1915 return err; 1916 } 1917 1918 static noinline_for_stack 1919 void free_reloc_roots(struct list_head *list) 1920 { 1921 struct btrfs_root *reloc_root, *tmp; 1922 1923 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1924 __del_reloc_root(reloc_root); 1925 } 1926 1927 static noinline_for_stack 1928 void merge_reloc_roots(struct reloc_control *rc) 1929 { 1930 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1931 struct btrfs_root *root; 1932 struct btrfs_root *reloc_root; 1933 LIST_HEAD(reloc_roots); 1934 int found = 0; 1935 int ret = 0; 1936 again: 1937 root = rc->extent_root; 1938 1939 /* 1940 * this serializes us with btrfs_record_root_in_transaction, 1941 * we have to make sure nobody is in the middle of 1942 * adding their roots to the list while we are 1943 * doing this splice 1944 */ 1945 mutex_lock(&fs_info->reloc_mutex); 1946 list_splice_init(&rc->reloc_roots, &reloc_roots); 1947 mutex_unlock(&fs_info->reloc_mutex); 1948 1949 while (!list_empty(&reloc_roots)) { 1950 found = 1; 1951 reloc_root = list_entry(reloc_roots.next, 1952 struct btrfs_root, root_list); 1953 1954 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1955 false); 1956 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1957 if (WARN_ON(IS_ERR(root))) { 1958 /* 1959 * For recovery we read the fs roots on mount, 1960 * and if we didn't find the root then we marked 1961 * the reloc root as a garbage root. For normal 1962 * relocation obviously the root should exist in 1963 * memory. However there's no reason we can't 1964 * handle the error properly here just in case. 1965 */ 1966 ret = PTR_ERR(root); 1967 goto out; 1968 } 1969 if (WARN_ON(root->reloc_root != reloc_root)) { 1970 /* 1971 * This can happen if on-disk metadata has some 1972 * corruption, e.g. bad reloc tree key offset. 1973 */ 1974 ret = -EINVAL; 1975 goto out; 1976 } 1977 ret = merge_reloc_root(rc, root); 1978 btrfs_put_root(root); 1979 if (ret) { 1980 if (list_empty(&reloc_root->root_list)) 1981 list_add_tail(&reloc_root->root_list, 1982 &reloc_roots); 1983 goto out; 1984 } 1985 } else { 1986 if (!IS_ERR(root)) { 1987 if (root->reloc_root == reloc_root) { 1988 root->reloc_root = NULL; 1989 btrfs_put_root(reloc_root); 1990 } 1991 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 1992 &root->state); 1993 btrfs_put_root(root); 1994 } 1995 1996 list_del_init(&reloc_root->root_list); 1997 /* Don't forget to queue this reloc root for cleanup */ 1998 list_add_tail(&reloc_root->reloc_dirty_list, 1999 &rc->dirty_subvol_roots); 2000 } 2001 } 2002 2003 if (found) { 2004 found = 0; 2005 goto again; 2006 } 2007 out: 2008 if (ret) { 2009 btrfs_handle_fs_error(fs_info, ret, NULL); 2010 free_reloc_roots(&reloc_roots); 2011 2012 /* new reloc root may be added */ 2013 mutex_lock(&fs_info->reloc_mutex); 2014 list_splice_init(&rc->reloc_roots, &reloc_roots); 2015 mutex_unlock(&fs_info->reloc_mutex); 2016 free_reloc_roots(&reloc_roots); 2017 } 2018 2019 /* 2020 * We used to have 2021 * 2022 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2023 * 2024 * here, but it's wrong. If we fail to start the transaction in 2025 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2026 * have actually been removed from the reloc_root_tree rb tree. This is 2027 * fine because we're bailing here, and we hold a reference on the root 2028 * for the list that holds it, so these roots will be cleaned up when we 2029 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2030 * will be cleaned up on unmount. 2031 * 2032 * The remaining nodes will be cleaned up by free_reloc_control. 2033 */ 2034 } 2035 2036 static void free_block_list(struct rb_root *blocks) 2037 { 2038 struct tree_block *block; 2039 struct rb_node *rb_node; 2040 while ((rb_node = rb_first(blocks))) { 2041 block = rb_entry(rb_node, struct tree_block, rb_node); 2042 rb_erase(rb_node, blocks); 2043 kfree(block); 2044 } 2045 } 2046 2047 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2048 struct btrfs_root *reloc_root) 2049 { 2050 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2051 struct btrfs_root *root; 2052 int ret; 2053 2054 if (reloc_root->last_trans == trans->transid) 2055 return 0; 2056 2057 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 2058 2059 /* 2060 * This should succeed, since we can't have a reloc root without having 2061 * already looked up the actual root and created the reloc root for this 2062 * root. 2063 * 2064 * However if there's some sort of corruption where we have a ref to a 2065 * reloc root without a corresponding root this could return ENOENT. 2066 */ 2067 if (IS_ERR(root)) { 2068 ASSERT(0); 2069 return PTR_ERR(root); 2070 } 2071 if (root->reloc_root != reloc_root) { 2072 ASSERT(0); 2073 btrfs_err(fs_info, 2074 "root %llu has two reloc roots associated with it", 2075 reloc_root->root_key.offset); 2076 btrfs_put_root(root); 2077 return -EUCLEAN; 2078 } 2079 ret = btrfs_record_root_in_trans(trans, root); 2080 btrfs_put_root(root); 2081 2082 return ret; 2083 } 2084 2085 static noinline_for_stack 2086 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2087 struct reloc_control *rc, 2088 struct btrfs_backref_node *node, 2089 struct btrfs_backref_edge *edges[]) 2090 { 2091 struct btrfs_backref_node *next; 2092 struct btrfs_root *root; 2093 int index = 0; 2094 int ret; 2095 2096 next = node; 2097 while (1) { 2098 cond_resched(); 2099 next = walk_up_backref(next, edges, &index); 2100 root = next->root; 2101 2102 /* 2103 * If there is no root, then our references for this block are 2104 * incomplete, as we should be able to walk all the way up to a 2105 * block that is owned by a root. 2106 * 2107 * This path is only for SHAREABLE roots, so if we come upon a 2108 * non-SHAREABLE root then we have backrefs that resolve 2109 * improperly. 2110 * 2111 * Both of these cases indicate file system corruption, or a bug 2112 * in the backref walking code. 2113 */ 2114 if (!root) { 2115 ASSERT(0); 2116 btrfs_err(trans->fs_info, 2117 "bytenr %llu doesn't have a backref path ending in a root", 2118 node->bytenr); 2119 return ERR_PTR(-EUCLEAN); 2120 } 2121 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2122 ASSERT(0); 2123 btrfs_err(trans->fs_info, 2124 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2125 node->bytenr); 2126 return ERR_PTR(-EUCLEAN); 2127 } 2128 2129 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2130 ret = record_reloc_root_in_trans(trans, root); 2131 if (ret) 2132 return ERR_PTR(ret); 2133 break; 2134 } 2135 2136 ret = btrfs_record_root_in_trans(trans, root); 2137 if (ret) 2138 return ERR_PTR(ret); 2139 root = root->reloc_root; 2140 2141 /* 2142 * We could have raced with another thread which failed, so 2143 * root->reloc_root may not be set, return ENOENT in this case. 2144 */ 2145 if (!root) 2146 return ERR_PTR(-ENOENT); 2147 2148 if (next->new_bytenr != root->node->start) { 2149 /* 2150 * We just created the reloc root, so we shouldn't have 2151 * ->new_bytenr set and this shouldn't be in the changed 2152 * list. If it is then we have multiple roots pointing 2153 * at the same bytenr which indicates corruption, or 2154 * we've made a mistake in the backref walking code. 2155 */ 2156 ASSERT(next->new_bytenr == 0); 2157 ASSERT(list_empty(&next->list)); 2158 if (next->new_bytenr || !list_empty(&next->list)) { 2159 btrfs_err(trans->fs_info, 2160 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2161 node->bytenr, next->bytenr); 2162 return ERR_PTR(-EUCLEAN); 2163 } 2164 2165 next->new_bytenr = root->node->start; 2166 btrfs_put_root(next->root); 2167 next->root = btrfs_grab_root(root); 2168 ASSERT(next->root); 2169 list_add_tail(&next->list, 2170 &rc->backref_cache.changed); 2171 mark_block_processed(rc, next); 2172 break; 2173 } 2174 2175 WARN_ON(1); 2176 root = NULL; 2177 next = walk_down_backref(edges, &index); 2178 if (!next || next->level <= node->level) 2179 break; 2180 } 2181 if (!root) { 2182 /* 2183 * This can happen if there's fs corruption or if there's a bug 2184 * in the backref lookup code. 2185 */ 2186 ASSERT(0); 2187 return ERR_PTR(-ENOENT); 2188 } 2189 2190 next = node; 2191 /* setup backref node path for btrfs_reloc_cow_block */ 2192 while (1) { 2193 rc->backref_cache.path[next->level] = next; 2194 if (--index < 0) 2195 break; 2196 next = edges[index]->node[UPPER]; 2197 } 2198 return root; 2199 } 2200 2201 /* 2202 * Select a tree root for relocation. 2203 * 2204 * Return NULL if the block is not shareable. We should use do_relocation() in 2205 * this case. 2206 * 2207 * Return a tree root pointer if the block is shareable. 2208 * Return -ENOENT if the block is root of reloc tree. 2209 */ 2210 static noinline_for_stack 2211 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2212 { 2213 struct btrfs_backref_node *next; 2214 struct btrfs_root *root; 2215 struct btrfs_root *fs_root = NULL; 2216 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2217 int index = 0; 2218 2219 next = node; 2220 while (1) { 2221 cond_resched(); 2222 next = walk_up_backref(next, edges, &index); 2223 root = next->root; 2224 2225 /* 2226 * This can occur if we have incomplete extent refs leading all 2227 * the way up a particular path, in this case return -EUCLEAN. 2228 */ 2229 if (!root) 2230 return ERR_PTR(-EUCLEAN); 2231 2232 /* No other choice for non-shareable tree */ 2233 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2234 return root; 2235 2236 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2237 fs_root = root; 2238 2239 if (next != node) 2240 return NULL; 2241 2242 next = walk_down_backref(edges, &index); 2243 if (!next || next->level <= node->level) 2244 break; 2245 } 2246 2247 if (!fs_root) 2248 return ERR_PTR(-ENOENT); 2249 return fs_root; 2250 } 2251 2252 static noinline_for_stack 2253 u64 calcu_metadata_size(struct reloc_control *rc, 2254 struct btrfs_backref_node *node, int reserve) 2255 { 2256 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2257 struct btrfs_backref_node *next = node; 2258 struct btrfs_backref_edge *edge; 2259 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2260 u64 num_bytes = 0; 2261 int index = 0; 2262 2263 BUG_ON(reserve && node->processed); 2264 2265 while (next) { 2266 cond_resched(); 2267 while (1) { 2268 if (next->processed && (reserve || next != node)) 2269 break; 2270 2271 num_bytes += fs_info->nodesize; 2272 2273 if (list_empty(&next->upper)) 2274 break; 2275 2276 edge = list_entry(next->upper.next, 2277 struct btrfs_backref_edge, list[LOWER]); 2278 edges[index++] = edge; 2279 next = edge->node[UPPER]; 2280 } 2281 next = walk_down_backref(edges, &index); 2282 } 2283 return num_bytes; 2284 } 2285 2286 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2287 struct reloc_control *rc, 2288 struct btrfs_backref_node *node) 2289 { 2290 struct btrfs_root *root = rc->extent_root; 2291 struct btrfs_fs_info *fs_info = root->fs_info; 2292 u64 num_bytes; 2293 int ret; 2294 u64 tmp; 2295 2296 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2297 2298 trans->block_rsv = rc->block_rsv; 2299 rc->reserved_bytes += num_bytes; 2300 2301 /* 2302 * We are under a transaction here so we can only do limited flushing. 2303 * If we get an enospc just kick back -EAGAIN so we know to drop the 2304 * transaction and try to refill when we can flush all the things. 2305 */ 2306 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2307 BTRFS_RESERVE_FLUSH_LIMIT); 2308 if (ret) { 2309 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2310 while (tmp <= rc->reserved_bytes) 2311 tmp <<= 1; 2312 /* 2313 * only one thread can access block_rsv at this point, 2314 * so we don't need hold lock to protect block_rsv. 2315 * we expand more reservation size here to allow enough 2316 * space for relocation and we will return earlier in 2317 * enospc case. 2318 */ 2319 rc->block_rsv->size = tmp + fs_info->nodesize * 2320 RELOCATION_RESERVED_NODES; 2321 return -EAGAIN; 2322 } 2323 2324 return 0; 2325 } 2326 2327 /* 2328 * relocate a block tree, and then update pointers in upper level 2329 * blocks that reference the block to point to the new location. 2330 * 2331 * if called by link_to_upper, the block has already been relocated. 2332 * in that case this function just updates pointers. 2333 */ 2334 static int do_relocation(struct btrfs_trans_handle *trans, 2335 struct reloc_control *rc, 2336 struct btrfs_backref_node *node, 2337 struct btrfs_key *key, 2338 struct btrfs_path *path, int lowest) 2339 { 2340 struct btrfs_backref_node *upper; 2341 struct btrfs_backref_edge *edge; 2342 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2343 struct btrfs_root *root; 2344 struct extent_buffer *eb; 2345 u32 blocksize; 2346 u64 bytenr; 2347 int slot; 2348 int ret = 0; 2349 2350 /* 2351 * If we are lowest then this is the first time we're processing this 2352 * block, and thus shouldn't have an eb associated with it yet. 2353 */ 2354 ASSERT(!lowest || !node->eb); 2355 2356 path->lowest_level = node->level + 1; 2357 rc->backref_cache.path[node->level] = node; 2358 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2359 struct btrfs_ref ref = { 0 }; 2360 2361 cond_resched(); 2362 2363 upper = edge->node[UPPER]; 2364 root = select_reloc_root(trans, rc, upper, edges); 2365 if (IS_ERR(root)) { 2366 ret = PTR_ERR(root); 2367 goto next; 2368 } 2369 2370 if (upper->eb && !upper->locked) { 2371 if (!lowest) { 2372 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2373 if (ret < 0) 2374 goto next; 2375 BUG_ON(ret); 2376 bytenr = btrfs_node_blockptr(upper->eb, slot); 2377 if (node->eb->start == bytenr) 2378 goto next; 2379 } 2380 btrfs_backref_drop_node_buffer(upper); 2381 } 2382 2383 if (!upper->eb) { 2384 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2385 if (ret) { 2386 if (ret > 0) 2387 ret = -ENOENT; 2388 2389 btrfs_release_path(path); 2390 break; 2391 } 2392 2393 if (!upper->eb) { 2394 upper->eb = path->nodes[upper->level]; 2395 path->nodes[upper->level] = NULL; 2396 } else { 2397 BUG_ON(upper->eb != path->nodes[upper->level]); 2398 } 2399 2400 upper->locked = 1; 2401 path->locks[upper->level] = 0; 2402 2403 slot = path->slots[upper->level]; 2404 btrfs_release_path(path); 2405 } else { 2406 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2407 if (ret < 0) 2408 goto next; 2409 BUG_ON(ret); 2410 } 2411 2412 bytenr = btrfs_node_blockptr(upper->eb, slot); 2413 if (lowest) { 2414 if (bytenr != node->bytenr) { 2415 btrfs_err(root->fs_info, 2416 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2417 bytenr, node->bytenr, slot, 2418 upper->eb->start); 2419 ret = -EIO; 2420 goto next; 2421 } 2422 } else { 2423 if (node->eb->start == bytenr) 2424 goto next; 2425 } 2426 2427 blocksize = root->fs_info->nodesize; 2428 eb = btrfs_read_node_slot(upper->eb, slot); 2429 if (IS_ERR(eb)) { 2430 ret = PTR_ERR(eb); 2431 goto next; 2432 } 2433 btrfs_tree_lock(eb); 2434 2435 if (!node->eb) { 2436 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2437 slot, &eb, BTRFS_NESTING_COW); 2438 btrfs_tree_unlock(eb); 2439 free_extent_buffer(eb); 2440 if (ret < 0) 2441 goto next; 2442 /* 2443 * We've just COWed this block, it should have updated 2444 * the correct backref node entry. 2445 */ 2446 ASSERT(node->eb == eb); 2447 } else { 2448 btrfs_set_node_blockptr(upper->eb, slot, 2449 node->eb->start); 2450 btrfs_set_node_ptr_generation(upper->eb, slot, 2451 trans->transid); 2452 btrfs_mark_buffer_dirty(trans, upper->eb); 2453 2454 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2455 node->eb->start, blocksize, 2456 upper->eb->start); 2457 btrfs_init_tree_ref(&ref, node->level, 2458 btrfs_header_owner(upper->eb), 2459 root->root_key.objectid, false); 2460 ret = btrfs_inc_extent_ref(trans, &ref); 2461 if (!ret) 2462 ret = btrfs_drop_subtree(trans, root, eb, 2463 upper->eb); 2464 if (ret) 2465 btrfs_abort_transaction(trans, ret); 2466 } 2467 next: 2468 if (!upper->pending) 2469 btrfs_backref_drop_node_buffer(upper); 2470 else 2471 btrfs_backref_unlock_node_buffer(upper); 2472 if (ret) 2473 break; 2474 } 2475 2476 if (!ret && node->pending) { 2477 btrfs_backref_drop_node_buffer(node); 2478 list_move_tail(&node->list, &rc->backref_cache.changed); 2479 node->pending = 0; 2480 } 2481 2482 path->lowest_level = 0; 2483 2484 /* 2485 * We should have allocated all of our space in the block rsv and thus 2486 * shouldn't ENOSPC. 2487 */ 2488 ASSERT(ret != -ENOSPC); 2489 return ret; 2490 } 2491 2492 static int link_to_upper(struct btrfs_trans_handle *trans, 2493 struct reloc_control *rc, 2494 struct btrfs_backref_node *node, 2495 struct btrfs_path *path) 2496 { 2497 struct btrfs_key key; 2498 2499 btrfs_node_key_to_cpu(node->eb, &key, 0); 2500 return do_relocation(trans, rc, node, &key, path, 0); 2501 } 2502 2503 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2504 struct reloc_control *rc, 2505 struct btrfs_path *path, int err) 2506 { 2507 LIST_HEAD(list); 2508 struct btrfs_backref_cache *cache = &rc->backref_cache; 2509 struct btrfs_backref_node *node; 2510 int level; 2511 int ret; 2512 2513 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2514 while (!list_empty(&cache->pending[level])) { 2515 node = list_entry(cache->pending[level].next, 2516 struct btrfs_backref_node, list); 2517 list_move_tail(&node->list, &list); 2518 BUG_ON(!node->pending); 2519 2520 if (!err) { 2521 ret = link_to_upper(trans, rc, node, path); 2522 if (ret < 0) 2523 err = ret; 2524 } 2525 } 2526 list_splice_init(&list, &cache->pending[level]); 2527 } 2528 return err; 2529 } 2530 2531 /* 2532 * mark a block and all blocks directly/indirectly reference the block 2533 * as processed. 2534 */ 2535 static void update_processed_blocks(struct reloc_control *rc, 2536 struct btrfs_backref_node *node) 2537 { 2538 struct btrfs_backref_node *next = node; 2539 struct btrfs_backref_edge *edge; 2540 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2541 int index = 0; 2542 2543 while (next) { 2544 cond_resched(); 2545 while (1) { 2546 if (next->processed) 2547 break; 2548 2549 mark_block_processed(rc, next); 2550 2551 if (list_empty(&next->upper)) 2552 break; 2553 2554 edge = list_entry(next->upper.next, 2555 struct btrfs_backref_edge, list[LOWER]); 2556 edges[index++] = edge; 2557 next = edge->node[UPPER]; 2558 } 2559 next = walk_down_backref(edges, &index); 2560 } 2561 } 2562 2563 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2564 { 2565 u32 blocksize = rc->extent_root->fs_info->nodesize; 2566 2567 if (test_range_bit(&rc->processed_blocks, bytenr, 2568 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2569 return 1; 2570 return 0; 2571 } 2572 2573 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2574 struct tree_block *block) 2575 { 2576 struct btrfs_tree_parent_check check = { 2577 .level = block->level, 2578 .owner_root = block->owner, 2579 .transid = block->key.offset 2580 }; 2581 struct extent_buffer *eb; 2582 2583 eb = read_tree_block(fs_info, block->bytenr, &check); 2584 if (IS_ERR(eb)) 2585 return PTR_ERR(eb); 2586 if (!extent_buffer_uptodate(eb)) { 2587 free_extent_buffer(eb); 2588 return -EIO; 2589 } 2590 if (block->level == 0) 2591 btrfs_item_key_to_cpu(eb, &block->key, 0); 2592 else 2593 btrfs_node_key_to_cpu(eb, &block->key, 0); 2594 free_extent_buffer(eb); 2595 block->key_ready = 1; 2596 return 0; 2597 } 2598 2599 /* 2600 * helper function to relocate a tree block 2601 */ 2602 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2603 struct reloc_control *rc, 2604 struct btrfs_backref_node *node, 2605 struct btrfs_key *key, 2606 struct btrfs_path *path) 2607 { 2608 struct btrfs_root *root; 2609 int ret = 0; 2610 2611 if (!node) 2612 return 0; 2613 2614 /* 2615 * If we fail here we want to drop our backref_node because we are going 2616 * to start over and regenerate the tree for it. 2617 */ 2618 ret = reserve_metadata_space(trans, rc, node); 2619 if (ret) 2620 goto out; 2621 2622 BUG_ON(node->processed); 2623 root = select_one_root(node); 2624 if (IS_ERR(root)) { 2625 ret = PTR_ERR(root); 2626 2627 /* See explanation in select_one_root for the -EUCLEAN case. */ 2628 ASSERT(ret == -ENOENT); 2629 if (ret == -ENOENT) { 2630 ret = 0; 2631 update_processed_blocks(rc, node); 2632 } 2633 goto out; 2634 } 2635 2636 if (root) { 2637 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2638 /* 2639 * This block was the root block of a root, and this is 2640 * the first time we're processing the block and thus it 2641 * should not have had the ->new_bytenr modified and 2642 * should have not been included on the changed list. 2643 * 2644 * However in the case of corruption we could have 2645 * multiple refs pointing to the same block improperly, 2646 * and thus we would trip over these checks. ASSERT() 2647 * for the developer case, because it could indicate a 2648 * bug in the backref code, however error out for a 2649 * normal user in the case of corruption. 2650 */ 2651 ASSERT(node->new_bytenr == 0); 2652 ASSERT(list_empty(&node->list)); 2653 if (node->new_bytenr || !list_empty(&node->list)) { 2654 btrfs_err(root->fs_info, 2655 "bytenr %llu has improper references to it", 2656 node->bytenr); 2657 ret = -EUCLEAN; 2658 goto out; 2659 } 2660 ret = btrfs_record_root_in_trans(trans, root); 2661 if (ret) 2662 goto out; 2663 /* 2664 * Another thread could have failed, need to check if we 2665 * have reloc_root actually set. 2666 */ 2667 if (!root->reloc_root) { 2668 ret = -ENOENT; 2669 goto out; 2670 } 2671 root = root->reloc_root; 2672 node->new_bytenr = root->node->start; 2673 btrfs_put_root(node->root); 2674 node->root = btrfs_grab_root(root); 2675 ASSERT(node->root); 2676 list_add_tail(&node->list, &rc->backref_cache.changed); 2677 } else { 2678 path->lowest_level = node->level; 2679 if (root == root->fs_info->chunk_root) 2680 btrfs_reserve_chunk_metadata(trans, false); 2681 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2682 btrfs_release_path(path); 2683 if (root == root->fs_info->chunk_root) 2684 btrfs_trans_release_chunk_metadata(trans); 2685 if (ret > 0) 2686 ret = 0; 2687 } 2688 if (!ret) 2689 update_processed_blocks(rc, node); 2690 } else { 2691 ret = do_relocation(trans, rc, node, key, path, 1); 2692 } 2693 out: 2694 if (ret || node->level == 0 || node->cowonly) 2695 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2696 return ret; 2697 } 2698 2699 /* 2700 * relocate a list of blocks 2701 */ 2702 static noinline_for_stack 2703 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2704 struct reloc_control *rc, struct rb_root *blocks) 2705 { 2706 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2707 struct btrfs_backref_node *node; 2708 struct btrfs_path *path; 2709 struct tree_block *block; 2710 struct tree_block *next; 2711 int ret; 2712 int err = 0; 2713 2714 path = btrfs_alloc_path(); 2715 if (!path) { 2716 err = -ENOMEM; 2717 goto out_free_blocks; 2718 } 2719 2720 /* Kick in readahead for tree blocks with missing keys */ 2721 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2722 if (!block->key_ready) 2723 btrfs_readahead_tree_block(fs_info, block->bytenr, 2724 block->owner, 0, 2725 block->level); 2726 } 2727 2728 /* Get first keys */ 2729 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2730 if (!block->key_ready) { 2731 err = get_tree_block_key(fs_info, block); 2732 if (err) 2733 goto out_free_path; 2734 } 2735 } 2736 2737 /* Do tree relocation */ 2738 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2739 node = build_backref_tree(trans, rc, &block->key, 2740 block->level, block->bytenr); 2741 if (IS_ERR(node)) { 2742 err = PTR_ERR(node); 2743 goto out; 2744 } 2745 2746 ret = relocate_tree_block(trans, rc, node, &block->key, 2747 path); 2748 if (ret < 0) { 2749 err = ret; 2750 break; 2751 } 2752 } 2753 out: 2754 err = finish_pending_nodes(trans, rc, path, err); 2755 2756 out_free_path: 2757 btrfs_free_path(path); 2758 out_free_blocks: 2759 free_block_list(blocks); 2760 return err; 2761 } 2762 2763 static noinline_for_stack int prealloc_file_extent_cluster( 2764 struct btrfs_inode *inode, 2765 const struct file_extent_cluster *cluster) 2766 { 2767 u64 alloc_hint = 0; 2768 u64 start; 2769 u64 end; 2770 u64 offset = inode->index_cnt; 2771 u64 num_bytes; 2772 int nr; 2773 int ret = 0; 2774 u64 i_size = i_size_read(&inode->vfs_inode); 2775 u64 prealloc_start = cluster->start - offset; 2776 u64 prealloc_end = cluster->end - offset; 2777 u64 cur_offset = prealloc_start; 2778 2779 /* 2780 * For subpage case, previous i_size may not be aligned to PAGE_SIZE. 2781 * This means the range [i_size, PAGE_END + 1) is filled with zeros by 2782 * btrfs_do_readpage() call of previously relocated file cluster. 2783 * 2784 * If the current cluster starts in the above range, btrfs_do_readpage() 2785 * will skip the read, and relocate_one_page() will later writeback 2786 * the padding zeros as new data, causing data corruption. 2787 * 2788 * Here we have to manually invalidate the range (i_size, PAGE_END + 1). 2789 */ 2790 if (!PAGE_ALIGNED(i_size)) { 2791 struct address_space *mapping = inode->vfs_inode.i_mapping; 2792 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2793 const u32 sectorsize = fs_info->sectorsize; 2794 struct page *page; 2795 2796 ASSERT(sectorsize < PAGE_SIZE); 2797 ASSERT(IS_ALIGNED(i_size, sectorsize)); 2798 2799 /* 2800 * Subpage can't handle page with DIRTY but without UPTODATE 2801 * bit as it can lead to the following deadlock: 2802 * 2803 * btrfs_read_folio() 2804 * | Page already *locked* 2805 * |- btrfs_lock_and_flush_ordered_range() 2806 * |- btrfs_start_ordered_extent() 2807 * |- extent_write_cache_pages() 2808 * |- lock_page() 2809 * We try to lock the page we already hold. 2810 * 2811 * Here we just writeback the whole data reloc inode, so that 2812 * we will be ensured to have no dirty range in the page, and 2813 * are safe to clear the uptodate bits. 2814 * 2815 * This shouldn't cause too much overhead, as we need to write 2816 * the data back anyway. 2817 */ 2818 ret = filemap_write_and_wait(mapping); 2819 if (ret < 0) 2820 return ret; 2821 2822 clear_extent_bits(&inode->io_tree, i_size, 2823 round_up(i_size, PAGE_SIZE) - 1, 2824 EXTENT_UPTODATE); 2825 page = find_lock_page(mapping, i_size >> PAGE_SHIFT); 2826 /* 2827 * If page is freed we don't need to do anything then, as we 2828 * will re-read the whole page anyway. 2829 */ 2830 if (page) { 2831 btrfs_subpage_clear_uptodate(fs_info, page, i_size, 2832 round_up(i_size, PAGE_SIZE) - i_size); 2833 unlock_page(page); 2834 put_page(page); 2835 } 2836 } 2837 2838 BUG_ON(cluster->start != cluster->boundary[0]); 2839 ret = btrfs_alloc_data_chunk_ondemand(inode, 2840 prealloc_end + 1 - prealloc_start); 2841 if (ret) 2842 return ret; 2843 2844 btrfs_inode_lock(inode, 0); 2845 for (nr = 0; nr < cluster->nr; nr++) { 2846 struct extent_state *cached_state = NULL; 2847 2848 start = cluster->boundary[nr] - offset; 2849 if (nr + 1 < cluster->nr) 2850 end = cluster->boundary[nr + 1] - 1 - offset; 2851 else 2852 end = cluster->end - offset; 2853 2854 lock_extent(&inode->io_tree, start, end, &cached_state); 2855 num_bytes = end + 1 - start; 2856 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2857 num_bytes, num_bytes, 2858 end + 1, &alloc_hint); 2859 cur_offset = end + 1; 2860 unlock_extent(&inode->io_tree, start, end, &cached_state); 2861 if (ret) 2862 break; 2863 } 2864 btrfs_inode_unlock(inode, 0); 2865 2866 if (cur_offset < prealloc_end) 2867 btrfs_free_reserved_data_space_noquota(inode->root->fs_info, 2868 prealloc_end + 1 - cur_offset); 2869 return ret; 2870 } 2871 2872 static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, 2873 u64 start, u64 end, u64 block_start) 2874 { 2875 struct extent_map *em; 2876 struct extent_state *cached_state = NULL; 2877 int ret = 0; 2878 2879 em = alloc_extent_map(); 2880 if (!em) 2881 return -ENOMEM; 2882 2883 em->start = start; 2884 em->len = end + 1 - start; 2885 em->block_len = em->len; 2886 em->block_start = block_start; 2887 set_bit(EXTENT_FLAG_PINNED, &em->flags); 2888 2889 lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 2890 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); 2891 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 2892 free_extent_map(em); 2893 2894 return ret; 2895 } 2896 2897 /* 2898 * Allow error injection to test balance/relocation cancellation 2899 */ 2900 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2901 { 2902 return atomic_read(&fs_info->balance_cancel_req) || 2903 atomic_read(&fs_info->reloc_cancel_req) || 2904 fatal_signal_pending(current); 2905 } 2906 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2907 2908 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2909 int cluster_nr) 2910 { 2911 /* Last extent, use cluster end directly */ 2912 if (cluster_nr >= cluster->nr - 1) 2913 return cluster->end; 2914 2915 /* Use next boundary start*/ 2916 return cluster->boundary[cluster_nr + 1] - 1; 2917 } 2918 2919 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, 2920 const struct file_extent_cluster *cluster, 2921 int *cluster_nr, unsigned long page_index) 2922 { 2923 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2924 u64 offset = BTRFS_I(inode)->index_cnt; 2925 const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; 2926 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2927 struct page *page; 2928 u64 page_start; 2929 u64 page_end; 2930 u64 cur; 2931 int ret; 2932 2933 ASSERT(page_index <= last_index); 2934 page = find_lock_page(inode->i_mapping, page_index); 2935 if (!page) { 2936 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2937 page_index, last_index + 1 - page_index); 2938 page = find_or_create_page(inode->i_mapping, page_index, mask); 2939 if (!page) 2940 return -ENOMEM; 2941 } 2942 2943 if (PageReadahead(page)) 2944 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2945 page_folio(page), page_index, 2946 last_index + 1 - page_index); 2947 2948 if (!PageUptodate(page)) { 2949 btrfs_read_folio(NULL, page_folio(page)); 2950 lock_page(page); 2951 if (!PageUptodate(page)) { 2952 ret = -EIO; 2953 goto release_page; 2954 } 2955 } 2956 2957 /* 2958 * We could have lost page private when we dropped the lock to read the 2959 * page above, make sure we set_page_extent_mapped here so we have any 2960 * of the subpage blocksize stuff we need in place. 2961 */ 2962 ret = set_page_extent_mapped(page); 2963 if (ret < 0) 2964 goto release_page; 2965 2966 page_start = page_offset(page); 2967 page_end = page_start + PAGE_SIZE - 1; 2968 2969 /* 2970 * Start from the cluster, as for subpage case, the cluster can start 2971 * inside the page. 2972 */ 2973 cur = max(page_start, cluster->boundary[*cluster_nr] - offset); 2974 while (cur <= page_end) { 2975 struct extent_state *cached_state = NULL; 2976 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2977 u64 extent_end = get_cluster_boundary_end(cluster, 2978 *cluster_nr) - offset; 2979 u64 clamped_start = max(page_start, extent_start); 2980 u64 clamped_end = min(page_end, extent_end); 2981 u32 clamped_len = clamped_end + 1 - clamped_start; 2982 2983 /* Reserve metadata for this range */ 2984 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2985 clamped_len, clamped_len, 2986 false); 2987 if (ret) 2988 goto release_page; 2989 2990 /* Mark the range delalloc and dirty for later writeback */ 2991 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 2992 &cached_state); 2993 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 2994 clamped_end, 0, &cached_state); 2995 if (ret) { 2996 clear_extent_bit(&BTRFS_I(inode)->io_tree, 2997 clamped_start, clamped_end, 2998 EXTENT_LOCKED | EXTENT_BOUNDARY, 2999 &cached_state); 3000 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3001 clamped_len, true); 3002 btrfs_delalloc_release_extents(BTRFS_I(inode), 3003 clamped_len); 3004 goto release_page; 3005 } 3006 btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); 3007 3008 /* 3009 * Set the boundary if it's inside the page. 3010 * Data relocation requires the destination extents to have the 3011 * same size as the source. 3012 * EXTENT_BOUNDARY bit prevents current extent from being merged 3013 * with previous extent. 3014 */ 3015 if (in_range(cluster->boundary[*cluster_nr] - offset, 3016 page_start, PAGE_SIZE)) { 3017 u64 boundary_start = cluster->boundary[*cluster_nr] - 3018 offset; 3019 u64 boundary_end = boundary_start + 3020 fs_info->sectorsize - 1; 3021 3022 set_extent_bit(&BTRFS_I(inode)->io_tree, 3023 boundary_start, boundary_end, 3024 EXTENT_BOUNDARY, NULL); 3025 } 3026 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 3027 &cached_state); 3028 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 3029 cur += clamped_len; 3030 3031 /* Crossed extent end, go to next extent */ 3032 if (cur >= extent_end) { 3033 (*cluster_nr)++; 3034 /* Just finished the last extent of the cluster, exit. */ 3035 if (*cluster_nr >= cluster->nr) 3036 break; 3037 } 3038 } 3039 unlock_page(page); 3040 put_page(page); 3041 3042 balance_dirty_pages_ratelimited(inode->i_mapping); 3043 btrfs_throttle(fs_info); 3044 if (btrfs_should_cancel_balance(fs_info)) 3045 ret = -ECANCELED; 3046 return ret; 3047 3048 release_page: 3049 unlock_page(page); 3050 put_page(page); 3051 return ret; 3052 } 3053 3054 static int relocate_file_extent_cluster(struct inode *inode, 3055 const struct file_extent_cluster *cluster) 3056 { 3057 u64 offset = BTRFS_I(inode)->index_cnt; 3058 unsigned long index; 3059 unsigned long last_index; 3060 struct file_ra_state *ra; 3061 int cluster_nr = 0; 3062 int ret = 0; 3063 3064 if (!cluster->nr) 3065 return 0; 3066 3067 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3068 if (!ra) 3069 return -ENOMEM; 3070 3071 ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); 3072 if (ret) 3073 goto out; 3074 3075 file_ra_state_init(ra, inode->i_mapping); 3076 3077 ret = setup_relocation_extent_mapping(inode, cluster->start - offset, 3078 cluster->end - offset, cluster->start); 3079 if (ret) 3080 goto out; 3081 3082 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3083 for (index = (cluster->start - offset) >> PAGE_SHIFT; 3084 index <= last_index && !ret; index++) 3085 ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); 3086 if (ret == 0) 3087 WARN_ON(cluster_nr != cluster->nr); 3088 out: 3089 kfree(ra); 3090 return ret; 3091 } 3092 3093 static noinline_for_stack int relocate_data_extent(struct inode *inode, 3094 const struct btrfs_key *extent_key, 3095 struct file_extent_cluster *cluster) 3096 { 3097 int ret; 3098 3099 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3100 ret = relocate_file_extent_cluster(inode, cluster); 3101 if (ret) 3102 return ret; 3103 cluster->nr = 0; 3104 } 3105 3106 if (!cluster->nr) 3107 cluster->start = extent_key->objectid; 3108 else 3109 BUG_ON(cluster->nr >= MAX_EXTENTS); 3110 cluster->end = extent_key->objectid + extent_key->offset - 1; 3111 cluster->boundary[cluster->nr] = extent_key->objectid; 3112 cluster->nr++; 3113 3114 if (cluster->nr >= MAX_EXTENTS) { 3115 ret = relocate_file_extent_cluster(inode, cluster); 3116 if (ret) 3117 return ret; 3118 cluster->nr = 0; 3119 } 3120 return 0; 3121 } 3122 3123 /* 3124 * helper to add a tree block to the list. 3125 * the major work is getting the generation and level of the block 3126 */ 3127 static int add_tree_block(struct reloc_control *rc, 3128 const struct btrfs_key *extent_key, 3129 struct btrfs_path *path, 3130 struct rb_root *blocks) 3131 { 3132 struct extent_buffer *eb; 3133 struct btrfs_extent_item *ei; 3134 struct btrfs_tree_block_info *bi; 3135 struct tree_block *block; 3136 struct rb_node *rb_node; 3137 u32 item_size; 3138 int level = -1; 3139 u64 generation; 3140 u64 owner = 0; 3141 3142 eb = path->nodes[0]; 3143 item_size = btrfs_item_size(eb, path->slots[0]); 3144 3145 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3146 item_size >= sizeof(*ei) + sizeof(*bi)) { 3147 unsigned long ptr = 0, end; 3148 3149 ei = btrfs_item_ptr(eb, path->slots[0], 3150 struct btrfs_extent_item); 3151 end = (unsigned long)ei + item_size; 3152 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3153 bi = (struct btrfs_tree_block_info *)(ei + 1); 3154 level = btrfs_tree_block_level(eb, bi); 3155 ptr = (unsigned long)(bi + 1); 3156 } else { 3157 level = (int)extent_key->offset; 3158 ptr = (unsigned long)(ei + 1); 3159 } 3160 generation = btrfs_extent_generation(eb, ei); 3161 3162 /* 3163 * We're reading random blocks without knowing their owner ahead 3164 * of time. This is ok most of the time, as all reloc roots and 3165 * fs roots have the same lock type. However normal trees do 3166 * not, and the only way to know ahead of time is to read the 3167 * inline ref offset. We know it's an fs root if 3168 * 3169 * 1. There's more than one ref. 3170 * 2. There's a SHARED_DATA_REF_KEY set. 3171 * 3. FULL_BACKREF is set on the flags. 3172 * 3173 * Otherwise it's safe to assume that the ref offset == the 3174 * owner of this block, so we can use that when calling 3175 * read_tree_block. 3176 */ 3177 if (btrfs_extent_refs(eb, ei) == 1 && 3178 !(btrfs_extent_flags(eb, ei) & 3179 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3180 ptr < end) { 3181 struct btrfs_extent_inline_ref *iref; 3182 int type; 3183 3184 iref = (struct btrfs_extent_inline_ref *)ptr; 3185 type = btrfs_get_extent_inline_ref_type(eb, iref, 3186 BTRFS_REF_TYPE_BLOCK); 3187 if (type == BTRFS_REF_TYPE_INVALID) 3188 return -EINVAL; 3189 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3190 owner = btrfs_extent_inline_ref_offset(eb, iref); 3191 } 3192 } else { 3193 btrfs_print_leaf(eb); 3194 btrfs_err(rc->block_group->fs_info, 3195 "unrecognized tree backref at tree block %llu slot %u", 3196 eb->start, path->slots[0]); 3197 btrfs_release_path(path); 3198 return -EUCLEAN; 3199 } 3200 3201 btrfs_release_path(path); 3202 3203 BUG_ON(level == -1); 3204 3205 block = kmalloc(sizeof(*block), GFP_NOFS); 3206 if (!block) 3207 return -ENOMEM; 3208 3209 block->bytenr = extent_key->objectid; 3210 block->key.objectid = rc->extent_root->fs_info->nodesize; 3211 block->key.offset = generation; 3212 block->level = level; 3213 block->key_ready = 0; 3214 block->owner = owner; 3215 3216 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3217 if (rb_node) 3218 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3219 -EEXIST); 3220 3221 return 0; 3222 } 3223 3224 /* 3225 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3226 */ 3227 static int __add_tree_block(struct reloc_control *rc, 3228 u64 bytenr, u32 blocksize, 3229 struct rb_root *blocks) 3230 { 3231 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3232 struct btrfs_path *path; 3233 struct btrfs_key key; 3234 int ret; 3235 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3236 3237 if (tree_block_processed(bytenr, rc)) 3238 return 0; 3239 3240 if (rb_simple_search(blocks, bytenr)) 3241 return 0; 3242 3243 path = btrfs_alloc_path(); 3244 if (!path) 3245 return -ENOMEM; 3246 again: 3247 key.objectid = bytenr; 3248 if (skinny) { 3249 key.type = BTRFS_METADATA_ITEM_KEY; 3250 key.offset = (u64)-1; 3251 } else { 3252 key.type = BTRFS_EXTENT_ITEM_KEY; 3253 key.offset = blocksize; 3254 } 3255 3256 path->search_commit_root = 1; 3257 path->skip_locking = 1; 3258 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3259 if (ret < 0) 3260 goto out; 3261 3262 if (ret > 0 && skinny) { 3263 if (path->slots[0]) { 3264 path->slots[0]--; 3265 btrfs_item_key_to_cpu(path->nodes[0], &key, 3266 path->slots[0]); 3267 if (key.objectid == bytenr && 3268 (key.type == BTRFS_METADATA_ITEM_KEY || 3269 (key.type == BTRFS_EXTENT_ITEM_KEY && 3270 key.offset == blocksize))) 3271 ret = 0; 3272 } 3273 3274 if (ret) { 3275 skinny = false; 3276 btrfs_release_path(path); 3277 goto again; 3278 } 3279 } 3280 if (ret) { 3281 ASSERT(ret == 1); 3282 btrfs_print_leaf(path->nodes[0]); 3283 btrfs_err(fs_info, 3284 "tree block extent item (%llu) is not found in extent tree", 3285 bytenr); 3286 WARN_ON(1); 3287 ret = -EINVAL; 3288 goto out; 3289 } 3290 3291 ret = add_tree_block(rc, &key, path, blocks); 3292 out: 3293 btrfs_free_path(path); 3294 return ret; 3295 } 3296 3297 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3298 struct btrfs_block_group *block_group, 3299 struct inode *inode, 3300 u64 ino) 3301 { 3302 struct btrfs_root *root = fs_info->tree_root; 3303 struct btrfs_trans_handle *trans; 3304 int ret = 0; 3305 3306 if (inode) 3307 goto truncate; 3308 3309 inode = btrfs_iget(fs_info->sb, ino, root); 3310 if (IS_ERR(inode)) 3311 return -ENOENT; 3312 3313 truncate: 3314 ret = btrfs_check_trunc_cache_free_space(fs_info, 3315 &fs_info->global_block_rsv); 3316 if (ret) 3317 goto out; 3318 3319 trans = btrfs_join_transaction(root); 3320 if (IS_ERR(trans)) { 3321 ret = PTR_ERR(trans); 3322 goto out; 3323 } 3324 3325 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3326 3327 btrfs_end_transaction(trans); 3328 btrfs_btree_balance_dirty(fs_info); 3329 out: 3330 iput(inode); 3331 return ret; 3332 } 3333 3334 /* 3335 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3336 * cache inode, to avoid free space cache data extent blocking data relocation. 3337 */ 3338 static int delete_v1_space_cache(struct extent_buffer *leaf, 3339 struct btrfs_block_group *block_group, 3340 u64 data_bytenr) 3341 { 3342 u64 space_cache_ino; 3343 struct btrfs_file_extent_item *ei; 3344 struct btrfs_key key; 3345 bool found = false; 3346 int i; 3347 int ret; 3348 3349 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3350 return 0; 3351 3352 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3353 u8 type; 3354 3355 btrfs_item_key_to_cpu(leaf, &key, i); 3356 if (key.type != BTRFS_EXTENT_DATA_KEY) 3357 continue; 3358 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3359 type = btrfs_file_extent_type(leaf, ei); 3360 3361 if ((type == BTRFS_FILE_EXTENT_REG || 3362 type == BTRFS_FILE_EXTENT_PREALLOC) && 3363 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3364 found = true; 3365 space_cache_ino = key.objectid; 3366 break; 3367 } 3368 } 3369 if (!found) 3370 return -ENOENT; 3371 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3372 space_cache_ino); 3373 return ret; 3374 } 3375 3376 /* 3377 * helper to find all tree blocks that reference a given data extent 3378 */ 3379 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3380 const struct btrfs_key *extent_key, 3381 struct btrfs_path *path, 3382 struct rb_root *blocks) 3383 { 3384 struct btrfs_backref_walk_ctx ctx = { 0 }; 3385 struct ulist_iterator leaf_uiter; 3386 struct ulist_node *ref_node = NULL; 3387 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3388 int ret = 0; 3389 3390 btrfs_release_path(path); 3391 3392 ctx.bytenr = extent_key->objectid; 3393 ctx.skip_inode_ref_list = true; 3394 ctx.fs_info = rc->extent_root->fs_info; 3395 3396 ret = btrfs_find_all_leafs(&ctx); 3397 if (ret < 0) 3398 return ret; 3399 3400 ULIST_ITER_INIT(&leaf_uiter); 3401 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3402 struct btrfs_tree_parent_check check = { 0 }; 3403 struct extent_buffer *eb; 3404 3405 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3406 if (IS_ERR(eb)) { 3407 ret = PTR_ERR(eb); 3408 break; 3409 } 3410 ret = delete_v1_space_cache(eb, rc->block_group, 3411 extent_key->objectid); 3412 free_extent_buffer(eb); 3413 if (ret < 0) 3414 break; 3415 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3416 if (ret < 0) 3417 break; 3418 } 3419 if (ret < 0) 3420 free_block_list(blocks); 3421 ulist_free(ctx.refs); 3422 return ret; 3423 } 3424 3425 /* 3426 * helper to find next unprocessed extent 3427 */ 3428 static noinline_for_stack 3429 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3430 struct btrfs_key *extent_key) 3431 { 3432 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3433 struct btrfs_key key; 3434 struct extent_buffer *leaf; 3435 u64 start, end, last; 3436 int ret; 3437 3438 last = rc->block_group->start + rc->block_group->length; 3439 while (1) { 3440 bool block_found; 3441 3442 cond_resched(); 3443 if (rc->search_start >= last) { 3444 ret = 1; 3445 break; 3446 } 3447 3448 key.objectid = rc->search_start; 3449 key.type = BTRFS_EXTENT_ITEM_KEY; 3450 key.offset = 0; 3451 3452 path->search_commit_root = 1; 3453 path->skip_locking = 1; 3454 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3455 0, 0); 3456 if (ret < 0) 3457 break; 3458 next: 3459 leaf = path->nodes[0]; 3460 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3461 ret = btrfs_next_leaf(rc->extent_root, path); 3462 if (ret != 0) 3463 break; 3464 leaf = path->nodes[0]; 3465 } 3466 3467 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3468 if (key.objectid >= last) { 3469 ret = 1; 3470 break; 3471 } 3472 3473 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3474 key.type != BTRFS_METADATA_ITEM_KEY) { 3475 path->slots[0]++; 3476 goto next; 3477 } 3478 3479 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3480 key.objectid + key.offset <= rc->search_start) { 3481 path->slots[0]++; 3482 goto next; 3483 } 3484 3485 if (key.type == BTRFS_METADATA_ITEM_KEY && 3486 key.objectid + fs_info->nodesize <= 3487 rc->search_start) { 3488 path->slots[0]++; 3489 goto next; 3490 } 3491 3492 block_found = find_first_extent_bit(&rc->processed_blocks, 3493 key.objectid, &start, &end, 3494 EXTENT_DIRTY, NULL); 3495 3496 if (block_found && start <= key.objectid) { 3497 btrfs_release_path(path); 3498 rc->search_start = end + 1; 3499 } else { 3500 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3501 rc->search_start = key.objectid + key.offset; 3502 else 3503 rc->search_start = key.objectid + 3504 fs_info->nodesize; 3505 memcpy(extent_key, &key, sizeof(key)); 3506 return 0; 3507 } 3508 } 3509 btrfs_release_path(path); 3510 return ret; 3511 } 3512 3513 static void set_reloc_control(struct reloc_control *rc) 3514 { 3515 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3516 3517 mutex_lock(&fs_info->reloc_mutex); 3518 fs_info->reloc_ctl = rc; 3519 mutex_unlock(&fs_info->reloc_mutex); 3520 } 3521 3522 static void unset_reloc_control(struct reloc_control *rc) 3523 { 3524 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3525 3526 mutex_lock(&fs_info->reloc_mutex); 3527 fs_info->reloc_ctl = NULL; 3528 mutex_unlock(&fs_info->reloc_mutex); 3529 } 3530 3531 static noinline_for_stack 3532 int prepare_to_relocate(struct reloc_control *rc) 3533 { 3534 struct btrfs_trans_handle *trans; 3535 int ret; 3536 3537 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3538 BTRFS_BLOCK_RSV_TEMP); 3539 if (!rc->block_rsv) 3540 return -ENOMEM; 3541 3542 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3543 rc->search_start = rc->block_group->start; 3544 rc->extents_found = 0; 3545 rc->nodes_relocated = 0; 3546 rc->merging_rsv_size = 0; 3547 rc->reserved_bytes = 0; 3548 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3549 RELOCATION_RESERVED_NODES; 3550 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3551 rc->block_rsv, rc->block_rsv->size, 3552 BTRFS_RESERVE_FLUSH_ALL); 3553 if (ret) 3554 return ret; 3555 3556 rc->create_reloc_tree = 1; 3557 set_reloc_control(rc); 3558 3559 trans = btrfs_join_transaction(rc->extent_root); 3560 if (IS_ERR(trans)) { 3561 unset_reloc_control(rc); 3562 /* 3563 * extent tree is not a ref_cow tree and has no reloc_root to 3564 * cleanup. And callers are responsible to free the above 3565 * block rsv. 3566 */ 3567 return PTR_ERR(trans); 3568 } 3569 3570 ret = btrfs_commit_transaction(trans); 3571 if (ret) 3572 unset_reloc_control(rc); 3573 3574 return ret; 3575 } 3576 3577 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3578 { 3579 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3580 struct rb_root blocks = RB_ROOT; 3581 struct btrfs_key key; 3582 struct btrfs_trans_handle *trans = NULL; 3583 struct btrfs_path *path; 3584 struct btrfs_extent_item *ei; 3585 u64 flags; 3586 int ret; 3587 int err = 0; 3588 int progress = 0; 3589 3590 path = btrfs_alloc_path(); 3591 if (!path) 3592 return -ENOMEM; 3593 path->reada = READA_FORWARD; 3594 3595 ret = prepare_to_relocate(rc); 3596 if (ret) { 3597 err = ret; 3598 goto out_free; 3599 } 3600 3601 while (1) { 3602 rc->reserved_bytes = 0; 3603 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3604 rc->block_rsv->size, 3605 BTRFS_RESERVE_FLUSH_ALL); 3606 if (ret) { 3607 err = ret; 3608 break; 3609 } 3610 progress++; 3611 trans = btrfs_start_transaction(rc->extent_root, 0); 3612 if (IS_ERR(trans)) { 3613 err = PTR_ERR(trans); 3614 trans = NULL; 3615 break; 3616 } 3617 restart: 3618 if (rc->backref_cache.last_trans != trans->transid) 3619 btrfs_backref_release_cache(&rc->backref_cache); 3620 rc->backref_cache.last_trans = trans->transid; 3621 3622 ret = find_next_extent(rc, path, &key); 3623 if (ret < 0) 3624 err = ret; 3625 if (ret != 0) 3626 break; 3627 3628 rc->extents_found++; 3629 3630 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3631 struct btrfs_extent_item); 3632 flags = btrfs_extent_flags(path->nodes[0], ei); 3633 3634 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3635 ret = add_tree_block(rc, &key, path, &blocks); 3636 } else if (rc->stage == UPDATE_DATA_PTRS && 3637 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3638 ret = add_data_references(rc, &key, path, &blocks); 3639 } else { 3640 btrfs_release_path(path); 3641 ret = 0; 3642 } 3643 if (ret < 0) { 3644 err = ret; 3645 break; 3646 } 3647 3648 if (!RB_EMPTY_ROOT(&blocks)) { 3649 ret = relocate_tree_blocks(trans, rc, &blocks); 3650 if (ret < 0) { 3651 if (ret != -EAGAIN) { 3652 err = ret; 3653 break; 3654 } 3655 rc->extents_found--; 3656 rc->search_start = key.objectid; 3657 } 3658 } 3659 3660 btrfs_end_transaction_throttle(trans); 3661 btrfs_btree_balance_dirty(fs_info); 3662 trans = NULL; 3663 3664 if (rc->stage == MOVE_DATA_EXTENTS && 3665 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3666 rc->found_file_extent = 1; 3667 ret = relocate_data_extent(rc->data_inode, 3668 &key, &rc->cluster); 3669 if (ret < 0) { 3670 err = ret; 3671 break; 3672 } 3673 } 3674 if (btrfs_should_cancel_balance(fs_info)) { 3675 err = -ECANCELED; 3676 break; 3677 } 3678 } 3679 if (trans && progress && err == -ENOSPC) { 3680 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3681 if (ret == 1) { 3682 err = 0; 3683 progress = 0; 3684 goto restart; 3685 } 3686 } 3687 3688 btrfs_release_path(path); 3689 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3690 3691 if (trans) { 3692 btrfs_end_transaction_throttle(trans); 3693 btrfs_btree_balance_dirty(fs_info); 3694 } 3695 3696 if (!err) { 3697 ret = relocate_file_extent_cluster(rc->data_inode, 3698 &rc->cluster); 3699 if (ret < 0) 3700 err = ret; 3701 } 3702 3703 rc->create_reloc_tree = 0; 3704 set_reloc_control(rc); 3705 3706 btrfs_backref_release_cache(&rc->backref_cache); 3707 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3708 3709 /* 3710 * Even in the case when the relocation is cancelled, we should all go 3711 * through prepare_to_merge() and merge_reloc_roots(). 3712 * 3713 * For error (including cancelled balance), prepare_to_merge() will 3714 * mark all reloc trees orphan, then queue them for cleanup in 3715 * merge_reloc_roots() 3716 */ 3717 err = prepare_to_merge(rc, err); 3718 3719 merge_reloc_roots(rc); 3720 3721 rc->merge_reloc_tree = 0; 3722 unset_reloc_control(rc); 3723 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3724 3725 /* get rid of pinned extents */ 3726 trans = btrfs_join_transaction(rc->extent_root); 3727 if (IS_ERR(trans)) { 3728 err = PTR_ERR(trans); 3729 goto out_free; 3730 } 3731 ret = btrfs_commit_transaction(trans); 3732 if (ret && !err) 3733 err = ret; 3734 out_free: 3735 ret = clean_dirty_subvols(rc); 3736 if (ret < 0 && !err) 3737 err = ret; 3738 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3739 btrfs_free_path(path); 3740 return err; 3741 } 3742 3743 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3744 struct btrfs_root *root, u64 objectid) 3745 { 3746 struct btrfs_path *path; 3747 struct btrfs_inode_item *item; 3748 struct extent_buffer *leaf; 3749 int ret; 3750 3751 path = btrfs_alloc_path(); 3752 if (!path) 3753 return -ENOMEM; 3754 3755 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3756 if (ret) 3757 goto out; 3758 3759 leaf = path->nodes[0]; 3760 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3761 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3762 btrfs_set_inode_generation(leaf, item, 1); 3763 btrfs_set_inode_size(leaf, item, 0); 3764 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3765 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3766 BTRFS_INODE_PREALLOC); 3767 btrfs_mark_buffer_dirty(trans, leaf); 3768 out: 3769 btrfs_free_path(path); 3770 return ret; 3771 } 3772 3773 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3774 struct btrfs_root *root, u64 objectid) 3775 { 3776 struct btrfs_path *path; 3777 struct btrfs_key key; 3778 int ret = 0; 3779 3780 path = btrfs_alloc_path(); 3781 if (!path) { 3782 ret = -ENOMEM; 3783 goto out; 3784 } 3785 3786 key.objectid = objectid; 3787 key.type = BTRFS_INODE_ITEM_KEY; 3788 key.offset = 0; 3789 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3790 if (ret) { 3791 if (ret > 0) 3792 ret = -ENOENT; 3793 goto out; 3794 } 3795 ret = btrfs_del_item(trans, root, path); 3796 out: 3797 if (ret) 3798 btrfs_abort_transaction(trans, ret); 3799 btrfs_free_path(path); 3800 } 3801 3802 /* 3803 * helper to create inode for data relocation. 3804 * the inode is in data relocation tree and its link count is 0 3805 */ 3806 static noinline_for_stack struct inode *create_reloc_inode( 3807 struct btrfs_fs_info *fs_info, 3808 const struct btrfs_block_group *group) 3809 { 3810 struct inode *inode = NULL; 3811 struct btrfs_trans_handle *trans; 3812 struct btrfs_root *root; 3813 u64 objectid; 3814 int err = 0; 3815 3816 root = btrfs_grab_root(fs_info->data_reloc_root); 3817 trans = btrfs_start_transaction(root, 6); 3818 if (IS_ERR(trans)) { 3819 btrfs_put_root(root); 3820 return ERR_CAST(trans); 3821 } 3822 3823 err = btrfs_get_free_objectid(root, &objectid); 3824 if (err) 3825 goto out; 3826 3827 err = __insert_orphan_inode(trans, root, objectid); 3828 if (err) 3829 goto out; 3830 3831 inode = btrfs_iget(fs_info->sb, objectid, root); 3832 if (IS_ERR(inode)) { 3833 delete_orphan_inode(trans, root, objectid); 3834 err = PTR_ERR(inode); 3835 inode = NULL; 3836 goto out; 3837 } 3838 BTRFS_I(inode)->index_cnt = group->start; 3839 3840 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 3841 out: 3842 btrfs_put_root(root); 3843 btrfs_end_transaction(trans); 3844 btrfs_btree_balance_dirty(fs_info); 3845 if (err) { 3846 iput(inode); 3847 inode = ERR_PTR(err); 3848 } 3849 return inode; 3850 } 3851 3852 /* 3853 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3854 * has been requested meanwhile and don't start in that case. 3855 * 3856 * Return: 3857 * 0 success 3858 * -EINPROGRESS operation is already in progress, that's probably a bug 3859 * -ECANCELED cancellation request was set before the operation started 3860 */ 3861 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3862 { 3863 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3864 /* This should not happen */ 3865 btrfs_err(fs_info, "reloc already running, cannot start"); 3866 return -EINPROGRESS; 3867 } 3868 3869 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3870 btrfs_info(fs_info, "chunk relocation canceled on start"); 3871 /* 3872 * On cancel, clear all requests but let the caller mark 3873 * the end after cleanup operations. 3874 */ 3875 atomic_set(&fs_info->reloc_cancel_req, 0); 3876 return -ECANCELED; 3877 } 3878 return 0; 3879 } 3880 3881 /* 3882 * Mark end of chunk relocation that is cancellable and wake any waiters. 3883 */ 3884 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3885 { 3886 /* Requested after start, clear bit first so any waiters can continue */ 3887 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3888 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3889 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3890 atomic_set(&fs_info->reloc_cancel_req, 0); 3891 } 3892 3893 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3894 { 3895 struct reloc_control *rc; 3896 3897 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3898 if (!rc) 3899 return NULL; 3900 3901 INIT_LIST_HEAD(&rc->reloc_roots); 3902 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3903 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); 3904 mapping_tree_init(&rc->reloc_root_tree); 3905 extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 3906 return rc; 3907 } 3908 3909 static void free_reloc_control(struct reloc_control *rc) 3910 { 3911 struct mapping_node *node, *tmp; 3912 3913 free_reloc_roots(&rc->reloc_roots); 3914 rbtree_postorder_for_each_entry_safe(node, tmp, 3915 &rc->reloc_root_tree.rb_root, rb_node) 3916 kfree(node); 3917 3918 kfree(rc); 3919 } 3920 3921 /* 3922 * Print the block group being relocated 3923 */ 3924 static void describe_relocation(struct btrfs_fs_info *fs_info, 3925 struct btrfs_block_group *block_group) 3926 { 3927 char buf[128] = {'\0'}; 3928 3929 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3930 3931 btrfs_info(fs_info, 3932 "relocating block group %llu flags %s", 3933 block_group->start, buf); 3934 } 3935 3936 static const char *stage_to_string(int stage) 3937 { 3938 if (stage == MOVE_DATA_EXTENTS) 3939 return "move data extents"; 3940 if (stage == UPDATE_DATA_PTRS) 3941 return "update data pointers"; 3942 return "unknown"; 3943 } 3944 3945 /* 3946 * function to relocate all extents in a block group. 3947 */ 3948 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 3949 { 3950 struct btrfs_block_group *bg; 3951 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3952 struct reloc_control *rc; 3953 struct inode *inode; 3954 struct btrfs_path *path; 3955 int ret; 3956 int rw = 0; 3957 int err = 0; 3958 3959 /* 3960 * This only gets set if we had a half-deleted snapshot on mount. We 3961 * cannot allow relocation to start while we're still trying to clean up 3962 * these pending deletions. 3963 */ 3964 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3965 if (ret) 3966 return ret; 3967 3968 /* We may have been woken up by close_ctree, so bail if we're closing. */ 3969 if (btrfs_fs_closing(fs_info)) 3970 return -EINTR; 3971 3972 bg = btrfs_lookup_block_group(fs_info, group_start); 3973 if (!bg) 3974 return -ENOENT; 3975 3976 /* 3977 * Relocation of a data block group creates ordered extents. Without 3978 * sb_start_write(), we can freeze the filesystem while unfinished 3979 * ordered extents are left. Such ordered extents can cause a deadlock 3980 * e.g. when syncfs() is waiting for their completion but they can't 3981 * finish because they block when joining a transaction, due to the 3982 * fact that the freeze locks are being held in write mode. 3983 */ 3984 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 3985 ASSERT(sb_write_started(fs_info->sb)); 3986 3987 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3988 btrfs_put_block_group(bg); 3989 return -ETXTBSY; 3990 } 3991 3992 rc = alloc_reloc_control(fs_info); 3993 if (!rc) { 3994 btrfs_put_block_group(bg); 3995 return -ENOMEM; 3996 } 3997 3998 ret = reloc_chunk_start(fs_info); 3999 if (ret < 0) { 4000 err = ret; 4001 goto out_put_bg; 4002 } 4003 4004 rc->extent_root = extent_root; 4005 rc->block_group = bg; 4006 4007 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4008 if (ret) { 4009 err = ret; 4010 goto out; 4011 } 4012 rw = 1; 4013 4014 path = btrfs_alloc_path(); 4015 if (!path) { 4016 err = -ENOMEM; 4017 goto out; 4018 } 4019 4020 inode = lookup_free_space_inode(rc->block_group, path); 4021 btrfs_free_path(path); 4022 4023 if (!IS_ERR(inode)) 4024 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4025 else 4026 ret = PTR_ERR(inode); 4027 4028 if (ret && ret != -ENOENT) { 4029 err = ret; 4030 goto out; 4031 } 4032 4033 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4034 if (IS_ERR(rc->data_inode)) { 4035 err = PTR_ERR(rc->data_inode); 4036 rc->data_inode = NULL; 4037 goto out; 4038 } 4039 4040 describe_relocation(fs_info, rc->block_group); 4041 4042 btrfs_wait_block_group_reservations(rc->block_group); 4043 btrfs_wait_nocow_writers(rc->block_group); 4044 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4045 rc->block_group->start, 4046 rc->block_group->length); 4047 4048 ret = btrfs_zone_finish(rc->block_group); 4049 WARN_ON(ret && ret != -EAGAIN); 4050 4051 while (1) { 4052 int finishes_stage; 4053 4054 mutex_lock(&fs_info->cleaner_mutex); 4055 ret = relocate_block_group(rc); 4056 mutex_unlock(&fs_info->cleaner_mutex); 4057 if (ret < 0) 4058 err = ret; 4059 4060 finishes_stage = rc->stage; 4061 /* 4062 * We may have gotten ENOSPC after we already dirtied some 4063 * extents. If writeout happens while we're relocating a 4064 * different block group we could end up hitting the 4065 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4066 * btrfs_reloc_cow_block. Make sure we write everything out 4067 * properly so we don't trip over this problem, and then break 4068 * out of the loop if we hit an error. 4069 */ 4070 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4071 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4072 (u64)-1); 4073 if (ret) 4074 err = ret; 4075 invalidate_mapping_pages(rc->data_inode->i_mapping, 4076 0, -1); 4077 rc->stage = UPDATE_DATA_PTRS; 4078 } 4079 4080 if (err < 0) 4081 goto out; 4082 4083 if (rc->extents_found == 0) 4084 break; 4085 4086 btrfs_info(fs_info, "found %llu extents, stage: %s", 4087 rc->extents_found, stage_to_string(finishes_stage)); 4088 } 4089 4090 WARN_ON(rc->block_group->pinned > 0); 4091 WARN_ON(rc->block_group->reserved > 0); 4092 WARN_ON(rc->block_group->used > 0); 4093 out: 4094 if (err && rw) 4095 btrfs_dec_block_group_ro(rc->block_group); 4096 iput(rc->data_inode); 4097 out_put_bg: 4098 btrfs_put_block_group(bg); 4099 reloc_chunk_end(fs_info); 4100 free_reloc_control(rc); 4101 return err; 4102 } 4103 4104 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4105 { 4106 struct btrfs_fs_info *fs_info = root->fs_info; 4107 struct btrfs_trans_handle *trans; 4108 int ret, err; 4109 4110 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4111 if (IS_ERR(trans)) 4112 return PTR_ERR(trans); 4113 4114 memset(&root->root_item.drop_progress, 0, 4115 sizeof(root->root_item.drop_progress)); 4116 btrfs_set_root_drop_level(&root->root_item, 0); 4117 btrfs_set_root_refs(&root->root_item, 0); 4118 ret = btrfs_update_root(trans, fs_info->tree_root, 4119 &root->root_key, &root->root_item); 4120 4121 err = btrfs_end_transaction(trans); 4122 if (err) 4123 return err; 4124 return ret; 4125 } 4126 4127 /* 4128 * recover relocation interrupted by system crash. 4129 * 4130 * this function resumes merging reloc trees with corresponding fs trees. 4131 * this is important for keeping the sharing of tree blocks 4132 */ 4133 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4134 { 4135 LIST_HEAD(reloc_roots); 4136 struct btrfs_key key; 4137 struct btrfs_root *fs_root; 4138 struct btrfs_root *reloc_root; 4139 struct btrfs_path *path; 4140 struct extent_buffer *leaf; 4141 struct reloc_control *rc = NULL; 4142 struct btrfs_trans_handle *trans; 4143 int ret; 4144 int err = 0; 4145 4146 path = btrfs_alloc_path(); 4147 if (!path) 4148 return -ENOMEM; 4149 path->reada = READA_BACK; 4150 4151 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4152 key.type = BTRFS_ROOT_ITEM_KEY; 4153 key.offset = (u64)-1; 4154 4155 while (1) { 4156 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4157 path, 0, 0); 4158 if (ret < 0) { 4159 err = ret; 4160 goto out; 4161 } 4162 if (ret > 0) { 4163 if (path->slots[0] == 0) 4164 break; 4165 path->slots[0]--; 4166 } 4167 leaf = path->nodes[0]; 4168 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4169 btrfs_release_path(path); 4170 4171 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4172 key.type != BTRFS_ROOT_ITEM_KEY) 4173 break; 4174 4175 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4176 if (IS_ERR(reloc_root)) { 4177 err = PTR_ERR(reloc_root); 4178 goto out; 4179 } 4180 4181 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4182 list_add(&reloc_root->root_list, &reloc_roots); 4183 4184 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4185 fs_root = btrfs_get_fs_root(fs_info, 4186 reloc_root->root_key.offset, false); 4187 if (IS_ERR(fs_root)) { 4188 ret = PTR_ERR(fs_root); 4189 if (ret != -ENOENT) { 4190 err = ret; 4191 goto out; 4192 } 4193 ret = mark_garbage_root(reloc_root); 4194 if (ret < 0) { 4195 err = ret; 4196 goto out; 4197 } 4198 } else { 4199 btrfs_put_root(fs_root); 4200 } 4201 } 4202 4203 if (key.offset == 0) 4204 break; 4205 4206 key.offset--; 4207 } 4208 btrfs_release_path(path); 4209 4210 if (list_empty(&reloc_roots)) 4211 goto out; 4212 4213 rc = alloc_reloc_control(fs_info); 4214 if (!rc) { 4215 err = -ENOMEM; 4216 goto out; 4217 } 4218 4219 ret = reloc_chunk_start(fs_info); 4220 if (ret < 0) { 4221 err = ret; 4222 goto out_end; 4223 } 4224 4225 rc->extent_root = btrfs_extent_root(fs_info, 0); 4226 4227 set_reloc_control(rc); 4228 4229 trans = btrfs_join_transaction(rc->extent_root); 4230 if (IS_ERR(trans)) { 4231 err = PTR_ERR(trans); 4232 goto out_unset; 4233 } 4234 4235 rc->merge_reloc_tree = 1; 4236 4237 while (!list_empty(&reloc_roots)) { 4238 reloc_root = list_entry(reloc_roots.next, 4239 struct btrfs_root, root_list); 4240 list_del(&reloc_root->root_list); 4241 4242 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4243 list_add_tail(&reloc_root->root_list, 4244 &rc->reloc_roots); 4245 continue; 4246 } 4247 4248 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4249 false); 4250 if (IS_ERR(fs_root)) { 4251 err = PTR_ERR(fs_root); 4252 list_add_tail(&reloc_root->root_list, &reloc_roots); 4253 btrfs_end_transaction(trans); 4254 goto out_unset; 4255 } 4256 4257 err = __add_reloc_root(reloc_root); 4258 ASSERT(err != -EEXIST); 4259 if (err) { 4260 list_add_tail(&reloc_root->root_list, &reloc_roots); 4261 btrfs_put_root(fs_root); 4262 btrfs_end_transaction(trans); 4263 goto out_unset; 4264 } 4265 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4266 btrfs_put_root(fs_root); 4267 } 4268 4269 err = btrfs_commit_transaction(trans); 4270 if (err) 4271 goto out_unset; 4272 4273 merge_reloc_roots(rc); 4274 4275 unset_reloc_control(rc); 4276 4277 trans = btrfs_join_transaction(rc->extent_root); 4278 if (IS_ERR(trans)) { 4279 err = PTR_ERR(trans); 4280 goto out_clean; 4281 } 4282 err = btrfs_commit_transaction(trans); 4283 out_clean: 4284 ret = clean_dirty_subvols(rc); 4285 if (ret < 0 && !err) 4286 err = ret; 4287 out_unset: 4288 unset_reloc_control(rc); 4289 out_end: 4290 reloc_chunk_end(fs_info); 4291 free_reloc_control(rc); 4292 out: 4293 free_reloc_roots(&reloc_roots); 4294 4295 btrfs_free_path(path); 4296 4297 if (err == 0) { 4298 /* cleanup orphan inode in data relocation tree */ 4299 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4300 ASSERT(fs_root); 4301 err = btrfs_orphan_cleanup(fs_root); 4302 btrfs_put_root(fs_root); 4303 } 4304 return err; 4305 } 4306 4307 /* 4308 * helper to add ordered checksum for data relocation. 4309 * 4310 * cloning checksum properly handles the nodatasum extents. 4311 * it also saves CPU time to re-calculate the checksum. 4312 */ 4313 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4314 { 4315 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 4316 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4317 u64 disk_bytenr = ordered->file_offset + inode->index_cnt; 4318 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4319 LIST_HEAD(list); 4320 int ret; 4321 4322 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4323 disk_bytenr + ordered->num_bytes - 1, 4324 &list, 0, false); 4325 if (ret) 4326 return ret; 4327 4328 while (!list_empty(&list)) { 4329 struct btrfs_ordered_sum *sums = 4330 list_entry(list.next, struct btrfs_ordered_sum, list); 4331 4332 list_del_init(&sums->list); 4333 4334 /* 4335 * We need to offset the new_bytenr based on where the csum is. 4336 * We need to do this because we will read in entire prealloc 4337 * extents but we may have written to say the middle of the 4338 * prealloc extent, so we need to make sure the csum goes with 4339 * the right disk offset. 4340 * 4341 * We can do this because the data reloc inode refers strictly 4342 * to the on disk bytes, so we don't have to worry about 4343 * disk_len vs real len like with real inodes since it's all 4344 * disk length. 4345 */ 4346 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4347 btrfs_add_ordered_sum(ordered, sums); 4348 } 4349 4350 return 0; 4351 } 4352 4353 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4354 struct btrfs_root *root, 4355 const struct extent_buffer *buf, 4356 struct extent_buffer *cow) 4357 { 4358 struct btrfs_fs_info *fs_info = root->fs_info; 4359 struct reloc_control *rc; 4360 struct btrfs_backref_node *node; 4361 int first_cow = 0; 4362 int level; 4363 int ret = 0; 4364 4365 rc = fs_info->reloc_ctl; 4366 if (!rc) 4367 return 0; 4368 4369 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4370 4371 level = btrfs_header_level(buf); 4372 if (btrfs_header_generation(buf) <= 4373 btrfs_root_last_snapshot(&root->root_item)) 4374 first_cow = 1; 4375 4376 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4377 rc->create_reloc_tree) { 4378 WARN_ON(!first_cow && level == 0); 4379 4380 node = rc->backref_cache.path[level]; 4381 BUG_ON(node->bytenr != buf->start && 4382 node->new_bytenr != buf->start); 4383 4384 btrfs_backref_drop_node_buffer(node); 4385 atomic_inc(&cow->refs); 4386 node->eb = cow; 4387 node->new_bytenr = cow->start; 4388 4389 if (!node->pending) { 4390 list_move_tail(&node->list, 4391 &rc->backref_cache.pending[level]); 4392 node->pending = 1; 4393 } 4394 4395 if (first_cow) 4396 mark_block_processed(rc, node); 4397 4398 if (first_cow && level > 0) 4399 rc->nodes_relocated += buf->len; 4400 } 4401 4402 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4403 ret = replace_file_extents(trans, rc, root, cow); 4404 return ret; 4405 } 4406 4407 /* 4408 * called before creating snapshot. it calculates metadata reservation 4409 * required for relocating tree blocks in the snapshot 4410 */ 4411 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4412 u64 *bytes_to_reserve) 4413 { 4414 struct btrfs_root *root = pending->root; 4415 struct reloc_control *rc = root->fs_info->reloc_ctl; 4416 4417 if (!rc || !have_reloc_root(root)) 4418 return; 4419 4420 if (!rc->merge_reloc_tree) 4421 return; 4422 4423 root = root->reloc_root; 4424 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4425 /* 4426 * relocation is in the stage of merging trees. the space 4427 * used by merging a reloc tree is twice the size of 4428 * relocated tree nodes in the worst case. half for cowing 4429 * the reloc tree, half for cowing the fs tree. the space 4430 * used by cowing the reloc tree will be freed after the 4431 * tree is dropped. if we create snapshot, cowing the fs 4432 * tree may use more space than it frees. so we need 4433 * reserve extra space. 4434 */ 4435 *bytes_to_reserve += rc->nodes_relocated; 4436 } 4437 4438 /* 4439 * called after snapshot is created. migrate block reservation 4440 * and create reloc root for the newly created snapshot 4441 * 4442 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4443 * references held on the reloc_root, one for root->reloc_root and one for 4444 * rc->reloc_roots. 4445 */ 4446 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4447 struct btrfs_pending_snapshot *pending) 4448 { 4449 struct btrfs_root *root = pending->root; 4450 struct btrfs_root *reloc_root; 4451 struct btrfs_root *new_root; 4452 struct reloc_control *rc = root->fs_info->reloc_ctl; 4453 int ret; 4454 4455 if (!rc || !have_reloc_root(root)) 4456 return 0; 4457 4458 rc = root->fs_info->reloc_ctl; 4459 rc->merging_rsv_size += rc->nodes_relocated; 4460 4461 if (rc->merge_reloc_tree) { 4462 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4463 rc->block_rsv, 4464 rc->nodes_relocated, true); 4465 if (ret) 4466 return ret; 4467 } 4468 4469 new_root = pending->snap; 4470 reloc_root = create_reloc_root(trans, root->reloc_root, 4471 new_root->root_key.objectid); 4472 if (IS_ERR(reloc_root)) 4473 return PTR_ERR(reloc_root); 4474 4475 ret = __add_reloc_root(reloc_root); 4476 ASSERT(ret != -EEXIST); 4477 if (ret) { 4478 /* Pairs with create_reloc_root */ 4479 btrfs_put_root(reloc_root); 4480 return ret; 4481 } 4482 new_root->reloc_root = btrfs_grab_root(reloc_root); 4483 4484 if (rc->create_reloc_tree) 4485 ret = clone_backref_node(trans, rc, root, reloc_root); 4486 return ret; 4487 } 4488 4489 /* 4490 * Get the current bytenr for the block group which is being relocated. 4491 * 4492 * Return U64_MAX if no running relocation. 4493 */ 4494 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4495 { 4496 u64 logical = U64_MAX; 4497 4498 lockdep_assert_held(&fs_info->reloc_mutex); 4499 4500 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4501 logical = fs_info->reloc_ctl->block_group->start; 4502 return logical; 4503 } 4504