1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 34 /* 35 * Relocation overview 36 * 37 * [What does relocation do] 38 * 39 * The objective of relocation is to relocate all extents of the target block 40 * group to other block groups. 41 * This is utilized by resize (shrink only), profile converting, compacting 42 * space, or balance routine to spread chunks over devices. 43 * 44 * Before | After 45 * ------------------------------------------------------------------ 46 * BG A: 10 data extents | BG A: deleted 47 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 48 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 49 * 50 * [How does relocation work] 51 * 52 * 1. Mark the target block group read-only 53 * New extents won't be allocated from the target block group. 54 * 55 * 2.1 Record each extent in the target block group 56 * To build a proper map of extents to be relocated. 57 * 58 * 2.2 Build data reloc tree and reloc trees 59 * Data reloc tree will contain an inode, recording all newly relocated 60 * data extents. 61 * There will be only one data reloc tree for one data block group. 62 * 63 * Reloc tree will be a special snapshot of its source tree, containing 64 * relocated tree blocks. 65 * Each tree referring to a tree block in target block group will get its 66 * reloc tree built. 67 * 68 * 2.3 Swap source tree with its corresponding reloc tree 69 * Each involved tree only refers to new extents after swap. 70 * 71 * 3. Cleanup reloc trees and data reloc tree. 72 * As old extents in the target block group are still referenced by reloc 73 * trees, we need to clean them up before really freeing the target block 74 * group. 75 * 76 * The main complexity is in steps 2.2 and 2.3. 77 * 78 * The entry point of relocation is relocate_block_group() function. 79 */ 80 81 #define RELOCATION_RESERVED_NODES 256 82 /* 83 * map address of tree root to tree 84 */ 85 struct mapping_node { 86 struct { 87 struct rb_node rb_node; 88 u64 bytenr; 89 }; /* Use rb_simle_node for search/insert */ 90 void *data; 91 }; 92 93 struct mapping_tree { 94 struct rb_root rb_root; 95 spinlock_t lock; 96 }; 97 98 /* 99 * present a tree block to process 100 */ 101 struct tree_block { 102 struct { 103 struct rb_node rb_node; 104 u64 bytenr; 105 }; /* Use rb_simple_node for search/insert */ 106 u64 owner; 107 struct btrfs_key key; 108 unsigned int level:8; 109 unsigned int key_ready:1; 110 }; 111 112 #define MAX_EXTENTS 128 113 114 struct file_extent_cluster { 115 u64 start; 116 u64 end; 117 u64 boundary[MAX_EXTENTS]; 118 unsigned int nr; 119 }; 120 121 struct reloc_control { 122 /* block group to relocate */ 123 struct btrfs_block_group *block_group; 124 /* extent tree */ 125 struct btrfs_root *extent_root; 126 /* inode for moving data */ 127 struct inode *data_inode; 128 129 struct btrfs_block_rsv *block_rsv; 130 131 struct btrfs_backref_cache backref_cache; 132 133 struct file_extent_cluster cluster; 134 /* tree blocks have been processed */ 135 struct extent_io_tree processed_blocks; 136 /* map start of tree root to corresponding reloc tree */ 137 struct mapping_tree reloc_root_tree; 138 /* list of reloc trees */ 139 struct list_head reloc_roots; 140 /* list of subvolume trees that get relocated */ 141 struct list_head dirty_subvol_roots; 142 /* size of metadata reservation for merging reloc trees */ 143 u64 merging_rsv_size; 144 /* size of relocated tree nodes */ 145 u64 nodes_relocated; 146 /* reserved size for block group relocation*/ 147 u64 reserved_bytes; 148 149 u64 search_start; 150 u64 extents_found; 151 152 unsigned int stage:8; 153 unsigned int create_reloc_tree:1; 154 unsigned int merge_reloc_tree:1; 155 unsigned int found_file_extent:1; 156 }; 157 158 /* stages of data relocation */ 159 #define MOVE_DATA_EXTENTS 0 160 #define UPDATE_DATA_PTRS 1 161 162 static void mark_block_processed(struct reloc_control *rc, 163 struct btrfs_backref_node *node) 164 { 165 u32 blocksize; 166 167 if (node->level == 0 || 168 in_range(node->bytenr, rc->block_group->start, 169 rc->block_group->length)) { 170 blocksize = rc->extent_root->fs_info->nodesize; 171 set_extent_bits(&rc->processed_blocks, node->bytenr, 172 node->bytenr + blocksize - 1, EXTENT_DIRTY); 173 } 174 node->processed = 1; 175 } 176 177 178 static void mapping_tree_init(struct mapping_tree *tree) 179 { 180 tree->rb_root = RB_ROOT; 181 spin_lock_init(&tree->lock); 182 } 183 184 /* 185 * walk up backref nodes until reach node presents tree root 186 */ 187 static struct btrfs_backref_node *walk_up_backref( 188 struct btrfs_backref_node *node, 189 struct btrfs_backref_edge *edges[], int *index) 190 { 191 struct btrfs_backref_edge *edge; 192 int idx = *index; 193 194 while (!list_empty(&node->upper)) { 195 edge = list_entry(node->upper.next, 196 struct btrfs_backref_edge, list[LOWER]); 197 edges[idx++] = edge; 198 node = edge->node[UPPER]; 199 } 200 BUG_ON(node->detached); 201 *index = idx; 202 return node; 203 } 204 205 /* 206 * walk down backref nodes to find start of next reference path 207 */ 208 static struct btrfs_backref_node *walk_down_backref( 209 struct btrfs_backref_edge *edges[], int *index) 210 { 211 struct btrfs_backref_edge *edge; 212 struct btrfs_backref_node *lower; 213 int idx = *index; 214 215 while (idx > 0) { 216 edge = edges[idx - 1]; 217 lower = edge->node[LOWER]; 218 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 219 idx--; 220 continue; 221 } 222 edge = list_entry(edge->list[LOWER].next, 223 struct btrfs_backref_edge, list[LOWER]); 224 edges[idx - 1] = edge; 225 *index = idx; 226 return edge->node[UPPER]; 227 } 228 *index = 0; 229 return NULL; 230 } 231 232 static void update_backref_node(struct btrfs_backref_cache *cache, 233 struct btrfs_backref_node *node, u64 bytenr) 234 { 235 struct rb_node *rb_node; 236 rb_erase(&node->rb_node, &cache->rb_root); 237 node->bytenr = bytenr; 238 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); 239 if (rb_node) 240 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); 241 } 242 243 /* 244 * update backref cache after a transaction commit 245 */ 246 static int update_backref_cache(struct btrfs_trans_handle *trans, 247 struct btrfs_backref_cache *cache) 248 { 249 struct btrfs_backref_node *node; 250 int level = 0; 251 252 if (cache->last_trans == 0) { 253 cache->last_trans = trans->transid; 254 return 0; 255 } 256 257 if (cache->last_trans == trans->transid) 258 return 0; 259 260 /* 261 * detached nodes are used to avoid unnecessary backref 262 * lookup. transaction commit changes the extent tree. 263 * so the detached nodes are no longer useful. 264 */ 265 while (!list_empty(&cache->detached)) { 266 node = list_entry(cache->detached.next, 267 struct btrfs_backref_node, list); 268 btrfs_backref_cleanup_node(cache, node); 269 } 270 271 while (!list_empty(&cache->changed)) { 272 node = list_entry(cache->changed.next, 273 struct btrfs_backref_node, list); 274 list_del_init(&node->list); 275 BUG_ON(node->pending); 276 update_backref_node(cache, node, node->new_bytenr); 277 } 278 279 /* 280 * some nodes can be left in the pending list if there were 281 * errors during processing the pending nodes. 282 */ 283 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 284 list_for_each_entry(node, &cache->pending[level], list) { 285 BUG_ON(!node->pending); 286 if (node->bytenr == node->new_bytenr) 287 continue; 288 update_backref_node(cache, node, node->new_bytenr); 289 } 290 } 291 292 cache->last_trans = 0; 293 return 1; 294 } 295 296 static bool reloc_root_is_dead(struct btrfs_root *root) 297 { 298 /* 299 * Pair with set_bit/clear_bit in clean_dirty_subvols and 300 * btrfs_update_reloc_root. We need to see the updated bit before 301 * trying to access reloc_root 302 */ 303 smp_rmb(); 304 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 305 return true; 306 return false; 307 } 308 309 /* 310 * Check if this subvolume tree has valid reloc tree. 311 * 312 * Reloc tree after swap is considered dead, thus not considered as valid. 313 * This is enough for most callers, as they don't distinguish dead reloc root 314 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 315 * special case. 316 */ 317 static bool have_reloc_root(struct btrfs_root *root) 318 { 319 if (reloc_root_is_dead(root)) 320 return false; 321 if (!root->reloc_root) 322 return false; 323 return true; 324 } 325 326 int btrfs_should_ignore_reloc_root(struct btrfs_root *root) 327 { 328 struct btrfs_root *reloc_root; 329 330 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 331 return 0; 332 333 /* This root has been merged with its reloc tree, we can ignore it */ 334 if (reloc_root_is_dead(root)) 335 return 1; 336 337 reloc_root = root->reloc_root; 338 if (!reloc_root) 339 return 0; 340 341 if (btrfs_header_generation(reloc_root->commit_root) == 342 root->fs_info->running_transaction->transid) 343 return 0; 344 /* 345 * if there is reloc tree and it was created in previous 346 * transaction backref lookup can find the reloc tree, 347 * so backref node for the fs tree root is useless for 348 * relocation. 349 */ 350 return 1; 351 } 352 353 /* 354 * find reloc tree by address of tree root 355 */ 356 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 357 { 358 struct reloc_control *rc = fs_info->reloc_ctl; 359 struct rb_node *rb_node; 360 struct mapping_node *node; 361 struct btrfs_root *root = NULL; 362 363 ASSERT(rc); 364 spin_lock(&rc->reloc_root_tree.lock); 365 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 366 if (rb_node) { 367 node = rb_entry(rb_node, struct mapping_node, rb_node); 368 root = node->data; 369 } 370 spin_unlock(&rc->reloc_root_tree.lock); 371 return btrfs_grab_root(root); 372 } 373 374 /* 375 * For useless nodes, do two major clean ups: 376 * 377 * - Cleanup the children edges and nodes 378 * If child node is also orphan (no parent) during cleanup, then the child 379 * node will also be cleaned up. 380 * 381 * - Freeing up leaves (level 0), keeps nodes detached 382 * For nodes, the node is still cached as "detached" 383 * 384 * Return false if @node is not in the @useless_nodes list. 385 * Return true if @node is in the @useless_nodes list. 386 */ 387 static bool handle_useless_nodes(struct reloc_control *rc, 388 struct btrfs_backref_node *node) 389 { 390 struct btrfs_backref_cache *cache = &rc->backref_cache; 391 struct list_head *useless_node = &cache->useless_node; 392 bool ret = false; 393 394 while (!list_empty(useless_node)) { 395 struct btrfs_backref_node *cur; 396 397 cur = list_first_entry(useless_node, struct btrfs_backref_node, 398 list); 399 list_del_init(&cur->list); 400 401 /* Only tree root nodes can be added to @useless_nodes */ 402 ASSERT(list_empty(&cur->upper)); 403 404 if (cur == node) 405 ret = true; 406 407 /* The node is the lowest node */ 408 if (cur->lowest) { 409 list_del_init(&cur->lower); 410 cur->lowest = 0; 411 } 412 413 /* Cleanup the lower edges */ 414 while (!list_empty(&cur->lower)) { 415 struct btrfs_backref_edge *edge; 416 struct btrfs_backref_node *lower; 417 418 edge = list_entry(cur->lower.next, 419 struct btrfs_backref_edge, list[UPPER]); 420 list_del(&edge->list[UPPER]); 421 list_del(&edge->list[LOWER]); 422 lower = edge->node[LOWER]; 423 btrfs_backref_free_edge(cache, edge); 424 425 /* Child node is also orphan, queue for cleanup */ 426 if (list_empty(&lower->upper)) 427 list_add(&lower->list, useless_node); 428 } 429 /* Mark this block processed for relocation */ 430 mark_block_processed(rc, cur); 431 432 /* 433 * Backref nodes for tree leaves are deleted from the cache. 434 * Backref nodes for upper level tree blocks are left in the 435 * cache to avoid unnecessary backref lookup. 436 */ 437 if (cur->level > 0) { 438 list_add(&cur->list, &cache->detached); 439 cur->detached = 1; 440 } else { 441 rb_erase(&cur->rb_node, &cache->rb_root); 442 btrfs_backref_free_node(cache, cur); 443 } 444 } 445 return ret; 446 } 447 448 /* 449 * Build backref tree for a given tree block. Root of the backref tree 450 * corresponds the tree block, leaves of the backref tree correspond roots of 451 * b-trees that reference the tree block. 452 * 453 * The basic idea of this function is check backrefs of a given block to find 454 * upper level blocks that reference the block, and then check backrefs of 455 * these upper level blocks recursively. The recursion stops when tree root is 456 * reached or backrefs for the block is cached. 457 * 458 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 459 * all upper level blocks that directly/indirectly reference the block are also 460 * cached. 461 */ 462 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 463 struct reloc_control *rc, struct btrfs_key *node_key, 464 int level, u64 bytenr) 465 { 466 struct btrfs_backref_iter *iter; 467 struct btrfs_backref_cache *cache = &rc->backref_cache; 468 /* For searching parent of TREE_BLOCK_REF */ 469 struct btrfs_path *path; 470 struct btrfs_backref_node *cur; 471 struct btrfs_backref_node *node = NULL; 472 struct btrfs_backref_edge *edge; 473 int ret; 474 int err = 0; 475 476 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); 477 if (!iter) 478 return ERR_PTR(-ENOMEM); 479 path = btrfs_alloc_path(); 480 if (!path) { 481 err = -ENOMEM; 482 goto out; 483 } 484 485 node = btrfs_backref_alloc_node(cache, bytenr, level); 486 if (!node) { 487 err = -ENOMEM; 488 goto out; 489 } 490 491 node->lowest = 1; 492 cur = node; 493 494 /* Breadth-first search to build backref cache */ 495 do { 496 ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, 497 cur); 498 if (ret < 0) { 499 err = ret; 500 goto out; 501 } 502 edge = list_first_entry_or_null(&cache->pending_edge, 503 struct btrfs_backref_edge, list[UPPER]); 504 /* 505 * The pending list isn't empty, take the first block to 506 * process 507 */ 508 if (edge) { 509 list_del_init(&edge->list[UPPER]); 510 cur = edge->node[UPPER]; 511 } 512 } while (edge); 513 514 /* Finish the upper linkage of newly added edges/nodes */ 515 ret = btrfs_backref_finish_upper_links(cache, node); 516 if (ret < 0) { 517 err = ret; 518 goto out; 519 } 520 521 if (handle_useless_nodes(rc, node)) 522 node = NULL; 523 out: 524 btrfs_backref_iter_free(iter); 525 btrfs_free_path(path); 526 if (err) { 527 btrfs_backref_error_cleanup(cache, node); 528 return ERR_PTR(err); 529 } 530 ASSERT(!node || !node->detached); 531 ASSERT(list_empty(&cache->useless_node) && 532 list_empty(&cache->pending_edge)); 533 return node; 534 } 535 536 /* 537 * helper to add backref node for the newly created snapshot. 538 * the backref node is created by cloning backref node that 539 * corresponds to root of source tree 540 */ 541 static int clone_backref_node(struct btrfs_trans_handle *trans, 542 struct reloc_control *rc, 543 struct btrfs_root *src, 544 struct btrfs_root *dest) 545 { 546 struct btrfs_root *reloc_root = src->reloc_root; 547 struct btrfs_backref_cache *cache = &rc->backref_cache; 548 struct btrfs_backref_node *node = NULL; 549 struct btrfs_backref_node *new_node; 550 struct btrfs_backref_edge *edge; 551 struct btrfs_backref_edge *new_edge; 552 struct rb_node *rb_node; 553 554 if (cache->last_trans > 0) 555 update_backref_cache(trans, cache); 556 557 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 558 if (rb_node) { 559 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 560 if (node->detached) 561 node = NULL; 562 else 563 BUG_ON(node->new_bytenr != reloc_root->node->start); 564 } 565 566 if (!node) { 567 rb_node = rb_simple_search(&cache->rb_root, 568 reloc_root->commit_root->start); 569 if (rb_node) { 570 node = rb_entry(rb_node, struct btrfs_backref_node, 571 rb_node); 572 BUG_ON(node->detached); 573 } 574 } 575 576 if (!node) 577 return 0; 578 579 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 580 node->level); 581 if (!new_node) 582 return -ENOMEM; 583 584 new_node->lowest = node->lowest; 585 new_node->checked = 1; 586 new_node->root = btrfs_grab_root(dest); 587 ASSERT(new_node->root); 588 589 if (!node->lowest) { 590 list_for_each_entry(edge, &node->lower, list[UPPER]) { 591 new_edge = btrfs_backref_alloc_edge(cache); 592 if (!new_edge) 593 goto fail; 594 595 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 596 new_node, LINK_UPPER); 597 } 598 } else { 599 list_add_tail(&new_node->lower, &cache->leaves); 600 } 601 602 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 603 &new_node->rb_node); 604 if (rb_node) 605 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 606 607 if (!new_node->lowest) { 608 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 609 list_add_tail(&new_edge->list[LOWER], 610 &new_edge->node[LOWER]->upper); 611 } 612 } 613 return 0; 614 fail: 615 while (!list_empty(&new_node->lower)) { 616 new_edge = list_entry(new_node->lower.next, 617 struct btrfs_backref_edge, list[UPPER]); 618 list_del(&new_edge->list[UPPER]); 619 btrfs_backref_free_edge(cache, new_edge); 620 } 621 btrfs_backref_free_node(cache, new_node); 622 return -ENOMEM; 623 } 624 625 /* 626 * helper to add 'address of tree root -> reloc tree' mapping 627 */ 628 static int __must_check __add_reloc_root(struct btrfs_root *root) 629 { 630 struct btrfs_fs_info *fs_info = root->fs_info; 631 struct rb_node *rb_node; 632 struct mapping_node *node; 633 struct reloc_control *rc = fs_info->reloc_ctl; 634 635 node = kmalloc(sizeof(*node), GFP_NOFS); 636 if (!node) 637 return -ENOMEM; 638 639 node->bytenr = root->commit_root->start; 640 node->data = root; 641 642 spin_lock(&rc->reloc_root_tree.lock); 643 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 644 node->bytenr, &node->rb_node); 645 spin_unlock(&rc->reloc_root_tree.lock); 646 if (rb_node) { 647 btrfs_err(fs_info, 648 "Duplicate root found for start=%llu while inserting into relocation tree", 649 node->bytenr); 650 return -EEXIST; 651 } 652 653 list_add_tail(&root->root_list, &rc->reloc_roots); 654 return 0; 655 } 656 657 /* 658 * helper to delete the 'address of tree root -> reloc tree' 659 * mapping 660 */ 661 static void __del_reloc_root(struct btrfs_root *root) 662 { 663 struct btrfs_fs_info *fs_info = root->fs_info; 664 struct rb_node *rb_node; 665 struct mapping_node *node = NULL; 666 struct reloc_control *rc = fs_info->reloc_ctl; 667 bool put_ref = false; 668 669 if (rc && root->node) { 670 spin_lock(&rc->reloc_root_tree.lock); 671 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 672 root->commit_root->start); 673 if (rb_node) { 674 node = rb_entry(rb_node, struct mapping_node, rb_node); 675 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 676 RB_CLEAR_NODE(&node->rb_node); 677 } 678 spin_unlock(&rc->reloc_root_tree.lock); 679 ASSERT(!node || (struct btrfs_root *)node->data == root); 680 } 681 682 /* 683 * We only put the reloc root here if it's on the list. There's a lot 684 * of places where the pattern is to splice the rc->reloc_roots, process 685 * the reloc roots, and then add the reloc root back onto 686 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 687 * list we don't want the reference being dropped, because the guy 688 * messing with the list is in charge of the reference. 689 */ 690 spin_lock(&fs_info->trans_lock); 691 if (!list_empty(&root->root_list)) { 692 put_ref = true; 693 list_del_init(&root->root_list); 694 } 695 spin_unlock(&fs_info->trans_lock); 696 if (put_ref) 697 btrfs_put_root(root); 698 kfree(node); 699 } 700 701 /* 702 * helper to update the 'address of tree root -> reloc tree' 703 * mapping 704 */ 705 static int __update_reloc_root(struct btrfs_root *root) 706 { 707 struct btrfs_fs_info *fs_info = root->fs_info; 708 struct rb_node *rb_node; 709 struct mapping_node *node = NULL; 710 struct reloc_control *rc = fs_info->reloc_ctl; 711 712 spin_lock(&rc->reloc_root_tree.lock); 713 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 714 root->commit_root->start); 715 if (rb_node) { 716 node = rb_entry(rb_node, struct mapping_node, rb_node); 717 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 718 } 719 spin_unlock(&rc->reloc_root_tree.lock); 720 721 if (!node) 722 return 0; 723 BUG_ON((struct btrfs_root *)node->data != root); 724 725 spin_lock(&rc->reloc_root_tree.lock); 726 node->bytenr = root->node->start; 727 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 728 node->bytenr, &node->rb_node); 729 spin_unlock(&rc->reloc_root_tree.lock); 730 if (rb_node) 731 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 732 return 0; 733 } 734 735 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 736 struct btrfs_root *root, u64 objectid) 737 { 738 struct btrfs_fs_info *fs_info = root->fs_info; 739 struct btrfs_root *reloc_root; 740 struct extent_buffer *eb; 741 struct btrfs_root_item *root_item; 742 struct btrfs_key root_key; 743 int ret = 0; 744 bool must_abort = false; 745 746 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 747 if (!root_item) 748 return ERR_PTR(-ENOMEM); 749 750 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 751 root_key.type = BTRFS_ROOT_ITEM_KEY; 752 root_key.offset = objectid; 753 754 if (root->root_key.objectid == objectid) { 755 u64 commit_root_gen; 756 757 /* called by btrfs_init_reloc_root */ 758 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 759 BTRFS_TREE_RELOC_OBJECTID); 760 if (ret) 761 goto fail; 762 763 /* 764 * Set the last_snapshot field to the generation of the commit 765 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 766 * correctly (returns true) when the relocation root is created 767 * either inside the critical section of a transaction commit 768 * (through transaction.c:qgroup_account_snapshot()) and when 769 * it's created before the transaction commit is started. 770 */ 771 commit_root_gen = btrfs_header_generation(root->commit_root); 772 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 773 } else { 774 /* 775 * called by btrfs_reloc_post_snapshot_hook. 776 * the source tree is a reloc tree, all tree blocks 777 * modified after it was created have RELOC flag 778 * set in their headers. so it's OK to not update 779 * the 'last_snapshot'. 780 */ 781 ret = btrfs_copy_root(trans, root, root->node, &eb, 782 BTRFS_TREE_RELOC_OBJECTID); 783 if (ret) 784 goto fail; 785 } 786 787 /* 788 * We have changed references at this point, we must abort the 789 * transaction if anything fails. 790 */ 791 must_abort = true; 792 793 memcpy(root_item, &root->root_item, sizeof(*root_item)); 794 btrfs_set_root_bytenr(root_item, eb->start); 795 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 796 btrfs_set_root_generation(root_item, trans->transid); 797 798 if (root->root_key.objectid == objectid) { 799 btrfs_set_root_refs(root_item, 0); 800 memset(&root_item->drop_progress, 0, 801 sizeof(struct btrfs_disk_key)); 802 btrfs_set_root_drop_level(root_item, 0); 803 } 804 805 btrfs_tree_unlock(eb); 806 free_extent_buffer(eb); 807 808 ret = btrfs_insert_root(trans, fs_info->tree_root, 809 &root_key, root_item); 810 if (ret) 811 goto fail; 812 813 kfree(root_item); 814 815 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 816 if (IS_ERR(reloc_root)) { 817 ret = PTR_ERR(reloc_root); 818 goto abort; 819 } 820 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 821 reloc_root->last_trans = trans->transid; 822 return reloc_root; 823 fail: 824 kfree(root_item); 825 abort: 826 if (must_abort) 827 btrfs_abort_transaction(trans, ret); 828 return ERR_PTR(ret); 829 } 830 831 /* 832 * create reloc tree for a given fs tree. reloc tree is just a 833 * snapshot of the fs tree with special root objectid. 834 * 835 * The reloc_root comes out of here with two references, one for 836 * root->reloc_root, and another for being on the rc->reloc_roots list. 837 */ 838 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 839 struct btrfs_root *root) 840 { 841 struct btrfs_fs_info *fs_info = root->fs_info; 842 struct btrfs_root *reloc_root; 843 struct reloc_control *rc = fs_info->reloc_ctl; 844 struct btrfs_block_rsv *rsv; 845 int clear_rsv = 0; 846 int ret; 847 848 if (!rc) 849 return 0; 850 851 /* 852 * The subvolume has reloc tree but the swap is finished, no need to 853 * create/update the dead reloc tree 854 */ 855 if (reloc_root_is_dead(root)) 856 return 0; 857 858 /* 859 * This is subtle but important. We do not do 860 * record_root_in_transaction for reloc roots, instead we record their 861 * corresponding fs root, and then here we update the last trans for the 862 * reloc root. This means that we have to do this for the entire life 863 * of the reloc root, regardless of which stage of the relocation we are 864 * in. 865 */ 866 if (root->reloc_root) { 867 reloc_root = root->reloc_root; 868 reloc_root->last_trans = trans->transid; 869 return 0; 870 } 871 872 /* 873 * We are merging reloc roots, we do not need new reloc trees. Also 874 * reloc trees never need their own reloc tree. 875 */ 876 if (!rc->create_reloc_tree || 877 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 878 return 0; 879 880 if (!trans->reloc_reserved) { 881 rsv = trans->block_rsv; 882 trans->block_rsv = rc->block_rsv; 883 clear_rsv = 1; 884 } 885 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 886 if (clear_rsv) 887 trans->block_rsv = rsv; 888 if (IS_ERR(reloc_root)) 889 return PTR_ERR(reloc_root); 890 891 ret = __add_reloc_root(reloc_root); 892 ASSERT(ret != -EEXIST); 893 if (ret) { 894 /* Pairs with create_reloc_root */ 895 btrfs_put_root(reloc_root); 896 return ret; 897 } 898 root->reloc_root = btrfs_grab_root(reloc_root); 899 return 0; 900 } 901 902 /* 903 * update root item of reloc tree 904 */ 905 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 906 struct btrfs_root *root) 907 { 908 struct btrfs_fs_info *fs_info = root->fs_info; 909 struct btrfs_root *reloc_root; 910 struct btrfs_root_item *root_item; 911 int ret; 912 913 if (!have_reloc_root(root)) 914 return 0; 915 916 reloc_root = root->reloc_root; 917 root_item = &reloc_root->root_item; 918 919 /* 920 * We are probably ok here, but __del_reloc_root() will drop its ref of 921 * the root. We have the ref for root->reloc_root, but just in case 922 * hold it while we update the reloc root. 923 */ 924 btrfs_grab_root(reloc_root); 925 926 /* root->reloc_root will stay until current relocation finished */ 927 if (fs_info->reloc_ctl->merge_reloc_tree && 928 btrfs_root_refs(root_item) == 0) { 929 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 930 /* 931 * Mark the tree as dead before we change reloc_root so 932 * have_reloc_root will not touch it from now on. 933 */ 934 smp_wmb(); 935 __del_reloc_root(reloc_root); 936 } 937 938 if (reloc_root->commit_root != reloc_root->node) { 939 __update_reloc_root(reloc_root); 940 btrfs_set_root_node(root_item, reloc_root->node); 941 free_extent_buffer(reloc_root->commit_root); 942 reloc_root->commit_root = btrfs_root_node(reloc_root); 943 } 944 945 ret = btrfs_update_root(trans, fs_info->tree_root, 946 &reloc_root->root_key, root_item); 947 btrfs_put_root(reloc_root); 948 return ret; 949 } 950 951 /* 952 * helper to find first cached inode with inode number >= objectid 953 * in a subvolume 954 */ 955 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 956 { 957 struct rb_node *node; 958 struct rb_node *prev; 959 struct btrfs_inode *entry; 960 struct inode *inode; 961 962 spin_lock(&root->inode_lock); 963 again: 964 node = root->inode_tree.rb_node; 965 prev = NULL; 966 while (node) { 967 prev = node; 968 entry = rb_entry(node, struct btrfs_inode, rb_node); 969 970 if (objectid < btrfs_ino(entry)) 971 node = node->rb_left; 972 else if (objectid > btrfs_ino(entry)) 973 node = node->rb_right; 974 else 975 break; 976 } 977 if (!node) { 978 while (prev) { 979 entry = rb_entry(prev, struct btrfs_inode, rb_node); 980 if (objectid <= btrfs_ino(entry)) { 981 node = prev; 982 break; 983 } 984 prev = rb_next(prev); 985 } 986 } 987 while (node) { 988 entry = rb_entry(node, struct btrfs_inode, rb_node); 989 inode = igrab(&entry->vfs_inode); 990 if (inode) { 991 spin_unlock(&root->inode_lock); 992 return inode; 993 } 994 995 objectid = btrfs_ino(entry) + 1; 996 if (cond_resched_lock(&root->inode_lock)) 997 goto again; 998 999 node = rb_next(node); 1000 } 1001 spin_unlock(&root->inode_lock); 1002 return NULL; 1003 } 1004 1005 /* 1006 * get new location of data 1007 */ 1008 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1009 u64 bytenr, u64 num_bytes) 1010 { 1011 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1012 struct btrfs_path *path; 1013 struct btrfs_file_extent_item *fi; 1014 struct extent_buffer *leaf; 1015 int ret; 1016 1017 path = btrfs_alloc_path(); 1018 if (!path) 1019 return -ENOMEM; 1020 1021 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1022 ret = btrfs_lookup_file_extent(NULL, root, path, 1023 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1024 if (ret < 0) 1025 goto out; 1026 if (ret > 0) { 1027 ret = -ENOENT; 1028 goto out; 1029 } 1030 1031 leaf = path->nodes[0]; 1032 fi = btrfs_item_ptr(leaf, path->slots[0], 1033 struct btrfs_file_extent_item); 1034 1035 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1036 btrfs_file_extent_compression(leaf, fi) || 1037 btrfs_file_extent_encryption(leaf, fi) || 1038 btrfs_file_extent_other_encoding(leaf, fi)); 1039 1040 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1041 ret = -EINVAL; 1042 goto out; 1043 } 1044 1045 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1046 ret = 0; 1047 out: 1048 btrfs_free_path(path); 1049 return ret; 1050 } 1051 1052 /* 1053 * update file extent items in the tree leaf to point to 1054 * the new locations. 1055 */ 1056 static noinline_for_stack 1057 int replace_file_extents(struct btrfs_trans_handle *trans, 1058 struct reloc_control *rc, 1059 struct btrfs_root *root, 1060 struct extent_buffer *leaf) 1061 { 1062 struct btrfs_fs_info *fs_info = root->fs_info; 1063 struct btrfs_key key; 1064 struct btrfs_file_extent_item *fi; 1065 struct inode *inode = NULL; 1066 u64 parent; 1067 u64 bytenr; 1068 u64 new_bytenr = 0; 1069 u64 num_bytes; 1070 u64 end; 1071 u32 nritems; 1072 u32 i; 1073 int ret = 0; 1074 int first = 1; 1075 int dirty = 0; 1076 1077 if (rc->stage != UPDATE_DATA_PTRS) 1078 return 0; 1079 1080 /* reloc trees always use full backref */ 1081 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1082 parent = leaf->start; 1083 else 1084 parent = 0; 1085 1086 nritems = btrfs_header_nritems(leaf); 1087 for (i = 0; i < nritems; i++) { 1088 struct btrfs_ref ref = { 0 }; 1089 1090 cond_resched(); 1091 btrfs_item_key_to_cpu(leaf, &key, i); 1092 if (key.type != BTRFS_EXTENT_DATA_KEY) 1093 continue; 1094 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1095 if (btrfs_file_extent_type(leaf, fi) == 1096 BTRFS_FILE_EXTENT_INLINE) 1097 continue; 1098 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1099 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1100 if (bytenr == 0) 1101 continue; 1102 if (!in_range(bytenr, rc->block_group->start, 1103 rc->block_group->length)) 1104 continue; 1105 1106 /* 1107 * if we are modifying block in fs tree, wait for read_folio 1108 * to complete and drop the extent cache 1109 */ 1110 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1111 if (first) { 1112 inode = find_next_inode(root, key.objectid); 1113 first = 0; 1114 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1115 btrfs_add_delayed_iput(inode); 1116 inode = find_next_inode(root, key.objectid); 1117 } 1118 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1119 struct extent_state *cached_state = NULL; 1120 1121 end = key.offset + 1122 btrfs_file_extent_num_bytes(leaf, fi); 1123 WARN_ON(!IS_ALIGNED(key.offset, 1124 fs_info->sectorsize)); 1125 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1126 end--; 1127 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1128 key.offset, end, 1129 &cached_state); 1130 if (!ret) 1131 continue; 1132 1133 btrfs_drop_extent_map_range(BTRFS_I(inode), 1134 key.offset, end, true); 1135 unlock_extent(&BTRFS_I(inode)->io_tree, 1136 key.offset, end, &cached_state); 1137 } 1138 } 1139 1140 ret = get_new_location(rc->data_inode, &new_bytenr, 1141 bytenr, num_bytes); 1142 if (ret) { 1143 /* 1144 * Don't have to abort since we've not changed anything 1145 * in the file extent yet. 1146 */ 1147 break; 1148 } 1149 1150 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1151 dirty = 1; 1152 1153 key.offset -= btrfs_file_extent_offset(leaf, fi); 1154 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1155 num_bytes, parent); 1156 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1157 key.objectid, key.offset, 1158 root->root_key.objectid, false); 1159 ret = btrfs_inc_extent_ref(trans, &ref); 1160 if (ret) { 1161 btrfs_abort_transaction(trans, ret); 1162 break; 1163 } 1164 1165 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1166 num_bytes, parent); 1167 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1168 key.objectid, key.offset, 1169 root->root_key.objectid, false); 1170 ret = btrfs_free_extent(trans, &ref); 1171 if (ret) { 1172 btrfs_abort_transaction(trans, ret); 1173 break; 1174 } 1175 } 1176 if (dirty) 1177 btrfs_mark_buffer_dirty(leaf); 1178 if (inode) 1179 btrfs_add_delayed_iput(inode); 1180 return ret; 1181 } 1182 1183 static noinline_for_stack 1184 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1185 struct btrfs_path *path, int level) 1186 { 1187 struct btrfs_disk_key key1; 1188 struct btrfs_disk_key key2; 1189 btrfs_node_key(eb, &key1, slot); 1190 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1191 return memcmp(&key1, &key2, sizeof(key1)); 1192 } 1193 1194 /* 1195 * try to replace tree blocks in fs tree with the new blocks 1196 * in reloc tree. tree blocks haven't been modified since the 1197 * reloc tree was create can be replaced. 1198 * 1199 * if a block was replaced, level of the block + 1 is returned. 1200 * if no block got replaced, 0 is returned. if there are other 1201 * errors, a negative error number is returned. 1202 */ 1203 static noinline_for_stack 1204 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1205 struct btrfs_root *dest, struct btrfs_root *src, 1206 struct btrfs_path *path, struct btrfs_key *next_key, 1207 int lowest_level, int max_level) 1208 { 1209 struct btrfs_fs_info *fs_info = dest->fs_info; 1210 struct extent_buffer *eb; 1211 struct extent_buffer *parent; 1212 struct btrfs_ref ref = { 0 }; 1213 struct btrfs_key key; 1214 u64 old_bytenr; 1215 u64 new_bytenr; 1216 u64 old_ptr_gen; 1217 u64 new_ptr_gen; 1218 u64 last_snapshot; 1219 u32 blocksize; 1220 int cow = 0; 1221 int level; 1222 int ret; 1223 int slot; 1224 1225 ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1226 ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1227 1228 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1229 again: 1230 slot = path->slots[lowest_level]; 1231 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1232 1233 eb = btrfs_lock_root_node(dest); 1234 level = btrfs_header_level(eb); 1235 1236 if (level < lowest_level) { 1237 btrfs_tree_unlock(eb); 1238 free_extent_buffer(eb); 1239 return 0; 1240 } 1241 1242 if (cow) { 1243 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1244 BTRFS_NESTING_COW); 1245 if (ret) { 1246 btrfs_tree_unlock(eb); 1247 free_extent_buffer(eb); 1248 return ret; 1249 } 1250 } 1251 1252 if (next_key) { 1253 next_key->objectid = (u64)-1; 1254 next_key->type = (u8)-1; 1255 next_key->offset = (u64)-1; 1256 } 1257 1258 parent = eb; 1259 while (1) { 1260 level = btrfs_header_level(parent); 1261 ASSERT(level >= lowest_level); 1262 1263 ret = btrfs_bin_search(parent, &key, &slot); 1264 if (ret < 0) 1265 break; 1266 if (ret && slot > 0) 1267 slot--; 1268 1269 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1270 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1271 1272 old_bytenr = btrfs_node_blockptr(parent, slot); 1273 blocksize = fs_info->nodesize; 1274 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1275 1276 if (level <= max_level) { 1277 eb = path->nodes[level]; 1278 new_bytenr = btrfs_node_blockptr(eb, 1279 path->slots[level]); 1280 new_ptr_gen = btrfs_node_ptr_generation(eb, 1281 path->slots[level]); 1282 } else { 1283 new_bytenr = 0; 1284 new_ptr_gen = 0; 1285 } 1286 1287 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1288 ret = level; 1289 break; 1290 } 1291 1292 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1293 memcmp_node_keys(parent, slot, path, level)) { 1294 if (level <= lowest_level) { 1295 ret = 0; 1296 break; 1297 } 1298 1299 eb = btrfs_read_node_slot(parent, slot); 1300 if (IS_ERR(eb)) { 1301 ret = PTR_ERR(eb); 1302 break; 1303 } 1304 btrfs_tree_lock(eb); 1305 if (cow) { 1306 ret = btrfs_cow_block(trans, dest, eb, parent, 1307 slot, &eb, 1308 BTRFS_NESTING_COW); 1309 if (ret) { 1310 btrfs_tree_unlock(eb); 1311 free_extent_buffer(eb); 1312 break; 1313 } 1314 } 1315 1316 btrfs_tree_unlock(parent); 1317 free_extent_buffer(parent); 1318 1319 parent = eb; 1320 continue; 1321 } 1322 1323 if (!cow) { 1324 btrfs_tree_unlock(parent); 1325 free_extent_buffer(parent); 1326 cow = 1; 1327 goto again; 1328 } 1329 1330 btrfs_node_key_to_cpu(path->nodes[level], &key, 1331 path->slots[level]); 1332 btrfs_release_path(path); 1333 1334 path->lowest_level = level; 1335 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1336 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1337 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1338 path->lowest_level = 0; 1339 if (ret) { 1340 if (ret > 0) 1341 ret = -ENOENT; 1342 break; 1343 } 1344 1345 /* 1346 * Info qgroup to trace both subtrees. 1347 * 1348 * We must trace both trees. 1349 * 1) Tree reloc subtree 1350 * If not traced, we will leak data numbers 1351 * 2) Fs subtree 1352 * If not traced, we will double count old data 1353 * 1354 * We don't scan the subtree right now, but only record 1355 * the swapped tree blocks. 1356 * The real subtree rescan is delayed until we have new 1357 * CoW on the subtree root node before transaction commit. 1358 */ 1359 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1360 rc->block_group, parent, slot, 1361 path->nodes[level], path->slots[level], 1362 last_snapshot); 1363 if (ret < 0) 1364 break; 1365 /* 1366 * swap blocks in fs tree and reloc tree. 1367 */ 1368 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1369 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1370 btrfs_mark_buffer_dirty(parent); 1371 1372 btrfs_set_node_blockptr(path->nodes[level], 1373 path->slots[level], old_bytenr); 1374 btrfs_set_node_ptr_generation(path->nodes[level], 1375 path->slots[level], old_ptr_gen); 1376 btrfs_mark_buffer_dirty(path->nodes[level]); 1377 1378 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 1379 blocksize, path->nodes[level]->start); 1380 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1381 0, true); 1382 ret = btrfs_inc_extent_ref(trans, &ref); 1383 if (ret) { 1384 btrfs_abort_transaction(trans, ret); 1385 break; 1386 } 1387 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1388 blocksize, 0); 1389 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, 1390 true); 1391 ret = btrfs_inc_extent_ref(trans, &ref); 1392 if (ret) { 1393 btrfs_abort_transaction(trans, ret); 1394 break; 1395 } 1396 1397 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 1398 blocksize, path->nodes[level]->start); 1399 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1400 0, true); 1401 ret = btrfs_free_extent(trans, &ref); 1402 if (ret) { 1403 btrfs_abort_transaction(trans, ret); 1404 break; 1405 } 1406 1407 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 1408 blocksize, 0); 1409 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 1410 0, true); 1411 ret = btrfs_free_extent(trans, &ref); 1412 if (ret) { 1413 btrfs_abort_transaction(trans, ret); 1414 break; 1415 } 1416 1417 btrfs_unlock_up_safe(path, 0); 1418 1419 ret = level; 1420 break; 1421 } 1422 btrfs_tree_unlock(parent); 1423 free_extent_buffer(parent); 1424 return ret; 1425 } 1426 1427 /* 1428 * helper to find next relocated block in reloc tree 1429 */ 1430 static noinline_for_stack 1431 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1432 int *level) 1433 { 1434 struct extent_buffer *eb; 1435 int i; 1436 u64 last_snapshot; 1437 u32 nritems; 1438 1439 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1440 1441 for (i = 0; i < *level; i++) { 1442 free_extent_buffer(path->nodes[i]); 1443 path->nodes[i] = NULL; 1444 } 1445 1446 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1447 eb = path->nodes[i]; 1448 nritems = btrfs_header_nritems(eb); 1449 while (path->slots[i] + 1 < nritems) { 1450 path->slots[i]++; 1451 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1452 last_snapshot) 1453 continue; 1454 1455 *level = i; 1456 return 0; 1457 } 1458 free_extent_buffer(path->nodes[i]); 1459 path->nodes[i] = NULL; 1460 } 1461 return 1; 1462 } 1463 1464 /* 1465 * walk down reloc tree to find relocated block of lowest level 1466 */ 1467 static noinline_for_stack 1468 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1469 int *level) 1470 { 1471 struct extent_buffer *eb = NULL; 1472 int i; 1473 u64 ptr_gen = 0; 1474 u64 last_snapshot; 1475 u32 nritems; 1476 1477 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1478 1479 for (i = *level; i > 0; i--) { 1480 eb = path->nodes[i]; 1481 nritems = btrfs_header_nritems(eb); 1482 while (path->slots[i] < nritems) { 1483 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1484 if (ptr_gen > last_snapshot) 1485 break; 1486 path->slots[i]++; 1487 } 1488 if (path->slots[i] >= nritems) { 1489 if (i == *level) 1490 break; 1491 *level = i + 1; 1492 return 0; 1493 } 1494 if (i == 1) { 1495 *level = i; 1496 return 0; 1497 } 1498 1499 eb = btrfs_read_node_slot(eb, path->slots[i]); 1500 if (IS_ERR(eb)) 1501 return PTR_ERR(eb); 1502 BUG_ON(btrfs_header_level(eb) != i - 1); 1503 path->nodes[i - 1] = eb; 1504 path->slots[i - 1] = 0; 1505 } 1506 return 1; 1507 } 1508 1509 /* 1510 * invalidate extent cache for file extents whose key in range of 1511 * [min_key, max_key) 1512 */ 1513 static int invalidate_extent_cache(struct btrfs_root *root, 1514 struct btrfs_key *min_key, 1515 struct btrfs_key *max_key) 1516 { 1517 struct btrfs_fs_info *fs_info = root->fs_info; 1518 struct inode *inode = NULL; 1519 u64 objectid; 1520 u64 start, end; 1521 u64 ino; 1522 1523 objectid = min_key->objectid; 1524 while (1) { 1525 struct extent_state *cached_state = NULL; 1526 1527 cond_resched(); 1528 iput(inode); 1529 1530 if (objectid > max_key->objectid) 1531 break; 1532 1533 inode = find_next_inode(root, objectid); 1534 if (!inode) 1535 break; 1536 ino = btrfs_ino(BTRFS_I(inode)); 1537 1538 if (ino > max_key->objectid) { 1539 iput(inode); 1540 break; 1541 } 1542 1543 objectid = ino + 1; 1544 if (!S_ISREG(inode->i_mode)) 1545 continue; 1546 1547 if (unlikely(min_key->objectid == ino)) { 1548 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1549 continue; 1550 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1551 start = 0; 1552 else { 1553 start = min_key->offset; 1554 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1555 } 1556 } else { 1557 start = 0; 1558 } 1559 1560 if (unlikely(max_key->objectid == ino)) { 1561 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1562 continue; 1563 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1564 end = (u64)-1; 1565 } else { 1566 if (max_key->offset == 0) 1567 continue; 1568 end = max_key->offset; 1569 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1570 end--; 1571 } 1572 } else { 1573 end = (u64)-1; 1574 } 1575 1576 /* the lock_extent waits for read_folio to complete */ 1577 lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 1578 btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); 1579 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 1580 } 1581 return 0; 1582 } 1583 1584 static int find_next_key(struct btrfs_path *path, int level, 1585 struct btrfs_key *key) 1586 1587 { 1588 while (level < BTRFS_MAX_LEVEL) { 1589 if (!path->nodes[level]) 1590 break; 1591 if (path->slots[level] + 1 < 1592 btrfs_header_nritems(path->nodes[level])) { 1593 btrfs_node_key_to_cpu(path->nodes[level], key, 1594 path->slots[level] + 1); 1595 return 0; 1596 } 1597 level++; 1598 } 1599 return 1; 1600 } 1601 1602 /* 1603 * Insert current subvolume into reloc_control::dirty_subvol_roots 1604 */ 1605 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1606 struct reloc_control *rc, 1607 struct btrfs_root *root) 1608 { 1609 struct btrfs_root *reloc_root = root->reloc_root; 1610 struct btrfs_root_item *reloc_root_item; 1611 int ret; 1612 1613 /* @root must be a subvolume tree root with a valid reloc tree */ 1614 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1615 ASSERT(reloc_root); 1616 1617 reloc_root_item = &reloc_root->root_item; 1618 memset(&reloc_root_item->drop_progress, 0, 1619 sizeof(reloc_root_item->drop_progress)); 1620 btrfs_set_root_drop_level(reloc_root_item, 0); 1621 btrfs_set_root_refs(reloc_root_item, 0); 1622 ret = btrfs_update_reloc_root(trans, root); 1623 if (ret) 1624 return ret; 1625 1626 if (list_empty(&root->reloc_dirty_list)) { 1627 btrfs_grab_root(root); 1628 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1629 } 1630 1631 return 0; 1632 } 1633 1634 static int clean_dirty_subvols(struct reloc_control *rc) 1635 { 1636 struct btrfs_root *root; 1637 struct btrfs_root *next; 1638 int ret = 0; 1639 int ret2; 1640 1641 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1642 reloc_dirty_list) { 1643 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1644 /* Merged subvolume, cleanup its reloc root */ 1645 struct btrfs_root *reloc_root = root->reloc_root; 1646 1647 list_del_init(&root->reloc_dirty_list); 1648 root->reloc_root = NULL; 1649 /* 1650 * Need barrier to ensure clear_bit() only happens after 1651 * root->reloc_root = NULL. Pairs with have_reloc_root. 1652 */ 1653 smp_wmb(); 1654 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1655 if (reloc_root) { 1656 /* 1657 * btrfs_drop_snapshot drops our ref we hold for 1658 * ->reloc_root. If it fails however we must 1659 * drop the ref ourselves. 1660 */ 1661 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1662 if (ret2 < 0) { 1663 btrfs_put_root(reloc_root); 1664 if (!ret) 1665 ret = ret2; 1666 } 1667 } 1668 btrfs_put_root(root); 1669 } else { 1670 /* Orphan reloc tree, just clean it up */ 1671 ret2 = btrfs_drop_snapshot(root, 0, 1); 1672 if (ret2 < 0) { 1673 btrfs_put_root(root); 1674 if (!ret) 1675 ret = ret2; 1676 } 1677 } 1678 } 1679 return ret; 1680 } 1681 1682 /* 1683 * merge the relocated tree blocks in reloc tree with corresponding 1684 * fs tree. 1685 */ 1686 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1687 struct btrfs_root *root) 1688 { 1689 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1690 struct btrfs_key key; 1691 struct btrfs_key next_key; 1692 struct btrfs_trans_handle *trans = NULL; 1693 struct btrfs_root *reloc_root; 1694 struct btrfs_root_item *root_item; 1695 struct btrfs_path *path; 1696 struct extent_buffer *leaf; 1697 int reserve_level; 1698 int level; 1699 int max_level; 1700 int replaced = 0; 1701 int ret = 0; 1702 u32 min_reserved; 1703 1704 path = btrfs_alloc_path(); 1705 if (!path) 1706 return -ENOMEM; 1707 path->reada = READA_FORWARD; 1708 1709 reloc_root = root->reloc_root; 1710 root_item = &reloc_root->root_item; 1711 1712 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1713 level = btrfs_root_level(root_item); 1714 atomic_inc(&reloc_root->node->refs); 1715 path->nodes[level] = reloc_root->node; 1716 path->slots[level] = 0; 1717 } else { 1718 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1719 1720 level = btrfs_root_drop_level(root_item); 1721 BUG_ON(level == 0); 1722 path->lowest_level = level; 1723 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1724 path->lowest_level = 0; 1725 if (ret < 0) { 1726 btrfs_free_path(path); 1727 return ret; 1728 } 1729 1730 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1731 path->slots[level]); 1732 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1733 1734 btrfs_unlock_up_safe(path, 0); 1735 } 1736 1737 /* 1738 * In merge_reloc_root(), we modify the upper level pointer to swap the 1739 * tree blocks between reloc tree and subvolume tree. Thus for tree 1740 * block COW, we COW at most from level 1 to root level for each tree. 1741 * 1742 * Thus the needed metadata size is at most root_level * nodesize, 1743 * and * 2 since we have two trees to COW. 1744 */ 1745 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1746 min_reserved = fs_info->nodesize * reserve_level * 2; 1747 memset(&next_key, 0, sizeof(next_key)); 1748 1749 while (1) { 1750 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1751 min_reserved, 1752 BTRFS_RESERVE_FLUSH_LIMIT); 1753 if (ret) 1754 goto out; 1755 trans = btrfs_start_transaction(root, 0); 1756 if (IS_ERR(trans)) { 1757 ret = PTR_ERR(trans); 1758 trans = NULL; 1759 goto out; 1760 } 1761 1762 /* 1763 * At this point we no longer have a reloc_control, so we can't 1764 * depend on btrfs_init_reloc_root to update our last_trans. 1765 * 1766 * But that's ok, we started the trans handle on our 1767 * corresponding fs_root, which means it's been added to the 1768 * dirty list. At commit time we'll still call 1769 * btrfs_update_reloc_root() and update our root item 1770 * appropriately. 1771 */ 1772 reloc_root->last_trans = trans->transid; 1773 trans->block_rsv = rc->block_rsv; 1774 1775 replaced = 0; 1776 max_level = level; 1777 1778 ret = walk_down_reloc_tree(reloc_root, path, &level); 1779 if (ret < 0) 1780 goto out; 1781 if (ret > 0) 1782 break; 1783 1784 if (!find_next_key(path, level, &key) && 1785 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1786 ret = 0; 1787 } else { 1788 ret = replace_path(trans, rc, root, reloc_root, path, 1789 &next_key, level, max_level); 1790 } 1791 if (ret < 0) 1792 goto out; 1793 if (ret > 0) { 1794 level = ret; 1795 btrfs_node_key_to_cpu(path->nodes[level], &key, 1796 path->slots[level]); 1797 replaced = 1; 1798 } 1799 1800 ret = walk_up_reloc_tree(reloc_root, path, &level); 1801 if (ret > 0) 1802 break; 1803 1804 BUG_ON(level == 0); 1805 /* 1806 * save the merging progress in the drop_progress. 1807 * this is OK since root refs == 1 in this case. 1808 */ 1809 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1810 path->slots[level]); 1811 btrfs_set_root_drop_level(root_item, level); 1812 1813 btrfs_end_transaction_throttle(trans); 1814 trans = NULL; 1815 1816 btrfs_btree_balance_dirty(fs_info); 1817 1818 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1819 invalidate_extent_cache(root, &key, &next_key); 1820 } 1821 1822 /* 1823 * handle the case only one block in the fs tree need to be 1824 * relocated and the block is tree root. 1825 */ 1826 leaf = btrfs_lock_root_node(root); 1827 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1828 BTRFS_NESTING_COW); 1829 btrfs_tree_unlock(leaf); 1830 free_extent_buffer(leaf); 1831 out: 1832 btrfs_free_path(path); 1833 1834 if (ret == 0) { 1835 ret = insert_dirty_subvol(trans, rc, root); 1836 if (ret) 1837 btrfs_abort_transaction(trans, ret); 1838 } 1839 1840 if (trans) 1841 btrfs_end_transaction_throttle(trans); 1842 1843 btrfs_btree_balance_dirty(fs_info); 1844 1845 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1846 invalidate_extent_cache(root, &key, &next_key); 1847 1848 return ret; 1849 } 1850 1851 static noinline_for_stack 1852 int prepare_to_merge(struct reloc_control *rc, int err) 1853 { 1854 struct btrfs_root *root = rc->extent_root; 1855 struct btrfs_fs_info *fs_info = root->fs_info; 1856 struct btrfs_root *reloc_root; 1857 struct btrfs_trans_handle *trans; 1858 LIST_HEAD(reloc_roots); 1859 u64 num_bytes = 0; 1860 int ret; 1861 1862 mutex_lock(&fs_info->reloc_mutex); 1863 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1864 rc->merging_rsv_size += rc->nodes_relocated * 2; 1865 mutex_unlock(&fs_info->reloc_mutex); 1866 1867 again: 1868 if (!err) { 1869 num_bytes = rc->merging_rsv_size; 1870 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1871 BTRFS_RESERVE_FLUSH_ALL); 1872 if (ret) 1873 err = ret; 1874 } 1875 1876 trans = btrfs_join_transaction(rc->extent_root); 1877 if (IS_ERR(trans)) { 1878 if (!err) 1879 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1880 num_bytes, NULL); 1881 return PTR_ERR(trans); 1882 } 1883 1884 if (!err) { 1885 if (num_bytes != rc->merging_rsv_size) { 1886 btrfs_end_transaction(trans); 1887 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1888 num_bytes, NULL); 1889 goto again; 1890 } 1891 } 1892 1893 rc->merge_reloc_tree = 1; 1894 1895 while (!list_empty(&rc->reloc_roots)) { 1896 reloc_root = list_entry(rc->reloc_roots.next, 1897 struct btrfs_root, root_list); 1898 list_del_init(&reloc_root->root_list); 1899 1900 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1901 false); 1902 if (IS_ERR(root)) { 1903 /* 1904 * Even if we have an error we need this reloc root 1905 * back on our list so we can clean up properly. 1906 */ 1907 list_add(&reloc_root->root_list, &reloc_roots); 1908 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1909 if (!err) 1910 err = PTR_ERR(root); 1911 break; 1912 } 1913 ASSERT(root->reloc_root == reloc_root); 1914 1915 /* 1916 * set reference count to 1, so btrfs_recover_relocation 1917 * knows it should resumes merging 1918 */ 1919 if (!err) 1920 btrfs_set_root_refs(&reloc_root->root_item, 1); 1921 ret = btrfs_update_reloc_root(trans, root); 1922 1923 /* 1924 * Even if we have an error we need this reloc root back on our 1925 * list so we can clean up properly. 1926 */ 1927 list_add(&reloc_root->root_list, &reloc_roots); 1928 btrfs_put_root(root); 1929 1930 if (ret) { 1931 btrfs_abort_transaction(trans, ret); 1932 if (!err) 1933 err = ret; 1934 break; 1935 } 1936 } 1937 1938 list_splice(&reloc_roots, &rc->reloc_roots); 1939 1940 if (!err) 1941 err = btrfs_commit_transaction(trans); 1942 else 1943 btrfs_end_transaction(trans); 1944 return err; 1945 } 1946 1947 static noinline_for_stack 1948 void free_reloc_roots(struct list_head *list) 1949 { 1950 struct btrfs_root *reloc_root, *tmp; 1951 1952 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1953 __del_reloc_root(reloc_root); 1954 } 1955 1956 static noinline_for_stack 1957 void merge_reloc_roots(struct reloc_control *rc) 1958 { 1959 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1960 struct btrfs_root *root; 1961 struct btrfs_root *reloc_root; 1962 LIST_HEAD(reloc_roots); 1963 int found = 0; 1964 int ret = 0; 1965 again: 1966 root = rc->extent_root; 1967 1968 /* 1969 * this serializes us with btrfs_record_root_in_transaction, 1970 * we have to make sure nobody is in the middle of 1971 * adding their roots to the list while we are 1972 * doing this splice 1973 */ 1974 mutex_lock(&fs_info->reloc_mutex); 1975 list_splice_init(&rc->reloc_roots, &reloc_roots); 1976 mutex_unlock(&fs_info->reloc_mutex); 1977 1978 while (!list_empty(&reloc_roots)) { 1979 found = 1; 1980 reloc_root = list_entry(reloc_roots.next, 1981 struct btrfs_root, root_list); 1982 1983 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1984 false); 1985 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1986 if (IS_ERR(root)) { 1987 /* 1988 * For recovery we read the fs roots on mount, 1989 * and if we didn't find the root then we marked 1990 * the reloc root as a garbage root. For normal 1991 * relocation obviously the root should exist in 1992 * memory. However there's no reason we can't 1993 * handle the error properly here just in case. 1994 */ 1995 ASSERT(0); 1996 ret = PTR_ERR(root); 1997 goto out; 1998 } 1999 if (root->reloc_root != reloc_root) { 2000 /* 2001 * This is actually impossible without something 2002 * going really wrong (like weird race condition 2003 * or cosmic rays). 2004 */ 2005 ASSERT(0); 2006 ret = -EINVAL; 2007 goto out; 2008 } 2009 ret = merge_reloc_root(rc, root); 2010 btrfs_put_root(root); 2011 if (ret) { 2012 if (list_empty(&reloc_root->root_list)) 2013 list_add_tail(&reloc_root->root_list, 2014 &reloc_roots); 2015 goto out; 2016 } 2017 } else { 2018 if (!IS_ERR(root)) { 2019 if (root->reloc_root == reloc_root) { 2020 root->reloc_root = NULL; 2021 btrfs_put_root(reloc_root); 2022 } 2023 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 2024 &root->state); 2025 btrfs_put_root(root); 2026 } 2027 2028 list_del_init(&reloc_root->root_list); 2029 /* Don't forget to queue this reloc root for cleanup */ 2030 list_add_tail(&reloc_root->reloc_dirty_list, 2031 &rc->dirty_subvol_roots); 2032 } 2033 } 2034 2035 if (found) { 2036 found = 0; 2037 goto again; 2038 } 2039 out: 2040 if (ret) { 2041 btrfs_handle_fs_error(fs_info, ret, NULL); 2042 free_reloc_roots(&reloc_roots); 2043 2044 /* new reloc root may be added */ 2045 mutex_lock(&fs_info->reloc_mutex); 2046 list_splice_init(&rc->reloc_roots, &reloc_roots); 2047 mutex_unlock(&fs_info->reloc_mutex); 2048 free_reloc_roots(&reloc_roots); 2049 } 2050 2051 /* 2052 * We used to have 2053 * 2054 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2055 * 2056 * here, but it's wrong. If we fail to start the transaction in 2057 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2058 * have actually been removed from the reloc_root_tree rb tree. This is 2059 * fine because we're bailing here, and we hold a reference on the root 2060 * for the list that holds it, so these roots will be cleaned up when we 2061 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2062 * will be cleaned up on unmount. 2063 * 2064 * The remaining nodes will be cleaned up by free_reloc_control. 2065 */ 2066 } 2067 2068 static void free_block_list(struct rb_root *blocks) 2069 { 2070 struct tree_block *block; 2071 struct rb_node *rb_node; 2072 while ((rb_node = rb_first(blocks))) { 2073 block = rb_entry(rb_node, struct tree_block, rb_node); 2074 rb_erase(rb_node, blocks); 2075 kfree(block); 2076 } 2077 } 2078 2079 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2080 struct btrfs_root *reloc_root) 2081 { 2082 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2083 struct btrfs_root *root; 2084 int ret; 2085 2086 if (reloc_root->last_trans == trans->transid) 2087 return 0; 2088 2089 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 2090 2091 /* 2092 * This should succeed, since we can't have a reloc root without having 2093 * already looked up the actual root and created the reloc root for this 2094 * root. 2095 * 2096 * However if there's some sort of corruption where we have a ref to a 2097 * reloc root without a corresponding root this could return ENOENT. 2098 */ 2099 if (IS_ERR(root)) { 2100 ASSERT(0); 2101 return PTR_ERR(root); 2102 } 2103 if (root->reloc_root != reloc_root) { 2104 ASSERT(0); 2105 btrfs_err(fs_info, 2106 "root %llu has two reloc roots associated with it", 2107 reloc_root->root_key.offset); 2108 btrfs_put_root(root); 2109 return -EUCLEAN; 2110 } 2111 ret = btrfs_record_root_in_trans(trans, root); 2112 btrfs_put_root(root); 2113 2114 return ret; 2115 } 2116 2117 static noinline_for_stack 2118 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2119 struct reloc_control *rc, 2120 struct btrfs_backref_node *node, 2121 struct btrfs_backref_edge *edges[]) 2122 { 2123 struct btrfs_backref_node *next; 2124 struct btrfs_root *root; 2125 int index = 0; 2126 int ret; 2127 2128 next = node; 2129 while (1) { 2130 cond_resched(); 2131 next = walk_up_backref(next, edges, &index); 2132 root = next->root; 2133 2134 /* 2135 * If there is no root, then our references for this block are 2136 * incomplete, as we should be able to walk all the way up to a 2137 * block that is owned by a root. 2138 * 2139 * This path is only for SHAREABLE roots, so if we come upon a 2140 * non-SHAREABLE root then we have backrefs that resolve 2141 * improperly. 2142 * 2143 * Both of these cases indicate file system corruption, or a bug 2144 * in the backref walking code. 2145 */ 2146 if (!root) { 2147 ASSERT(0); 2148 btrfs_err(trans->fs_info, 2149 "bytenr %llu doesn't have a backref path ending in a root", 2150 node->bytenr); 2151 return ERR_PTR(-EUCLEAN); 2152 } 2153 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2154 ASSERT(0); 2155 btrfs_err(trans->fs_info, 2156 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2157 node->bytenr); 2158 return ERR_PTR(-EUCLEAN); 2159 } 2160 2161 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2162 ret = record_reloc_root_in_trans(trans, root); 2163 if (ret) 2164 return ERR_PTR(ret); 2165 break; 2166 } 2167 2168 ret = btrfs_record_root_in_trans(trans, root); 2169 if (ret) 2170 return ERR_PTR(ret); 2171 root = root->reloc_root; 2172 2173 /* 2174 * We could have raced with another thread which failed, so 2175 * root->reloc_root may not be set, return ENOENT in this case. 2176 */ 2177 if (!root) 2178 return ERR_PTR(-ENOENT); 2179 2180 if (next->new_bytenr != root->node->start) { 2181 /* 2182 * We just created the reloc root, so we shouldn't have 2183 * ->new_bytenr set and this shouldn't be in the changed 2184 * list. If it is then we have multiple roots pointing 2185 * at the same bytenr which indicates corruption, or 2186 * we've made a mistake in the backref walking code. 2187 */ 2188 ASSERT(next->new_bytenr == 0); 2189 ASSERT(list_empty(&next->list)); 2190 if (next->new_bytenr || !list_empty(&next->list)) { 2191 btrfs_err(trans->fs_info, 2192 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2193 node->bytenr, next->bytenr); 2194 return ERR_PTR(-EUCLEAN); 2195 } 2196 2197 next->new_bytenr = root->node->start; 2198 btrfs_put_root(next->root); 2199 next->root = btrfs_grab_root(root); 2200 ASSERT(next->root); 2201 list_add_tail(&next->list, 2202 &rc->backref_cache.changed); 2203 mark_block_processed(rc, next); 2204 break; 2205 } 2206 2207 WARN_ON(1); 2208 root = NULL; 2209 next = walk_down_backref(edges, &index); 2210 if (!next || next->level <= node->level) 2211 break; 2212 } 2213 if (!root) { 2214 /* 2215 * This can happen if there's fs corruption or if there's a bug 2216 * in the backref lookup code. 2217 */ 2218 ASSERT(0); 2219 return ERR_PTR(-ENOENT); 2220 } 2221 2222 next = node; 2223 /* setup backref node path for btrfs_reloc_cow_block */ 2224 while (1) { 2225 rc->backref_cache.path[next->level] = next; 2226 if (--index < 0) 2227 break; 2228 next = edges[index]->node[UPPER]; 2229 } 2230 return root; 2231 } 2232 2233 /* 2234 * Select a tree root for relocation. 2235 * 2236 * Return NULL if the block is not shareable. We should use do_relocation() in 2237 * this case. 2238 * 2239 * Return a tree root pointer if the block is shareable. 2240 * Return -ENOENT if the block is root of reloc tree. 2241 */ 2242 static noinline_for_stack 2243 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2244 { 2245 struct btrfs_backref_node *next; 2246 struct btrfs_root *root; 2247 struct btrfs_root *fs_root = NULL; 2248 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2249 int index = 0; 2250 2251 next = node; 2252 while (1) { 2253 cond_resched(); 2254 next = walk_up_backref(next, edges, &index); 2255 root = next->root; 2256 2257 /* 2258 * This can occur if we have incomplete extent refs leading all 2259 * the way up a particular path, in this case return -EUCLEAN. 2260 */ 2261 if (!root) 2262 return ERR_PTR(-EUCLEAN); 2263 2264 /* No other choice for non-shareable tree */ 2265 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2266 return root; 2267 2268 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2269 fs_root = root; 2270 2271 if (next != node) 2272 return NULL; 2273 2274 next = walk_down_backref(edges, &index); 2275 if (!next || next->level <= node->level) 2276 break; 2277 } 2278 2279 if (!fs_root) 2280 return ERR_PTR(-ENOENT); 2281 return fs_root; 2282 } 2283 2284 static noinline_for_stack 2285 u64 calcu_metadata_size(struct reloc_control *rc, 2286 struct btrfs_backref_node *node, int reserve) 2287 { 2288 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2289 struct btrfs_backref_node *next = node; 2290 struct btrfs_backref_edge *edge; 2291 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2292 u64 num_bytes = 0; 2293 int index = 0; 2294 2295 BUG_ON(reserve && node->processed); 2296 2297 while (next) { 2298 cond_resched(); 2299 while (1) { 2300 if (next->processed && (reserve || next != node)) 2301 break; 2302 2303 num_bytes += fs_info->nodesize; 2304 2305 if (list_empty(&next->upper)) 2306 break; 2307 2308 edge = list_entry(next->upper.next, 2309 struct btrfs_backref_edge, list[LOWER]); 2310 edges[index++] = edge; 2311 next = edge->node[UPPER]; 2312 } 2313 next = walk_down_backref(edges, &index); 2314 } 2315 return num_bytes; 2316 } 2317 2318 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2319 struct reloc_control *rc, 2320 struct btrfs_backref_node *node) 2321 { 2322 struct btrfs_root *root = rc->extent_root; 2323 struct btrfs_fs_info *fs_info = root->fs_info; 2324 u64 num_bytes; 2325 int ret; 2326 u64 tmp; 2327 2328 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2329 2330 trans->block_rsv = rc->block_rsv; 2331 rc->reserved_bytes += num_bytes; 2332 2333 /* 2334 * We are under a transaction here so we can only do limited flushing. 2335 * If we get an enospc just kick back -EAGAIN so we know to drop the 2336 * transaction and try to refill when we can flush all the things. 2337 */ 2338 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2339 BTRFS_RESERVE_FLUSH_LIMIT); 2340 if (ret) { 2341 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2342 while (tmp <= rc->reserved_bytes) 2343 tmp <<= 1; 2344 /* 2345 * only one thread can access block_rsv at this point, 2346 * so we don't need hold lock to protect block_rsv. 2347 * we expand more reservation size here to allow enough 2348 * space for relocation and we will return earlier in 2349 * enospc case. 2350 */ 2351 rc->block_rsv->size = tmp + fs_info->nodesize * 2352 RELOCATION_RESERVED_NODES; 2353 return -EAGAIN; 2354 } 2355 2356 return 0; 2357 } 2358 2359 /* 2360 * relocate a block tree, and then update pointers in upper level 2361 * blocks that reference the block to point to the new location. 2362 * 2363 * if called by link_to_upper, the block has already been relocated. 2364 * in that case this function just updates pointers. 2365 */ 2366 static int do_relocation(struct btrfs_trans_handle *trans, 2367 struct reloc_control *rc, 2368 struct btrfs_backref_node *node, 2369 struct btrfs_key *key, 2370 struct btrfs_path *path, int lowest) 2371 { 2372 struct btrfs_backref_node *upper; 2373 struct btrfs_backref_edge *edge; 2374 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2375 struct btrfs_root *root; 2376 struct extent_buffer *eb; 2377 u32 blocksize; 2378 u64 bytenr; 2379 int slot; 2380 int ret = 0; 2381 2382 /* 2383 * If we are lowest then this is the first time we're processing this 2384 * block, and thus shouldn't have an eb associated with it yet. 2385 */ 2386 ASSERT(!lowest || !node->eb); 2387 2388 path->lowest_level = node->level + 1; 2389 rc->backref_cache.path[node->level] = node; 2390 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2391 struct btrfs_ref ref = { 0 }; 2392 2393 cond_resched(); 2394 2395 upper = edge->node[UPPER]; 2396 root = select_reloc_root(trans, rc, upper, edges); 2397 if (IS_ERR(root)) { 2398 ret = PTR_ERR(root); 2399 goto next; 2400 } 2401 2402 if (upper->eb && !upper->locked) { 2403 if (!lowest) { 2404 ret = btrfs_bin_search(upper->eb, key, &slot); 2405 if (ret < 0) 2406 goto next; 2407 BUG_ON(ret); 2408 bytenr = btrfs_node_blockptr(upper->eb, slot); 2409 if (node->eb->start == bytenr) 2410 goto next; 2411 } 2412 btrfs_backref_drop_node_buffer(upper); 2413 } 2414 2415 if (!upper->eb) { 2416 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2417 if (ret) { 2418 if (ret > 0) 2419 ret = -ENOENT; 2420 2421 btrfs_release_path(path); 2422 break; 2423 } 2424 2425 if (!upper->eb) { 2426 upper->eb = path->nodes[upper->level]; 2427 path->nodes[upper->level] = NULL; 2428 } else { 2429 BUG_ON(upper->eb != path->nodes[upper->level]); 2430 } 2431 2432 upper->locked = 1; 2433 path->locks[upper->level] = 0; 2434 2435 slot = path->slots[upper->level]; 2436 btrfs_release_path(path); 2437 } else { 2438 ret = btrfs_bin_search(upper->eb, key, &slot); 2439 if (ret < 0) 2440 goto next; 2441 BUG_ON(ret); 2442 } 2443 2444 bytenr = btrfs_node_blockptr(upper->eb, slot); 2445 if (lowest) { 2446 if (bytenr != node->bytenr) { 2447 btrfs_err(root->fs_info, 2448 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2449 bytenr, node->bytenr, slot, 2450 upper->eb->start); 2451 ret = -EIO; 2452 goto next; 2453 } 2454 } else { 2455 if (node->eb->start == bytenr) 2456 goto next; 2457 } 2458 2459 blocksize = root->fs_info->nodesize; 2460 eb = btrfs_read_node_slot(upper->eb, slot); 2461 if (IS_ERR(eb)) { 2462 ret = PTR_ERR(eb); 2463 goto next; 2464 } 2465 btrfs_tree_lock(eb); 2466 2467 if (!node->eb) { 2468 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2469 slot, &eb, BTRFS_NESTING_COW); 2470 btrfs_tree_unlock(eb); 2471 free_extent_buffer(eb); 2472 if (ret < 0) 2473 goto next; 2474 /* 2475 * We've just COWed this block, it should have updated 2476 * the correct backref node entry. 2477 */ 2478 ASSERT(node->eb == eb); 2479 } else { 2480 btrfs_set_node_blockptr(upper->eb, slot, 2481 node->eb->start); 2482 btrfs_set_node_ptr_generation(upper->eb, slot, 2483 trans->transid); 2484 btrfs_mark_buffer_dirty(upper->eb); 2485 2486 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2487 node->eb->start, blocksize, 2488 upper->eb->start); 2489 btrfs_init_tree_ref(&ref, node->level, 2490 btrfs_header_owner(upper->eb), 2491 root->root_key.objectid, false); 2492 ret = btrfs_inc_extent_ref(trans, &ref); 2493 if (!ret) 2494 ret = btrfs_drop_subtree(trans, root, eb, 2495 upper->eb); 2496 if (ret) 2497 btrfs_abort_transaction(trans, ret); 2498 } 2499 next: 2500 if (!upper->pending) 2501 btrfs_backref_drop_node_buffer(upper); 2502 else 2503 btrfs_backref_unlock_node_buffer(upper); 2504 if (ret) 2505 break; 2506 } 2507 2508 if (!ret && node->pending) { 2509 btrfs_backref_drop_node_buffer(node); 2510 list_move_tail(&node->list, &rc->backref_cache.changed); 2511 node->pending = 0; 2512 } 2513 2514 path->lowest_level = 0; 2515 2516 /* 2517 * We should have allocated all of our space in the block rsv and thus 2518 * shouldn't ENOSPC. 2519 */ 2520 ASSERT(ret != -ENOSPC); 2521 return ret; 2522 } 2523 2524 static int link_to_upper(struct btrfs_trans_handle *trans, 2525 struct reloc_control *rc, 2526 struct btrfs_backref_node *node, 2527 struct btrfs_path *path) 2528 { 2529 struct btrfs_key key; 2530 2531 btrfs_node_key_to_cpu(node->eb, &key, 0); 2532 return do_relocation(trans, rc, node, &key, path, 0); 2533 } 2534 2535 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2536 struct reloc_control *rc, 2537 struct btrfs_path *path, int err) 2538 { 2539 LIST_HEAD(list); 2540 struct btrfs_backref_cache *cache = &rc->backref_cache; 2541 struct btrfs_backref_node *node; 2542 int level; 2543 int ret; 2544 2545 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2546 while (!list_empty(&cache->pending[level])) { 2547 node = list_entry(cache->pending[level].next, 2548 struct btrfs_backref_node, list); 2549 list_move_tail(&node->list, &list); 2550 BUG_ON(!node->pending); 2551 2552 if (!err) { 2553 ret = link_to_upper(trans, rc, node, path); 2554 if (ret < 0) 2555 err = ret; 2556 } 2557 } 2558 list_splice_init(&list, &cache->pending[level]); 2559 } 2560 return err; 2561 } 2562 2563 /* 2564 * mark a block and all blocks directly/indirectly reference the block 2565 * as processed. 2566 */ 2567 static void update_processed_blocks(struct reloc_control *rc, 2568 struct btrfs_backref_node *node) 2569 { 2570 struct btrfs_backref_node *next = node; 2571 struct btrfs_backref_edge *edge; 2572 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2573 int index = 0; 2574 2575 while (next) { 2576 cond_resched(); 2577 while (1) { 2578 if (next->processed) 2579 break; 2580 2581 mark_block_processed(rc, next); 2582 2583 if (list_empty(&next->upper)) 2584 break; 2585 2586 edge = list_entry(next->upper.next, 2587 struct btrfs_backref_edge, list[LOWER]); 2588 edges[index++] = edge; 2589 next = edge->node[UPPER]; 2590 } 2591 next = walk_down_backref(edges, &index); 2592 } 2593 } 2594 2595 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2596 { 2597 u32 blocksize = rc->extent_root->fs_info->nodesize; 2598 2599 if (test_range_bit(&rc->processed_blocks, bytenr, 2600 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2601 return 1; 2602 return 0; 2603 } 2604 2605 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2606 struct tree_block *block) 2607 { 2608 struct extent_buffer *eb; 2609 2610 eb = read_tree_block(fs_info, block->bytenr, block->owner, 2611 block->key.offset, block->level, NULL); 2612 if (IS_ERR(eb)) 2613 return PTR_ERR(eb); 2614 if (!extent_buffer_uptodate(eb)) { 2615 free_extent_buffer(eb); 2616 return -EIO; 2617 } 2618 if (block->level == 0) 2619 btrfs_item_key_to_cpu(eb, &block->key, 0); 2620 else 2621 btrfs_node_key_to_cpu(eb, &block->key, 0); 2622 free_extent_buffer(eb); 2623 block->key_ready = 1; 2624 return 0; 2625 } 2626 2627 /* 2628 * helper function to relocate a tree block 2629 */ 2630 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2631 struct reloc_control *rc, 2632 struct btrfs_backref_node *node, 2633 struct btrfs_key *key, 2634 struct btrfs_path *path) 2635 { 2636 struct btrfs_root *root; 2637 int ret = 0; 2638 2639 if (!node) 2640 return 0; 2641 2642 /* 2643 * If we fail here we want to drop our backref_node because we are going 2644 * to start over and regenerate the tree for it. 2645 */ 2646 ret = reserve_metadata_space(trans, rc, node); 2647 if (ret) 2648 goto out; 2649 2650 BUG_ON(node->processed); 2651 root = select_one_root(node); 2652 if (IS_ERR(root)) { 2653 ret = PTR_ERR(root); 2654 2655 /* See explanation in select_one_root for the -EUCLEAN case. */ 2656 ASSERT(ret == -ENOENT); 2657 if (ret == -ENOENT) { 2658 ret = 0; 2659 update_processed_blocks(rc, node); 2660 } 2661 goto out; 2662 } 2663 2664 if (root) { 2665 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2666 /* 2667 * This block was the root block of a root, and this is 2668 * the first time we're processing the block and thus it 2669 * should not have had the ->new_bytenr modified and 2670 * should have not been included on the changed list. 2671 * 2672 * However in the case of corruption we could have 2673 * multiple refs pointing to the same block improperly, 2674 * and thus we would trip over these checks. ASSERT() 2675 * for the developer case, because it could indicate a 2676 * bug in the backref code, however error out for a 2677 * normal user in the case of corruption. 2678 */ 2679 ASSERT(node->new_bytenr == 0); 2680 ASSERT(list_empty(&node->list)); 2681 if (node->new_bytenr || !list_empty(&node->list)) { 2682 btrfs_err(root->fs_info, 2683 "bytenr %llu has improper references to it", 2684 node->bytenr); 2685 ret = -EUCLEAN; 2686 goto out; 2687 } 2688 ret = btrfs_record_root_in_trans(trans, root); 2689 if (ret) 2690 goto out; 2691 /* 2692 * Another thread could have failed, need to check if we 2693 * have reloc_root actually set. 2694 */ 2695 if (!root->reloc_root) { 2696 ret = -ENOENT; 2697 goto out; 2698 } 2699 root = root->reloc_root; 2700 node->new_bytenr = root->node->start; 2701 btrfs_put_root(node->root); 2702 node->root = btrfs_grab_root(root); 2703 ASSERT(node->root); 2704 list_add_tail(&node->list, &rc->backref_cache.changed); 2705 } else { 2706 path->lowest_level = node->level; 2707 if (root == root->fs_info->chunk_root) 2708 btrfs_reserve_chunk_metadata(trans, false); 2709 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2710 btrfs_release_path(path); 2711 if (root == root->fs_info->chunk_root) 2712 btrfs_trans_release_chunk_metadata(trans); 2713 if (ret > 0) 2714 ret = 0; 2715 } 2716 if (!ret) 2717 update_processed_blocks(rc, node); 2718 } else { 2719 ret = do_relocation(trans, rc, node, key, path, 1); 2720 } 2721 out: 2722 if (ret || node->level == 0 || node->cowonly) 2723 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2724 return ret; 2725 } 2726 2727 /* 2728 * relocate a list of blocks 2729 */ 2730 static noinline_for_stack 2731 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2732 struct reloc_control *rc, struct rb_root *blocks) 2733 { 2734 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2735 struct btrfs_backref_node *node; 2736 struct btrfs_path *path; 2737 struct tree_block *block; 2738 struct tree_block *next; 2739 int ret; 2740 int err = 0; 2741 2742 path = btrfs_alloc_path(); 2743 if (!path) { 2744 err = -ENOMEM; 2745 goto out_free_blocks; 2746 } 2747 2748 /* Kick in readahead for tree blocks with missing keys */ 2749 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2750 if (!block->key_ready) 2751 btrfs_readahead_tree_block(fs_info, block->bytenr, 2752 block->owner, 0, 2753 block->level); 2754 } 2755 2756 /* Get first keys */ 2757 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2758 if (!block->key_ready) { 2759 err = get_tree_block_key(fs_info, block); 2760 if (err) 2761 goto out_free_path; 2762 } 2763 } 2764 2765 /* Do tree relocation */ 2766 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2767 node = build_backref_tree(rc, &block->key, 2768 block->level, block->bytenr); 2769 if (IS_ERR(node)) { 2770 err = PTR_ERR(node); 2771 goto out; 2772 } 2773 2774 ret = relocate_tree_block(trans, rc, node, &block->key, 2775 path); 2776 if (ret < 0) { 2777 err = ret; 2778 break; 2779 } 2780 } 2781 out: 2782 err = finish_pending_nodes(trans, rc, path, err); 2783 2784 out_free_path: 2785 btrfs_free_path(path); 2786 out_free_blocks: 2787 free_block_list(blocks); 2788 return err; 2789 } 2790 2791 static noinline_for_stack int prealloc_file_extent_cluster( 2792 struct btrfs_inode *inode, 2793 struct file_extent_cluster *cluster) 2794 { 2795 u64 alloc_hint = 0; 2796 u64 start; 2797 u64 end; 2798 u64 offset = inode->index_cnt; 2799 u64 num_bytes; 2800 int nr; 2801 int ret = 0; 2802 u64 i_size = i_size_read(&inode->vfs_inode); 2803 u64 prealloc_start = cluster->start - offset; 2804 u64 prealloc_end = cluster->end - offset; 2805 u64 cur_offset = prealloc_start; 2806 2807 /* 2808 * For subpage case, previous i_size may not be aligned to PAGE_SIZE. 2809 * This means the range [i_size, PAGE_END + 1) is filled with zeros by 2810 * btrfs_do_readpage() call of previously relocated file cluster. 2811 * 2812 * If the current cluster starts in the above range, btrfs_do_readpage() 2813 * will skip the read, and relocate_one_page() will later writeback 2814 * the padding zeros as new data, causing data corruption. 2815 * 2816 * Here we have to manually invalidate the range (i_size, PAGE_END + 1). 2817 */ 2818 if (!IS_ALIGNED(i_size, PAGE_SIZE)) { 2819 struct address_space *mapping = inode->vfs_inode.i_mapping; 2820 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2821 const u32 sectorsize = fs_info->sectorsize; 2822 struct page *page; 2823 2824 ASSERT(sectorsize < PAGE_SIZE); 2825 ASSERT(IS_ALIGNED(i_size, sectorsize)); 2826 2827 /* 2828 * Subpage can't handle page with DIRTY but without UPTODATE 2829 * bit as it can lead to the following deadlock: 2830 * 2831 * btrfs_read_folio() 2832 * | Page already *locked* 2833 * |- btrfs_lock_and_flush_ordered_range() 2834 * |- btrfs_start_ordered_extent() 2835 * |- extent_write_cache_pages() 2836 * |- lock_page() 2837 * We try to lock the page we already hold. 2838 * 2839 * Here we just writeback the whole data reloc inode, so that 2840 * we will be ensured to have no dirty range in the page, and 2841 * are safe to clear the uptodate bits. 2842 * 2843 * This shouldn't cause too much overhead, as we need to write 2844 * the data back anyway. 2845 */ 2846 ret = filemap_write_and_wait(mapping); 2847 if (ret < 0) 2848 return ret; 2849 2850 clear_extent_bits(&inode->io_tree, i_size, 2851 round_up(i_size, PAGE_SIZE) - 1, 2852 EXTENT_UPTODATE); 2853 page = find_lock_page(mapping, i_size >> PAGE_SHIFT); 2854 /* 2855 * If page is freed we don't need to do anything then, as we 2856 * will re-read the whole page anyway. 2857 */ 2858 if (page) { 2859 btrfs_subpage_clear_uptodate(fs_info, page, i_size, 2860 round_up(i_size, PAGE_SIZE) - i_size); 2861 unlock_page(page); 2862 put_page(page); 2863 } 2864 } 2865 2866 BUG_ON(cluster->start != cluster->boundary[0]); 2867 ret = btrfs_alloc_data_chunk_ondemand(inode, 2868 prealloc_end + 1 - prealloc_start); 2869 if (ret) 2870 return ret; 2871 2872 btrfs_inode_lock(&inode->vfs_inode, 0); 2873 for (nr = 0; nr < cluster->nr; nr++) { 2874 struct extent_state *cached_state = NULL; 2875 2876 start = cluster->boundary[nr] - offset; 2877 if (nr + 1 < cluster->nr) 2878 end = cluster->boundary[nr + 1] - 1 - offset; 2879 else 2880 end = cluster->end - offset; 2881 2882 lock_extent(&inode->io_tree, start, end, &cached_state); 2883 num_bytes = end + 1 - start; 2884 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2885 num_bytes, num_bytes, 2886 end + 1, &alloc_hint); 2887 cur_offset = end + 1; 2888 unlock_extent(&inode->io_tree, start, end, &cached_state); 2889 if (ret) 2890 break; 2891 } 2892 btrfs_inode_unlock(&inode->vfs_inode, 0); 2893 2894 if (cur_offset < prealloc_end) 2895 btrfs_free_reserved_data_space_noquota(inode->root->fs_info, 2896 prealloc_end + 1 - cur_offset); 2897 return ret; 2898 } 2899 2900 static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, 2901 u64 start, u64 end, u64 block_start) 2902 { 2903 struct extent_map *em; 2904 struct extent_state *cached_state = NULL; 2905 int ret = 0; 2906 2907 em = alloc_extent_map(); 2908 if (!em) 2909 return -ENOMEM; 2910 2911 em->start = start; 2912 em->len = end + 1 - start; 2913 em->block_len = em->len; 2914 em->block_start = block_start; 2915 set_bit(EXTENT_FLAG_PINNED, &em->flags); 2916 2917 lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 2918 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); 2919 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); 2920 free_extent_map(em); 2921 2922 return ret; 2923 } 2924 2925 /* 2926 * Allow error injection to test balance/relocation cancellation 2927 */ 2928 noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) 2929 { 2930 return atomic_read(&fs_info->balance_cancel_req) || 2931 atomic_read(&fs_info->reloc_cancel_req) || 2932 fatal_signal_pending(current); 2933 } 2934 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2935 2936 static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, 2937 int cluster_nr) 2938 { 2939 /* Last extent, use cluster end directly */ 2940 if (cluster_nr >= cluster->nr - 1) 2941 return cluster->end; 2942 2943 /* Use next boundary start*/ 2944 return cluster->boundary[cluster_nr + 1] - 1; 2945 } 2946 2947 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, 2948 struct file_extent_cluster *cluster, 2949 int *cluster_nr, unsigned long page_index) 2950 { 2951 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2952 u64 offset = BTRFS_I(inode)->index_cnt; 2953 const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; 2954 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2955 struct page *page; 2956 u64 page_start; 2957 u64 page_end; 2958 u64 cur; 2959 int ret; 2960 2961 ASSERT(page_index <= last_index); 2962 page = find_lock_page(inode->i_mapping, page_index); 2963 if (!page) { 2964 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2965 page_index, last_index + 1 - page_index); 2966 page = find_or_create_page(inode->i_mapping, page_index, mask); 2967 if (!page) 2968 return -ENOMEM; 2969 } 2970 ret = set_page_extent_mapped(page); 2971 if (ret < 0) 2972 goto release_page; 2973 2974 if (PageReadahead(page)) 2975 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2976 page_folio(page), page_index, 2977 last_index + 1 - page_index); 2978 2979 if (!PageUptodate(page)) { 2980 btrfs_read_folio(NULL, page_folio(page)); 2981 lock_page(page); 2982 if (!PageUptodate(page)) { 2983 ret = -EIO; 2984 goto release_page; 2985 } 2986 } 2987 2988 page_start = page_offset(page); 2989 page_end = page_start + PAGE_SIZE - 1; 2990 2991 /* 2992 * Start from the cluster, as for subpage case, the cluster can start 2993 * inside the page. 2994 */ 2995 cur = max(page_start, cluster->boundary[*cluster_nr] - offset); 2996 while (cur <= page_end) { 2997 struct extent_state *cached_state = NULL; 2998 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2999 u64 extent_end = get_cluster_boundary_end(cluster, 3000 *cluster_nr) - offset; 3001 u64 clamped_start = max(page_start, extent_start); 3002 u64 clamped_end = min(page_end, extent_end); 3003 u32 clamped_len = clamped_end + 1 - clamped_start; 3004 3005 /* Reserve metadata for this range */ 3006 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3007 clamped_len, clamped_len, 3008 false); 3009 if (ret) 3010 goto release_page; 3011 3012 /* Mark the range delalloc and dirty for later writeback */ 3013 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 3014 &cached_state); 3015 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 3016 clamped_end, 0, &cached_state); 3017 if (ret) { 3018 clear_extent_bit(&BTRFS_I(inode)->io_tree, 3019 clamped_start, clamped_end, 3020 EXTENT_LOCKED | EXTENT_BOUNDARY, 3021 &cached_state); 3022 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3023 clamped_len, true); 3024 btrfs_delalloc_release_extents(BTRFS_I(inode), 3025 clamped_len); 3026 goto release_page; 3027 } 3028 btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); 3029 3030 /* 3031 * Set the boundary if it's inside the page. 3032 * Data relocation requires the destination extents to have the 3033 * same size as the source. 3034 * EXTENT_BOUNDARY bit prevents current extent from being merged 3035 * with previous extent. 3036 */ 3037 if (in_range(cluster->boundary[*cluster_nr] - offset, 3038 page_start, PAGE_SIZE)) { 3039 u64 boundary_start = cluster->boundary[*cluster_nr] - 3040 offset; 3041 u64 boundary_end = boundary_start + 3042 fs_info->sectorsize - 1; 3043 3044 set_extent_bits(&BTRFS_I(inode)->io_tree, 3045 boundary_start, boundary_end, 3046 EXTENT_BOUNDARY); 3047 } 3048 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 3049 &cached_state); 3050 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 3051 cur += clamped_len; 3052 3053 /* Crossed extent end, go to next extent */ 3054 if (cur >= extent_end) { 3055 (*cluster_nr)++; 3056 /* Just finished the last extent of the cluster, exit. */ 3057 if (*cluster_nr >= cluster->nr) 3058 break; 3059 } 3060 } 3061 unlock_page(page); 3062 put_page(page); 3063 3064 balance_dirty_pages_ratelimited(inode->i_mapping); 3065 btrfs_throttle(fs_info); 3066 if (btrfs_should_cancel_balance(fs_info)) 3067 ret = -ECANCELED; 3068 return ret; 3069 3070 release_page: 3071 unlock_page(page); 3072 put_page(page); 3073 return ret; 3074 } 3075 3076 static int relocate_file_extent_cluster(struct inode *inode, 3077 struct file_extent_cluster *cluster) 3078 { 3079 u64 offset = BTRFS_I(inode)->index_cnt; 3080 unsigned long index; 3081 unsigned long last_index; 3082 struct file_ra_state *ra; 3083 int cluster_nr = 0; 3084 int ret = 0; 3085 3086 if (!cluster->nr) 3087 return 0; 3088 3089 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3090 if (!ra) 3091 return -ENOMEM; 3092 3093 ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); 3094 if (ret) 3095 goto out; 3096 3097 file_ra_state_init(ra, inode->i_mapping); 3098 3099 ret = setup_relocation_extent_mapping(inode, cluster->start - offset, 3100 cluster->end - offset, cluster->start); 3101 if (ret) 3102 goto out; 3103 3104 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3105 for (index = (cluster->start - offset) >> PAGE_SHIFT; 3106 index <= last_index && !ret; index++) 3107 ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); 3108 if (ret == 0) 3109 WARN_ON(cluster_nr != cluster->nr); 3110 out: 3111 kfree(ra); 3112 return ret; 3113 } 3114 3115 static noinline_for_stack 3116 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3117 struct file_extent_cluster *cluster) 3118 { 3119 int ret; 3120 3121 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3122 ret = relocate_file_extent_cluster(inode, cluster); 3123 if (ret) 3124 return ret; 3125 cluster->nr = 0; 3126 } 3127 3128 if (!cluster->nr) 3129 cluster->start = extent_key->objectid; 3130 else 3131 BUG_ON(cluster->nr >= MAX_EXTENTS); 3132 cluster->end = extent_key->objectid + extent_key->offset - 1; 3133 cluster->boundary[cluster->nr] = extent_key->objectid; 3134 cluster->nr++; 3135 3136 if (cluster->nr >= MAX_EXTENTS) { 3137 ret = relocate_file_extent_cluster(inode, cluster); 3138 if (ret) 3139 return ret; 3140 cluster->nr = 0; 3141 } 3142 return 0; 3143 } 3144 3145 /* 3146 * helper to add a tree block to the list. 3147 * the major work is getting the generation and level of the block 3148 */ 3149 static int add_tree_block(struct reloc_control *rc, 3150 struct btrfs_key *extent_key, 3151 struct btrfs_path *path, 3152 struct rb_root *blocks) 3153 { 3154 struct extent_buffer *eb; 3155 struct btrfs_extent_item *ei; 3156 struct btrfs_tree_block_info *bi; 3157 struct tree_block *block; 3158 struct rb_node *rb_node; 3159 u32 item_size; 3160 int level = -1; 3161 u64 generation; 3162 u64 owner = 0; 3163 3164 eb = path->nodes[0]; 3165 item_size = btrfs_item_size(eb, path->slots[0]); 3166 3167 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3168 item_size >= sizeof(*ei) + sizeof(*bi)) { 3169 unsigned long ptr = 0, end; 3170 3171 ei = btrfs_item_ptr(eb, path->slots[0], 3172 struct btrfs_extent_item); 3173 end = (unsigned long)ei + item_size; 3174 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3175 bi = (struct btrfs_tree_block_info *)(ei + 1); 3176 level = btrfs_tree_block_level(eb, bi); 3177 ptr = (unsigned long)(bi + 1); 3178 } else { 3179 level = (int)extent_key->offset; 3180 ptr = (unsigned long)(ei + 1); 3181 } 3182 generation = btrfs_extent_generation(eb, ei); 3183 3184 /* 3185 * We're reading random blocks without knowing their owner ahead 3186 * of time. This is ok most of the time, as all reloc roots and 3187 * fs roots have the same lock type. However normal trees do 3188 * not, and the only way to know ahead of time is to read the 3189 * inline ref offset. We know it's an fs root if 3190 * 3191 * 1. There's more than one ref. 3192 * 2. There's a SHARED_DATA_REF_KEY set. 3193 * 3. FULL_BACKREF is set on the flags. 3194 * 3195 * Otherwise it's safe to assume that the ref offset == the 3196 * owner of this block, so we can use that when calling 3197 * read_tree_block. 3198 */ 3199 if (btrfs_extent_refs(eb, ei) == 1 && 3200 !(btrfs_extent_flags(eb, ei) & 3201 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3202 ptr < end) { 3203 struct btrfs_extent_inline_ref *iref; 3204 int type; 3205 3206 iref = (struct btrfs_extent_inline_ref *)ptr; 3207 type = btrfs_get_extent_inline_ref_type(eb, iref, 3208 BTRFS_REF_TYPE_BLOCK); 3209 if (type == BTRFS_REF_TYPE_INVALID) 3210 return -EINVAL; 3211 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3212 owner = btrfs_extent_inline_ref_offset(eb, iref); 3213 } 3214 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3215 btrfs_print_v0_err(eb->fs_info); 3216 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3217 return -EINVAL; 3218 } else { 3219 BUG(); 3220 } 3221 3222 btrfs_release_path(path); 3223 3224 BUG_ON(level == -1); 3225 3226 block = kmalloc(sizeof(*block), GFP_NOFS); 3227 if (!block) 3228 return -ENOMEM; 3229 3230 block->bytenr = extent_key->objectid; 3231 block->key.objectid = rc->extent_root->fs_info->nodesize; 3232 block->key.offset = generation; 3233 block->level = level; 3234 block->key_ready = 0; 3235 block->owner = owner; 3236 3237 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3238 if (rb_node) 3239 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3240 -EEXIST); 3241 3242 return 0; 3243 } 3244 3245 /* 3246 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3247 */ 3248 static int __add_tree_block(struct reloc_control *rc, 3249 u64 bytenr, u32 blocksize, 3250 struct rb_root *blocks) 3251 { 3252 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3253 struct btrfs_path *path; 3254 struct btrfs_key key; 3255 int ret; 3256 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3257 3258 if (tree_block_processed(bytenr, rc)) 3259 return 0; 3260 3261 if (rb_simple_search(blocks, bytenr)) 3262 return 0; 3263 3264 path = btrfs_alloc_path(); 3265 if (!path) 3266 return -ENOMEM; 3267 again: 3268 key.objectid = bytenr; 3269 if (skinny) { 3270 key.type = BTRFS_METADATA_ITEM_KEY; 3271 key.offset = (u64)-1; 3272 } else { 3273 key.type = BTRFS_EXTENT_ITEM_KEY; 3274 key.offset = blocksize; 3275 } 3276 3277 path->search_commit_root = 1; 3278 path->skip_locking = 1; 3279 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3280 if (ret < 0) 3281 goto out; 3282 3283 if (ret > 0 && skinny) { 3284 if (path->slots[0]) { 3285 path->slots[0]--; 3286 btrfs_item_key_to_cpu(path->nodes[0], &key, 3287 path->slots[0]); 3288 if (key.objectid == bytenr && 3289 (key.type == BTRFS_METADATA_ITEM_KEY || 3290 (key.type == BTRFS_EXTENT_ITEM_KEY && 3291 key.offset == blocksize))) 3292 ret = 0; 3293 } 3294 3295 if (ret) { 3296 skinny = false; 3297 btrfs_release_path(path); 3298 goto again; 3299 } 3300 } 3301 if (ret) { 3302 ASSERT(ret == 1); 3303 btrfs_print_leaf(path->nodes[0]); 3304 btrfs_err(fs_info, 3305 "tree block extent item (%llu) is not found in extent tree", 3306 bytenr); 3307 WARN_ON(1); 3308 ret = -EINVAL; 3309 goto out; 3310 } 3311 3312 ret = add_tree_block(rc, &key, path, blocks); 3313 out: 3314 btrfs_free_path(path); 3315 return ret; 3316 } 3317 3318 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3319 struct btrfs_block_group *block_group, 3320 struct inode *inode, 3321 u64 ino) 3322 { 3323 struct btrfs_root *root = fs_info->tree_root; 3324 struct btrfs_trans_handle *trans; 3325 int ret = 0; 3326 3327 if (inode) 3328 goto truncate; 3329 3330 inode = btrfs_iget(fs_info->sb, ino, root); 3331 if (IS_ERR(inode)) 3332 return -ENOENT; 3333 3334 truncate: 3335 ret = btrfs_check_trunc_cache_free_space(fs_info, 3336 &fs_info->global_block_rsv); 3337 if (ret) 3338 goto out; 3339 3340 trans = btrfs_join_transaction(root); 3341 if (IS_ERR(trans)) { 3342 ret = PTR_ERR(trans); 3343 goto out; 3344 } 3345 3346 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3347 3348 btrfs_end_transaction(trans); 3349 btrfs_btree_balance_dirty(fs_info); 3350 out: 3351 iput(inode); 3352 return ret; 3353 } 3354 3355 /* 3356 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3357 * cache inode, to avoid free space cache data extent blocking data relocation. 3358 */ 3359 static int delete_v1_space_cache(struct extent_buffer *leaf, 3360 struct btrfs_block_group *block_group, 3361 u64 data_bytenr) 3362 { 3363 u64 space_cache_ino; 3364 struct btrfs_file_extent_item *ei; 3365 struct btrfs_key key; 3366 bool found = false; 3367 int i; 3368 int ret; 3369 3370 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3371 return 0; 3372 3373 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3374 u8 type; 3375 3376 btrfs_item_key_to_cpu(leaf, &key, i); 3377 if (key.type != BTRFS_EXTENT_DATA_KEY) 3378 continue; 3379 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3380 type = btrfs_file_extent_type(leaf, ei); 3381 3382 if ((type == BTRFS_FILE_EXTENT_REG || 3383 type == BTRFS_FILE_EXTENT_PREALLOC) && 3384 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3385 found = true; 3386 space_cache_ino = key.objectid; 3387 break; 3388 } 3389 } 3390 if (!found) 3391 return -ENOENT; 3392 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3393 space_cache_ino); 3394 return ret; 3395 } 3396 3397 /* 3398 * helper to find all tree blocks that reference a given data extent 3399 */ 3400 static noinline_for_stack 3401 int add_data_references(struct reloc_control *rc, 3402 struct btrfs_key *extent_key, 3403 struct btrfs_path *path, 3404 struct rb_root *blocks) 3405 { 3406 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3407 struct ulist *leaves = NULL; 3408 struct ulist_iterator leaf_uiter; 3409 struct ulist_node *ref_node = NULL; 3410 const u32 blocksize = fs_info->nodesize; 3411 int ret = 0; 3412 3413 btrfs_release_path(path); 3414 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, 3415 0, &leaves, NULL, true); 3416 if (ret < 0) 3417 return ret; 3418 3419 ULIST_ITER_INIT(&leaf_uiter); 3420 while ((ref_node = ulist_next(leaves, &leaf_uiter))) { 3421 struct extent_buffer *eb; 3422 3423 eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); 3424 if (IS_ERR(eb)) { 3425 ret = PTR_ERR(eb); 3426 break; 3427 } 3428 ret = delete_v1_space_cache(eb, rc->block_group, 3429 extent_key->objectid); 3430 free_extent_buffer(eb); 3431 if (ret < 0) 3432 break; 3433 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3434 if (ret < 0) 3435 break; 3436 } 3437 if (ret < 0) 3438 free_block_list(blocks); 3439 ulist_free(leaves); 3440 return ret; 3441 } 3442 3443 /* 3444 * helper to find next unprocessed extent 3445 */ 3446 static noinline_for_stack 3447 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3448 struct btrfs_key *extent_key) 3449 { 3450 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3451 struct btrfs_key key; 3452 struct extent_buffer *leaf; 3453 u64 start, end, last; 3454 int ret; 3455 3456 last = rc->block_group->start + rc->block_group->length; 3457 while (1) { 3458 cond_resched(); 3459 if (rc->search_start >= last) { 3460 ret = 1; 3461 break; 3462 } 3463 3464 key.objectid = rc->search_start; 3465 key.type = BTRFS_EXTENT_ITEM_KEY; 3466 key.offset = 0; 3467 3468 path->search_commit_root = 1; 3469 path->skip_locking = 1; 3470 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3471 0, 0); 3472 if (ret < 0) 3473 break; 3474 next: 3475 leaf = path->nodes[0]; 3476 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3477 ret = btrfs_next_leaf(rc->extent_root, path); 3478 if (ret != 0) 3479 break; 3480 leaf = path->nodes[0]; 3481 } 3482 3483 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3484 if (key.objectid >= last) { 3485 ret = 1; 3486 break; 3487 } 3488 3489 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3490 key.type != BTRFS_METADATA_ITEM_KEY) { 3491 path->slots[0]++; 3492 goto next; 3493 } 3494 3495 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3496 key.objectid + key.offset <= rc->search_start) { 3497 path->slots[0]++; 3498 goto next; 3499 } 3500 3501 if (key.type == BTRFS_METADATA_ITEM_KEY && 3502 key.objectid + fs_info->nodesize <= 3503 rc->search_start) { 3504 path->slots[0]++; 3505 goto next; 3506 } 3507 3508 ret = find_first_extent_bit(&rc->processed_blocks, 3509 key.objectid, &start, &end, 3510 EXTENT_DIRTY, NULL); 3511 3512 if (ret == 0 && start <= key.objectid) { 3513 btrfs_release_path(path); 3514 rc->search_start = end + 1; 3515 } else { 3516 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3517 rc->search_start = key.objectid + key.offset; 3518 else 3519 rc->search_start = key.objectid + 3520 fs_info->nodesize; 3521 memcpy(extent_key, &key, sizeof(key)); 3522 return 0; 3523 } 3524 } 3525 btrfs_release_path(path); 3526 return ret; 3527 } 3528 3529 static void set_reloc_control(struct reloc_control *rc) 3530 { 3531 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3532 3533 mutex_lock(&fs_info->reloc_mutex); 3534 fs_info->reloc_ctl = rc; 3535 mutex_unlock(&fs_info->reloc_mutex); 3536 } 3537 3538 static void unset_reloc_control(struct reloc_control *rc) 3539 { 3540 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3541 3542 mutex_lock(&fs_info->reloc_mutex); 3543 fs_info->reloc_ctl = NULL; 3544 mutex_unlock(&fs_info->reloc_mutex); 3545 } 3546 3547 static noinline_for_stack 3548 int prepare_to_relocate(struct reloc_control *rc) 3549 { 3550 struct btrfs_trans_handle *trans; 3551 int ret; 3552 3553 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3554 BTRFS_BLOCK_RSV_TEMP); 3555 if (!rc->block_rsv) 3556 return -ENOMEM; 3557 3558 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3559 rc->search_start = rc->block_group->start; 3560 rc->extents_found = 0; 3561 rc->nodes_relocated = 0; 3562 rc->merging_rsv_size = 0; 3563 rc->reserved_bytes = 0; 3564 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3565 RELOCATION_RESERVED_NODES; 3566 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3567 rc->block_rsv, rc->block_rsv->size, 3568 BTRFS_RESERVE_FLUSH_ALL); 3569 if (ret) 3570 return ret; 3571 3572 rc->create_reloc_tree = 1; 3573 set_reloc_control(rc); 3574 3575 trans = btrfs_join_transaction(rc->extent_root); 3576 if (IS_ERR(trans)) { 3577 unset_reloc_control(rc); 3578 /* 3579 * extent tree is not a ref_cow tree and has no reloc_root to 3580 * cleanup. And callers are responsible to free the above 3581 * block rsv. 3582 */ 3583 return PTR_ERR(trans); 3584 } 3585 3586 ret = btrfs_commit_transaction(trans); 3587 if (ret) 3588 unset_reloc_control(rc); 3589 3590 return ret; 3591 } 3592 3593 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3594 { 3595 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3596 struct rb_root blocks = RB_ROOT; 3597 struct btrfs_key key; 3598 struct btrfs_trans_handle *trans = NULL; 3599 struct btrfs_path *path; 3600 struct btrfs_extent_item *ei; 3601 u64 flags; 3602 int ret; 3603 int err = 0; 3604 int progress = 0; 3605 3606 path = btrfs_alloc_path(); 3607 if (!path) 3608 return -ENOMEM; 3609 path->reada = READA_FORWARD; 3610 3611 ret = prepare_to_relocate(rc); 3612 if (ret) { 3613 err = ret; 3614 goto out_free; 3615 } 3616 3617 while (1) { 3618 rc->reserved_bytes = 0; 3619 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3620 rc->block_rsv->size, 3621 BTRFS_RESERVE_FLUSH_ALL); 3622 if (ret) { 3623 err = ret; 3624 break; 3625 } 3626 progress++; 3627 trans = btrfs_start_transaction(rc->extent_root, 0); 3628 if (IS_ERR(trans)) { 3629 err = PTR_ERR(trans); 3630 trans = NULL; 3631 break; 3632 } 3633 restart: 3634 if (update_backref_cache(trans, &rc->backref_cache)) { 3635 btrfs_end_transaction(trans); 3636 trans = NULL; 3637 continue; 3638 } 3639 3640 ret = find_next_extent(rc, path, &key); 3641 if (ret < 0) 3642 err = ret; 3643 if (ret != 0) 3644 break; 3645 3646 rc->extents_found++; 3647 3648 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3649 struct btrfs_extent_item); 3650 flags = btrfs_extent_flags(path->nodes[0], ei); 3651 3652 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3653 ret = add_tree_block(rc, &key, path, &blocks); 3654 } else if (rc->stage == UPDATE_DATA_PTRS && 3655 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3656 ret = add_data_references(rc, &key, path, &blocks); 3657 } else { 3658 btrfs_release_path(path); 3659 ret = 0; 3660 } 3661 if (ret < 0) { 3662 err = ret; 3663 break; 3664 } 3665 3666 if (!RB_EMPTY_ROOT(&blocks)) { 3667 ret = relocate_tree_blocks(trans, rc, &blocks); 3668 if (ret < 0) { 3669 if (ret != -EAGAIN) { 3670 err = ret; 3671 break; 3672 } 3673 rc->extents_found--; 3674 rc->search_start = key.objectid; 3675 } 3676 } 3677 3678 btrfs_end_transaction_throttle(trans); 3679 btrfs_btree_balance_dirty(fs_info); 3680 trans = NULL; 3681 3682 if (rc->stage == MOVE_DATA_EXTENTS && 3683 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3684 rc->found_file_extent = 1; 3685 ret = relocate_data_extent(rc->data_inode, 3686 &key, &rc->cluster); 3687 if (ret < 0) { 3688 err = ret; 3689 break; 3690 } 3691 } 3692 if (btrfs_should_cancel_balance(fs_info)) { 3693 err = -ECANCELED; 3694 break; 3695 } 3696 } 3697 if (trans && progress && err == -ENOSPC) { 3698 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3699 if (ret == 1) { 3700 err = 0; 3701 progress = 0; 3702 goto restart; 3703 } 3704 } 3705 3706 btrfs_release_path(path); 3707 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3708 3709 if (trans) { 3710 btrfs_end_transaction_throttle(trans); 3711 btrfs_btree_balance_dirty(fs_info); 3712 } 3713 3714 if (!err) { 3715 ret = relocate_file_extent_cluster(rc->data_inode, 3716 &rc->cluster); 3717 if (ret < 0) 3718 err = ret; 3719 } 3720 3721 rc->create_reloc_tree = 0; 3722 set_reloc_control(rc); 3723 3724 btrfs_backref_release_cache(&rc->backref_cache); 3725 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3726 3727 /* 3728 * Even in the case when the relocation is cancelled, we should all go 3729 * through prepare_to_merge() and merge_reloc_roots(). 3730 * 3731 * For error (including cancelled balance), prepare_to_merge() will 3732 * mark all reloc trees orphan, then queue them for cleanup in 3733 * merge_reloc_roots() 3734 */ 3735 err = prepare_to_merge(rc, err); 3736 3737 merge_reloc_roots(rc); 3738 3739 rc->merge_reloc_tree = 0; 3740 unset_reloc_control(rc); 3741 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3742 3743 /* get rid of pinned extents */ 3744 trans = btrfs_join_transaction(rc->extent_root); 3745 if (IS_ERR(trans)) { 3746 err = PTR_ERR(trans); 3747 goto out_free; 3748 } 3749 ret = btrfs_commit_transaction(trans); 3750 if (ret && !err) 3751 err = ret; 3752 out_free: 3753 ret = clean_dirty_subvols(rc); 3754 if (ret < 0 && !err) 3755 err = ret; 3756 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3757 btrfs_free_path(path); 3758 return err; 3759 } 3760 3761 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3762 struct btrfs_root *root, u64 objectid) 3763 { 3764 struct btrfs_path *path; 3765 struct btrfs_inode_item *item; 3766 struct extent_buffer *leaf; 3767 int ret; 3768 3769 path = btrfs_alloc_path(); 3770 if (!path) 3771 return -ENOMEM; 3772 3773 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3774 if (ret) 3775 goto out; 3776 3777 leaf = path->nodes[0]; 3778 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3779 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3780 btrfs_set_inode_generation(leaf, item, 1); 3781 btrfs_set_inode_size(leaf, item, 0); 3782 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3783 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3784 BTRFS_INODE_PREALLOC); 3785 btrfs_mark_buffer_dirty(leaf); 3786 out: 3787 btrfs_free_path(path); 3788 return ret; 3789 } 3790 3791 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3792 struct btrfs_root *root, u64 objectid) 3793 { 3794 struct btrfs_path *path; 3795 struct btrfs_key key; 3796 int ret = 0; 3797 3798 path = btrfs_alloc_path(); 3799 if (!path) { 3800 ret = -ENOMEM; 3801 goto out; 3802 } 3803 3804 key.objectid = objectid; 3805 key.type = BTRFS_INODE_ITEM_KEY; 3806 key.offset = 0; 3807 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3808 if (ret) { 3809 if (ret > 0) 3810 ret = -ENOENT; 3811 goto out; 3812 } 3813 ret = btrfs_del_item(trans, root, path); 3814 out: 3815 if (ret) 3816 btrfs_abort_transaction(trans, ret); 3817 btrfs_free_path(path); 3818 } 3819 3820 /* 3821 * helper to create inode for data relocation. 3822 * the inode is in data relocation tree and its link count is 0 3823 */ 3824 static noinline_for_stack 3825 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3826 struct btrfs_block_group *group) 3827 { 3828 struct inode *inode = NULL; 3829 struct btrfs_trans_handle *trans; 3830 struct btrfs_root *root; 3831 u64 objectid; 3832 int err = 0; 3833 3834 root = btrfs_grab_root(fs_info->data_reloc_root); 3835 trans = btrfs_start_transaction(root, 6); 3836 if (IS_ERR(trans)) { 3837 btrfs_put_root(root); 3838 return ERR_CAST(trans); 3839 } 3840 3841 err = btrfs_get_free_objectid(root, &objectid); 3842 if (err) 3843 goto out; 3844 3845 err = __insert_orphan_inode(trans, root, objectid); 3846 if (err) 3847 goto out; 3848 3849 inode = btrfs_iget(fs_info->sb, objectid, root); 3850 if (IS_ERR(inode)) { 3851 delete_orphan_inode(trans, root, objectid); 3852 err = PTR_ERR(inode); 3853 inode = NULL; 3854 goto out; 3855 } 3856 BTRFS_I(inode)->index_cnt = group->start; 3857 3858 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 3859 out: 3860 btrfs_put_root(root); 3861 btrfs_end_transaction(trans); 3862 btrfs_btree_balance_dirty(fs_info); 3863 if (err) { 3864 iput(inode); 3865 inode = ERR_PTR(err); 3866 } 3867 return inode; 3868 } 3869 3870 /* 3871 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3872 * has been requested meanwhile and don't start in that case. 3873 * 3874 * Return: 3875 * 0 success 3876 * -EINPROGRESS operation is already in progress, that's probably a bug 3877 * -ECANCELED cancellation request was set before the operation started 3878 */ 3879 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3880 { 3881 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3882 /* This should not happen */ 3883 btrfs_err(fs_info, "reloc already running, cannot start"); 3884 return -EINPROGRESS; 3885 } 3886 3887 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3888 btrfs_info(fs_info, "chunk relocation canceled on start"); 3889 /* 3890 * On cancel, clear all requests but let the caller mark 3891 * the end after cleanup operations. 3892 */ 3893 atomic_set(&fs_info->reloc_cancel_req, 0); 3894 return -ECANCELED; 3895 } 3896 return 0; 3897 } 3898 3899 /* 3900 * Mark end of chunk relocation that is cancellable and wake any waiters. 3901 */ 3902 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3903 { 3904 /* Requested after start, clear bit first so any waiters can continue */ 3905 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3906 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3907 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3908 atomic_set(&fs_info->reloc_cancel_req, 0); 3909 } 3910 3911 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3912 { 3913 struct reloc_control *rc; 3914 3915 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3916 if (!rc) 3917 return NULL; 3918 3919 INIT_LIST_HEAD(&rc->reloc_roots); 3920 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3921 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); 3922 mapping_tree_init(&rc->reloc_root_tree); 3923 extent_io_tree_init(fs_info, &rc->processed_blocks, 3924 IO_TREE_RELOC_BLOCKS, NULL); 3925 return rc; 3926 } 3927 3928 static void free_reloc_control(struct reloc_control *rc) 3929 { 3930 struct mapping_node *node, *tmp; 3931 3932 free_reloc_roots(&rc->reloc_roots); 3933 rbtree_postorder_for_each_entry_safe(node, tmp, 3934 &rc->reloc_root_tree.rb_root, rb_node) 3935 kfree(node); 3936 3937 kfree(rc); 3938 } 3939 3940 /* 3941 * Print the block group being relocated 3942 */ 3943 static void describe_relocation(struct btrfs_fs_info *fs_info, 3944 struct btrfs_block_group *block_group) 3945 { 3946 char buf[128] = {'\0'}; 3947 3948 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3949 3950 btrfs_info(fs_info, 3951 "relocating block group %llu flags %s", 3952 block_group->start, buf); 3953 } 3954 3955 static const char *stage_to_string(int stage) 3956 { 3957 if (stage == MOVE_DATA_EXTENTS) 3958 return "move data extents"; 3959 if (stage == UPDATE_DATA_PTRS) 3960 return "update data pointers"; 3961 return "unknown"; 3962 } 3963 3964 /* 3965 * function to relocate all extents in a block group. 3966 */ 3967 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 3968 { 3969 struct btrfs_block_group *bg; 3970 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3971 struct reloc_control *rc; 3972 struct inode *inode; 3973 struct btrfs_path *path; 3974 int ret; 3975 int rw = 0; 3976 int err = 0; 3977 3978 /* 3979 * This only gets set if we had a half-deleted snapshot on mount. We 3980 * cannot allow relocation to start while we're still trying to clean up 3981 * these pending deletions. 3982 */ 3983 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3984 if (ret) 3985 return ret; 3986 3987 /* We may have been woken up by close_ctree, so bail if we're closing. */ 3988 if (btrfs_fs_closing(fs_info)) 3989 return -EINTR; 3990 3991 bg = btrfs_lookup_block_group(fs_info, group_start); 3992 if (!bg) 3993 return -ENOENT; 3994 3995 /* 3996 * Relocation of a data block group creates ordered extents. Without 3997 * sb_start_write(), we can freeze the filesystem while unfinished 3998 * ordered extents are left. Such ordered extents can cause a deadlock 3999 * e.g. when syncfs() is waiting for their completion but they can't 4000 * finish because they block when joining a transaction, due to the 4001 * fact that the freeze locks are being held in write mode. 4002 */ 4003 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 4004 ASSERT(sb_write_started(fs_info->sb)); 4005 4006 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4007 btrfs_put_block_group(bg); 4008 return -ETXTBSY; 4009 } 4010 4011 rc = alloc_reloc_control(fs_info); 4012 if (!rc) { 4013 btrfs_put_block_group(bg); 4014 return -ENOMEM; 4015 } 4016 4017 ret = reloc_chunk_start(fs_info); 4018 if (ret < 0) { 4019 err = ret; 4020 goto out_put_bg; 4021 } 4022 4023 rc->extent_root = extent_root; 4024 rc->block_group = bg; 4025 4026 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4027 if (ret) { 4028 err = ret; 4029 goto out; 4030 } 4031 rw = 1; 4032 4033 path = btrfs_alloc_path(); 4034 if (!path) { 4035 err = -ENOMEM; 4036 goto out; 4037 } 4038 4039 inode = lookup_free_space_inode(rc->block_group, path); 4040 btrfs_free_path(path); 4041 4042 if (!IS_ERR(inode)) 4043 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4044 else 4045 ret = PTR_ERR(inode); 4046 4047 if (ret && ret != -ENOENT) { 4048 err = ret; 4049 goto out; 4050 } 4051 4052 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4053 if (IS_ERR(rc->data_inode)) { 4054 err = PTR_ERR(rc->data_inode); 4055 rc->data_inode = NULL; 4056 goto out; 4057 } 4058 4059 describe_relocation(fs_info, rc->block_group); 4060 4061 btrfs_wait_block_group_reservations(rc->block_group); 4062 btrfs_wait_nocow_writers(rc->block_group); 4063 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4064 rc->block_group->start, 4065 rc->block_group->length); 4066 4067 ret = btrfs_zone_finish(rc->block_group); 4068 WARN_ON(ret && ret != -EAGAIN); 4069 4070 while (1) { 4071 int finishes_stage; 4072 4073 mutex_lock(&fs_info->cleaner_mutex); 4074 ret = relocate_block_group(rc); 4075 mutex_unlock(&fs_info->cleaner_mutex); 4076 if (ret < 0) 4077 err = ret; 4078 4079 finishes_stage = rc->stage; 4080 /* 4081 * We may have gotten ENOSPC after we already dirtied some 4082 * extents. If writeout happens while we're relocating a 4083 * different block group we could end up hitting the 4084 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4085 * btrfs_reloc_cow_block. Make sure we write everything out 4086 * properly so we don't trip over this problem, and then break 4087 * out of the loop if we hit an error. 4088 */ 4089 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4090 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4091 (u64)-1); 4092 if (ret) 4093 err = ret; 4094 invalidate_mapping_pages(rc->data_inode->i_mapping, 4095 0, -1); 4096 rc->stage = UPDATE_DATA_PTRS; 4097 } 4098 4099 if (err < 0) 4100 goto out; 4101 4102 if (rc->extents_found == 0) 4103 break; 4104 4105 btrfs_info(fs_info, "found %llu extents, stage: %s", 4106 rc->extents_found, stage_to_string(finishes_stage)); 4107 } 4108 4109 WARN_ON(rc->block_group->pinned > 0); 4110 WARN_ON(rc->block_group->reserved > 0); 4111 WARN_ON(rc->block_group->used > 0); 4112 out: 4113 if (err && rw) 4114 btrfs_dec_block_group_ro(rc->block_group); 4115 iput(rc->data_inode); 4116 out_put_bg: 4117 btrfs_put_block_group(bg); 4118 reloc_chunk_end(fs_info); 4119 free_reloc_control(rc); 4120 return err; 4121 } 4122 4123 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4124 { 4125 struct btrfs_fs_info *fs_info = root->fs_info; 4126 struct btrfs_trans_handle *trans; 4127 int ret, err; 4128 4129 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4130 if (IS_ERR(trans)) 4131 return PTR_ERR(trans); 4132 4133 memset(&root->root_item.drop_progress, 0, 4134 sizeof(root->root_item.drop_progress)); 4135 btrfs_set_root_drop_level(&root->root_item, 0); 4136 btrfs_set_root_refs(&root->root_item, 0); 4137 ret = btrfs_update_root(trans, fs_info->tree_root, 4138 &root->root_key, &root->root_item); 4139 4140 err = btrfs_end_transaction(trans); 4141 if (err) 4142 return err; 4143 return ret; 4144 } 4145 4146 /* 4147 * recover relocation interrupted by system crash. 4148 * 4149 * this function resumes merging reloc trees with corresponding fs trees. 4150 * this is important for keeping the sharing of tree blocks 4151 */ 4152 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4153 { 4154 LIST_HEAD(reloc_roots); 4155 struct btrfs_key key; 4156 struct btrfs_root *fs_root; 4157 struct btrfs_root *reloc_root; 4158 struct btrfs_path *path; 4159 struct extent_buffer *leaf; 4160 struct reloc_control *rc = NULL; 4161 struct btrfs_trans_handle *trans; 4162 int ret; 4163 int err = 0; 4164 4165 path = btrfs_alloc_path(); 4166 if (!path) 4167 return -ENOMEM; 4168 path->reada = READA_BACK; 4169 4170 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4171 key.type = BTRFS_ROOT_ITEM_KEY; 4172 key.offset = (u64)-1; 4173 4174 while (1) { 4175 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4176 path, 0, 0); 4177 if (ret < 0) { 4178 err = ret; 4179 goto out; 4180 } 4181 if (ret > 0) { 4182 if (path->slots[0] == 0) 4183 break; 4184 path->slots[0]--; 4185 } 4186 leaf = path->nodes[0]; 4187 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4188 btrfs_release_path(path); 4189 4190 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4191 key.type != BTRFS_ROOT_ITEM_KEY) 4192 break; 4193 4194 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4195 if (IS_ERR(reloc_root)) { 4196 err = PTR_ERR(reloc_root); 4197 goto out; 4198 } 4199 4200 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4201 list_add(&reloc_root->root_list, &reloc_roots); 4202 4203 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4204 fs_root = btrfs_get_fs_root(fs_info, 4205 reloc_root->root_key.offset, false); 4206 if (IS_ERR(fs_root)) { 4207 ret = PTR_ERR(fs_root); 4208 if (ret != -ENOENT) { 4209 err = ret; 4210 goto out; 4211 } 4212 ret = mark_garbage_root(reloc_root); 4213 if (ret < 0) { 4214 err = ret; 4215 goto out; 4216 } 4217 } else { 4218 btrfs_put_root(fs_root); 4219 } 4220 } 4221 4222 if (key.offset == 0) 4223 break; 4224 4225 key.offset--; 4226 } 4227 btrfs_release_path(path); 4228 4229 if (list_empty(&reloc_roots)) 4230 goto out; 4231 4232 rc = alloc_reloc_control(fs_info); 4233 if (!rc) { 4234 err = -ENOMEM; 4235 goto out; 4236 } 4237 4238 ret = reloc_chunk_start(fs_info); 4239 if (ret < 0) { 4240 err = ret; 4241 goto out_end; 4242 } 4243 4244 rc->extent_root = btrfs_extent_root(fs_info, 0); 4245 4246 set_reloc_control(rc); 4247 4248 trans = btrfs_join_transaction(rc->extent_root); 4249 if (IS_ERR(trans)) { 4250 err = PTR_ERR(trans); 4251 goto out_unset; 4252 } 4253 4254 rc->merge_reloc_tree = 1; 4255 4256 while (!list_empty(&reloc_roots)) { 4257 reloc_root = list_entry(reloc_roots.next, 4258 struct btrfs_root, root_list); 4259 list_del(&reloc_root->root_list); 4260 4261 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4262 list_add_tail(&reloc_root->root_list, 4263 &rc->reloc_roots); 4264 continue; 4265 } 4266 4267 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4268 false); 4269 if (IS_ERR(fs_root)) { 4270 err = PTR_ERR(fs_root); 4271 list_add_tail(&reloc_root->root_list, &reloc_roots); 4272 btrfs_end_transaction(trans); 4273 goto out_unset; 4274 } 4275 4276 err = __add_reloc_root(reloc_root); 4277 ASSERT(err != -EEXIST); 4278 if (err) { 4279 list_add_tail(&reloc_root->root_list, &reloc_roots); 4280 btrfs_put_root(fs_root); 4281 btrfs_end_transaction(trans); 4282 goto out_unset; 4283 } 4284 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4285 btrfs_put_root(fs_root); 4286 } 4287 4288 err = btrfs_commit_transaction(trans); 4289 if (err) 4290 goto out_unset; 4291 4292 merge_reloc_roots(rc); 4293 4294 unset_reloc_control(rc); 4295 4296 trans = btrfs_join_transaction(rc->extent_root); 4297 if (IS_ERR(trans)) { 4298 err = PTR_ERR(trans); 4299 goto out_clean; 4300 } 4301 err = btrfs_commit_transaction(trans); 4302 out_clean: 4303 ret = clean_dirty_subvols(rc); 4304 if (ret < 0 && !err) 4305 err = ret; 4306 out_unset: 4307 unset_reloc_control(rc); 4308 out_end: 4309 reloc_chunk_end(fs_info); 4310 free_reloc_control(rc); 4311 out: 4312 free_reloc_roots(&reloc_roots); 4313 4314 btrfs_free_path(path); 4315 4316 if (err == 0) { 4317 /* cleanup orphan inode in data relocation tree */ 4318 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4319 ASSERT(fs_root); 4320 err = btrfs_orphan_cleanup(fs_root); 4321 btrfs_put_root(fs_root); 4322 } 4323 return err; 4324 } 4325 4326 /* 4327 * helper to add ordered checksum for data relocation. 4328 * 4329 * cloning checksum properly handles the nodatasum extents. 4330 * it also saves CPU time to re-calculate the checksum. 4331 */ 4332 int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) 4333 { 4334 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4335 struct btrfs_root *csum_root; 4336 struct btrfs_ordered_sum *sums; 4337 struct btrfs_ordered_extent *ordered; 4338 int ret; 4339 u64 disk_bytenr; 4340 u64 new_bytenr; 4341 LIST_HEAD(list); 4342 4343 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4344 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); 4345 4346 disk_bytenr = file_pos + inode->index_cnt; 4347 csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4348 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, 4349 disk_bytenr + len - 1, &list, 0, false); 4350 if (ret) 4351 goto out; 4352 4353 while (!list_empty(&list)) { 4354 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4355 list_del_init(&sums->list); 4356 4357 /* 4358 * We need to offset the new_bytenr based on where the csum is. 4359 * We need to do this because we will read in entire prealloc 4360 * extents but we may have written to say the middle of the 4361 * prealloc extent, so we need to make sure the csum goes with 4362 * the right disk offset. 4363 * 4364 * We can do this because the data reloc inode refers strictly 4365 * to the on disk bytes, so we don't have to worry about 4366 * disk_len vs real len like with real inodes since it's all 4367 * disk length. 4368 */ 4369 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; 4370 sums->bytenr = new_bytenr; 4371 4372 btrfs_add_ordered_sum(ordered, sums); 4373 } 4374 out: 4375 btrfs_put_ordered_extent(ordered); 4376 return ret; 4377 } 4378 4379 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4380 struct btrfs_root *root, struct extent_buffer *buf, 4381 struct extent_buffer *cow) 4382 { 4383 struct btrfs_fs_info *fs_info = root->fs_info; 4384 struct reloc_control *rc; 4385 struct btrfs_backref_node *node; 4386 int first_cow = 0; 4387 int level; 4388 int ret = 0; 4389 4390 rc = fs_info->reloc_ctl; 4391 if (!rc) 4392 return 0; 4393 4394 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4395 4396 level = btrfs_header_level(buf); 4397 if (btrfs_header_generation(buf) <= 4398 btrfs_root_last_snapshot(&root->root_item)) 4399 first_cow = 1; 4400 4401 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4402 rc->create_reloc_tree) { 4403 WARN_ON(!first_cow && level == 0); 4404 4405 node = rc->backref_cache.path[level]; 4406 BUG_ON(node->bytenr != buf->start && 4407 node->new_bytenr != buf->start); 4408 4409 btrfs_backref_drop_node_buffer(node); 4410 atomic_inc(&cow->refs); 4411 node->eb = cow; 4412 node->new_bytenr = cow->start; 4413 4414 if (!node->pending) { 4415 list_move_tail(&node->list, 4416 &rc->backref_cache.pending[level]); 4417 node->pending = 1; 4418 } 4419 4420 if (first_cow) 4421 mark_block_processed(rc, node); 4422 4423 if (first_cow && level > 0) 4424 rc->nodes_relocated += buf->len; 4425 } 4426 4427 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4428 ret = replace_file_extents(trans, rc, root, cow); 4429 return ret; 4430 } 4431 4432 /* 4433 * called before creating snapshot. it calculates metadata reservation 4434 * required for relocating tree blocks in the snapshot 4435 */ 4436 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4437 u64 *bytes_to_reserve) 4438 { 4439 struct btrfs_root *root = pending->root; 4440 struct reloc_control *rc = root->fs_info->reloc_ctl; 4441 4442 if (!rc || !have_reloc_root(root)) 4443 return; 4444 4445 if (!rc->merge_reloc_tree) 4446 return; 4447 4448 root = root->reloc_root; 4449 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4450 /* 4451 * relocation is in the stage of merging trees. the space 4452 * used by merging a reloc tree is twice the size of 4453 * relocated tree nodes in the worst case. half for cowing 4454 * the reloc tree, half for cowing the fs tree. the space 4455 * used by cowing the reloc tree will be freed after the 4456 * tree is dropped. if we create snapshot, cowing the fs 4457 * tree may use more space than it frees. so we need 4458 * reserve extra space. 4459 */ 4460 *bytes_to_reserve += rc->nodes_relocated; 4461 } 4462 4463 /* 4464 * called after snapshot is created. migrate block reservation 4465 * and create reloc root for the newly created snapshot 4466 * 4467 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4468 * references held on the reloc_root, one for root->reloc_root and one for 4469 * rc->reloc_roots. 4470 */ 4471 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4472 struct btrfs_pending_snapshot *pending) 4473 { 4474 struct btrfs_root *root = pending->root; 4475 struct btrfs_root *reloc_root; 4476 struct btrfs_root *new_root; 4477 struct reloc_control *rc = root->fs_info->reloc_ctl; 4478 int ret; 4479 4480 if (!rc || !have_reloc_root(root)) 4481 return 0; 4482 4483 rc = root->fs_info->reloc_ctl; 4484 rc->merging_rsv_size += rc->nodes_relocated; 4485 4486 if (rc->merge_reloc_tree) { 4487 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4488 rc->block_rsv, 4489 rc->nodes_relocated, true); 4490 if (ret) 4491 return ret; 4492 } 4493 4494 new_root = pending->snap; 4495 reloc_root = create_reloc_root(trans, root->reloc_root, 4496 new_root->root_key.objectid); 4497 if (IS_ERR(reloc_root)) 4498 return PTR_ERR(reloc_root); 4499 4500 ret = __add_reloc_root(reloc_root); 4501 ASSERT(ret != -EEXIST); 4502 if (ret) { 4503 /* Pairs with create_reloc_root */ 4504 btrfs_put_root(reloc_root); 4505 return ret; 4506 } 4507 new_root->reloc_root = btrfs_grab_root(reloc_root); 4508 4509 if (rc->create_reloc_tree) 4510 ret = clone_backref_node(trans, rc, root, reloc_root); 4511 return ret; 4512 } 4513