1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 30 /* 31 * Relocation overview 32 * 33 * [What does relocation do] 34 * 35 * The objective of relocation is to relocate all extents of the target block 36 * group to other block groups. 37 * This is utilized by resize (shrink only), profile converting, compacting 38 * space, or balance routine to spread chunks over devices. 39 * 40 * Before | After 41 * ------------------------------------------------------------------ 42 * BG A: 10 data extents | BG A: deleted 43 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 44 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 45 * 46 * [How does relocation work] 47 * 48 * 1. Mark the target block group read-only 49 * New extents won't be allocated from the target block group. 50 * 51 * 2.1 Record each extent in the target block group 52 * To build a proper map of extents to be relocated. 53 * 54 * 2.2 Build data reloc tree and reloc trees 55 * Data reloc tree will contain an inode, recording all newly relocated 56 * data extents. 57 * There will be only one data reloc tree for one data block group. 58 * 59 * Reloc tree will be a special snapshot of its source tree, containing 60 * relocated tree blocks. 61 * Each tree referring to a tree block in target block group will get its 62 * reloc tree built. 63 * 64 * 2.3 Swap source tree with its corresponding reloc tree 65 * Each involved tree only refers to new extents after swap. 66 * 67 * 3. Cleanup reloc trees and data reloc tree. 68 * As old extents in the target block group are still referenced by reloc 69 * trees, we need to clean them up before really freeing the target block 70 * group. 71 * 72 * The main complexity is in steps 2.2 and 2.3. 73 * 74 * The entry point of relocation is relocate_block_group() function. 75 */ 76 77 #define RELOCATION_RESERVED_NODES 256 78 /* 79 * map address of tree root to tree 80 */ 81 struct mapping_node { 82 struct { 83 struct rb_node rb_node; 84 u64 bytenr; 85 }; /* Use rb_simle_node for search/insert */ 86 void *data; 87 }; 88 89 struct mapping_tree { 90 struct rb_root rb_root; 91 spinlock_t lock; 92 }; 93 94 /* 95 * present a tree block to process 96 */ 97 struct tree_block { 98 struct { 99 struct rb_node rb_node; 100 u64 bytenr; 101 }; /* Use rb_simple_node for search/insert */ 102 u64 owner; 103 struct btrfs_key key; 104 unsigned int level:8; 105 unsigned int key_ready:1; 106 }; 107 108 #define MAX_EXTENTS 128 109 110 struct file_extent_cluster { 111 u64 start; 112 u64 end; 113 u64 boundary[MAX_EXTENTS]; 114 unsigned int nr; 115 }; 116 117 struct reloc_control { 118 /* block group to relocate */ 119 struct btrfs_block_group *block_group; 120 /* extent tree */ 121 struct btrfs_root *extent_root; 122 /* inode for moving data */ 123 struct inode *data_inode; 124 125 struct btrfs_block_rsv *block_rsv; 126 127 struct btrfs_backref_cache backref_cache; 128 129 struct file_extent_cluster cluster; 130 /* tree blocks have been processed */ 131 struct extent_io_tree processed_blocks; 132 /* map start of tree root to corresponding reloc tree */ 133 struct mapping_tree reloc_root_tree; 134 /* list of reloc trees */ 135 struct list_head reloc_roots; 136 /* list of subvolume trees that get relocated */ 137 struct list_head dirty_subvol_roots; 138 /* size of metadata reservation for merging reloc trees */ 139 u64 merging_rsv_size; 140 /* size of relocated tree nodes */ 141 u64 nodes_relocated; 142 /* reserved size for block group relocation*/ 143 u64 reserved_bytes; 144 145 u64 search_start; 146 u64 extents_found; 147 148 unsigned int stage:8; 149 unsigned int create_reloc_tree:1; 150 unsigned int merge_reloc_tree:1; 151 unsigned int found_file_extent:1; 152 }; 153 154 /* stages of data relocation */ 155 #define MOVE_DATA_EXTENTS 0 156 #define UPDATE_DATA_PTRS 1 157 158 static void mark_block_processed(struct reloc_control *rc, 159 struct btrfs_backref_node *node) 160 { 161 u32 blocksize; 162 163 if (node->level == 0 || 164 in_range(node->bytenr, rc->block_group->start, 165 rc->block_group->length)) { 166 blocksize = rc->extent_root->fs_info->nodesize; 167 set_extent_bits(&rc->processed_blocks, node->bytenr, 168 node->bytenr + blocksize - 1, EXTENT_DIRTY); 169 } 170 node->processed = 1; 171 } 172 173 174 static void mapping_tree_init(struct mapping_tree *tree) 175 { 176 tree->rb_root = RB_ROOT; 177 spin_lock_init(&tree->lock); 178 } 179 180 /* 181 * walk up backref nodes until reach node presents tree root 182 */ 183 static struct btrfs_backref_node *walk_up_backref( 184 struct btrfs_backref_node *node, 185 struct btrfs_backref_edge *edges[], int *index) 186 { 187 struct btrfs_backref_edge *edge; 188 int idx = *index; 189 190 while (!list_empty(&node->upper)) { 191 edge = list_entry(node->upper.next, 192 struct btrfs_backref_edge, list[LOWER]); 193 edges[idx++] = edge; 194 node = edge->node[UPPER]; 195 } 196 BUG_ON(node->detached); 197 *index = idx; 198 return node; 199 } 200 201 /* 202 * walk down backref nodes to find start of next reference path 203 */ 204 static struct btrfs_backref_node *walk_down_backref( 205 struct btrfs_backref_edge *edges[], int *index) 206 { 207 struct btrfs_backref_edge *edge; 208 struct btrfs_backref_node *lower; 209 int idx = *index; 210 211 while (idx > 0) { 212 edge = edges[idx - 1]; 213 lower = edge->node[LOWER]; 214 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 215 idx--; 216 continue; 217 } 218 edge = list_entry(edge->list[LOWER].next, 219 struct btrfs_backref_edge, list[LOWER]); 220 edges[idx - 1] = edge; 221 *index = idx; 222 return edge->node[UPPER]; 223 } 224 *index = 0; 225 return NULL; 226 } 227 228 static void update_backref_node(struct btrfs_backref_cache *cache, 229 struct btrfs_backref_node *node, u64 bytenr) 230 { 231 struct rb_node *rb_node; 232 rb_erase(&node->rb_node, &cache->rb_root); 233 node->bytenr = bytenr; 234 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); 235 if (rb_node) 236 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); 237 } 238 239 /* 240 * update backref cache after a transaction commit 241 */ 242 static int update_backref_cache(struct btrfs_trans_handle *trans, 243 struct btrfs_backref_cache *cache) 244 { 245 struct btrfs_backref_node *node; 246 int level = 0; 247 248 if (cache->last_trans == 0) { 249 cache->last_trans = trans->transid; 250 return 0; 251 } 252 253 if (cache->last_trans == trans->transid) 254 return 0; 255 256 /* 257 * detached nodes are used to avoid unnecessary backref 258 * lookup. transaction commit changes the extent tree. 259 * so the detached nodes are no longer useful. 260 */ 261 while (!list_empty(&cache->detached)) { 262 node = list_entry(cache->detached.next, 263 struct btrfs_backref_node, list); 264 btrfs_backref_cleanup_node(cache, node); 265 } 266 267 while (!list_empty(&cache->changed)) { 268 node = list_entry(cache->changed.next, 269 struct btrfs_backref_node, list); 270 list_del_init(&node->list); 271 BUG_ON(node->pending); 272 update_backref_node(cache, node, node->new_bytenr); 273 } 274 275 /* 276 * some nodes can be left in the pending list if there were 277 * errors during processing the pending nodes. 278 */ 279 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 280 list_for_each_entry(node, &cache->pending[level], list) { 281 BUG_ON(!node->pending); 282 if (node->bytenr == node->new_bytenr) 283 continue; 284 update_backref_node(cache, node, node->new_bytenr); 285 } 286 } 287 288 cache->last_trans = 0; 289 return 1; 290 } 291 292 static bool reloc_root_is_dead(struct btrfs_root *root) 293 { 294 /* 295 * Pair with set_bit/clear_bit in clean_dirty_subvols and 296 * btrfs_update_reloc_root. We need to see the updated bit before 297 * trying to access reloc_root 298 */ 299 smp_rmb(); 300 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 301 return true; 302 return false; 303 } 304 305 /* 306 * Check if this subvolume tree has valid reloc tree. 307 * 308 * Reloc tree after swap is considered dead, thus not considered as valid. 309 * This is enough for most callers, as they don't distinguish dead reloc root 310 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 311 * special case. 312 */ 313 static bool have_reloc_root(struct btrfs_root *root) 314 { 315 if (reloc_root_is_dead(root)) 316 return false; 317 if (!root->reloc_root) 318 return false; 319 return true; 320 } 321 322 int btrfs_should_ignore_reloc_root(struct btrfs_root *root) 323 { 324 struct btrfs_root *reloc_root; 325 326 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 327 return 0; 328 329 /* This root has been merged with its reloc tree, we can ignore it */ 330 if (reloc_root_is_dead(root)) 331 return 1; 332 333 reloc_root = root->reloc_root; 334 if (!reloc_root) 335 return 0; 336 337 if (btrfs_header_generation(reloc_root->commit_root) == 338 root->fs_info->running_transaction->transid) 339 return 0; 340 /* 341 * if there is reloc tree and it was created in previous 342 * transaction backref lookup can find the reloc tree, 343 * so backref node for the fs tree root is useless for 344 * relocation. 345 */ 346 return 1; 347 } 348 349 /* 350 * find reloc tree by address of tree root 351 */ 352 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 353 { 354 struct reloc_control *rc = fs_info->reloc_ctl; 355 struct rb_node *rb_node; 356 struct mapping_node *node; 357 struct btrfs_root *root = NULL; 358 359 ASSERT(rc); 360 spin_lock(&rc->reloc_root_tree.lock); 361 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 362 if (rb_node) { 363 node = rb_entry(rb_node, struct mapping_node, rb_node); 364 root = (struct btrfs_root *)node->data; 365 } 366 spin_unlock(&rc->reloc_root_tree.lock); 367 return btrfs_grab_root(root); 368 } 369 370 /* 371 * For useless nodes, do two major clean ups: 372 * 373 * - Cleanup the children edges and nodes 374 * If child node is also orphan (no parent) during cleanup, then the child 375 * node will also be cleaned up. 376 * 377 * - Freeing up leaves (level 0), keeps nodes detached 378 * For nodes, the node is still cached as "detached" 379 * 380 * Return false if @node is not in the @useless_nodes list. 381 * Return true if @node is in the @useless_nodes list. 382 */ 383 static bool handle_useless_nodes(struct reloc_control *rc, 384 struct btrfs_backref_node *node) 385 { 386 struct btrfs_backref_cache *cache = &rc->backref_cache; 387 struct list_head *useless_node = &cache->useless_node; 388 bool ret = false; 389 390 while (!list_empty(useless_node)) { 391 struct btrfs_backref_node *cur; 392 393 cur = list_first_entry(useless_node, struct btrfs_backref_node, 394 list); 395 list_del_init(&cur->list); 396 397 /* Only tree root nodes can be added to @useless_nodes */ 398 ASSERT(list_empty(&cur->upper)); 399 400 if (cur == node) 401 ret = true; 402 403 /* The node is the lowest node */ 404 if (cur->lowest) { 405 list_del_init(&cur->lower); 406 cur->lowest = 0; 407 } 408 409 /* Cleanup the lower edges */ 410 while (!list_empty(&cur->lower)) { 411 struct btrfs_backref_edge *edge; 412 struct btrfs_backref_node *lower; 413 414 edge = list_entry(cur->lower.next, 415 struct btrfs_backref_edge, list[UPPER]); 416 list_del(&edge->list[UPPER]); 417 list_del(&edge->list[LOWER]); 418 lower = edge->node[LOWER]; 419 btrfs_backref_free_edge(cache, edge); 420 421 /* Child node is also orphan, queue for cleanup */ 422 if (list_empty(&lower->upper)) 423 list_add(&lower->list, useless_node); 424 } 425 /* Mark this block processed for relocation */ 426 mark_block_processed(rc, cur); 427 428 /* 429 * Backref nodes for tree leaves are deleted from the cache. 430 * Backref nodes for upper level tree blocks are left in the 431 * cache to avoid unnecessary backref lookup. 432 */ 433 if (cur->level > 0) { 434 list_add(&cur->list, &cache->detached); 435 cur->detached = 1; 436 } else { 437 rb_erase(&cur->rb_node, &cache->rb_root); 438 btrfs_backref_free_node(cache, cur); 439 } 440 } 441 return ret; 442 } 443 444 /* 445 * Build backref tree for a given tree block. Root of the backref tree 446 * corresponds the tree block, leaves of the backref tree correspond roots of 447 * b-trees that reference the tree block. 448 * 449 * The basic idea of this function is check backrefs of a given block to find 450 * upper level blocks that reference the block, and then check backrefs of 451 * these upper level blocks recursively. The recursion stops when tree root is 452 * reached or backrefs for the block is cached. 453 * 454 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 455 * all upper level blocks that directly/indirectly reference the block are also 456 * cached. 457 */ 458 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 459 struct reloc_control *rc, struct btrfs_key *node_key, 460 int level, u64 bytenr) 461 { 462 struct btrfs_backref_iter *iter; 463 struct btrfs_backref_cache *cache = &rc->backref_cache; 464 /* For searching parent of TREE_BLOCK_REF */ 465 struct btrfs_path *path; 466 struct btrfs_backref_node *cur; 467 struct btrfs_backref_node *node = NULL; 468 struct btrfs_backref_edge *edge; 469 int ret; 470 int err = 0; 471 472 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); 473 if (!iter) 474 return ERR_PTR(-ENOMEM); 475 path = btrfs_alloc_path(); 476 if (!path) { 477 err = -ENOMEM; 478 goto out; 479 } 480 481 node = btrfs_backref_alloc_node(cache, bytenr, level); 482 if (!node) { 483 err = -ENOMEM; 484 goto out; 485 } 486 487 node->lowest = 1; 488 cur = node; 489 490 /* Breadth-first search to build backref cache */ 491 do { 492 ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, 493 cur); 494 if (ret < 0) { 495 err = ret; 496 goto out; 497 } 498 edge = list_first_entry_or_null(&cache->pending_edge, 499 struct btrfs_backref_edge, list[UPPER]); 500 /* 501 * The pending list isn't empty, take the first block to 502 * process 503 */ 504 if (edge) { 505 list_del_init(&edge->list[UPPER]); 506 cur = edge->node[UPPER]; 507 } 508 } while (edge); 509 510 /* Finish the upper linkage of newly added edges/nodes */ 511 ret = btrfs_backref_finish_upper_links(cache, node); 512 if (ret < 0) { 513 err = ret; 514 goto out; 515 } 516 517 if (handle_useless_nodes(rc, node)) 518 node = NULL; 519 out: 520 btrfs_backref_iter_free(iter); 521 btrfs_free_path(path); 522 if (err) { 523 btrfs_backref_error_cleanup(cache, node); 524 return ERR_PTR(err); 525 } 526 ASSERT(!node || !node->detached); 527 ASSERT(list_empty(&cache->useless_node) && 528 list_empty(&cache->pending_edge)); 529 return node; 530 } 531 532 /* 533 * helper to add backref node for the newly created snapshot. 534 * the backref node is created by cloning backref node that 535 * corresponds to root of source tree 536 */ 537 static int clone_backref_node(struct btrfs_trans_handle *trans, 538 struct reloc_control *rc, 539 struct btrfs_root *src, 540 struct btrfs_root *dest) 541 { 542 struct btrfs_root *reloc_root = src->reloc_root; 543 struct btrfs_backref_cache *cache = &rc->backref_cache; 544 struct btrfs_backref_node *node = NULL; 545 struct btrfs_backref_node *new_node; 546 struct btrfs_backref_edge *edge; 547 struct btrfs_backref_edge *new_edge; 548 struct rb_node *rb_node; 549 550 if (cache->last_trans > 0) 551 update_backref_cache(trans, cache); 552 553 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 554 if (rb_node) { 555 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 556 if (node->detached) 557 node = NULL; 558 else 559 BUG_ON(node->new_bytenr != reloc_root->node->start); 560 } 561 562 if (!node) { 563 rb_node = rb_simple_search(&cache->rb_root, 564 reloc_root->commit_root->start); 565 if (rb_node) { 566 node = rb_entry(rb_node, struct btrfs_backref_node, 567 rb_node); 568 BUG_ON(node->detached); 569 } 570 } 571 572 if (!node) 573 return 0; 574 575 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 576 node->level); 577 if (!new_node) 578 return -ENOMEM; 579 580 new_node->lowest = node->lowest; 581 new_node->checked = 1; 582 new_node->root = btrfs_grab_root(dest); 583 ASSERT(new_node->root); 584 585 if (!node->lowest) { 586 list_for_each_entry(edge, &node->lower, list[UPPER]) { 587 new_edge = btrfs_backref_alloc_edge(cache); 588 if (!new_edge) 589 goto fail; 590 591 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 592 new_node, LINK_UPPER); 593 } 594 } else { 595 list_add_tail(&new_node->lower, &cache->leaves); 596 } 597 598 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 599 &new_node->rb_node); 600 if (rb_node) 601 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 602 603 if (!new_node->lowest) { 604 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 605 list_add_tail(&new_edge->list[LOWER], 606 &new_edge->node[LOWER]->upper); 607 } 608 } 609 return 0; 610 fail: 611 while (!list_empty(&new_node->lower)) { 612 new_edge = list_entry(new_node->lower.next, 613 struct btrfs_backref_edge, list[UPPER]); 614 list_del(&new_edge->list[UPPER]); 615 btrfs_backref_free_edge(cache, new_edge); 616 } 617 btrfs_backref_free_node(cache, new_node); 618 return -ENOMEM; 619 } 620 621 /* 622 * helper to add 'address of tree root -> reloc tree' mapping 623 */ 624 static int __must_check __add_reloc_root(struct btrfs_root *root) 625 { 626 struct btrfs_fs_info *fs_info = root->fs_info; 627 struct rb_node *rb_node; 628 struct mapping_node *node; 629 struct reloc_control *rc = fs_info->reloc_ctl; 630 631 node = kmalloc(sizeof(*node), GFP_NOFS); 632 if (!node) 633 return -ENOMEM; 634 635 node->bytenr = root->commit_root->start; 636 node->data = root; 637 638 spin_lock(&rc->reloc_root_tree.lock); 639 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 640 node->bytenr, &node->rb_node); 641 spin_unlock(&rc->reloc_root_tree.lock); 642 if (rb_node) { 643 btrfs_err(fs_info, 644 "Duplicate root found for start=%llu while inserting into relocation tree", 645 node->bytenr); 646 return -EEXIST; 647 } 648 649 list_add_tail(&root->root_list, &rc->reloc_roots); 650 return 0; 651 } 652 653 /* 654 * helper to delete the 'address of tree root -> reloc tree' 655 * mapping 656 */ 657 static void __del_reloc_root(struct btrfs_root *root) 658 { 659 struct btrfs_fs_info *fs_info = root->fs_info; 660 struct rb_node *rb_node; 661 struct mapping_node *node = NULL; 662 struct reloc_control *rc = fs_info->reloc_ctl; 663 bool put_ref = false; 664 665 if (rc && root->node) { 666 spin_lock(&rc->reloc_root_tree.lock); 667 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 668 root->commit_root->start); 669 if (rb_node) { 670 node = rb_entry(rb_node, struct mapping_node, rb_node); 671 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 672 RB_CLEAR_NODE(&node->rb_node); 673 } 674 spin_unlock(&rc->reloc_root_tree.lock); 675 ASSERT(!node || (struct btrfs_root *)node->data == root); 676 } 677 678 /* 679 * We only put the reloc root here if it's on the list. There's a lot 680 * of places where the pattern is to splice the rc->reloc_roots, process 681 * the reloc roots, and then add the reloc root back onto 682 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 683 * list we don't want the reference being dropped, because the guy 684 * messing with the list is in charge of the reference. 685 */ 686 spin_lock(&fs_info->trans_lock); 687 if (!list_empty(&root->root_list)) { 688 put_ref = true; 689 list_del_init(&root->root_list); 690 } 691 spin_unlock(&fs_info->trans_lock); 692 if (put_ref) 693 btrfs_put_root(root); 694 kfree(node); 695 } 696 697 /* 698 * helper to update the 'address of tree root -> reloc tree' 699 * mapping 700 */ 701 static int __update_reloc_root(struct btrfs_root *root) 702 { 703 struct btrfs_fs_info *fs_info = root->fs_info; 704 struct rb_node *rb_node; 705 struct mapping_node *node = NULL; 706 struct reloc_control *rc = fs_info->reloc_ctl; 707 708 spin_lock(&rc->reloc_root_tree.lock); 709 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 710 root->commit_root->start); 711 if (rb_node) { 712 node = rb_entry(rb_node, struct mapping_node, rb_node); 713 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 714 } 715 spin_unlock(&rc->reloc_root_tree.lock); 716 717 if (!node) 718 return 0; 719 BUG_ON((struct btrfs_root *)node->data != root); 720 721 spin_lock(&rc->reloc_root_tree.lock); 722 node->bytenr = root->node->start; 723 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 724 node->bytenr, &node->rb_node); 725 spin_unlock(&rc->reloc_root_tree.lock); 726 if (rb_node) 727 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 728 return 0; 729 } 730 731 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 732 struct btrfs_root *root, u64 objectid) 733 { 734 struct btrfs_fs_info *fs_info = root->fs_info; 735 struct btrfs_root *reloc_root; 736 struct extent_buffer *eb; 737 struct btrfs_root_item *root_item; 738 struct btrfs_key root_key; 739 int ret = 0; 740 bool must_abort = false; 741 742 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 743 if (!root_item) 744 return ERR_PTR(-ENOMEM); 745 746 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 747 root_key.type = BTRFS_ROOT_ITEM_KEY; 748 root_key.offset = objectid; 749 750 if (root->root_key.objectid == objectid) { 751 u64 commit_root_gen; 752 753 /* called by btrfs_init_reloc_root */ 754 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 755 BTRFS_TREE_RELOC_OBJECTID); 756 if (ret) 757 goto fail; 758 759 /* 760 * Set the last_snapshot field to the generation of the commit 761 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 762 * correctly (returns true) when the relocation root is created 763 * either inside the critical section of a transaction commit 764 * (through transaction.c:qgroup_account_snapshot()) and when 765 * it's created before the transaction commit is started. 766 */ 767 commit_root_gen = btrfs_header_generation(root->commit_root); 768 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 769 } else { 770 /* 771 * called by btrfs_reloc_post_snapshot_hook. 772 * the source tree is a reloc tree, all tree blocks 773 * modified after it was created have RELOC flag 774 * set in their headers. so it's OK to not update 775 * the 'last_snapshot'. 776 */ 777 ret = btrfs_copy_root(trans, root, root->node, &eb, 778 BTRFS_TREE_RELOC_OBJECTID); 779 if (ret) 780 goto fail; 781 } 782 783 /* 784 * We have changed references at this point, we must abort the 785 * transaction if anything fails. 786 */ 787 must_abort = true; 788 789 memcpy(root_item, &root->root_item, sizeof(*root_item)); 790 btrfs_set_root_bytenr(root_item, eb->start); 791 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 792 btrfs_set_root_generation(root_item, trans->transid); 793 794 if (root->root_key.objectid == objectid) { 795 btrfs_set_root_refs(root_item, 0); 796 memset(&root_item->drop_progress, 0, 797 sizeof(struct btrfs_disk_key)); 798 btrfs_set_root_drop_level(root_item, 0); 799 } 800 801 btrfs_tree_unlock(eb); 802 free_extent_buffer(eb); 803 804 ret = btrfs_insert_root(trans, fs_info->tree_root, 805 &root_key, root_item); 806 if (ret) 807 goto fail; 808 809 kfree(root_item); 810 811 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 812 if (IS_ERR(reloc_root)) { 813 ret = PTR_ERR(reloc_root); 814 goto abort; 815 } 816 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 817 reloc_root->last_trans = trans->transid; 818 return reloc_root; 819 fail: 820 kfree(root_item); 821 abort: 822 if (must_abort) 823 btrfs_abort_transaction(trans, ret); 824 return ERR_PTR(ret); 825 } 826 827 /* 828 * create reloc tree for a given fs tree. reloc tree is just a 829 * snapshot of the fs tree with special root objectid. 830 * 831 * The reloc_root comes out of here with two references, one for 832 * root->reloc_root, and another for being on the rc->reloc_roots list. 833 */ 834 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 835 struct btrfs_root *root) 836 { 837 struct btrfs_fs_info *fs_info = root->fs_info; 838 struct btrfs_root *reloc_root; 839 struct reloc_control *rc = fs_info->reloc_ctl; 840 struct btrfs_block_rsv *rsv; 841 int clear_rsv = 0; 842 int ret; 843 844 if (!rc) 845 return 0; 846 847 /* 848 * The subvolume has reloc tree but the swap is finished, no need to 849 * create/update the dead reloc tree 850 */ 851 if (reloc_root_is_dead(root)) 852 return 0; 853 854 /* 855 * This is subtle but important. We do not do 856 * record_root_in_transaction for reloc roots, instead we record their 857 * corresponding fs root, and then here we update the last trans for the 858 * reloc root. This means that we have to do this for the entire life 859 * of the reloc root, regardless of which stage of the relocation we are 860 * in. 861 */ 862 if (root->reloc_root) { 863 reloc_root = root->reloc_root; 864 reloc_root->last_trans = trans->transid; 865 return 0; 866 } 867 868 /* 869 * We are merging reloc roots, we do not need new reloc trees. Also 870 * reloc trees never need their own reloc tree. 871 */ 872 if (!rc->create_reloc_tree || 873 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 874 return 0; 875 876 if (!trans->reloc_reserved) { 877 rsv = trans->block_rsv; 878 trans->block_rsv = rc->block_rsv; 879 clear_rsv = 1; 880 } 881 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 882 if (clear_rsv) 883 trans->block_rsv = rsv; 884 if (IS_ERR(reloc_root)) 885 return PTR_ERR(reloc_root); 886 887 ret = __add_reloc_root(reloc_root); 888 ASSERT(ret != -EEXIST); 889 if (ret) { 890 /* Pairs with create_reloc_root */ 891 btrfs_put_root(reloc_root); 892 return ret; 893 } 894 root->reloc_root = btrfs_grab_root(reloc_root); 895 return 0; 896 } 897 898 /* 899 * update root item of reloc tree 900 */ 901 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 902 struct btrfs_root *root) 903 { 904 struct btrfs_fs_info *fs_info = root->fs_info; 905 struct btrfs_root *reloc_root; 906 struct btrfs_root_item *root_item; 907 int ret; 908 909 if (!have_reloc_root(root)) 910 return 0; 911 912 reloc_root = root->reloc_root; 913 root_item = &reloc_root->root_item; 914 915 /* 916 * We are probably ok here, but __del_reloc_root() will drop its ref of 917 * the root. We have the ref for root->reloc_root, but just in case 918 * hold it while we update the reloc root. 919 */ 920 btrfs_grab_root(reloc_root); 921 922 /* root->reloc_root will stay until current relocation finished */ 923 if (fs_info->reloc_ctl->merge_reloc_tree && 924 btrfs_root_refs(root_item) == 0) { 925 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 926 /* 927 * Mark the tree as dead before we change reloc_root so 928 * have_reloc_root will not touch it from now on. 929 */ 930 smp_wmb(); 931 __del_reloc_root(reloc_root); 932 } 933 934 if (reloc_root->commit_root != reloc_root->node) { 935 __update_reloc_root(reloc_root); 936 btrfs_set_root_node(root_item, reloc_root->node); 937 free_extent_buffer(reloc_root->commit_root); 938 reloc_root->commit_root = btrfs_root_node(reloc_root); 939 } 940 941 ret = btrfs_update_root(trans, fs_info->tree_root, 942 &reloc_root->root_key, root_item); 943 btrfs_put_root(reloc_root); 944 return ret; 945 } 946 947 /* 948 * helper to find first cached inode with inode number >= objectid 949 * in a subvolume 950 */ 951 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 952 { 953 struct rb_node *node; 954 struct rb_node *prev; 955 struct btrfs_inode *entry; 956 struct inode *inode; 957 958 spin_lock(&root->inode_lock); 959 again: 960 node = root->inode_tree.rb_node; 961 prev = NULL; 962 while (node) { 963 prev = node; 964 entry = rb_entry(node, struct btrfs_inode, rb_node); 965 966 if (objectid < btrfs_ino(entry)) 967 node = node->rb_left; 968 else if (objectid > btrfs_ino(entry)) 969 node = node->rb_right; 970 else 971 break; 972 } 973 if (!node) { 974 while (prev) { 975 entry = rb_entry(prev, struct btrfs_inode, rb_node); 976 if (objectid <= btrfs_ino(entry)) { 977 node = prev; 978 break; 979 } 980 prev = rb_next(prev); 981 } 982 } 983 while (node) { 984 entry = rb_entry(node, struct btrfs_inode, rb_node); 985 inode = igrab(&entry->vfs_inode); 986 if (inode) { 987 spin_unlock(&root->inode_lock); 988 return inode; 989 } 990 991 objectid = btrfs_ino(entry) + 1; 992 if (cond_resched_lock(&root->inode_lock)) 993 goto again; 994 995 node = rb_next(node); 996 } 997 spin_unlock(&root->inode_lock); 998 return NULL; 999 } 1000 1001 /* 1002 * get new location of data 1003 */ 1004 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1005 u64 bytenr, u64 num_bytes) 1006 { 1007 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1008 struct btrfs_path *path; 1009 struct btrfs_file_extent_item *fi; 1010 struct extent_buffer *leaf; 1011 int ret; 1012 1013 path = btrfs_alloc_path(); 1014 if (!path) 1015 return -ENOMEM; 1016 1017 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1018 ret = btrfs_lookup_file_extent(NULL, root, path, 1019 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1020 if (ret < 0) 1021 goto out; 1022 if (ret > 0) { 1023 ret = -ENOENT; 1024 goto out; 1025 } 1026 1027 leaf = path->nodes[0]; 1028 fi = btrfs_item_ptr(leaf, path->slots[0], 1029 struct btrfs_file_extent_item); 1030 1031 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1032 btrfs_file_extent_compression(leaf, fi) || 1033 btrfs_file_extent_encryption(leaf, fi) || 1034 btrfs_file_extent_other_encoding(leaf, fi)); 1035 1036 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1037 ret = -EINVAL; 1038 goto out; 1039 } 1040 1041 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1042 ret = 0; 1043 out: 1044 btrfs_free_path(path); 1045 return ret; 1046 } 1047 1048 /* 1049 * update file extent items in the tree leaf to point to 1050 * the new locations. 1051 */ 1052 static noinline_for_stack 1053 int replace_file_extents(struct btrfs_trans_handle *trans, 1054 struct reloc_control *rc, 1055 struct btrfs_root *root, 1056 struct extent_buffer *leaf) 1057 { 1058 struct btrfs_fs_info *fs_info = root->fs_info; 1059 struct btrfs_key key; 1060 struct btrfs_file_extent_item *fi; 1061 struct inode *inode = NULL; 1062 u64 parent; 1063 u64 bytenr; 1064 u64 new_bytenr = 0; 1065 u64 num_bytes; 1066 u64 end; 1067 u32 nritems; 1068 u32 i; 1069 int ret = 0; 1070 int first = 1; 1071 int dirty = 0; 1072 1073 if (rc->stage != UPDATE_DATA_PTRS) 1074 return 0; 1075 1076 /* reloc trees always use full backref */ 1077 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1078 parent = leaf->start; 1079 else 1080 parent = 0; 1081 1082 nritems = btrfs_header_nritems(leaf); 1083 for (i = 0; i < nritems; i++) { 1084 struct btrfs_ref ref = { 0 }; 1085 1086 cond_resched(); 1087 btrfs_item_key_to_cpu(leaf, &key, i); 1088 if (key.type != BTRFS_EXTENT_DATA_KEY) 1089 continue; 1090 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1091 if (btrfs_file_extent_type(leaf, fi) == 1092 BTRFS_FILE_EXTENT_INLINE) 1093 continue; 1094 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1095 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1096 if (bytenr == 0) 1097 continue; 1098 if (!in_range(bytenr, rc->block_group->start, 1099 rc->block_group->length)) 1100 continue; 1101 1102 /* 1103 * if we are modifying block in fs tree, wait for readpage 1104 * to complete and drop the extent cache 1105 */ 1106 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1107 if (first) { 1108 inode = find_next_inode(root, key.objectid); 1109 first = 0; 1110 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1111 btrfs_add_delayed_iput(inode); 1112 inode = find_next_inode(root, key.objectid); 1113 } 1114 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1115 end = key.offset + 1116 btrfs_file_extent_num_bytes(leaf, fi); 1117 WARN_ON(!IS_ALIGNED(key.offset, 1118 fs_info->sectorsize)); 1119 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1120 end--; 1121 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1122 key.offset, end); 1123 if (!ret) 1124 continue; 1125 1126 btrfs_drop_extent_cache(BTRFS_I(inode), 1127 key.offset, end, 1); 1128 unlock_extent(&BTRFS_I(inode)->io_tree, 1129 key.offset, end); 1130 } 1131 } 1132 1133 ret = get_new_location(rc->data_inode, &new_bytenr, 1134 bytenr, num_bytes); 1135 if (ret) { 1136 /* 1137 * Don't have to abort since we've not changed anything 1138 * in the file extent yet. 1139 */ 1140 break; 1141 } 1142 1143 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1144 dirty = 1; 1145 1146 key.offset -= btrfs_file_extent_offset(leaf, fi); 1147 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1148 num_bytes, parent); 1149 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1150 key.objectid, key.offset, 1151 root->root_key.objectid, false); 1152 ret = btrfs_inc_extent_ref(trans, &ref); 1153 if (ret) { 1154 btrfs_abort_transaction(trans, ret); 1155 break; 1156 } 1157 1158 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1159 num_bytes, parent); 1160 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1161 key.objectid, key.offset, 1162 root->root_key.objectid, false); 1163 ret = btrfs_free_extent(trans, &ref); 1164 if (ret) { 1165 btrfs_abort_transaction(trans, ret); 1166 break; 1167 } 1168 } 1169 if (dirty) 1170 btrfs_mark_buffer_dirty(leaf); 1171 if (inode) 1172 btrfs_add_delayed_iput(inode); 1173 return ret; 1174 } 1175 1176 static noinline_for_stack 1177 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1178 struct btrfs_path *path, int level) 1179 { 1180 struct btrfs_disk_key key1; 1181 struct btrfs_disk_key key2; 1182 btrfs_node_key(eb, &key1, slot); 1183 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1184 return memcmp(&key1, &key2, sizeof(key1)); 1185 } 1186 1187 /* 1188 * try to replace tree blocks in fs tree with the new blocks 1189 * in reloc tree. tree blocks haven't been modified since the 1190 * reloc tree was create can be replaced. 1191 * 1192 * if a block was replaced, level of the block + 1 is returned. 1193 * if no block got replaced, 0 is returned. if there are other 1194 * errors, a negative error number is returned. 1195 */ 1196 static noinline_for_stack 1197 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1198 struct btrfs_root *dest, struct btrfs_root *src, 1199 struct btrfs_path *path, struct btrfs_key *next_key, 1200 int lowest_level, int max_level) 1201 { 1202 struct btrfs_fs_info *fs_info = dest->fs_info; 1203 struct extent_buffer *eb; 1204 struct extent_buffer *parent; 1205 struct btrfs_ref ref = { 0 }; 1206 struct btrfs_key key; 1207 u64 old_bytenr; 1208 u64 new_bytenr; 1209 u64 old_ptr_gen; 1210 u64 new_ptr_gen; 1211 u64 last_snapshot; 1212 u32 blocksize; 1213 int cow = 0; 1214 int level; 1215 int ret; 1216 int slot; 1217 1218 ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1219 ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1220 1221 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1222 again: 1223 slot = path->slots[lowest_level]; 1224 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1225 1226 eb = btrfs_lock_root_node(dest); 1227 level = btrfs_header_level(eb); 1228 1229 if (level < lowest_level) { 1230 btrfs_tree_unlock(eb); 1231 free_extent_buffer(eb); 1232 return 0; 1233 } 1234 1235 if (cow) { 1236 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1237 BTRFS_NESTING_COW); 1238 if (ret) { 1239 btrfs_tree_unlock(eb); 1240 free_extent_buffer(eb); 1241 return ret; 1242 } 1243 } 1244 1245 if (next_key) { 1246 next_key->objectid = (u64)-1; 1247 next_key->type = (u8)-1; 1248 next_key->offset = (u64)-1; 1249 } 1250 1251 parent = eb; 1252 while (1) { 1253 level = btrfs_header_level(parent); 1254 ASSERT(level >= lowest_level); 1255 1256 ret = btrfs_bin_search(parent, &key, &slot); 1257 if (ret < 0) 1258 break; 1259 if (ret && slot > 0) 1260 slot--; 1261 1262 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1263 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1264 1265 old_bytenr = btrfs_node_blockptr(parent, slot); 1266 blocksize = fs_info->nodesize; 1267 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1268 1269 if (level <= max_level) { 1270 eb = path->nodes[level]; 1271 new_bytenr = btrfs_node_blockptr(eb, 1272 path->slots[level]); 1273 new_ptr_gen = btrfs_node_ptr_generation(eb, 1274 path->slots[level]); 1275 } else { 1276 new_bytenr = 0; 1277 new_ptr_gen = 0; 1278 } 1279 1280 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1281 ret = level; 1282 break; 1283 } 1284 1285 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1286 memcmp_node_keys(parent, slot, path, level)) { 1287 if (level <= lowest_level) { 1288 ret = 0; 1289 break; 1290 } 1291 1292 eb = btrfs_read_node_slot(parent, slot); 1293 if (IS_ERR(eb)) { 1294 ret = PTR_ERR(eb); 1295 break; 1296 } 1297 btrfs_tree_lock(eb); 1298 if (cow) { 1299 ret = btrfs_cow_block(trans, dest, eb, parent, 1300 slot, &eb, 1301 BTRFS_NESTING_COW); 1302 if (ret) { 1303 btrfs_tree_unlock(eb); 1304 free_extent_buffer(eb); 1305 break; 1306 } 1307 } 1308 1309 btrfs_tree_unlock(parent); 1310 free_extent_buffer(parent); 1311 1312 parent = eb; 1313 continue; 1314 } 1315 1316 if (!cow) { 1317 btrfs_tree_unlock(parent); 1318 free_extent_buffer(parent); 1319 cow = 1; 1320 goto again; 1321 } 1322 1323 btrfs_node_key_to_cpu(path->nodes[level], &key, 1324 path->slots[level]); 1325 btrfs_release_path(path); 1326 1327 path->lowest_level = level; 1328 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1329 path->lowest_level = 0; 1330 if (ret) { 1331 if (ret > 0) 1332 ret = -ENOENT; 1333 break; 1334 } 1335 1336 /* 1337 * Info qgroup to trace both subtrees. 1338 * 1339 * We must trace both trees. 1340 * 1) Tree reloc subtree 1341 * If not traced, we will leak data numbers 1342 * 2) Fs subtree 1343 * If not traced, we will double count old data 1344 * 1345 * We don't scan the subtree right now, but only record 1346 * the swapped tree blocks. 1347 * The real subtree rescan is delayed until we have new 1348 * CoW on the subtree root node before transaction commit. 1349 */ 1350 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1351 rc->block_group, parent, slot, 1352 path->nodes[level], path->slots[level], 1353 last_snapshot); 1354 if (ret < 0) 1355 break; 1356 /* 1357 * swap blocks in fs tree and reloc tree. 1358 */ 1359 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1360 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1361 btrfs_mark_buffer_dirty(parent); 1362 1363 btrfs_set_node_blockptr(path->nodes[level], 1364 path->slots[level], old_bytenr); 1365 btrfs_set_node_ptr_generation(path->nodes[level], 1366 path->slots[level], old_ptr_gen); 1367 btrfs_mark_buffer_dirty(path->nodes[level]); 1368 1369 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 1370 blocksize, path->nodes[level]->start); 1371 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1372 0, true); 1373 ret = btrfs_inc_extent_ref(trans, &ref); 1374 if (ret) { 1375 btrfs_abort_transaction(trans, ret); 1376 break; 1377 } 1378 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1379 blocksize, 0); 1380 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, 1381 true); 1382 ret = btrfs_inc_extent_ref(trans, &ref); 1383 if (ret) { 1384 btrfs_abort_transaction(trans, ret); 1385 break; 1386 } 1387 1388 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 1389 blocksize, path->nodes[level]->start); 1390 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1391 0, true); 1392 ret = btrfs_free_extent(trans, &ref); 1393 if (ret) { 1394 btrfs_abort_transaction(trans, ret); 1395 break; 1396 } 1397 1398 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 1399 blocksize, 0); 1400 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 1401 0, true); 1402 ret = btrfs_free_extent(trans, &ref); 1403 if (ret) { 1404 btrfs_abort_transaction(trans, ret); 1405 break; 1406 } 1407 1408 btrfs_unlock_up_safe(path, 0); 1409 1410 ret = level; 1411 break; 1412 } 1413 btrfs_tree_unlock(parent); 1414 free_extent_buffer(parent); 1415 return ret; 1416 } 1417 1418 /* 1419 * helper to find next relocated block in reloc tree 1420 */ 1421 static noinline_for_stack 1422 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1423 int *level) 1424 { 1425 struct extent_buffer *eb; 1426 int i; 1427 u64 last_snapshot; 1428 u32 nritems; 1429 1430 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1431 1432 for (i = 0; i < *level; i++) { 1433 free_extent_buffer(path->nodes[i]); 1434 path->nodes[i] = NULL; 1435 } 1436 1437 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1438 eb = path->nodes[i]; 1439 nritems = btrfs_header_nritems(eb); 1440 while (path->slots[i] + 1 < nritems) { 1441 path->slots[i]++; 1442 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1443 last_snapshot) 1444 continue; 1445 1446 *level = i; 1447 return 0; 1448 } 1449 free_extent_buffer(path->nodes[i]); 1450 path->nodes[i] = NULL; 1451 } 1452 return 1; 1453 } 1454 1455 /* 1456 * walk down reloc tree to find relocated block of lowest level 1457 */ 1458 static noinline_for_stack 1459 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1460 int *level) 1461 { 1462 struct extent_buffer *eb = NULL; 1463 int i; 1464 u64 ptr_gen = 0; 1465 u64 last_snapshot; 1466 u32 nritems; 1467 1468 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1469 1470 for (i = *level; i > 0; i--) { 1471 eb = path->nodes[i]; 1472 nritems = btrfs_header_nritems(eb); 1473 while (path->slots[i] < nritems) { 1474 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1475 if (ptr_gen > last_snapshot) 1476 break; 1477 path->slots[i]++; 1478 } 1479 if (path->slots[i] >= nritems) { 1480 if (i == *level) 1481 break; 1482 *level = i + 1; 1483 return 0; 1484 } 1485 if (i == 1) { 1486 *level = i; 1487 return 0; 1488 } 1489 1490 eb = btrfs_read_node_slot(eb, path->slots[i]); 1491 if (IS_ERR(eb)) 1492 return PTR_ERR(eb); 1493 BUG_ON(btrfs_header_level(eb) != i - 1); 1494 path->nodes[i - 1] = eb; 1495 path->slots[i - 1] = 0; 1496 } 1497 return 1; 1498 } 1499 1500 /* 1501 * invalidate extent cache for file extents whose key in range of 1502 * [min_key, max_key) 1503 */ 1504 static int invalidate_extent_cache(struct btrfs_root *root, 1505 struct btrfs_key *min_key, 1506 struct btrfs_key *max_key) 1507 { 1508 struct btrfs_fs_info *fs_info = root->fs_info; 1509 struct inode *inode = NULL; 1510 u64 objectid; 1511 u64 start, end; 1512 u64 ino; 1513 1514 objectid = min_key->objectid; 1515 while (1) { 1516 cond_resched(); 1517 iput(inode); 1518 1519 if (objectid > max_key->objectid) 1520 break; 1521 1522 inode = find_next_inode(root, objectid); 1523 if (!inode) 1524 break; 1525 ino = btrfs_ino(BTRFS_I(inode)); 1526 1527 if (ino > max_key->objectid) { 1528 iput(inode); 1529 break; 1530 } 1531 1532 objectid = ino + 1; 1533 if (!S_ISREG(inode->i_mode)) 1534 continue; 1535 1536 if (unlikely(min_key->objectid == ino)) { 1537 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1538 continue; 1539 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1540 start = 0; 1541 else { 1542 start = min_key->offset; 1543 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1544 } 1545 } else { 1546 start = 0; 1547 } 1548 1549 if (unlikely(max_key->objectid == ino)) { 1550 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1551 continue; 1552 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1553 end = (u64)-1; 1554 } else { 1555 if (max_key->offset == 0) 1556 continue; 1557 end = max_key->offset; 1558 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1559 end--; 1560 } 1561 } else { 1562 end = (u64)-1; 1563 } 1564 1565 /* the lock_extent waits for readpage to complete */ 1566 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 1567 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 1568 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 1569 } 1570 return 0; 1571 } 1572 1573 static int find_next_key(struct btrfs_path *path, int level, 1574 struct btrfs_key *key) 1575 1576 { 1577 while (level < BTRFS_MAX_LEVEL) { 1578 if (!path->nodes[level]) 1579 break; 1580 if (path->slots[level] + 1 < 1581 btrfs_header_nritems(path->nodes[level])) { 1582 btrfs_node_key_to_cpu(path->nodes[level], key, 1583 path->slots[level] + 1); 1584 return 0; 1585 } 1586 level++; 1587 } 1588 return 1; 1589 } 1590 1591 /* 1592 * Insert current subvolume into reloc_control::dirty_subvol_roots 1593 */ 1594 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1595 struct reloc_control *rc, 1596 struct btrfs_root *root) 1597 { 1598 struct btrfs_root *reloc_root = root->reloc_root; 1599 struct btrfs_root_item *reloc_root_item; 1600 int ret; 1601 1602 /* @root must be a subvolume tree root with a valid reloc tree */ 1603 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1604 ASSERT(reloc_root); 1605 1606 reloc_root_item = &reloc_root->root_item; 1607 memset(&reloc_root_item->drop_progress, 0, 1608 sizeof(reloc_root_item->drop_progress)); 1609 btrfs_set_root_drop_level(reloc_root_item, 0); 1610 btrfs_set_root_refs(reloc_root_item, 0); 1611 ret = btrfs_update_reloc_root(trans, root); 1612 if (ret) 1613 return ret; 1614 1615 if (list_empty(&root->reloc_dirty_list)) { 1616 btrfs_grab_root(root); 1617 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1618 } 1619 1620 return 0; 1621 } 1622 1623 static int clean_dirty_subvols(struct reloc_control *rc) 1624 { 1625 struct btrfs_root *root; 1626 struct btrfs_root *next; 1627 int ret = 0; 1628 int ret2; 1629 1630 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1631 reloc_dirty_list) { 1632 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1633 /* Merged subvolume, cleanup its reloc root */ 1634 struct btrfs_root *reloc_root = root->reloc_root; 1635 1636 list_del_init(&root->reloc_dirty_list); 1637 root->reloc_root = NULL; 1638 /* 1639 * Need barrier to ensure clear_bit() only happens after 1640 * root->reloc_root = NULL. Pairs with have_reloc_root. 1641 */ 1642 smp_wmb(); 1643 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1644 if (reloc_root) { 1645 /* 1646 * btrfs_drop_snapshot drops our ref we hold for 1647 * ->reloc_root. If it fails however we must 1648 * drop the ref ourselves. 1649 */ 1650 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1651 if (ret2 < 0) { 1652 btrfs_put_root(reloc_root); 1653 if (!ret) 1654 ret = ret2; 1655 } 1656 } 1657 btrfs_put_root(root); 1658 } else { 1659 /* Orphan reloc tree, just clean it up */ 1660 ret2 = btrfs_drop_snapshot(root, 0, 1); 1661 if (ret2 < 0) { 1662 btrfs_put_root(root); 1663 if (!ret) 1664 ret = ret2; 1665 } 1666 } 1667 } 1668 return ret; 1669 } 1670 1671 /* 1672 * merge the relocated tree blocks in reloc tree with corresponding 1673 * fs tree. 1674 */ 1675 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1676 struct btrfs_root *root) 1677 { 1678 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1679 struct btrfs_key key; 1680 struct btrfs_key next_key; 1681 struct btrfs_trans_handle *trans = NULL; 1682 struct btrfs_root *reloc_root; 1683 struct btrfs_root_item *root_item; 1684 struct btrfs_path *path; 1685 struct extent_buffer *leaf; 1686 int reserve_level; 1687 int level; 1688 int max_level; 1689 int replaced = 0; 1690 int ret = 0; 1691 u32 min_reserved; 1692 1693 path = btrfs_alloc_path(); 1694 if (!path) 1695 return -ENOMEM; 1696 path->reada = READA_FORWARD; 1697 1698 reloc_root = root->reloc_root; 1699 root_item = &reloc_root->root_item; 1700 1701 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1702 level = btrfs_root_level(root_item); 1703 atomic_inc(&reloc_root->node->refs); 1704 path->nodes[level] = reloc_root->node; 1705 path->slots[level] = 0; 1706 } else { 1707 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1708 1709 level = btrfs_root_drop_level(root_item); 1710 BUG_ON(level == 0); 1711 path->lowest_level = level; 1712 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1713 path->lowest_level = 0; 1714 if (ret < 0) { 1715 btrfs_free_path(path); 1716 return ret; 1717 } 1718 1719 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1720 path->slots[level]); 1721 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1722 1723 btrfs_unlock_up_safe(path, 0); 1724 } 1725 1726 /* 1727 * In merge_reloc_root(), we modify the upper level pointer to swap the 1728 * tree blocks between reloc tree and subvolume tree. Thus for tree 1729 * block COW, we COW at most from level 1 to root level for each tree. 1730 * 1731 * Thus the needed metadata size is at most root_level * nodesize, 1732 * and * 2 since we have two trees to COW. 1733 */ 1734 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1735 min_reserved = fs_info->nodesize * reserve_level * 2; 1736 memset(&next_key, 0, sizeof(next_key)); 1737 1738 while (1) { 1739 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1740 min_reserved, 1741 BTRFS_RESERVE_FLUSH_LIMIT); 1742 if (ret) 1743 goto out; 1744 trans = btrfs_start_transaction(root, 0); 1745 if (IS_ERR(trans)) { 1746 ret = PTR_ERR(trans); 1747 trans = NULL; 1748 goto out; 1749 } 1750 1751 /* 1752 * At this point we no longer have a reloc_control, so we can't 1753 * depend on btrfs_init_reloc_root to update our last_trans. 1754 * 1755 * But that's ok, we started the trans handle on our 1756 * corresponding fs_root, which means it's been added to the 1757 * dirty list. At commit time we'll still call 1758 * btrfs_update_reloc_root() and update our root item 1759 * appropriately. 1760 */ 1761 reloc_root->last_trans = trans->transid; 1762 trans->block_rsv = rc->block_rsv; 1763 1764 replaced = 0; 1765 max_level = level; 1766 1767 ret = walk_down_reloc_tree(reloc_root, path, &level); 1768 if (ret < 0) 1769 goto out; 1770 if (ret > 0) 1771 break; 1772 1773 if (!find_next_key(path, level, &key) && 1774 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1775 ret = 0; 1776 } else { 1777 ret = replace_path(trans, rc, root, reloc_root, path, 1778 &next_key, level, max_level); 1779 } 1780 if (ret < 0) 1781 goto out; 1782 if (ret > 0) { 1783 level = ret; 1784 btrfs_node_key_to_cpu(path->nodes[level], &key, 1785 path->slots[level]); 1786 replaced = 1; 1787 } 1788 1789 ret = walk_up_reloc_tree(reloc_root, path, &level); 1790 if (ret > 0) 1791 break; 1792 1793 BUG_ON(level == 0); 1794 /* 1795 * save the merging progress in the drop_progress. 1796 * this is OK since root refs == 1 in this case. 1797 */ 1798 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1799 path->slots[level]); 1800 btrfs_set_root_drop_level(root_item, level); 1801 1802 btrfs_end_transaction_throttle(trans); 1803 trans = NULL; 1804 1805 btrfs_btree_balance_dirty(fs_info); 1806 1807 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1808 invalidate_extent_cache(root, &key, &next_key); 1809 } 1810 1811 /* 1812 * handle the case only one block in the fs tree need to be 1813 * relocated and the block is tree root. 1814 */ 1815 leaf = btrfs_lock_root_node(root); 1816 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1817 BTRFS_NESTING_COW); 1818 btrfs_tree_unlock(leaf); 1819 free_extent_buffer(leaf); 1820 out: 1821 btrfs_free_path(path); 1822 1823 if (ret == 0) { 1824 ret = insert_dirty_subvol(trans, rc, root); 1825 if (ret) 1826 btrfs_abort_transaction(trans, ret); 1827 } 1828 1829 if (trans) 1830 btrfs_end_transaction_throttle(trans); 1831 1832 btrfs_btree_balance_dirty(fs_info); 1833 1834 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1835 invalidate_extent_cache(root, &key, &next_key); 1836 1837 return ret; 1838 } 1839 1840 static noinline_for_stack 1841 int prepare_to_merge(struct reloc_control *rc, int err) 1842 { 1843 struct btrfs_root *root = rc->extent_root; 1844 struct btrfs_fs_info *fs_info = root->fs_info; 1845 struct btrfs_root *reloc_root; 1846 struct btrfs_trans_handle *trans; 1847 LIST_HEAD(reloc_roots); 1848 u64 num_bytes = 0; 1849 int ret; 1850 1851 mutex_lock(&fs_info->reloc_mutex); 1852 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1853 rc->merging_rsv_size += rc->nodes_relocated * 2; 1854 mutex_unlock(&fs_info->reloc_mutex); 1855 1856 again: 1857 if (!err) { 1858 num_bytes = rc->merging_rsv_size; 1859 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1860 BTRFS_RESERVE_FLUSH_ALL); 1861 if (ret) 1862 err = ret; 1863 } 1864 1865 trans = btrfs_join_transaction(rc->extent_root); 1866 if (IS_ERR(trans)) { 1867 if (!err) 1868 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1869 num_bytes, NULL); 1870 return PTR_ERR(trans); 1871 } 1872 1873 if (!err) { 1874 if (num_bytes != rc->merging_rsv_size) { 1875 btrfs_end_transaction(trans); 1876 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1877 num_bytes, NULL); 1878 goto again; 1879 } 1880 } 1881 1882 rc->merge_reloc_tree = 1; 1883 1884 while (!list_empty(&rc->reloc_roots)) { 1885 reloc_root = list_entry(rc->reloc_roots.next, 1886 struct btrfs_root, root_list); 1887 list_del_init(&reloc_root->root_list); 1888 1889 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1890 false); 1891 if (IS_ERR(root)) { 1892 /* 1893 * Even if we have an error we need this reloc root 1894 * back on our list so we can clean up properly. 1895 */ 1896 list_add(&reloc_root->root_list, &reloc_roots); 1897 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1898 if (!err) 1899 err = PTR_ERR(root); 1900 break; 1901 } 1902 ASSERT(root->reloc_root == reloc_root); 1903 1904 /* 1905 * set reference count to 1, so btrfs_recover_relocation 1906 * knows it should resumes merging 1907 */ 1908 if (!err) 1909 btrfs_set_root_refs(&reloc_root->root_item, 1); 1910 ret = btrfs_update_reloc_root(trans, root); 1911 1912 /* 1913 * Even if we have an error we need this reloc root back on our 1914 * list so we can clean up properly. 1915 */ 1916 list_add(&reloc_root->root_list, &reloc_roots); 1917 btrfs_put_root(root); 1918 1919 if (ret) { 1920 btrfs_abort_transaction(trans, ret); 1921 if (!err) 1922 err = ret; 1923 break; 1924 } 1925 } 1926 1927 list_splice(&reloc_roots, &rc->reloc_roots); 1928 1929 if (!err) 1930 err = btrfs_commit_transaction(trans); 1931 else 1932 btrfs_end_transaction(trans); 1933 return err; 1934 } 1935 1936 static noinline_for_stack 1937 void free_reloc_roots(struct list_head *list) 1938 { 1939 struct btrfs_root *reloc_root, *tmp; 1940 1941 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1942 __del_reloc_root(reloc_root); 1943 } 1944 1945 static noinline_for_stack 1946 void merge_reloc_roots(struct reloc_control *rc) 1947 { 1948 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1949 struct btrfs_root *root; 1950 struct btrfs_root *reloc_root; 1951 LIST_HEAD(reloc_roots); 1952 int found = 0; 1953 int ret = 0; 1954 again: 1955 root = rc->extent_root; 1956 1957 /* 1958 * this serializes us with btrfs_record_root_in_transaction, 1959 * we have to make sure nobody is in the middle of 1960 * adding their roots to the list while we are 1961 * doing this splice 1962 */ 1963 mutex_lock(&fs_info->reloc_mutex); 1964 list_splice_init(&rc->reloc_roots, &reloc_roots); 1965 mutex_unlock(&fs_info->reloc_mutex); 1966 1967 while (!list_empty(&reloc_roots)) { 1968 found = 1; 1969 reloc_root = list_entry(reloc_roots.next, 1970 struct btrfs_root, root_list); 1971 1972 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1973 false); 1974 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1975 if (IS_ERR(root)) { 1976 /* 1977 * For recovery we read the fs roots on mount, 1978 * and if we didn't find the root then we marked 1979 * the reloc root as a garbage root. For normal 1980 * relocation obviously the root should exist in 1981 * memory. However there's no reason we can't 1982 * handle the error properly here just in case. 1983 */ 1984 ASSERT(0); 1985 ret = PTR_ERR(root); 1986 goto out; 1987 } 1988 if (root->reloc_root != reloc_root) { 1989 /* 1990 * This is actually impossible without something 1991 * going really wrong (like weird race condition 1992 * or cosmic rays). 1993 */ 1994 ASSERT(0); 1995 ret = -EINVAL; 1996 goto out; 1997 } 1998 ret = merge_reloc_root(rc, root); 1999 btrfs_put_root(root); 2000 if (ret) { 2001 if (list_empty(&reloc_root->root_list)) 2002 list_add_tail(&reloc_root->root_list, 2003 &reloc_roots); 2004 goto out; 2005 } 2006 } else { 2007 if (!IS_ERR(root)) { 2008 if (root->reloc_root == reloc_root) { 2009 root->reloc_root = NULL; 2010 btrfs_put_root(reloc_root); 2011 } 2012 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 2013 &root->state); 2014 btrfs_put_root(root); 2015 } 2016 2017 list_del_init(&reloc_root->root_list); 2018 /* Don't forget to queue this reloc root for cleanup */ 2019 list_add_tail(&reloc_root->reloc_dirty_list, 2020 &rc->dirty_subvol_roots); 2021 } 2022 } 2023 2024 if (found) { 2025 found = 0; 2026 goto again; 2027 } 2028 out: 2029 if (ret) { 2030 btrfs_handle_fs_error(fs_info, ret, NULL); 2031 free_reloc_roots(&reloc_roots); 2032 2033 /* new reloc root may be added */ 2034 mutex_lock(&fs_info->reloc_mutex); 2035 list_splice_init(&rc->reloc_roots, &reloc_roots); 2036 mutex_unlock(&fs_info->reloc_mutex); 2037 free_reloc_roots(&reloc_roots); 2038 } 2039 2040 /* 2041 * We used to have 2042 * 2043 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2044 * 2045 * here, but it's wrong. If we fail to start the transaction in 2046 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2047 * have actually been removed from the reloc_root_tree rb tree. This is 2048 * fine because we're bailing here, and we hold a reference on the root 2049 * for the list that holds it, so these roots will be cleaned up when we 2050 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2051 * will be cleaned up on unmount. 2052 * 2053 * The remaining nodes will be cleaned up by free_reloc_control. 2054 */ 2055 } 2056 2057 static void free_block_list(struct rb_root *blocks) 2058 { 2059 struct tree_block *block; 2060 struct rb_node *rb_node; 2061 while ((rb_node = rb_first(blocks))) { 2062 block = rb_entry(rb_node, struct tree_block, rb_node); 2063 rb_erase(rb_node, blocks); 2064 kfree(block); 2065 } 2066 } 2067 2068 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2069 struct btrfs_root *reloc_root) 2070 { 2071 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2072 struct btrfs_root *root; 2073 int ret; 2074 2075 if (reloc_root->last_trans == trans->transid) 2076 return 0; 2077 2078 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 2079 2080 /* 2081 * This should succeed, since we can't have a reloc root without having 2082 * already looked up the actual root and created the reloc root for this 2083 * root. 2084 * 2085 * However if there's some sort of corruption where we have a ref to a 2086 * reloc root without a corresponding root this could return ENOENT. 2087 */ 2088 if (IS_ERR(root)) { 2089 ASSERT(0); 2090 return PTR_ERR(root); 2091 } 2092 if (root->reloc_root != reloc_root) { 2093 ASSERT(0); 2094 btrfs_err(fs_info, 2095 "root %llu has two reloc roots associated with it", 2096 reloc_root->root_key.offset); 2097 btrfs_put_root(root); 2098 return -EUCLEAN; 2099 } 2100 ret = btrfs_record_root_in_trans(trans, root); 2101 btrfs_put_root(root); 2102 2103 return ret; 2104 } 2105 2106 static noinline_for_stack 2107 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2108 struct reloc_control *rc, 2109 struct btrfs_backref_node *node, 2110 struct btrfs_backref_edge *edges[]) 2111 { 2112 struct btrfs_backref_node *next; 2113 struct btrfs_root *root; 2114 int index = 0; 2115 int ret; 2116 2117 next = node; 2118 while (1) { 2119 cond_resched(); 2120 next = walk_up_backref(next, edges, &index); 2121 root = next->root; 2122 2123 /* 2124 * If there is no root, then our references for this block are 2125 * incomplete, as we should be able to walk all the way up to a 2126 * block that is owned by a root. 2127 * 2128 * This path is only for SHAREABLE roots, so if we come upon a 2129 * non-SHAREABLE root then we have backrefs that resolve 2130 * improperly. 2131 * 2132 * Both of these cases indicate file system corruption, or a bug 2133 * in the backref walking code. 2134 */ 2135 if (!root) { 2136 ASSERT(0); 2137 btrfs_err(trans->fs_info, 2138 "bytenr %llu doesn't have a backref path ending in a root", 2139 node->bytenr); 2140 return ERR_PTR(-EUCLEAN); 2141 } 2142 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2143 ASSERT(0); 2144 btrfs_err(trans->fs_info, 2145 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2146 node->bytenr); 2147 return ERR_PTR(-EUCLEAN); 2148 } 2149 2150 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2151 ret = record_reloc_root_in_trans(trans, root); 2152 if (ret) 2153 return ERR_PTR(ret); 2154 break; 2155 } 2156 2157 ret = btrfs_record_root_in_trans(trans, root); 2158 if (ret) 2159 return ERR_PTR(ret); 2160 root = root->reloc_root; 2161 2162 /* 2163 * We could have raced with another thread which failed, so 2164 * root->reloc_root may not be set, return ENOENT in this case. 2165 */ 2166 if (!root) 2167 return ERR_PTR(-ENOENT); 2168 2169 if (next->new_bytenr != root->node->start) { 2170 /* 2171 * We just created the reloc root, so we shouldn't have 2172 * ->new_bytenr set and this shouldn't be in the changed 2173 * list. If it is then we have multiple roots pointing 2174 * at the same bytenr which indicates corruption, or 2175 * we've made a mistake in the backref walking code. 2176 */ 2177 ASSERT(next->new_bytenr == 0); 2178 ASSERT(list_empty(&next->list)); 2179 if (next->new_bytenr || !list_empty(&next->list)) { 2180 btrfs_err(trans->fs_info, 2181 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2182 node->bytenr, next->bytenr); 2183 return ERR_PTR(-EUCLEAN); 2184 } 2185 2186 next->new_bytenr = root->node->start; 2187 btrfs_put_root(next->root); 2188 next->root = btrfs_grab_root(root); 2189 ASSERT(next->root); 2190 list_add_tail(&next->list, 2191 &rc->backref_cache.changed); 2192 mark_block_processed(rc, next); 2193 break; 2194 } 2195 2196 WARN_ON(1); 2197 root = NULL; 2198 next = walk_down_backref(edges, &index); 2199 if (!next || next->level <= node->level) 2200 break; 2201 } 2202 if (!root) { 2203 /* 2204 * This can happen if there's fs corruption or if there's a bug 2205 * in the backref lookup code. 2206 */ 2207 ASSERT(0); 2208 return ERR_PTR(-ENOENT); 2209 } 2210 2211 next = node; 2212 /* setup backref node path for btrfs_reloc_cow_block */ 2213 while (1) { 2214 rc->backref_cache.path[next->level] = next; 2215 if (--index < 0) 2216 break; 2217 next = edges[index]->node[UPPER]; 2218 } 2219 return root; 2220 } 2221 2222 /* 2223 * Select a tree root for relocation. 2224 * 2225 * Return NULL if the block is not shareable. We should use do_relocation() in 2226 * this case. 2227 * 2228 * Return a tree root pointer if the block is shareable. 2229 * Return -ENOENT if the block is root of reloc tree. 2230 */ 2231 static noinline_for_stack 2232 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2233 { 2234 struct btrfs_backref_node *next; 2235 struct btrfs_root *root; 2236 struct btrfs_root *fs_root = NULL; 2237 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2238 int index = 0; 2239 2240 next = node; 2241 while (1) { 2242 cond_resched(); 2243 next = walk_up_backref(next, edges, &index); 2244 root = next->root; 2245 2246 /* 2247 * This can occur if we have incomplete extent refs leading all 2248 * the way up a particular path, in this case return -EUCLEAN. 2249 */ 2250 if (!root) 2251 return ERR_PTR(-EUCLEAN); 2252 2253 /* No other choice for non-shareable tree */ 2254 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2255 return root; 2256 2257 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2258 fs_root = root; 2259 2260 if (next != node) 2261 return NULL; 2262 2263 next = walk_down_backref(edges, &index); 2264 if (!next || next->level <= node->level) 2265 break; 2266 } 2267 2268 if (!fs_root) 2269 return ERR_PTR(-ENOENT); 2270 return fs_root; 2271 } 2272 2273 static noinline_for_stack 2274 u64 calcu_metadata_size(struct reloc_control *rc, 2275 struct btrfs_backref_node *node, int reserve) 2276 { 2277 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2278 struct btrfs_backref_node *next = node; 2279 struct btrfs_backref_edge *edge; 2280 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2281 u64 num_bytes = 0; 2282 int index = 0; 2283 2284 BUG_ON(reserve && node->processed); 2285 2286 while (next) { 2287 cond_resched(); 2288 while (1) { 2289 if (next->processed && (reserve || next != node)) 2290 break; 2291 2292 num_bytes += fs_info->nodesize; 2293 2294 if (list_empty(&next->upper)) 2295 break; 2296 2297 edge = list_entry(next->upper.next, 2298 struct btrfs_backref_edge, list[LOWER]); 2299 edges[index++] = edge; 2300 next = edge->node[UPPER]; 2301 } 2302 next = walk_down_backref(edges, &index); 2303 } 2304 return num_bytes; 2305 } 2306 2307 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2308 struct reloc_control *rc, 2309 struct btrfs_backref_node *node) 2310 { 2311 struct btrfs_root *root = rc->extent_root; 2312 struct btrfs_fs_info *fs_info = root->fs_info; 2313 u64 num_bytes; 2314 int ret; 2315 u64 tmp; 2316 2317 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2318 2319 trans->block_rsv = rc->block_rsv; 2320 rc->reserved_bytes += num_bytes; 2321 2322 /* 2323 * We are under a transaction here so we can only do limited flushing. 2324 * If we get an enospc just kick back -EAGAIN so we know to drop the 2325 * transaction and try to refill when we can flush all the things. 2326 */ 2327 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2328 BTRFS_RESERVE_FLUSH_LIMIT); 2329 if (ret) { 2330 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2331 while (tmp <= rc->reserved_bytes) 2332 tmp <<= 1; 2333 /* 2334 * only one thread can access block_rsv at this point, 2335 * so we don't need hold lock to protect block_rsv. 2336 * we expand more reservation size here to allow enough 2337 * space for relocation and we will return earlier in 2338 * enospc case. 2339 */ 2340 rc->block_rsv->size = tmp + fs_info->nodesize * 2341 RELOCATION_RESERVED_NODES; 2342 return -EAGAIN; 2343 } 2344 2345 return 0; 2346 } 2347 2348 /* 2349 * relocate a block tree, and then update pointers in upper level 2350 * blocks that reference the block to point to the new location. 2351 * 2352 * if called by link_to_upper, the block has already been relocated. 2353 * in that case this function just updates pointers. 2354 */ 2355 static int do_relocation(struct btrfs_trans_handle *trans, 2356 struct reloc_control *rc, 2357 struct btrfs_backref_node *node, 2358 struct btrfs_key *key, 2359 struct btrfs_path *path, int lowest) 2360 { 2361 struct btrfs_backref_node *upper; 2362 struct btrfs_backref_edge *edge; 2363 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2364 struct btrfs_root *root; 2365 struct extent_buffer *eb; 2366 u32 blocksize; 2367 u64 bytenr; 2368 int slot; 2369 int ret = 0; 2370 2371 /* 2372 * If we are lowest then this is the first time we're processing this 2373 * block, and thus shouldn't have an eb associated with it yet. 2374 */ 2375 ASSERT(!lowest || !node->eb); 2376 2377 path->lowest_level = node->level + 1; 2378 rc->backref_cache.path[node->level] = node; 2379 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2380 struct btrfs_ref ref = { 0 }; 2381 2382 cond_resched(); 2383 2384 upper = edge->node[UPPER]; 2385 root = select_reloc_root(trans, rc, upper, edges); 2386 if (IS_ERR(root)) { 2387 ret = PTR_ERR(root); 2388 goto next; 2389 } 2390 2391 if (upper->eb && !upper->locked) { 2392 if (!lowest) { 2393 ret = btrfs_bin_search(upper->eb, key, &slot); 2394 if (ret < 0) 2395 goto next; 2396 BUG_ON(ret); 2397 bytenr = btrfs_node_blockptr(upper->eb, slot); 2398 if (node->eb->start == bytenr) 2399 goto next; 2400 } 2401 btrfs_backref_drop_node_buffer(upper); 2402 } 2403 2404 if (!upper->eb) { 2405 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2406 if (ret) { 2407 if (ret > 0) 2408 ret = -ENOENT; 2409 2410 btrfs_release_path(path); 2411 break; 2412 } 2413 2414 if (!upper->eb) { 2415 upper->eb = path->nodes[upper->level]; 2416 path->nodes[upper->level] = NULL; 2417 } else { 2418 BUG_ON(upper->eb != path->nodes[upper->level]); 2419 } 2420 2421 upper->locked = 1; 2422 path->locks[upper->level] = 0; 2423 2424 slot = path->slots[upper->level]; 2425 btrfs_release_path(path); 2426 } else { 2427 ret = btrfs_bin_search(upper->eb, key, &slot); 2428 if (ret < 0) 2429 goto next; 2430 BUG_ON(ret); 2431 } 2432 2433 bytenr = btrfs_node_blockptr(upper->eb, slot); 2434 if (lowest) { 2435 if (bytenr != node->bytenr) { 2436 btrfs_err(root->fs_info, 2437 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2438 bytenr, node->bytenr, slot, 2439 upper->eb->start); 2440 ret = -EIO; 2441 goto next; 2442 } 2443 } else { 2444 if (node->eb->start == bytenr) 2445 goto next; 2446 } 2447 2448 blocksize = root->fs_info->nodesize; 2449 eb = btrfs_read_node_slot(upper->eb, slot); 2450 if (IS_ERR(eb)) { 2451 ret = PTR_ERR(eb); 2452 goto next; 2453 } 2454 btrfs_tree_lock(eb); 2455 2456 if (!node->eb) { 2457 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2458 slot, &eb, BTRFS_NESTING_COW); 2459 btrfs_tree_unlock(eb); 2460 free_extent_buffer(eb); 2461 if (ret < 0) 2462 goto next; 2463 /* 2464 * We've just COWed this block, it should have updated 2465 * the correct backref node entry. 2466 */ 2467 ASSERT(node->eb == eb); 2468 } else { 2469 btrfs_set_node_blockptr(upper->eb, slot, 2470 node->eb->start); 2471 btrfs_set_node_ptr_generation(upper->eb, slot, 2472 trans->transid); 2473 btrfs_mark_buffer_dirty(upper->eb); 2474 2475 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2476 node->eb->start, blocksize, 2477 upper->eb->start); 2478 btrfs_init_tree_ref(&ref, node->level, 2479 btrfs_header_owner(upper->eb), 2480 root->root_key.objectid, false); 2481 ret = btrfs_inc_extent_ref(trans, &ref); 2482 if (!ret) 2483 ret = btrfs_drop_subtree(trans, root, eb, 2484 upper->eb); 2485 if (ret) 2486 btrfs_abort_transaction(trans, ret); 2487 } 2488 next: 2489 if (!upper->pending) 2490 btrfs_backref_drop_node_buffer(upper); 2491 else 2492 btrfs_backref_unlock_node_buffer(upper); 2493 if (ret) 2494 break; 2495 } 2496 2497 if (!ret && node->pending) { 2498 btrfs_backref_drop_node_buffer(node); 2499 list_move_tail(&node->list, &rc->backref_cache.changed); 2500 node->pending = 0; 2501 } 2502 2503 path->lowest_level = 0; 2504 2505 /* 2506 * We should have allocated all of our space in the block rsv and thus 2507 * shouldn't ENOSPC. 2508 */ 2509 ASSERT(ret != -ENOSPC); 2510 return ret; 2511 } 2512 2513 static int link_to_upper(struct btrfs_trans_handle *trans, 2514 struct reloc_control *rc, 2515 struct btrfs_backref_node *node, 2516 struct btrfs_path *path) 2517 { 2518 struct btrfs_key key; 2519 2520 btrfs_node_key_to_cpu(node->eb, &key, 0); 2521 return do_relocation(trans, rc, node, &key, path, 0); 2522 } 2523 2524 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2525 struct reloc_control *rc, 2526 struct btrfs_path *path, int err) 2527 { 2528 LIST_HEAD(list); 2529 struct btrfs_backref_cache *cache = &rc->backref_cache; 2530 struct btrfs_backref_node *node; 2531 int level; 2532 int ret; 2533 2534 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2535 while (!list_empty(&cache->pending[level])) { 2536 node = list_entry(cache->pending[level].next, 2537 struct btrfs_backref_node, list); 2538 list_move_tail(&node->list, &list); 2539 BUG_ON(!node->pending); 2540 2541 if (!err) { 2542 ret = link_to_upper(trans, rc, node, path); 2543 if (ret < 0) 2544 err = ret; 2545 } 2546 } 2547 list_splice_init(&list, &cache->pending[level]); 2548 } 2549 return err; 2550 } 2551 2552 /* 2553 * mark a block and all blocks directly/indirectly reference the block 2554 * as processed. 2555 */ 2556 static void update_processed_blocks(struct reloc_control *rc, 2557 struct btrfs_backref_node *node) 2558 { 2559 struct btrfs_backref_node *next = node; 2560 struct btrfs_backref_edge *edge; 2561 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2562 int index = 0; 2563 2564 while (next) { 2565 cond_resched(); 2566 while (1) { 2567 if (next->processed) 2568 break; 2569 2570 mark_block_processed(rc, next); 2571 2572 if (list_empty(&next->upper)) 2573 break; 2574 2575 edge = list_entry(next->upper.next, 2576 struct btrfs_backref_edge, list[LOWER]); 2577 edges[index++] = edge; 2578 next = edge->node[UPPER]; 2579 } 2580 next = walk_down_backref(edges, &index); 2581 } 2582 } 2583 2584 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2585 { 2586 u32 blocksize = rc->extent_root->fs_info->nodesize; 2587 2588 if (test_range_bit(&rc->processed_blocks, bytenr, 2589 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2590 return 1; 2591 return 0; 2592 } 2593 2594 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2595 struct tree_block *block) 2596 { 2597 struct extent_buffer *eb; 2598 2599 eb = read_tree_block(fs_info, block->bytenr, block->owner, 2600 block->key.offset, block->level, NULL); 2601 if (IS_ERR(eb)) { 2602 return PTR_ERR(eb); 2603 } else if (!extent_buffer_uptodate(eb)) { 2604 free_extent_buffer(eb); 2605 return -EIO; 2606 } 2607 if (block->level == 0) 2608 btrfs_item_key_to_cpu(eb, &block->key, 0); 2609 else 2610 btrfs_node_key_to_cpu(eb, &block->key, 0); 2611 free_extent_buffer(eb); 2612 block->key_ready = 1; 2613 return 0; 2614 } 2615 2616 /* 2617 * helper function to relocate a tree block 2618 */ 2619 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2620 struct reloc_control *rc, 2621 struct btrfs_backref_node *node, 2622 struct btrfs_key *key, 2623 struct btrfs_path *path) 2624 { 2625 struct btrfs_root *root; 2626 int ret = 0; 2627 2628 if (!node) 2629 return 0; 2630 2631 /* 2632 * If we fail here we want to drop our backref_node because we are going 2633 * to start over and regenerate the tree for it. 2634 */ 2635 ret = reserve_metadata_space(trans, rc, node); 2636 if (ret) 2637 goto out; 2638 2639 BUG_ON(node->processed); 2640 root = select_one_root(node); 2641 if (IS_ERR(root)) { 2642 ret = PTR_ERR(root); 2643 2644 /* See explanation in select_one_root for the -EUCLEAN case. */ 2645 ASSERT(ret == -ENOENT); 2646 if (ret == -ENOENT) { 2647 ret = 0; 2648 update_processed_blocks(rc, node); 2649 } 2650 goto out; 2651 } 2652 2653 if (root) { 2654 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2655 /* 2656 * This block was the root block of a root, and this is 2657 * the first time we're processing the block and thus it 2658 * should not have had the ->new_bytenr modified and 2659 * should have not been included on the changed list. 2660 * 2661 * However in the case of corruption we could have 2662 * multiple refs pointing to the same block improperly, 2663 * and thus we would trip over these checks. ASSERT() 2664 * for the developer case, because it could indicate a 2665 * bug in the backref code, however error out for a 2666 * normal user in the case of corruption. 2667 */ 2668 ASSERT(node->new_bytenr == 0); 2669 ASSERT(list_empty(&node->list)); 2670 if (node->new_bytenr || !list_empty(&node->list)) { 2671 btrfs_err(root->fs_info, 2672 "bytenr %llu has improper references to it", 2673 node->bytenr); 2674 ret = -EUCLEAN; 2675 goto out; 2676 } 2677 ret = btrfs_record_root_in_trans(trans, root); 2678 if (ret) 2679 goto out; 2680 /* 2681 * Another thread could have failed, need to check if we 2682 * have reloc_root actually set. 2683 */ 2684 if (!root->reloc_root) { 2685 ret = -ENOENT; 2686 goto out; 2687 } 2688 root = root->reloc_root; 2689 node->new_bytenr = root->node->start; 2690 btrfs_put_root(node->root); 2691 node->root = btrfs_grab_root(root); 2692 ASSERT(node->root); 2693 list_add_tail(&node->list, &rc->backref_cache.changed); 2694 } else { 2695 path->lowest_level = node->level; 2696 if (root == root->fs_info->chunk_root) 2697 btrfs_reserve_chunk_metadata(trans, false); 2698 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2699 btrfs_release_path(path); 2700 if (root == root->fs_info->chunk_root) 2701 btrfs_trans_release_chunk_metadata(trans); 2702 if (ret > 0) 2703 ret = 0; 2704 } 2705 if (!ret) 2706 update_processed_blocks(rc, node); 2707 } else { 2708 ret = do_relocation(trans, rc, node, key, path, 1); 2709 } 2710 out: 2711 if (ret || node->level == 0 || node->cowonly) 2712 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2713 return ret; 2714 } 2715 2716 /* 2717 * relocate a list of blocks 2718 */ 2719 static noinline_for_stack 2720 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2721 struct reloc_control *rc, struct rb_root *blocks) 2722 { 2723 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2724 struct btrfs_backref_node *node; 2725 struct btrfs_path *path; 2726 struct tree_block *block; 2727 struct tree_block *next; 2728 int ret; 2729 int err = 0; 2730 2731 path = btrfs_alloc_path(); 2732 if (!path) { 2733 err = -ENOMEM; 2734 goto out_free_blocks; 2735 } 2736 2737 /* Kick in readahead for tree blocks with missing keys */ 2738 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2739 if (!block->key_ready) 2740 btrfs_readahead_tree_block(fs_info, block->bytenr, 2741 block->owner, 0, 2742 block->level); 2743 } 2744 2745 /* Get first keys */ 2746 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2747 if (!block->key_ready) { 2748 err = get_tree_block_key(fs_info, block); 2749 if (err) 2750 goto out_free_path; 2751 } 2752 } 2753 2754 /* Do tree relocation */ 2755 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2756 node = build_backref_tree(rc, &block->key, 2757 block->level, block->bytenr); 2758 if (IS_ERR(node)) { 2759 err = PTR_ERR(node); 2760 goto out; 2761 } 2762 2763 ret = relocate_tree_block(trans, rc, node, &block->key, 2764 path); 2765 if (ret < 0) { 2766 err = ret; 2767 break; 2768 } 2769 } 2770 out: 2771 err = finish_pending_nodes(trans, rc, path, err); 2772 2773 out_free_path: 2774 btrfs_free_path(path); 2775 out_free_blocks: 2776 free_block_list(blocks); 2777 return err; 2778 } 2779 2780 static noinline_for_stack int prealloc_file_extent_cluster( 2781 struct btrfs_inode *inode, 2782 struct file_extent_cluster *cluster) 2783 { 2784 u64 alloc_hint = 0; 2785 u64 start; 2786 u64 end; 2787 u64 offset = inode->index_cnt; 2788 u64 num_bytes; 2789 int nr; 2790 int ret = 0; 2791 u64 i_size = i_size_read(&inode->vfs_inode); 2792 u64 prealloc_start = cluster->start - offset; 2793 u64 prealloc_end = cluster->end - offset; 2794 u64 cur_offset = prealloc_start; 2795 2796 /* 2797 * For subpage case, previous i_size may not be aligned to PAGE_SIZE. 2798 * This means the range [i_size, PAGE_END + 1) is filled with zeros by 2799 * btrfs_do_readpage() call of previously relocated file cluster. 2800 * 2801 * If the current cluster starts in the above range, btrfs_do_readpage() 2802 * will skip the read, and relocate_one_page() will later writeback 2803 * the padding zeros as new data, causing data corruption. 2804 * 2805 * Here we have to manually invalidate the range (i_size, PAGE_END + 1). 2806 */ 2807 if (!IS_ALIGNED(i_size, PAGE_SIZE)) { 2808 struct address_space *mapping = inode->vfs_inode.i_mapping; 2809 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2810 const u32 sectorsize = fs_info->sectorsize; 2811 struct page *page; 2812 2813 ASSERT(sectorsize < PAGE_SIZE); 2814 ASSERT(IS_ALIGNED(i_size, sectorsize)); 2815 2816 /* 2817 * Subpage can't handle page with DIRTY but without UPTODATE 2818 * bit as it can lead to the following deadlock: 2819 * 2820 * btrfs_readpage() 2821 * | Page already *locked* 2822 * |- btrfs_lock_and_flush_ordered_range() 2823 * |- btrfs_start_ordered_extent() 2824 * |- extent_write_cache_pages() 2825 * |- lock_page() 2826 * We try to lock the page we already hold. 2827 * 2828 * Here we just writeback the whole data reloc inode, so that 2829 * we will be ensured to have no dirty range in the page, and 2830 * are safe to clear the uptodate bits. 2831 * 2832 * This shouldn't cause too much overhead, as we need to write 2833 * the data back anyway. 2834 */ 2835 ret = filemap_write_and_wait(mapping); 2836 if (ret < 0) 2837 return ret; 2838 2839 clear_extent_bits(&inode->io_tree, i_size, 2840 round_up(i_size, PAGE_SIZE) - 1, 2841 EXTENT_UPTODATE); 2842 page = find_lock_page(mapping, i_size >> PAGE_SHIFT); 2843 /* 2844 * If page is freed we don't need to do anything then, as we 2845 * will re-read the whole page anyway. 2846 */ 2847 if (page) { 2848 btrfs_subpage_clear_uptodate(fs_info, page, i_size, 2849 round_up(i_size, PAGE_SIZE) - i_size); 2850 unlock_page(page); 2851 put_page(page); 2852 } 2853 } 2854 2855 BUG_ON(cluster->start != cluster->boundary[0]); 2856 ret = btrfs_alloc_data_chunk_ondemand(inode, 2857 prealloc_end + 1 - prealloc_start); 2858 if (ret) 2859 return ret; 2860 2861 btrfs_inode_lock(&inode->vfs_inode, 0); 2862 for (nr = 0; nr < cluster->nr; nr++) { 2863 start = cluster->boundary[nr] - offset; 2864 if (nr + 1 < cluster->nr) 2865 end = cluster->boundary[nr + 1] - 1 - offset; 2866 else 2867 end = cluster->end - offset; 2868 2869 lock_extent(&inode->io_tree, start, end); 2870 num_bytes = end + 1 - start; 2871 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2872 num_bytes, num_bytes, 2873 end + 1, &alloc_hint); 2874 cur_offset = end + 1; 2875 unlock_extent(&inode->io_tree, start, end); 2876 if (ret) 2877 break; 2878 } 2879 btrfs_inode_unlock(&inode->vfs_inode, 0); 2880 2881 if (cur_offset < prealloc_end) 2882 btrfs_free_reserved_data_space_noquota(inode->root->fs_info, 2883 prealloc_end + 1 - cur_offset); 2884 return ret; 2885 } 2886 2887 static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, 2888 u64 start, u64 end, u64 block_start) 2889 { 2890 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2891 struct extent_map *em; 2892 int ret = 0; 2893 2894 em = alloc_extent_map(); 2895 if (!em) 2896 return -ENOMEM; 2897 2898 em->start = start; 2899 em->len = end + 1 - start; 2900 em->block_len = em->len; 2901 em->block_start = block_start; 2902 set_bit(EXTENT_FLAG_PINNED, &em->flags); 2903 2904 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2905 while (1) { 2906 write_lock(&em_tree->lock); 2907 ret = add_extent_mapping(em_tree, em, 0); 2908 write_unlock(&em_tree->lock); 2909 if (ret != -EEXIST) { 2910 free_extent_map(em); 2911 break; 2912 } 2913 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 2914 } 2915 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2916 return ret; 2917 } 2918 2919 /* 2920 * Allow error injection to test balance/relocation cancellation 2921 */ 2922 noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) 2923 { 2924 return atomic_read(&fs_info->balance_cancel_req) || 2925 atomic_read(&fs_info->reloc_cancel_req) || 2926 fatal_signal_pending(current); 2927 } 2928 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2929 2930 static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, 2931 int cluster_nr) 2932 { 2933 /* Last extent, use cluster end directly */ 2934 if (cluster_nr >= cluster->nr - 1) 2935 return cluster->end; 2936 2937 /* Use next boundary start*/ 2938 return cluster->boundary[cluster_nr + 1] - 1; 2939 } 2940 2941 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, 2942 struct file_extent_cluster *cluster, 2943 int *cluster_nr, unsigned long page_index) 2944 { 2945 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2946 u64 offset = BTRFS_I(inode)->index_cnt; 2947 const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; 2948 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2949 struct page *page; 2950 u64 page_start; 2951 u64 page_end; 2952 u64 cur; 2953 int ret; 2954 2955 ASSERT(page_index <= last_index); 2956 page = find_lock_page(inode->i_mapping, page_index); 2957 if (!page) { 2958 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2959 page_index, last_index + 1 - page_index); 2960 page = find_or_create_page(inode->i_mapping, page_index, mask); 2961 if (!page) 2962 return -ENOMEM; 2963 } 2964 ret = set_page_extent_mapped(page); 2965 if (ret < 0) 2966 goto release_page; 2967 2968 if (PageReadahead(page)) 2969 page_cache_async_readahead(inode->i_mapping, ra, NULL, page, 2970 page_index, last_index + 1 - page_index); 2971 2972 if (!PageUptodate(page)) { 2973 btrfs_readpage(NULL, page); 2974 lock_page(page); 2975 if (!PageUptodate(page)) { 2976 ret = -EIO; 2977 goto release_page; 2978 } 2979 } 2980 2981 page_start = page_offset(page); 2982 page_end = page_start + PAGE_SIZE - 1; 2983 2984 /* 2985 * Start from the cluster, as for subpage case, the cluster can start 2986 * inside the page. 2987 */ 2988 cur = max(page_start, cluster->boundary[*cluster_nr] - offset); 2989 while (cur <= page_end) { 2990 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2991 u64 extent_end = get_cluster_boundary_end(cluster, 2992 *cluster_nr) - offset; 2993 u64 clamped_start = max(page_start, extent_start); 2994 u64 clamped_end = min(page_end, extent_end); 2995 u32 clamped_len = clamped_end + 1 - clamped_start; 2996 2997 /* Reserve metadata for this range */ 2998 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2999 clamped_len); 3000 if (ret) 3001 goto release_page; 3002 3003 /* Mark the range delalloc and dirty for later writeback */ 3004 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); 3005 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 3006 clamped_end, 0, NULL); 3007 if (ret) { 3008 clear_extent_bits(&BTRFS_I(inode)->io_tree, 3009 clamped_start, clamped_end, 3010 EXTENT_LOCKED | EXTENT_BOUNDARY); 3011 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3012 clamped_len, true); 3013 btrfs_delalloc_release_extents(BTRFS_I(inode), 3014 clamped_len); 3015 goto release_page; 3016 } 3017 btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); 3018 3019 /* 3020 * Set the boundary if it's inside the page. 3021 * Data relocation requires the destination extents to have the 3022 * same size as the source. 3023 * EXTENT_BOUNDARY bit prevents current extent from being merged 3024 * with previous extent. 3025 */ 3026 if (in_range(cluster->boundary[*cluster_nr] - offset, 3027 page_start, PAGE_SIZE)) { 3028 u64 boundary_start = cluster->boundary[*cluster_nr] - 3029 offset; 3030 u64 boundary_end = boundary_start + 3031 fs_info->sectorsize - 1; 3032 3033 set_extent_bits(&BTRFS_I(inode)->io_tree, 3034 boundary_start, boundary_end, 3035 EXTENT_BOUNDARY); 3036 } 3037 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); 3038 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 3039 cur += clamped_len; 3040 3041 /* Crossed extent end, go to next extent */ 3042 if (cur >= extent_end) { 3043 (*cluster_nr)++; 3044 /* Just finished the last extent of the cluster, exit. */ 3045 if (*cluster_nr >= cluster->nr) 3046 break; 3047 } 3048 } 3049 unlock_page(page); 3050 put_page(page); 3051 3052 balance_dirty_pages_ratelimited(inode->i_mapping); 3053 btrfs_throttle(fs_info); 3054 if (btrfs_should_cancel_balance(fs_info)) 3055 ret = -ECANCELED; 3056 return ret; 3057 3058 release_page: 3059 unlock_page(page); 3060 put_page(page); 3061 return ret; 3062 } 3063 3064 static int relocate_file_extent_cluster(struct inode *inode, 3065 struct file_extent_cluster *cluster) 3066 { 3067 u64 offset = BTRFS_I(inode)->index_cnt; 3068 unsigned long index; 3069 unsigned long last_index; 3070 struct file_ra_state *ra; 3071 int cluster_nr = 0; 3072 int ret = 0; 3073 3074 if (!cluster->nr) 3075 return 0; 3076 3077 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3078 if (!ra) 3079 return -ENOMEM; 3080 3081 ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); 3082 if (ret) 3083 goto out; 3084 3085 file_ra_state_init(ra, inode->i_mapping); 3086 3087 ret = setup_relocation_extent_mapping(inode, cluster->start - offset, 3088 cluster->end - offset, cluster->start); 3089 if (ret) 3090 goto out; 3091 3092 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3093 for (index = (cluster->start - offset) >> PAGE_SHIFT; 3094 index <= last_index && !ret; index++) 3095 ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); 3096 if (ret == 0) 3097 WARN_ON(cluster_nr != cluster->nr); 3098 out: 3099 kfree(ra); 3100 return ret; 3101 } 3102 3103 static noinline_for_stack 3104 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3105 struct file_extent_cluster *cluster) 3106 { 3107 int ret; 3108 3109 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3110 ret = relocate_file_extent_cluster(inode, cluster); 3111 if (ret) 3112 return ret; 3113 cluster->nr = 0; 3114 } 3115 3116 if (!cluster->nr) 3117 cluster->start = extent_key->objectid; 3118 else 3119 BUG_ON(cluster->nr >= MAX_EXTENTS); 3120 cluster->end = extent_key->objectid + extent_key->offset - 1; 3121 cluster->boundary[cluster->nr] = extent_key->objectid; 3122 cluster->nr++; 3123 3124 if (cluster->nr >= MAX_EXTENTS) { 3125 ret = relocate_file_extent_cluster(inode, cluster); 3126 if (ret) 3127 return ret; 3128 cluster->nr = 0; 3129 } 3130 return 0; 3131 } 3132 3133 /* 3134 * helper to add a tree block to the list. 3135 * the major work is getting the generation and level of the block 3136 */ 3137 static int add_tree_block(struct reloc_control *rc, 3138 struct btrfs_key *extent_key, 3139 struct btrfs_path *path, 3140 struct rb_root *blocks) 3141 { 3142 struct extent_buffer *eb; 3143 struct btrfs_extent_item *ei; 3144 struct btrfs_tree_block_info *bi; 3145 struct tree_block *block; 3146 struct rb_node *rb_node; 3147 u32 item_size; 3148 int level = -1; 3149 u64 generation; 3150 u64 owner = 0; 3151 3152 eb = path->nodes[0]; 3153 item_size = btrfs_item_size(eb, path->slots[0]); 3154 3155 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3156 item_size >= sizeof(*ei) + sizeof(*bi)) { 3157 unsigned long ptr = 0, end; 3158 3159 ei = btrfs_item_ptr(eb, path->slots[0], 3160 struct btrfs_extent_item); 3161 end = (unsigned long)ei + item_size; 3162 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3163 bi = (struct btrfs_tree_block_info *)(ei + 1); 3164 level = btrfs_tree_block_level(eb, bi); 3165 ptr = (unsigned long)(bi + 1); 3166 } else { 3167 level = (int)extent_key->offset; 3168 ptr = (unsigned long)(ei + 1); 3169 } 3170 generation = btrfs_extent_generation(eb, ei); 3171 3172 /* 3173 * We're reading random blocks without knowing their owner ahead 3174 * of time. This is ok most of the time, as all reloc roots and 3175 * fs roots have the same lock type. However normal trees do 3176 * not, and the only way to know ahead of time is to read the 3177 * inline ref offset. We know it's an fs root if 3178 * 3179 * 1. There's more than one ref. 3180 * 2. There's a SHARED_DATA_REF_KEY set. 3181 * 3. FULL_BACKREF is set on the flags. 3182 * 3183 * Otherwise it's safe to assume that the ref offset == the 3184 * owner of this block, so we can use that when calling 3185 * read_tree_block. 3186 */ 3187 if (btrfs_extent_refs(eb, ei) == 1 && 3188 !(btrfs_extent_flags(eb, ei) & 3189 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3190 ptr < end) { 3191 struct btrfs_extent_inline_ref *iref; 3192 int type; 3193 3194 iref = (struct btrfs_extent_inline_ref *)ptr; 3195 type = btrfs_get_extent_inline_ref_type(eb, iref, 3196 BTRFS_REF_TYPE_BLOCK); 3197 if (type == BTRFS_REF_TYPE_INVALID) 3198 return -EINVAL; 3199 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3200 owner = btrfs_extent_inline_ref_offset(eb, iref); 3201 } 3202 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3203 btrfs_print_v0_err(eb->fs_info); 3204 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3205 return -EINVAL; 3206 } else { 3207 BUG(); 3208 } 3209 3210 btrfs_release_path(path); 3211 3212 BUG_ON(level == -1); 3213 3214 block = kmalloc(sizeof(*block), GFP_NOFS); 3215 if (!block) 3216 return -ENOMEM; 3217 3218 block->bytenr = extent_key->objectid; 3219 block->key.objectid = rc->extent_root->fs_info->nodesize; 3220 block->key.offset = generation; 3221 block->level = level; 3222 block->key_ready = 0; 3223 block->owner = owner; 3224 3225 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3226 if (rb_node) 3227 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3228 -EEXIST); 3229 3230 return 0; 3231 } 3232 3233 /* 3234 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3235 */ 3236 static int __add_tree_block(struct reloc_control *rc, 3237 u64 bytenr, u32 blocksize, 3238 struct rb_root *blocks) 3239 { 3240 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3241 struct btrfs_path *path; 3242 struct btrfs_key key; 3243 int ret; 3244 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3245 3246 if (tree_block_processed(bytenr, rc)) 3247 return 0; 3248 3249 if (rb_simple_search(blocks, bytenr)) 3250 return 0; 3251 3252 path = btrfs_alloc_path(); 3253 if (!path) 3254 return -ENOMEM; 3255 again: 3256 key.objectid = bytenr; 3257 if (skinny) { 3258 key.type = BTRFS_METADATA_ITEM_KEY; 3259 key.offset = (u64)-1; 3260 } else { 3261 key.type = BTRFS_EXTENT_ITEM_KEY; 3262 key.offset = blocksize; 3263 } 3264 3265 path->search_commit_root = 1; 3266 path->skip_locking = 1; 3267 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3268 if (ret < 0) 3269 goto out; 3270 3271 if (ret > 0 && skinny) { 3272 if (path->slots[0]) { 3273 path->slots[0]--; 3274 btrfs_item_key_to_cpu(path->nodes[0], &key, 3275 path->slots[0]); 3276 if (key.objectid == bytenr && 3277 (key.type == BTRFS_METADATA_ITEM_KEY || 3278 (key.type == BTRFS_EXTENT_ITEM_KEY && 3279 key.offset == blocksize))) 3280 ret = 0; 3281 } 3282 3283 if (ret) { 3284 skinny = false; 3285 btrfs_release_path(path); 3286 goto again; 3287 } 3288 } 3289 if (ret) { 3290 ASSERT(ret == 1); 3291 btrfs_print_leaf(path->nodes[0]); 3292 btrfs_err(fs_info, 3293 "tree block extent item (%llu) is not found in extent tree", 3294 bytenr); 3295 WARN_ON(1); 3296 ret = -EINVAL; 3297 goto out; 3298 } 3299 3300 ret = add_tree_block(rc, &key, path, blocks); 3301 out: 3302 btrfs_free_path(path); 3303 return ret; 3304 } 3305 3306 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3307 struct btrfs_block_group *block_group, 3308 struct inode *inode, 3309 u64 ino) 3310 { 3311 struct btrfs_root *root = fs_info->tree_root; 3312 struct btrfs_trans_handle *trans; 3313 int ret = 0; 3314 3315 if (inode) 3316 goto truncate; 3317 3318 inode = btrfs_iget(fs_info->sb, ino, root); 3319 if (IS_ERR(inode)) 3320 return -ENOENT; 3321 3322 truncate: 3323 ret = btrfs_check_trunc_cache_free_space(fs_info, 3324 &fs_info->global_block_rsv); 3325 if (ret) 3326 goto out; 3327 3328 trans = btrfs_join_transaction(root); 3329 if (IS_ERR(trans)) { 3330 ret = PTR_ERR(trans); 3331 goto out; 3332 } 3333 3334 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3335 3336 btrfs_end_transaction(trans); 3337 btrfs_btree_balance_dirty(fs_info); 3338 out: 3339 iput(inode); 3340 return ret; 3341 } 3342 3343 /* 3344 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3345 * cache inode, to avoid free space cache data extent blocking data relocation. 3346 */ 3347 static int delete_v1_space_cache(struct extent_buffer *leaf, 3348 struct btrfs_block_group *block_group, 3349 u64 data_bytenr) 3350 { 3351 u64 space_cache_ino; 3352 struct btrfs_file_extent_item *ei; 3353 struct btrfs_key key; 3354 bool found = false; 3355 int i; 3356 int ret; 3357 3358 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3359 return 0; 3360 3361 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3362 u8 type; 3363 3364 btrfs_item_key_to_cpu(leaf, &key, i); 3365 if (key.type != BTRFS_EXTENT_DATA_KEY) 3366 continue; 3367 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3368 type = btrfs_file_extent_type(leaf, ei); 3369 3370 if ((type == BTRFS_FILE_EXTENT_REG || 3371 type == BTRFS_FILE_EXTENT_PREALLOC) && 3372 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3373 found = true; 3374 space_cache_ino = key.objectid; 3375 break; 3376 } 3377 } 3378 if (!found) 3379 return -ENOENT; 3380 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3381 space_cache_ino); 3382 return ret; 3383 } 3384 3385 /* 3386 * helper to find all tree blocks that reference a given data extent 3387 */ 3388 static noinline_for_stack 3389 int add_data_references(struct reloc_control *rc, 3390 struct btrfs_key *extent_key, 3391 struct btrfs_path *path, 3392 struct rb_root *blocks) 3393 { 3394 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3395 struct ulist *leaves = NULL; 3396 struct ulist_iterator leaf_uiter; 3397 struct ulist_node *ref_node = NULL; 3398 const u32 blocksize = fs_info->nodesize; 3399 int ret = 0; 3400 3401 btrfs_release_path(path); 3402 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, 3403 0, &leaves, NULL, true); 3404 if (ret < 0) 3405 return ret; 3406 3407 ULIST_ITER_INIT(&leaf_uiter); 3408 while ((ref_node = ulist_next(leaves, &leaf_uiter))) { 3409 struct extent_buffer *eb; 3410 3411 eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); 3412 if (IS_ERR(eb)) { 3413 ret = PTR_ERR(eb); 3414 break; 3415 } 3416 ret = delete_v1_space_cache(eb, rc->block_group, 3417 extent_key->objectid); 3418 free_extent_buffer(eb); 3419 if (ret < 0) 3420 break; 3421 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3422 if (ret < 0) 3423 break; 3424 } 3425 if (ret < 0) 3426 free_block_list(blocks); 3427 ulist_free(leaves); 3428 return ret; 3429 } 3430 3431 /* 3432 * helper to find next unprocessed extent 3433 */ 3434 static noinline_for_stack 3435 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3436 struct btrfs_key *extent_key) 3437 { 3438 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3439 struct btrfs_key key; 3440 struct extent_buffer *leaf; 3441 u64 start, end, last; 3442 int ret; 3443 3444 last = rc->block_group->start + rc->block_group->length; 3445 while (1) { 3446 cond_resched(); 3447 if (rc->search_start >= last) { 3448 ret = 1; 3449 break; 3450 } 3451 3452 key.objectid = rc->search_start; 3453 key.type = BTRFS_EXTENT_ITEM_KEY; 3454 key.offset = 0; 3455 3456 path->search_commit_root = 1; 3457 path->skip_locking = 1; 3458 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3459 0, 0); 3460 if (ret < 0) 3461 break; 3462 next: 3463 leaf = path->nodes[0]; 3464 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3465 ret = btrfs_next_leaf(rc->extent_root, path); 3466 if (ret != 0) 3467 break; 3468 leaf = path->nodes[0]; 3469 } 3470 3471 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3472 if (key.objectid >= last) { 3473 ret = 1; 3474 break; 3475 } 3476 3477 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3478 key.type != BTRFS_METADATA_ITEM_KEY) { 3479 path->slots[0]++; 3480 goto next; 3481 } 3482 3483 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3484 key.objectid + key.offset <= rc->search_start) { 3485 path->slots[0]++; 3486 goto next; 3487 } 3488 3489 if (key.type == BTRFS_METADATA_ITEM_KEY && 3490 key.objectid + fs_info->nodesize <= 3491 rc->search_start) { 3492 path->slots[0]++; 3493 goto next; 3494 } 3495 3496 ret = find_first_extent_bit(&rc->processed_blocks, 3497 key.objectid, &start, &end, 3498 EXTENT_DIRTY, NULL); 3499 3500 if (ret == 0 && start <= key.objectid) { 3501 btrfs_release_path(path); 3502 rc->search_start = end + 1; 3503 } else { 3504 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3505 rc->search_start = key.objectid + key.offset; 3506 else 3507 rc->search_start = key.objectid + 3508 fs_info->nodesize; 3509 memcpy(extent_key, &key, sizeof(key)); 3510 return 0; 3511 } 3512 } 3513 btrfs_release_path(path); 3514 return ret; 3515 } 3516 3517 static void set_reloc_control(struct reloc_control *rc) 3518 { 3519 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3520 3521 mutex_lock(&fs_info->reloc_mutex); 3522 fs_info->reloc_ctl = rc; 3523 mutex_unlock(&fs_info->reloc_mutex); 3524 } 3525 3526 static void unset_reloc_control(struct reloc_control *rc) 3527 { 3528 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3529 3530 mutex_lock(&fs_info->reloc_mutex); 3531 fs_info->reloc_ctl = NULL; 3532 mutex_unlock(&fs_info->reloc_mutex); 3533 } 3534 3535 static noinline_for_stack 3536 int prepare_to_relocate(struct reloc_control *rc) 3537 { 3538 struct btrfs_trans_handle *trans; 3539 int ret; 3540 3541 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3542 BTRFS_BLOCK_RSV_TEMP); 3543 if (!rc->block_rsv) 3544 return -ENOMEM; 3545 3546 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3547 rc->search_start = rc->block_group->start; 3548 rc->extents_found = 0; 3549 rc->nodes_relocated = 0; 3550 rc->merging_rsv_size = 0; 3551 rc->reserved_bytes = 0; 3552 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3553 RELOCATION_RESERVED_NODES; 3554 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3555 rc->block_rsv, rc->block_rsv->size, 3556 BTRFS_RESERVE_FLUSH_ALL); 3557 if (ret) 3558 return ret; 3559 3560 rc->create_reloc_tree = 1; 3561 set_reloc_control(rc); 3562 3563 trans = btrfs_join_transaction(rc->extent_root); 3564 if (IS_ERR(trans)) { 3565 unset_reloc_control(rc); 3566 /* 3567 * extent tree is not a ref_cow tree and has no reloc_root to 3568 * cleanup. And callers are responsible to free the above 3569 * block rsv. 3570 */ 3571 return PTR_ERR(trans); 3572 } 3573 return btrfs_commit_transaction(trans); 3574 } 3575 3576 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3577 { 3578 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3579 struct rb_root blocks = RB_ROOT; 3580 struct btrfs_key key; 3581 struct btrfs_trans_handle *trans = NULL; 3582 struct btrfs_path *path; 3583 struct btrfs_extent_item *ei; 3584 u64 flags; 3585 int ret; 3586 int err = 0; 3587 int progress = 0; 3588 3589 path = btrfs_alloc_path(); 3590 if (!path) 3591 return -ENOMEM; 3592 path->reada = READA_FORWARD; 3593 3594 ret = prepare_to_relocate(rc); 3595 if (ret) { 3596 err = ret; 3597 goto out_free; 3598 } 3599 3600 while (1) { 3601 rc->reserved_bytes = 0; 3602 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3603 rc->block_rsv->size, 3604 BTRFS_RESERVE_FLUSH_ALL); 3605 if (ret) { 3606 err = ret; 3607 break; 3608 } 3609 progress++; 3610 trans = btrfs_start_transaction(rc->extent_root, 0); 3611 if (IS_ERR(trans)) { 3612 err = PTR_ERR(trans); 3613 trans = NULL; 3614 break; 3615 } 3616 restart: 3617 if (update_backref_cache(trans, &rc->backref_cache)) { 3618 btrfs_end_transaction(trans); 3619 trans = NULL; 3620 continue; 3621 } 3622 3623 ret = find_next_extent(rc, path, &key); 3624 if (ret < 0) 3625 err = ret; 3626 if (ret != 0) 3627 break; 3628 3629 rc->extents_found++; 3630 3631 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3632 struct btrfs_extent_item); 3633 flags = btrfs_extent_flags(path->nodes[0], ei); 3634 3635 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3636 ret = add_tree_block(rc, &key, path, &blocks); 3637 } else if (rc->stage == UPDATE_DATA_PTRS && 3638 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3639 ret = add_data_references(rc, &key, path, &blocks); 3640 } else { 3641 btrfs_release_path(path); 3642 ret = 0; 3643 } 3644 if (ret < 0) { 3645 err = ret; 3646 break; 3647 } 3648 3649 if (!RB_EMPTY_ROOT(&blocks)) { 3650 ret = relocate_tree_blocks(trans, rc, &blocks); 3651 if (ret < 0) { 3652 if (ret != -EAGAIN) { 3653 err = ret; 3654 break; 3655 } 3656 rc->extents_found--; 3657 rc->search_start = key.objectid; 3658 } 3659 } 3660 3661 btrfs_end_transaction_throttle(trans); 3662 btrfs_btree_balance_dirty(fs_info); 3663 trans = NULL; 3664 3665 if (rc->stage == MOVE_DATA_EXTENTS && 3666 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3667 rc->found_file_extent = 1; 3668 ret = relocate_data_extent(rc->data_inode, 3669 &key, &rc->cluster); 3670 if (ret < 0) { 3671 err = ret; 3672 break; 3673 } 3674 } 3675 if (btrfs_should_cancel_balance(fs_info)) { 3676 err = -ECANCELED; 3677 break; 3678 } 3679 } 3680 if (trans && progress && err == -ENOSPC) { 3681 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3682 if (ret == 1) { 3683 err = 0; 3684 progress = 0; 3685 goto restart; 3686 } 3687 } 3688 3689 btrfs_release_path(path); 3690 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3691 3692 if (trans) { 3693 btrfs_end_transaction_throttle(trans); 3694 btrfs_btree_balance_dirty(fs_info); 3695 } 3696 3697 if (!err) { 3698 ret = relocate_file_extent_cluster(rc->data_inode, 3699 &rc->cluster); 3700 if (ret < 0) 3701 err = ret; 3702 } 3703 3704 rc->create_reloc_tree = 0; 3705 set_reloc_control(rc); 3706 3707 btrfs_backref_release_cache(&rc->backref_cache); 3708 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3709 3710 /* 3711 * Even in the case when the relocation is cancelled, we should all go 3712 * through prepare_to_merge() and merge_reloc_roots(). 3713 * 3714 * For error (including cancelled balance), prepare_to_merge() will 3715 * mark all reloc trees orphan, then queue them for cleanup in 3716 * merge_reloc_roots() 3717 */ 3718 err = prepare_to_merge(rc, err); 3719 3720 merge_reloc_roots(rc); 3721 3722 rc->merge_reloc_tree = 0; 3723 unset_reloc_control(rc); 3724 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3725 3726 /* get rid of pinned extents */ 3727 trans = btrfs_join_transaction(rc->extent_root); 3728 if (IS_ERR(trans)) { 3729 err = PTR_ERR(trans); 3730 goto out_free; 3731 } 3732 ret = btrfs_commit_transaction(trans); 3733 if (ret && !err) 3734 err = ret; 3735 out_free: 3736 ret = clean_dirty_subvols(rc); 3737 if (ret < 0 && !err) 3738 err = ret; 3739 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3740 btrfs_free_path(path); 3741 return err; 3742 } 3743 3744 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3745 struct btrfs_root *root, u64 objectid) 3746 { 3747 struct btrfs_path *path; 3748 struct btrfs_inode_item *item; 3749 struct extent_buffer *leaf; 3750 int ret; 3751 3752 path = btrfs_alloc_path(); 3753 if (!path) 3754 return -ENOMEM; 3755 3756 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3757 if (ret) 3758 goto out; 3759 3760 leaf = path->nodes[0]; 3761 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3762 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3763 btrfs_set_inode_generation(leaf, item, 1); 3764 btrfs_set_inode_size(leaf, item, 0); 3765 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3766 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3767 BTRFS_INODE_PREALLOC); 3768 btrfs_mark_buffer_dirty(leaf); 3769 out: 3770 btrfs_free_path(path); 3771 return ret; 3772 } 3773 3774 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3775 struct btrfs_root *root, u64 objectid) 3776 { 3777 struct btrfs_path *path; 3778 struct btrfs_key key; 3779 int ret = 0; 3780 3781 path = btrfs_alloc_path(); 3782 if (!path) { 3783 ret = -ENOMEM; 3784 goto out; 3785 } 3786 3787 key.objectid = objectid; 3788 key.type = BTRFS_INODE_ITEM_KEY; 3789 key.offset = 0; 3790 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3791 if (ret) { 3792 if (ret > 0) 3793 ret = -ENOENT; 3794 goto out; 3795 } 3796 ret = btrfs_del_item(trans, root, path); 3797 out: 3798 if (ret) 3799 btrfs_abort_transaction(trans, ret); 3800 btrfs_free_path(path); 3801 } 3802 3803 /* 3804 * helper to create inode for data relocation. 3805 * the inode is in data relocation tree and its link count is 0 3806 */ 3807 static noinline_for_stack 3808 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3809 struct btrfs_block_group *group) 3810 { 3811 struct inode *inode = NULL; 3812 struct btrfs_trans_handle *trans; 3813 struct btrfs_root *root; 3814 u64 objectid; 3815 int err = 0; 3816 3817 root = btrfs_grab_root(fs_info->data_reloc_root); 3818 trans = btrfs_start_transaction(root, 6); 3819 if (IS_ERR(trans)) { 3820 btrfs_put_root(root); 3821 return ERR_CAST(trans); 3822 } 3823 3824 err = btrfs_get_free_objectid(root, &objectid); 3825 if (err) 3826 goto out; 3827 3828 err = __insert_orphan_inode(trans, root, objectid); 3829 if (err) 3830 goto out; 3831 3832 inode = btrfs_iget(fs_info->sb, objectid, root); 3833 if (IS_ERR(inode)) { 3834 delete_orphan_inode(trans, root, objectid); 3835 err = PTR_ERR(inode); 3836 inode = NULL; 3837 goto out; 3838 } 3839 BTRFS_I(inode)->index_cnt = group->start; 3840 3841 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 3842 out: 3843 btrfs_put_root(root); 3844 btrfs_end_transaction(trans); 3845 btrfs_btree_balance_dirty(fs_info); 3846 if (err) { 3847 if (inode) 3848 iput(inode); 3849 inode = ERR_PTR(err); 3850 } 3851 return inode; 3852 } 3853 3854 /* 3855 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3856 * has been requested meanwhile and don't start in that case. 3857 * 3858 * Return: 3859 * 0 success 3860 * -EINPROGRESS operation is already in progress, that's probably a bug 3861 * -ECANCELED cancellation request was set before the operation started 3862 * -EAGAIN can not start because there are ongoing send operations 3863 */ 3864 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3865 { 3866 spin_lock(&fs_info->send_reloc_lock); 3867 if (fs_info->send_in_progress) { 3868 btrfs_warn_rl(fs_info, 3869 "cannot run relocation while send operations are in progress (%d in progress)", 3870 fs_info->send_in_progress); 3871 spin_unlock(&fs_info->send_reloc_lock); 3872 return -EAGAIN; 3873 } 3874 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3875 /* This should not happen */ 3876 spin_unlock(&fs_info->send_reloc_lock); 3877 btrfs_err(fs_info, "reloc already running, cannot start"); 3878 return -EINPROGRESS; 3879 } 3880 spin_unlock(&fs_info->send_reloc_lock); 3881 3882 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3883 btrfs_info(fs_info, "chunk relocation canceled on start"); 3884 /* 3885 * On cancel, clear all requests but let the caller mark 3886 * the end after cleanup operations. 3887 */ 3888 atomic_set(&fs_info->reloc_cancel_req, 0); 3889 return -ECANCELED; 3890 } 3891 return 0; 3892 } 3893 3894 /* 3895 * Mark end of chunk relocation that is cancellable and wake any waiters. 3896 */ 3897 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3898 { 3899 /* Requested after start, clear bit first so any waiters can continue */ 3900 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3901 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3902 spin_lock(&fs_info->send_reloc_lock); 3903 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3904 spin_unlock(&fs_info->send_reloc_lock); 3905 atomic_set(&fs_info->reloc_cancel_req, 0); 3906 } 3907 3908 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3909 { 3910 struct reloc_control *rc; 3911 3912 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3913 if (!rc) 3914 return NULL; 3915 3916 INIT_LIST_HEAD(&rc->reloc_roots); 3917 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3918 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); 3919 mapping_tree_init(&rc->reloc_root_tree); 3920 extent_io_tree_init(fs_info, &rc->processed_blocks, 3921 IO_TREE_RELOC_BLOCKS, NULL); 3922 return rc; 3923 } 3924 3925 static void free_reloc_control(struct reloc_control *rc) 3926 { 3927 struct mapping_node *node, *tmp; 3928 3929 free_reloc_roots(&rc->reloc_roots); 3930 rbtree_postorder_for_each_entry_safe(node, tmp, 3931 &rc->reloc_root_tree.rb_root, rb_node) 3932 kfree(node); 3933 3934 kfree(rc); 3935 } 3936 3937 /* 3938 * Print the block group being relocated 3939 */ 3940 static void describe_relocation(struct btrfs_fs_info *fs_info, 3941 struct btrfs_block_group *block_group) 3942 { 3943 char buf[128] = {'\0'}; 3944 3945 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3946 3947 btrfs_info(fs_info, 3948 "relocating block group %llu flags %s", 3949 block_group->start, buf); 3950 } 3951 3952 static const char *stage_to_string(int stage) 3953 { 3954 if (stage == MOVE_DATA_EXTENTS) 3955 return "move data extents"; 3956 if (stage == UPDATE_DATA_PTRS) 3957 return "update data pointers"; 3958 return "unknown"; 3959 } 3960 3961 /* 3962 * function to relocate all extents in a block group. 3963 */ 3964 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 3965 { 3966 struct btrfs_block_group *bg; 3967 struct btrfs_root *extent_root = fs_info->extent_root; 3968 struct reloc_control *rc; 3969 struct inode *inode; 3970 struct btrfs_path *path; 3971 int ret; 3972 int rw = 0; 3973 int err = 0; 3974 3975 bg = btrfs_lookup_block_group(fs_info, group_start); 3976 if (!bg) 3977 return -ENOENT; 3978 3979 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3980 btrfs_put_block_group(bg); 3981 return -ETXTBSY; 3982 } 3983 3984 rc = alloc_reloc_control(fs_info); 3985 if (!rc) { 3986 btrfs_put_block_group(bg); 3987 return -ENOMEM; 3988 } 3989 3990 ret = reloc_chunk_start(fs_info); 3991 if (ret < 0) { 3992 err = ret; 3993 goto out_put_bg; 3994 } 3995 3996 rc->extent_root = extent_root; 3997 rc->block_group = bg; 3998 3999 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4000 if (ret) { 4001 err = ret; 4002 goto out; 4003 } 4004 rw = 1; 4005 4006 path = btrfs_alloc_path(); 4007 if (!path) { 4008 err = -ENOMEM; 4009 goto out; 4010 } 4011 4012 inode = lookup_free_space_inode(rc->block_group, path); 4013 btrfs_free_path(path); 4014 4015 if (!IS_ERR(inode)) 4016 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4017 else 4018 ret = PTR_ERR(inode); 4019 4020 if (ret && ret != -ENOENT) { 4021 err = ret; 4022 goto out; 4023 } 4024 4025 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4026 if (IS_ERR(rc->data_inode)) { 4027 err = PTR_ERR(rc->data_inode); 4028 rc->data_inode = NULL; 4029 goto out; 4030 } 4031 4032 describe_relocation(fs_info, rc->block_group); 4033 4034 btrfs_wait_block_group_reservations(rc->block_group); 4035 btrfs_wait_nocow_writers(rc->block_group); 4036 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4037 rc->block_group->start, 4038 rc->block_group->length); 4039 4040 ret = btrfs_zone_finish(rc->block_group); 4041 WARN_ON(ret && ret != -EAGAIN); 4042 4043 while (1) { 4044 int finishes_stage; 4045 4046 mutex_lock(&fs_info->cleaner_mutex); 4047 ret = relocate_block_group(rc); 4048 mutex_unlock(&fs_info->cleaner_mutex); 4049 if (ret < 0) 4050 err = ret; 4051 4052 finishes_stage = rc->stage; 4053 /* 4054 * We may have gotten ENOSPC after we already dirtied some 4055 * extents. If writeout happens while we're relocating a 4056 * different block group we could end up hitting the 4057 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4058 * btrfs_reloc_cow_block. Make sure we write everything out 4059 * properly so we don't trip over this problem, and then break 4060 * out of the loop if we hit an error. 4061 */ 4062 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4063 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4064 (u64)-1); 4065 if (ret) 4066 err = ret; 4067 invalidate_mapping_pages(rc->data_inode->i_mapping, 4068 0, -1); 4069 rc->stage = UPDATE_DATA_PTRS; 4070 } 4071 4072 if (err < 0) 4073 goto out; 4074 4075 if (rc->extents_found == 0) 4076 break; 4077 4078 btrfs_info(fs_info, "found %llu extents, stage: %s", 4079 rc->extents_found, stage_to_string(finishes_stage)); 4080 } 4081 4082 WARN_ON(rc->block_group->pinned > 0); 4083 WARN_ON(rc->block_group->reserved > 0); 4084 WARN_ON(rc->block_group->used > 0); 4085 out: 4086 if (err && rw) 4087 btrfs_dec_block_group_ro(rc->block_group); 4088 iput(rc->data_inode); 4089 out_put_bg: 4090 btrfs_put_block_group(bg); 4091 reloc_chunk_end(fs_info); 4092 free_reloc_control(rc); 4093 return err; 4094 } 4095 4096 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4097 { 4098 struct btrfs_fs_info *fs_info = root->fs_info; 4099 struct btrfs_trans_handle *trans; 4100 int ret, err; 4101 4102 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4103 if (IS_ERR(trans)) 4104 return PTR_ERR(trans); 4105 4106 memset(&root->root_item.drop_progress, 0, 4107 sizeof(root->root_item.drop_progress)); 4108 btrfs_set_root_drop_level(&root->root_item, 0); 4109 btrfs_set_root_refs(&root->root_item, 0); 4110 ret = btrfs_update_root(trans, fs_info->tree_root, 4111 &root->root_key, &root->root_item); 4112 4113 err = btrfs_end_transaction(trans); 4114 if (err) 4115 return err; 4116 return ret; 4117 } 4118 4119 /* 4120 * recover relocation interrupted by system crash. 4121 * 4122 * this function resumes merging reloc trees with corresponding fs trees. 4123 * this is important for keeping the sharing of tree blocks 4124 */ 4125 int btrfs_recover_relocation(struct btrfs_root *root) 4126 { 4127 struct btrfs_fs_info *fs_info = root->fs_info; 4128 LIST_HEAD(reloc_roots); 4129 struct btrfs_key key; 4130 struct btrfs_root *fs_root; 4131 struct btrfs_root *reloc_root; 4132 struct btrfs_path *path; 4133 struct extent_buffer *leaf; 4134 struct reloc_control *rc = NULL; 4135 struct btrfs_trans_handle *trans; 4136 int ret; 4137 int err = 0; 4138 4139 path = btrfs_alloc_path(); 4140 if (!path) 4141 return -ENOMEM; 4142 path->reada = READA_BACK; 4143 4144 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4145 key.type = BTRFS_ROOT_ITEM_KEY; 4146 key.offset = (u64)-1; 4147 4148 while (1) { 4149 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4150 path, 0, 0); 4151 if (ret < 0) { 4152 err = ret; 4153 goto out; 4154 } 4155 if (ret > 0) { 4156 if (path->slots[0] == 0) 4157 break; 4158 path->slots[0]--; 4159 } 4160 leaf = path->nodes[0]; 4161 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4162 btrfs_release_path(path); 4163 4164 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4165 key.type != BTRFS_ROOT_ITEM_KEY) 4166 break; 4167 4168 reloc_root = btrfs_read_tree_root(root, &key); 4169 if (IS_ERR(reloc_root)) { 4170 err = PTR_ERR(reloc_root); 4171 goto out; 4172 } 4173 4174 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4175 list_add(&reloc_root->root_list, &reloc_roots); 4176 4177 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4178 fs_root = btrfs_get_fs_root(fs_info, 4179 reloc_root->root_key.offset, false); 4180 if (IS_ERR(fs_root)) { 4181 ret = PTR_ERR(fs_root); 4182 if (ret != -ENOENT) { 4183 err = ret; 4184 goto out; 4185 } 4186 ret = mark_garbage_root(reloc_root); 4187 if (ret < 0) { 4188 err = ret; 4189 goto out; 4190 } 4191 } else { 4192 btrfs_put_root(fs_root); 4193 } 4194 } 4195 4196 if (key.offset == 0) 4197 break; 4198 4199 key.offset--; 4200 } 4201 btrfs_release_path(path); 4202 4203 if (list_empty(&reloc_roots)) 4204 goto out; 4205 4206 rc = alloc_reloc_control(fs_info); 4207 if (!rc) { 4208 err = -ENOMEM; 4209 goto out; 4210 } 4211 4212 ret = reloc_chunk_start(fs_info); 4213 if (ret < 0) { 4214 err = ret; 4215 goto out_end; 4216 } 4217 4218 rc->extent_root = fs_info->extent_root; 4219 4220 set_reloc_control(rc); 4221 4222 trans = btrfs_join_transaction(rc->extent_root); 4223 if (IS_ERR(trans)) { 4224 err = PTR_ERR(trans); 4225 goto out_unset; 4226 } 4227 4228 rc->merge_reloc_tree = 1; 4229 4230 while (!list_empty(&reloc_roots)) { 4231 reloc_root = list_entry(reloc_roots.next, 4232 struct btrfs_root, root_list); 4233 list_del(&reloc_root->root_list); 4234 4235 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4236 list_add_tail(&reloc_root->root_list, 4237 &rc->reloc_roots); 4238 continue; 4239 } 4240 4241 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4242 false); 4243 if (IS_ERR(fs_root)) { 4244 err = PTR_ERR(fs_root); 4245 list_add_tail(&reloc_root->root_list, &reloc_roots); 4246 btrfs_end_transaction(trans); 4247 goto out_unset; 4248 } 4249 4250 err = __add_reloc_root(reloc_root); 4251 ASSERT(err != -EEXIST); 4252 if (err) { 4253 list_add_tail(&reloc_root->root_list, &reloc_roots); 4254 btrfs_put_root(fs_root); 4255 btrfs_end_transaction(trans); 4256 goto out_unset; 4257 } 4258 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4259 btrfs_put_root(fs_root); 4260 } 4261 4262 err = btrfs_commit_transaction(trans); 4263 if (err) 4264 goto out_unset; 4265 4266 merge_reloc_roots(rc); 4267 4268 unset_reloc_control(rc); 4269 4270 trans = btrfs_join_transaction(rc->extent_root); 4271 if (IS_ERR(trans)) { 4272 err = PTR_ERR(trans); 4273 goto out_clean; 4274 } 4275 err = btrfs_commit_transaction(trans); 4276 out_clean: 4277 ret = clean_dirty_subvols(rc); 4278 if (ret < 0 && !err) 4279 err = ret; 4280 out_unset: 4281 unset_reloc_control(rc); 4282 out_end: 4283 reloc_chunk_end(fs_info); 4284 free_reloc_control(rc); 4285 out: 4286 free_reloc_roots(&reloc_roots); 4287 4288 btrfs_free_path(path); 4289 4290 if (err == 0) { 4291 /* cleanup orphan inode in data relocation tree */ 4292 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4293 ASSERT(fs_root); 4294 err = btrfs_orphan_cleanup(fs_root); 4295 btrfs_put_root(fs_root); 4296 } 4297 return err; 4298 } 4299 4300 /* 4301 * helper to add ordered checksum for data relocation. 4302 * 4303 * cloning checksum properly handles the nodatasum extents. 4304 * it also saves CPU time to re-calculate the checksum. 4305 */ 4306 int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) 4307 { 4308 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4309 struct btrfs_ordered_sum *sums; 4310 struct btrfs_ordered_extent *ordered; 4311 int ret; 4312 u64 disk_bytenr; 4313 u64 new_bytenr; 4314 LIST_HEAD(list); 4315 4316 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4317 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); 4318 4319 disk_bytenr = file_pos + inode->index_cnt; 4320 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 4321 disk_bytenr + len - 1, &list, 0); 4322 if (ret) 4323 goto out; 4324 4325 while (!list_empty(&list)) { 4326 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4327 list_del_init(&sums->list); 4328 4329 /* 4330 * We need to offset the new_bytenr based on where the csum is. 4331 * We need to do this because we will read in entire prealloc 4332 * extents but we may have written to say the middle of the 4333 * prealloc extent, so we need to make sure the csum goes with 4334 * the right disk offset. 4335 * 4336 * We can do this because the data reloc inode refers strictly 4337 * to the on disk bytes, so we don't have to worry about 4338 * disk_len vs real len like with real inodes since it's all 4339 * disk length. 4340 */ 4341 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; 4342 sums->bytenr = new_bytenr; 4343 4344 btrfs_add_ordered_sum(ordered, sums); 4345 } 4346 out: 4347 btrfs_put_ordered_extent(ordered); 4348 return ret; 4349 } 4350 4351 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4352 struct btrfs_root *root, struct extent_buffer *buf, 4353 struct extent_buffer *cow) 4354 { 4355 struct btrfs_fs_info *fs_info = root->fs_info; 4356 struct reloc_control *rc; 4357 struct btrfs_backref_node *node; 4358 int first_cow = 0; 4359 int level; 4360 int ret = 0; 4361 4362 rc = fs_info->reloc_ctl; 4363 if (!rc) 4364 return 0; 4365 4366 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4367 4368 level = btrfs_header_level(buf); 4369 if (btrfs_header_generation(buf) <= 4370 btrfs_root_last_snapshot(&root->root_item)) 4371 first_cow = 1; 4372 4373 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4374 rc->create_reloc_tree) { 4375 WARN_ON(!first_cow && level == 0); 4376 4377 node = rc->backref_cache.path[level]; 4378 BUG_ON(node->bytenr != buf->start && 4379 node->new_bytenr != buf->start); 4380 4381 btrfs_backref_drop_node_buffer(node); 4382 atomic_inc(&cow->refs); 4383 node->eb = cow; 4384 node->new_bytenr = cow->start; 4385 4386 if (!node->pending) { 4387 list_move_tail(&node->list, 4388 &rc->backref_cache.pending[level]); 4389 node->pending = 1; 4390 } 4391 4392 if (first_cow) 4393 mark_block_processed(rc, node); 4394 4395 if (first_cow && level > 0) 4396 rc->nodes_relocated += buf->len; 4397 } 4398 4399 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4400 ret = replace_file_extents(trans, rc, root, cow); 4401 return ret; 4402 } 4403 4404 /* 4405 * called before creating snapshot. it calculates metadata reservation 4406 * required for relocating tree blocks in the snapshot 4407 */ 4408 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4409 u64 *bytes_to_reserve) 4410 { 4411 struct btrfs_root *root = pending->root; 4412 struct reloc_control *rc = root->fs_info->reloc_ctl; 4413 4414 if (!rc || !have_reloc_root(root)) 4415 return; 4416 4417 if (!rc->merge_reloc_tree) 4418 return; 4419 4420 root = root->reloc_root; 4421 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4422 /* 4423 * relocation is in the stage of merging trees. the space 4424 * used by merging a reloc tree is twice the size of 4425 * relocated tree nodes in the worst case. half for cowing 4426 * the reloc tree, half for cowing the fs tree. the space 4427 * used by cowing the reloc tree will be freed after the 4428 * tree is dropped. if we create snapshot, cowing the fs 4429 * tree may use more space than it frees. so we need 4430 * reserve extra space. 4431 */ 4432 *bytes_to_reserve += rc->nodes_relocated; 4433 } 4434 4435 /* 4436 * called after snapshot is created. migrate block reservation 4437 * and create reloc root for the newly created snapshot 4438 * 4439 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4440 * references held on the reloc_root, one for root->reloc_root and one for 4441 * rc->reloc_roots. 4442 */ 4443 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4444 struct btrfs_pending_snapshot *pending) 4445 { 4446 struct btrfs_root *root = pending->root; 4447 struct btrfs_root *reloc_root; 4448 struct btrfs_root *new_root; 4449 struct reloc_control *rc = root->fs_info->reloc_ctl; 4450 int ret; 4451 4452 if (!rc || !have_reloc_root(root)) 4453 return 0; 4454 4455 rc = root->fs_info->reloc_ctl; 4456 rc->merging_rsv_size += rc->nodes_relocated; 4457 4458 if (rc->merge_reloc_tree) { 4459 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4460 rc->block_rsv, 4461 rc->nodes_relocated, true); 4462 if (ret) 4463 return ret; 4464 } 4465 4466 new_root = pending->snap; 4467 reloc_root = create_reloc_root(trans, root->reloc_root, 4468 new_root->root_key.objectid); 4469 if (IS_ERR(reloc_root)) 4470 return PTR_ERR(reloc_root); 4471 4472 ret = __add_reloc_root(reloc_root); 4473 ASSERT(ret != -EEXIST); 4474 if (ret) { 4475 /* Pairs with create_reloc_root */ 4476 btrfs_put_root(reloc_root); 4477 return ret; 4478 } 4479 new_root->reloc_root = btrfs_grab_root(reloc_root); 4480 4481 if (rc->create_reloc_tree) 4482 ret = clone_backref_node(trans, rc, root, reloc_root); 4483 return ret; 4484 } 4485