1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 30 /* 31 * Relocation overview 32 * 33 * [What does relocation do] 34 * 35 * The objective of relocation is to relocate all extents of the target block 36 * group to other block groups. 37 * This is utilized by resize (shrink only), profile converting, compacting 38 * space, or balance routine to spread chunks over devices. 39 * 40 * Before | After 41 * ------------------------------------------------------------------ 42 * BG A: 10 data extents | BG A: deleted 43 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 44 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 45 * 46 * [How does relocation work] 47 * 48 * 1. Mark the target block group read-only 49 * New extents won't be allocated from the target block group. 50 * 51 * 2.1 Record each extent in the target block group 52 * To build a proper map of extents to be relocated. 53 * 54 * 2.2 Build data reloc tree and reloc trees 55 * Data reloc tree will contain an inode, recording all newly relocated 56 * data extents. 57 * There will be only one data reloc tree for one data block group. 58 * 59 * Reloc tree will be a special snapshot of its source tree, containing 60 * relocated tree blocks. 61 * Each tree referring to a tree block in target block group will get its 62 * reloc tree built. 63 * 64 * 2.3 Swap source tree with its corresponding reloc tree 65 * Each involved tree only refers to new extents after swap. 66 * 67 * 3. Cleanup reloc trees and data reloc tree. 68 * As old extents in the target block group are still referenced by reloc 69 * trees, we need to clean them up before really freeing the target block 70 * group. 71 * 72 * The main complexity is in steps 2.2 and 2.3. 73 * 74 * The entry point of relocation is relocate_block_group() function. 75 */ 76 77 #define RELOCATION_RESERVED_NODES 256 78 /* 79 * map address of tree root to tree 80 */ 81 struct mapping_node { 82 struct { 83 struct rb_node rb_node; 84 u64 bytenr; 85 }; /* Use rb_simle_node for search/insert */ 86 void *data; 87 }; 88 89 struct mapping_tree { 90 struct rb_root rb_root; 91 spinlock_t lock; 92 }; 93 94 /* 95 * present a tree block to process 96 */ 97 struct tree_block { 98 struct { 99 struct rb_node rb_node; 100 u64 bytenr; 101 }; /* Use rb_simple_node for search/insert */ 102 u64 owner; 103 struct btrfs_key key; 104 unsigned int level:8; 105 unsigned int key_ready:1; 106 }; 107 108 #define MAX_EXTENTS 128 109 110 struct file_extent_cluster { 111 u64 start; 112 u64 end; 113 u64 boundary[MAX_EXTENTS]; 114 unsigned int nr; 115 }; 116 117 struct reloc_control { 118 /* block group to relocate */ 119 struct btrfs_block_group *block_group; 120 /* extent tree */ 121 struct btrfs_root *extent_root; 122 /* inode for moving data */ 123 struct inode *data_inode; 124 125 struct btrfs_block_rsv *block_rsv; 126 127 struct btrfs_backref_cache backref_cache; 128 129 struct file_extent_cluster cluster; 130 /* tree blocks have been processed */ 131 struct extent_io_tree processed_blocks; 132 /* map start of tree root to corresponding reloc tree */ 133 struct mapping_tree reloc_root_tree; 134 /* list of reloc trees */ 135 struct list_head reloc_roots; 136 /* list of subvolume trees that get relocated */ 137 struct list_head dirty_subvol_roots; 138 /* size of metadata reservation for merging reloc trees */ 139 u64 merging_rsv_size; 140 /* size of relocated tree nodes */ 141 u64 nodes_relocated; 142 /* reserved size for block group relocation*/ 143 u64 reserved_bytes; 144 145 u64 search_start; 146 u64 extents_found; 147 148 unsigned int stage:8; 149 unsigned int create_reloc_tree:1; 150 unsigned int merge_reloc_tree:1; 151 unsigned int found_file_extent:1; 152 }; 153 154 /* stages of data relocation */ 155 #define MOVE_DATA_EXTENTS 0 156 #define UPDATE_DATA_PTRS 1 157 158 static void mark_block_processed(struct reloc_control *rc, 159 struct btrfs_backref_node *node) 160 { 161 u32 blocksize; 162 163 if (node->level == 0 || 164 in_range(node->bytenr, rc->block_group->start, 165 rc->block_group->length)) { 166 blocksize = rc->extent_root->fs_info->nodesize; 167 set_extent_bits(&rc->processed_blocks, node->bytenr, 168 node->bytenr + blocksize - 1, EXTENT_DIRTY); 169 } 170 node->processed = 1; 171 } 172 173 174 static void mapping_tree_init(struct mapping_tree *tree) 175 { 176 tree->rb_root = RB_ROOT; 177 spin_lock_init(&tree->lock); 178 } 179 180 /* 181 * walk up backref nodes until reach node presents tree root 182 */ 183 static struct btrfs_backref_node *walk_up_backref( 184 struct btrfs_backref_node *node, 185 struct btrfs_backref_edge *edges[], int *index) 186 { 187 struct btrfs_backref_edge *edge; 188 int idx = *index; 189 190 while (!list_empty(&node->upper)) { 191 edge = list_entry(node->upper.next, 192 struct btrfs_backref_edge, list[LOWER]); 193 edges[idx++] = edge; 194 node = edge->node[UPPER]; 195 } 196 BUG_ON(node->detached); 197 *index = idx; 198 return node; 199 } 200 201 /* 202 * walk down backref nodes to find start of next reference path 203 */ 204 static struct btrfs_backref_node *walk_down_backref( 205 struct btrfs_backref_edge *edges[], int *index) 206 { 207 struct btrfs_backref_edge *edge; 208 struct btrfs_backref_node *lower; 209 int idx = *index; 210 211 while (idx > 0) { 212 edge = edges[idx - 1]; 213 lower = edge->node[LOWER]; 214 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 215 idx--; 216 continue; 217 } 218 edge = list_entry(edge->list[LOWER].next, 219 struct btrfs_backref_edge, list[LOWER]); 220 edges[idx - 1] = edge; 221 *index = idx; 222 return edge->node[UPPER]; 223 } 224 *index = 0; 225 return NULL; 226 } 227 228 static void update_backref_node(struct btrfs_backref_cache *cache, 229 struct btrfs_backref_node *node, u64 bytenr) 230 { 231 struct rb_node *rb_node; 232 rb_erase(&node->rb_node, &cache->rb_root); 233 node->bytenr = bytenr; 234 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); 235 if (rb_node) 236 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); 237 } 238 239 /* 240 * update backref cache after a transaction commit 241 */ 242 static int update_backref_cache(struct btrfs_trans_handle *trans, 243 struct btrfs_backref_cache *cache) 244 { 245 struct btrfs_backref_node *node; 246 int level = 0; 247 248 if (cache->last_trans == 0) { 249 cache->last_trans = trans->transid; 250 return 0; 251 } 252 253 if (cache->last_trans == trans->transid) 254 return 0; 255 256 /* 257 * detached nodes are used to avoid unnecessary backref 258 * lookup. transaction commit changes the extent tree. 259 * so the detached nodes are no longer useful. 260 */ 261 while (!list_empty(&cache->detached)) { 262 node = list_entry(cache->detached.next, 263 struct btrfs_backref_node, list); 264 btrfs_backref_cleanup_node(cache, node); 265 } 266 267 while (!list_empty(&cache->changed)) { 268 node = list_entry(cache->changed.next, 269 struct btrfs_backref_node, list); 270 list_del_init(&node->list); 271 BUG_ON(node->pending); 272 update_backref_node(cache, node, node->new_bytenr); 273 } 274 275 /* 276 * some nodes can be left in the pending list if there were 277 * errors during processing the pending nodes. 278 */ 279 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 280 list_for_each_entry(node, &cache->pending[level], list) { 281 BUG_ON(!node->pending); 282 if (node->bytenr == node->new_bytenr) 283 continue; 284 update_backref_node(cache, node, node->new_bytenr); 285 } 286 } 287 288 cache->last_trans = 0; 289 return 1; 290 } 291 292 static bool reloc_root_is_dead(struct btrfs_root *root) 293 { 294 /* 295 * Pair with set_bit/clear_bit in clean_dirty_subvols and 296 * btrfs_update_reloc_root. We need to see the updated bit before 297 * trying to access reloc_root 298 */ 299 smp_rmb(); 300 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 301 return true; 302 return false; 303 } 304 305 /* 306 * Check if this subvolume tree has valid reloc tree. 307 * 308 * Reloc tree after swap is considered dead, thus not considered as valid. 309 * This is enough for most callers, as they don't distinguish dead reloc root 310 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 311 * special case. 312 */ 313 static bool have_reloc_root(struct btrfs_root *root) 314 { 315 if (reloc_root_is_dead(root)) 316 return false; 317 if (!root->reloc_root) 318 return false; 319 return true; 320 } 321 322 int btrfs_should_ignore_reloc_root(struct btrfs_root *root) 323 { 324 struct btrfs_root *reloc_root; 325 326 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 327 return 0; 328 329 /* This root has been merged with its reloc tree, we can ignore it */ 330 if (reloc_root_is_dead(root)) 331 return 1; 332 333 reloc_root = root->reloc_root; 334 if (!reloc_root) 335 return 0; 336 337 if (btrfs_header_generation(reloc_root->commit_root) == 338 root->fs_info->running_transaction->transid) 339 return 0; 340 /* 341 * if there is reloc tree and it was created in previous 342 * transaction backref lookup can find the reloc tree, 343 * so backref node for the fs tree root is useless for 344 * relocation. 345 */ 346 return 1; 347 } 348 349 /* 350 * find reloc tree by address of tree root 351 */ 352 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 353 { 354 struct reloc_control *rc = fs_info->reloc_ctl; 355 struct rb_node *rb_node; 356 struct mapping_node *node; 357 struct btrfs_root *root = NULL; 358 359 ASSERT(rc); 360 spin_lock(&rc->reloc_root_tree.lock); 361 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 362 if (rb_node) { 363 node = rb_entry(rb_node, struct mapping_node, rb_node); 364 root = (struct btrfs_root *)node->data; 365 } 366 spin_unlock(&rc->reloc_root_tree.lock); 367 return btrfs_grab_root(root); 368 } 369 370 /* 371 * For useless nodes, do two major clean ups: 372 * 373 * - Cleanup the children edges and nodes 374 * If child node is also orphan (no parent) during cleanup, then the child 375 * node will also be cleaned up. 376 * 377 * - Freeing up leaves (level 0), keeps nodes detached 378 * For nodes, the node is still cached as "detached" 379 * 380 * Return false if @node is not in the @useless_nodes list. 381 * Return true if @node is in the @useless_nodes list. 382 */ 383 static bool handle_useless_nodes(struct reloc_control *rc, 384 struct btrfs_backref_node *node) 385 { 386 struct btrfs_backref_cache *cache = &rc->backref_cache; 387 struct list_head *useless_node = &cache->useless_node; 388 bool ret = false; 389 390 while (!list_empty(useless_node)) { 391 struct btrfs_backref_node *cur; 392 393 cur = list_first_entry(useless_node, struct btrfs_backref_node, 394 list); 395 list_del_init(&cur->list); 396 397 /* Only tree root nodes can be added to @useless_nodes */ 398 ASSERT(list_empty(&cur->upper)); 399 400 if (cur == node) 401 ret = true; 402 403 /* The node is the lowest node */ 404 if (cur->lowest) { 405 list_del_init(&cur->lower); 406 cur->lowest = 0; 407 } 408 409 /* Cleanup the lower edges */ 410 while (!list_empty(&cur->lower)) { 411 struct btrfs_backref_edge *edge; 412 struct btrfs_backref_node *lower; 413 414 edge = list_entry(cur->lower.next, 415 struct btrfs_backref_edge, list[UPPER]); 416 list_del(&edge->list[UPPER]); 417 list_del(&edge->list[LOWER]); 418 lower = edge->node[LOWER]; 419 btrfs_backref_free_edge(cache, edge); 420 421 /* Child node is also orphan, queue for cleanup */ 422 if (list_empty(&lower->upper)) 423 list_add(&lower->list, useless_node); 424 } 425 /* Mark this block processed for relocation */ 426 mark_block_processed(rc, cur); 427 428 /* 429 * Backref nodes for tree leaves are deleted from the cache. 430 * Backref nodes for upper level tree blocks are left in the 431 * cache to avoid unnecessary backref lookup. 432 */ 433 if (cur->level > 0) { 434 list_add(&cur->list, &cache->detached); 435 cur->detached = 1; 436 } else { 437 rb_erase(&cur->rb_node, &cache->rb_root); 438 btrfs_backref_free_node(cache, cur); 439 } 440 } 441 return ret; 442 } 443 444 /* 445 * Build backref tree for a given tree block. Root of the backref tree 446 * corresponds the tree block, leaves of the backref tree correspond roots of 447 * b-trees that reference the tree block. 448 * 449 * The basic idea of this function is check backrefs of a given block to find 450 * upper level blocks that reference the block, and then check backrefs of 451 * these upper level blocks recursively. The recursion stops when tree root is 452 * reached or backrefs for the block is cached. 453 * 454 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 455 * all upper level blocks that directly/indirectly reference the block are also 456 * cached. 457 */ 458 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 459 struct reloc_control *rc, struct btrfs_key *node_key, 460 int level, u64 bytenr) 461 { 462 struct btrfs_backref_iter *iter; 463 struct btrfs_backref_cache *cache = &rc->backref_cache; 464 /* For searching parent of TREE_BLOCK_REF */ 465 struct btrfs_path *path; 466 struct btrfs_backref_node *cur; 467 struct btrfs_backref_node *node = NULL; 468 struct btrfs_backref_edge *edge; 469 int ret; 470 int err = 0; 471 472 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); 473 if (!iter) 474 return ERR_PTR(-ENOMEM); 475 path = btrfs_alloc_path(); 476 if (!path) { 477 err = -ENOMEM; 478 goto out; 479 } 480 481 node = btrfs_backref_alloc_node(cache, bytenr, level); 482 if (!node) { 483 err = -ENOMEM; 484 goto out; 485 } 486 487 node->lowest = 1; 488 cur = node; 489 490 /* Breadth-first search to build backref cache */ 491 do { 492 ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, 493 cur); 494 if (ret < 0) { 495 err = ret; 496 goto out; 497 } 498 edge = list_first_entry_or_null(&cache->pending_edge, 499 struct btrfs_backref_edge, list[UPPER]); 500 /* 501 * The pending list isn't empty, take the first block to 502 * process 503 */ 504 if (edge) { 505 list_del_init(&edge->list[UPPER]); 506 cur = edge->node[UPPER]; 507 } 508 } while (edge); 509 510 /* Finish the upper linkage of newly added edges/nodes */ 511 ret = btrfs_backref_finish_upper_links(cache, node); 512 if (ret < 0) { 513 err = ret; 514 goto out; 515 } 516 517 if (handle_useless_nodes(rc, node)) 518 node = NULL; 519 out: 520 btrfs_backref_iter_free(iter); 521 btrfs_free_path(path); 522 if (err) { 523 btrfs_backref_error_cleanup(cache, node); 524 return ERR_PTR(err); 525 } 526 ASSERT(!node || !node->detached); 527 ASSERT(list_empty(&cache->useless_node) && 528 list_empty(&cache->pending_edge)); 529 return node; 530 } 531 532 /* 533 * helper to add backref node for the newly created snapshot. 534 * the backref node is created by cloning backref node that 535 * corresponds to root of source tree 536 */ 537 static int clone_backref_node(struct btrfs_trans_handle *trans, 538 struct reloc_control *rc, 539 struct btrfs_root *src, 540 struct btrfs_root *dest) 541 { 542 struct btrfs_root *reloc_root = src->reloc_root; 543 struct btrfs_backref_cache *cache = &rc->backref_cache; 544 struct btrfs_backref_node *node = NULL; 545 struct btrfs_backref_node *new_node; 546 struct btrfs_backref_edge *edge; 547 struct btrfs_backref_edge *new_edge; 548 struct rb_node *rb_node; 549 550 if (cache->last_trans > 0) 551 update_backref_cache(trans, cache); 552 553 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 554 if (rb_node) { 555 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 556 if (node->detached) 557 node = NULL; 558 else 559 BUG_ON(node->new_bytenr != reloc_root->node->start); 560 } 561 562 if (!node) { 563 rb_node = rb_simple_search(&cache->rb_root, 564 reloc_root->commit_root->start); 565 if (rb_node) { 566 node = rb_entry(rb_node, struct btrfs_backref_node, 567 rb_node); 568 BUG_ON(node->detached); 569 } 570 } 571 572 if (!node) 573 return 0; 574 575 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 576 node->level); 577 if (!new_node) 578 return -ENOMEM; 579 580 new_node->lowest = node->lowest; 581 new_node->checked = 1; 582 new_node->root = btrfs_grab_root(dest); 583 ASSERT(new_node->root); 584 585 if (!node->lowest) { 586 list_for_each_entry(edge, &node->lower, list[UPPER]) { 587 new_edge = btrfs_backref_alloc_edge(cache); 588 if (!new_edge) 589 goto fail; 590 591 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 592 new_node, LINK_UPPER); 593 } 594 } else { 595 list_add_tail(&new_node->lower, &cache->leaves); 596 } 597 598 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 599 &new_node->rb_node); 600 if (rb_node) 601 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 602 603 if (!new_node->lowest) { 604 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 605 list_add_tail(&new_edge->list[LOWER], 606 &new_edge->node[LOWER]->upper); 607 } 608 } 609 return 0; 610 fail: 611 while (!list_empty(&new_node->lower)) { 612 new_edge = list_entry(new_node->lower.next, 613 struct btrfs_backref_edge, list[UPPER]); 614 list_del(&new_edge->list[UPPER]); 615 btrfs_backref_free_edge(cache, new_edge); 616 } 617 btrfs_backref_free_node(cache, new_node); 618 return -ENOMEM; 619 } 620 621 /* 622 * helper to add 'address of tree root -> reloc tree' mapping 623 */ 624 static int __must_check __add_reloc_root(struct btrfs_root *root) 625 { 626 struct btrfs_fs_info *fs_info = root->fs_info; 627 struct rb_node *rb_node; 628 struct mapping_node *node; 629 struct reloc_control *rc = fs_info->reloc_ctl; 630 631 node = kmalloc(sizeof(*node), GFP_NOFS); 632 if (!node) 633 return -ENOMEM; 634 635 node->bytenr = root->commit_root->start; 636 node->data = root; 637 638 spin_lock(&rc->reloc_root_tree.lock); 639 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 640 node->bytenr, &node->rb_node); 641 spin_unlock(&rc->reloc_root_tree.lock); 642 if (rb_node) { 643 btrfs_err(fs_info, 644 "Duplicate root found for start=%llu while inserting into relocation tree", 645 node->bytenr); 646 return -EEXIST; 647 } 648 649 list_add_tail(&root->root_list, &rc->reloc_roots); 650 return 0; 651 } 652 653 /* 654 * helper to delete the 'address of tree root -> reloc tree' 655 * mapping 656 */ 657 static void __del_reloc_root(struct btrfs_root *root) 658 { 659 struct btrfs_fs_info *fs_info = root->fs_info; 660 struct rb_node *rb_node; 661 struct mapping_node *node = NULL; 662 struct reloc_control *rc = fs_info->reloc_ctl; 663 bool put_ref = false; 664 665 if (rc && root->node) { 666 spin_lock(&rc->reloc_root_tree.lock); 667 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 668 root->commit_root->start); 669 if (rb_node) { 670 node = rb_entry(rb_node, struct mapping_node, rb_node); 671 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 672 RB_CLEAR_NODE(&node->rb_node); 673 } 674 spin_unlock(&rc->reloc_root_tree.lock); 675 ASSERT(!node || (struct btrfs_root *)node->data == root); 676 } 677 678 /* 679 * We only put the reloc root here if it's on the list. There's a lot 680 * of places where the pattern is to splice the rc->reloc_roots, process 681 * the reloc roots, and then add the reloc root back onto 682 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 683 * list we don't want the reference being dropped, because the guy 684 * messing with the list is in charge of the reference. 685 */ 686 spin_lock(&fs_info->trans_lock); 687 if (!list_empty(&root->root_list)) { 688 put_ref = true; 689 list_del_init(&root->root_list); 690 } 691 spin_unlock(&fs_info->trans_lock); 692 if (put_ref) 693 btrfs_put_root(root); 694 kfree(node); 695 } 696 697 /* 698 * helper to update the 'address of tree root -> reloc tree' 699 * mapping 700 */ 701 static int __update_reloc_root(struct btrfs_root *root) 702 { 703 struct btrfs_fs_info *fs_info = root->fs_info; 704 struct rb_node *rb_node; 705 struct mapping_node *node = NULL; 706 struct reloc_control *rc = fs_info->reloc_ctl; 707 708 spin_lock(&rc->reloc_root_tree.lock); 709 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 710 root->commit_root->start); 711 if (rb_node) { 712 node = rb_entry(rb_node, struct mapping_node, rb_node); 713 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 714 } 715 spin_unlock(&rc->reloc_root_tree.lock); 716 717 if (!node) 718 return 0; 719 BUG_ON((struct btrfs_root *)node->data != root); 720 721 spin_lock(&rc->reloc_root_tree.lock); 722 node->bytenr = root->node->start; 723 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 724 node->bytenr, &node->rb_node); 725 spin_unlock(&rc->reloc_root_tree.lock); 726 if (rb_node) 727 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 728 return 0; 729 } 730 731 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 732 struct btrfs_root *root, u64 objectid) 733 { 734 struct btrfs_fs_info *fs_info = root->fs_info; 735 struct btrfs_root *reloc_root; 736 struct extent_buffer *eb; 737 struct btrfs_root_item *root_item; 738 struct btrfs_key root_key; 739 int ret = 0; 740 bool must_abort = false; 741 742 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 743 if (!root_item) 744 return ERR_PTR(-ENOMEM); 745 746 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 747 root_key.type = BTRFS_ROOT_ITEM_KEY; 748 root_key.offset = objectid; 749 750 if (root->root_key.objectid == objectid) { 751 u64 commit_root_gen; 752 753 /* called by btrfs_init_reloc_root */ 754 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 755 BTRFS_TREE_RELOC_OBJECTID); 756 if (ret) 757 goto fail; 758 759 /* 760 * Set the last_snapshot field to the generation of the commit 761 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 762 * correctly (returns true) when the relocation root is created 763 * either inside the critical section of a transaction commit 764 * (through transaction.c:qgroup_account_snapshot()) and when 765 * it's created before the transaction commit is started. 766 */ 767 commit_root_gen = btrfs_header_generation(root->commit_root); 768 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 769 } else { 770 /* 771 * called by btrfs_reloc_post_snapshot_hook. 772 * the source tree is a reloc tree, all tree blocks 773 * modified after it was created have RELOC flag 774 * set in their headers. so it's OK to not update 775 * the 'last_snapshot'. 776 */ 777 ret = btrfs_copy_root(trans, root, root->node, &eb, 778 BTRFS_TREE_RELOC_OBJECTID); 779 if (ret) 780 goto fail; 781 } 782 783 /* 784 * We have changed references at this point, we must abort the 785 * transaction if anything fails. 786 */ 787 must_abort = true; 788 789 memcpy(root_item, &root->root_item, sizeof(*root_item)); 790 btrfs_set_root_bytenr(root_item, eb->start); 791 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 792 btrfs_set_root_generation(root_item, trans->transid); 793 794 if (root->root_key.objectid == objectid) { 795 btrfs_set_root_refs(root_item, 0); 796 memset(&root_item->drop_progress, 0, 797 sizeof(struct btrfs_disk_key)); 798 btrfs_set_root_drop_level(root_item, 0); 799 } 800 801 btrfs_tree_unlock(eb); 802 free_extent_buffer(eb); 803 804 ret = btrfs_insert_root(trans, fs_info->tree_root, 805 &root_key, root_item); 806 if (ret) 807 goto fail; 808 809 kfree(root_item); 810 811 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 812 if (IS_ERR(reloc_root)) { 813 ret = PTR_ERR(reloc_root); 814 goto abort; 815 } 816 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 817 reloc_root->last_trans = trans->transid; 818 return reloc_root; 819 fail: 820 kfree(root_item); 821 abort: 822 if (must_abort) 823 btrfs_abort_transaction(trans, ret); 824 return ERR_PTR(ret); 825 } 826 827 /* 828 * create reloc tree for a given fs tree. reloc tree is just a 829 * snapshot of the fs tree with special root objectid. 830 * 831 * The reloc_root comes out of here with two references, one for 832 * root->reloc_root, and another for being on the rc->reloc_roots list. 833 */ 834 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 835 struct btrfs_root *root) 836 { 837 struct btrfs_fs_info *fs_info = root->fs_info; 838 struct btrfs_root *reloc_root; 839 struct reloc_control *rc = fs_info->reloc_ctl; 840 struct btrfs_block_rsv *rsv; 841 int clear_rsv = 0; 842 int ret; 843 844 if (!rc) 845 return 0; 846 847 /* 848 * The subvolume has reloc tree but the swap is finished, no need to 849 * create/update the dead reloc tree 850 */ 851 if (reloc_root_is_dead(root)) 852 return 0; 853 854 /* 855 * This is subtle but important. We do not do 856 * record_root_in_transaction for reloc roots, instead we record their 857 * corresponding fs root, and then here we update the last trans for the 858 * reloc root. This means that we have to do this for the entire life 859 * of the reloc root, regardless of which stage of the relocation we are 860 * in. 861 */ 862 if (root->reloc_root) { 863 reloc_root = root->reloc_root; 864 reloc_root->last_trans = trans->transid; 865 return 0; 866 } 867 868 /* 869 * We are merging reloc roots, we do not need new reloc trees. Also 870 * reloc trees never need their own reloc tree. 871 */ 872 if (!rc->create_reloc_tree || 873 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 874 return 0; 875 876 if (!trans->reloc_reserved) { 877 rsv = trans->block_rsv; 878 trans->block_rsv = rc->block_rsv; 879 clear_rsv = 1; 880 } 881 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 882 if (clear_rsv) 883 trans->block_rsv = rsv; 884 if (IS_ERR(reloc_root)) 885 return PTR_ERR(reloc_root); 886 887 ret = __add_reloc_root(reloc_root); 888 ASSERT(ret != -EEXIST); 889 if (ret) { 890 /* Pairs with create_reloc_root */ 891 btrfs_put_root(reloc_root); 892 return ret; 893 } 894 root->reloc_root = btrfs_grab_root(reloc_root); 895 return 0; 896 } 897 898 /* 899 * update root item of reloc tree 900 */ 901 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 902 struct btrfs_root *root) 903 { 904 struct btrfs_fs_info *fs_info = root->fs_info; 905 struct btrfs_root *reloc_root; 906 struct btrfs_root_item *root_item; 907 int ret; 908 909 if (!have_reloc_root(root)) 910 return 0; 911 912 reloc_root = root->reloc_root; 913 root_item = &reloc_root->root_item; 914 915 /* 916 * We are probably ok here, but __del_reloc_root() will drop its ref of 917 * the root. We have the ref for root->reloc_root, but just in case 918 * hold it while we update the reloc root. 919 */ 920 btrfs_grab_root(reloc_root); 921 922 /* root->reloc_root will stay until current relocation finished */ 923 if (fs_info->reloc_ctl->merge_reloc_tree && 924 btrfs_root_refs(root_item) == 0) { 925 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 926 /* 927 * Mark the tree as dead before we change reloc_root so 928 * have_reloc_root will not touch it from now on. 929 */ 930 smp_wmb(); 931 __del_reloc_root(reloc_root); 932 } 933 934 if (reloc_root->commit_root != reloc_root->node) { 935 __update_reloc_root(reloc_root); 936 btrfs_set_root_node(root_item, reloc_root->node); 937 free_extent_buffer(reloc_root->commit_root); 938 reloc_root->commit_root = btrfs_root_node(reloc_root); 939 } 940 941 ret = btrfs_update_root(trans, fs_info->tree_root, 942 &reloc_root->root_key, root_item); 943 btrfs_put_root(reloc_root); 944 return ret; 945 } 946 947 /* 948 * helper to find first cached inode with inode number >= objectid 949 * in a subvolume 950 */ 951 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 952 { 953 struct rb_node *node; 954 struct rb_node *prev; 955 struct btrfs_inode *entry; 956 struct inode *inode; 957 958 spin_lock(&root->inode_lock); 959 again: 960 node = root->inode_tree.rb_node; 961 prev = NULL; 962 while (node) { 963 prev = node; 964 entry = rb_entry(node, struct btrfs_inode, rb_node); 965 966 if (objectid < btrfs_ino(entry)) 967 node = node->rb_left; 968 else if (objectid > btrfs_ino(entry)) 969 node = node->rb_right; 970 else 971 break; 972 } 973 if (!node) { 974 while (prev) { 975 entry = rb_entry(prev, struct btrfs_inode, rb_node); 976 if (objectid <= btrfs_ino(entry)) { 977 node = prev; 978 break; 979 } 980 prev = rb_next(prev); 981 } 982 } 983 while (node) { 984 entry = rb_entry(node, struct btrfs_inode, rb_node); 985 inode = igrab(&entry->vfs_inode); 986 if (inode) { 987 spin_unlock(&root->inode_lock); 988 return inode; 989 } 990 991 objectid = btrfs_ino(entry) + 1; 992 if (cond_resched_lock(&root->inode_lock)) 993 goto again; 994 995 node = rb_next(node); 996 } 997 spin_unlock(&root->inode_lock); 998 return NULL; 999 } 1000 1001 /* 1002 * get new location of data 1003 */ 1004 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1005 u64 bytenr, u64 num_bytes) 1006 { 1007 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1008 struct btrfs_path *path; 1009 struct btrfs_file_extent_item *fi; 1010 struct extent_buffer *leaf; 1011 int ret; 1012 1013 path = btrfs_alloc_path(); 1014 if (!path) 1015 return -ENOMEM; 1016 1017 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1018 ret = btrfs_lookup_file_extent(NULL, root, path, 1019 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1020 if (ret < 0) 1021 goto out; 1022 if (ret > 0) { 1023 ret = -ENOENT; 1024 goto out; 1025 } 1026 1027 leaf = path->nodes[0]; 1028 fi = btrfs_item_ptr(leaf, path->slots[0], 1029 struct btrfs_file_extent_item); 1030 1031 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1032 btrfs_file_extent_compression(leaf, fi) || 1033 btrfs_file_extent_encryption(leaf, fi) || 1034 btrfs_file_extent_other_encoding(leaf, fi)); 1035 1036 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1037 ret = -EINVAL; 1038 goto out; 1039 } 1040 1041 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1042 ret = 0; 1043 out: 1044 btrfs_free_path(path); 1045 return ret; 1046 } 1047 1048 /* 1049 * update file extent items in the tree leaf to point to 1050 * the new locations. 1051 */ 1052 static noinline_for_stack 1053 int replace_file_extents(struct btrfs_trans_handle *trans, 1054 struct reloc_control *rc, 1055 struct btrfs_root *root, 1056 struct extent_buffer *leaf) 1057 { 1058 struct btrfs_fs_info *fs_info = root->fs_info; 1059 struct btrfs_key key; 1060 struct btrfs_file_extent_item *fi; 1061 struct inode *inode = NULL; 1062 u64 parent; 1063 u64 bytenr; 1064 u64 new_bytenr = 0; 1065 u64 num_bytes; 1066 u64 end; 1067 u32 nritems; 1068 u32 i; 1069 int ret = 0; 1070 int first = 1; 1071 int dirty = 0; 1072 1073 if (rc->stage != UPDATE_DATA_PTRS) 1074 return 0; 1075 1076 /* reloc trees always use full backref */ 1077 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1078 parent = leaf->start; 1079 else 1080 parent = 0; 1081 1082 nritems = btrfs_header_nritems(leaf); 1083 for (i = 0; i < nritems; i++) { 1084 struct btrfs_ref ref = { 0 }; 1085 1086 cond_resched(); 1087 btrfs_item_key_to_cpu(leaf, &key, i); 1088 if (key.type != BTRFS_EXTENT_DATA_KEY) 1089 continue; 1090 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1091 if (btrfs_file_extent_type(leaf, fi) == 1092 BTRFS_FILE_EXTENT_INLINE) 1093 continue; 1094 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1095 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1096 if (bytenr == 0) 1097 continue; 1098 if (!in_range(bytenr, rc->block_group->start, 1099 rc->block_group->length)) 1100 continue; 1101 1102 /* 1103 * if we are modifying block in fs tree, wait for readpage 1104 * to complete and drop the extent cache 1105 */ 1106 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1107 if (first) { 1108 inode = find_next_inode(root, key.objectid); 1109 first = 0; 1110 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1111 btrfs_add_delayed_iput(inode); 1112 inode = find_next_inode(root, key.objectid); 1113 } 1114 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1115 end = key.offset + 1116 btrfs_file_extent_num_bytes(leaf, fi); 1117 WARN_ON(!IS_ALIGNED(key.offset, 1118 fs_info->sectorsize)); 1119 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1120 end--; 1121 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1122 key.offset, end); 1123 if (!ret) 1124 continue; 1125 1126 btrfs_drop_extent_cache(BTRFS_I(inode), 1127 key.offset, end, 1); 1128 unlock_extent(&BTRFS_I(inode)->io_tree, 1129 key.offset, end); 1130 } 1131 } 1132 1133 ret = get_new_location(rc->data_inode, &new_bytenr, 1134 bytenr, num_bytes); 1135 if (ret) { 1136 /* 1137 * Don't have to abort since we've not changed anything 1138 * in the file extent yet. 1139 */ 1140 break; 1141 } 1142 1143 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1144 dirty = 1; 1145 1146 key.offset -= btrfs_file_extent_offset(leaf, fi); 1147 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1148 num_bytes, parent); 1149 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1150 key.objectid, key.offset, 1151 root->root_key.objectid, false); 1152 ret = btrfs_inc_extent_ref(trans, &ref); 1153 if (ret) { 1154 btrfs_abort_transaction(trans, ret); 1155 break; 1156 } 1157 1158 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1159 num_bytes, parent); 1160 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1161 key.objectid, key.offset, 1162 root->root_key.objectid, false); 1163 ret = btrfs_free_extent(trans, &ref); 1164 if (ret) { 1165 btrfs_abort_transaction(trans, ret); 1166 break; 1167 } 1168 } 1169 if (dirty) 1170 btrfs_mark_buffer_dirty(leaf); 1171 if (inode) 1172 btrfs_add_delayed_iput(inode); 1173 return ret; 1174 } 1175 1176 static noinline_for_stack 1177 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1178 struct btrfs_path *path, int level) 1179 { 1180 struct btrfs_disk_key key1; 1181 struct btrfs_disk_key key2; 1182 btrfs_node_key(eb, &key1, slot); 1183 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1184 return memcmp(&key1, &key2, sizeof(key1)); 1185 } 1186 1187 /* 1188 * try to replace tree blocks in fs tree with the new blocks 1189 * in reloc tree. tree blocks haven't been modified since the 1190 * reloc tree was create can be replaced. 1191 * 1192 * if a block was replaced, level of the block + 1 is returned. 1193 * if no block got replaced, 0 is returned. if there are other 1194 * errors, a negative error number is returned. 1195 */ 1196 static noinline_for_stack 1197 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1198 struct btrfs_root *dest, struct btrfs_root *src, 1199 struct btrfs_path *path, struct btrfs_key *next_key, 1200 int lowest_level, int max_level) 1201 { 1202 struct btrfs_fs_info *fs_info = dest->fs_info; 1203 struct extent_buffer *eb; 1204 struct extent_buffer *parent; 1205 struct btrfs_ref ref = { 0 }; 1206 struct btrfs_key key; 1207 u64 old_bytenr; 1208 u64 new_bytenr; 1209 u64 old_ptr_gen; 1210 u64 new_ptr_gen; 1211 u64 last_snapshot; 1212 u32 blocksize; 1213 int cow = 0; 1214 int level; 1215 int ret; 1216 int slot; 1217 1218 ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1219 ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1220 1221 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1222 again: 1223 slot = path->slots[lowest_level]; 1224 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1225 1226 eb = btrfs_lock_root_node(dest); 1227 level = btrfs_header_level(eb); 1228 1229 if (level < lowest_level) { 1230 btrfs_tree_unlock(eb); 1231 free_extent_buffer(eb); 1232 return 0; 1233 } 1234 1235 if (cow) { 1236 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1237 BTRFS_NESTING_COW); 1238 if (ret) { 1239 btrfs_tree_unlock(eb); 1240 free_extent_buffer(eb); 1241 return ret; 1242 } 1243 } 1244 1245 if (next_key) { 1246 next_key->objectid = (u64)-1; 1247 next_key->type = (u8)-1; 1248 next_key->offset = (u64)-1; 1249 } 1250 1251 parent = eb; 1252 while (1) { 1253 level = btrfs_header_level(parent); 1254 ASSERT(level >= lowest_level); 1255 1256 ret = btrfs_bin_search(parent, &key, &slot); 1257 if (ret < 0) 1258 break; 1259 if (ret && slot > 0) 1260 slot--; 1261 1262 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1263 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1264 1265 old_bytenr = btrfs_node_blockptr(parent, slot); 1266 blocksize = fs_info->nodesize; 1267 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1268 1269 if (level <= max_level) { 1270 eb = path->nodes[level]; 1271 new_bytenr = btrfs_node_blockptr(eb, 1272 path->slots[level]); 1273 new_ptr_gen = btrfs_node_ptr_generation(eb, 1274 path->slots[level]); 1275 } else { 1276 new_bytenr = 0; 1277 new_ptr_gen = 0; 1278 } 1279 1280 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1281 ret = level; 1282 break; 1283 } 1284 1285 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1286 memcmp_node_keys(parent, slot, path, level)) { 1287 if (level <= lowest_level) { 1288 ret = 0; 1289 break; 1290 } 1291 1292 eb = btrfs_read_node_slot(parent, slot); 1293 if (IS_ERR(eb)) { 1294 ret = PTR_ERR(eb); 1295 break; 1296 } 1297 btrfs_tree_lock(eb); 1298 if (cow) { 1299 ret = btrfs_cow_block(trans, dest, eb, parent, 1300 slot, &eb, 1301 BTRFS_NESTING_COW); 1302 if (ret) { 1303 btrfs_tree_unlock(eb); 1304 free_extent_buffer(eb); 1305 break; 1306 } 1307 } 1308 1309 btrfs_tree_unlock(parent); 1310 free_extent_buffer(parent); 1311 1312 parent = eb; 1313 continue; 1314 } 1315 1316 if (!cow) { 1317 btrfs_tree_unlock(parent); 1318 free_extent_buffer(parent); 1319 cow = 1; 1320 goto again; 1321 } 1322 1323 btrfs_node_key_to_cpu(path->nodes[level], &key, 1324 path->slots[level]); 1325 btrfs_release_path(path); 1326 1327 path->lowest_level = level; 1328 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1329 path->lowest_level = 0; 1330 if (ret) { 1331 if (ret > 0) 1332 ret = -ENOENT; 1333 break; 1334 } 1335 1336 /* 1337 * Info qgroup to trace both subtrees. 1338 * 1339 * We must trace both trees. 1340 * 1) Tree reloc subtree 1341 * If not traced, we will leak data numbers 1342 * 2) Fs subtree 1343 * If not traced, we will double count old data 1344 * 1345 * We don't scan the subtree right now, but only record 1346 * the swapped tree blocks. 1347 * The real subtree rescan is delayed until we have new 1348 * CoW on the subtree root node before transaction commit. 1349 */ 1350 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1351 rc->block_group, parent, slot, 1352 path->nodes[level], path->slots[level], 1353 last_snapshot); 1354 if (ret < 0) 1355 break; 1356 /* 1357 * swap blocks in fs tree and reloc tree. 1358 */ 1359 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1360 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1361 btrfs_mark_buffer_dirty(parent); 1362 1363 btrfs_set_node_blockptr(path->nodes[level], 1364 path->slots[level], old_bytenr); 1365 btrfs_set_node_ptr_generation(path->nodes[level], 1366 path->slots[level], old_ptr_gen); 1367 btrfs_mark_buffer_dirty(path->nodes[level]); 1368 1369 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 1370 blocksize, path->nodes[level]->start); 1371 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1372 0, true); 1373 ret = btrfs_inc_extent_ref(trans, &ref); 1374 if (ret) { 1375 btrfs_abort_transaction(trans, ret); 1376 break; 1377 } 1378 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1379 blocksize, 0); 1380 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, 1381 true); 1382 ret = btrfs_inc_extent_ref(trans, &ref); 1383 if (ret) { 1384 btrfs_abort_transaction(trans, ret); 1385 break; 1386 } 1387 1388 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 1389 blocksize, path->nodes[level]->start); 1390 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 1391 0, true); 1392 ret = btrfs_free_extent(trans, &ref); 1393 if (ret) { 1394 btrfs_abort_transaction(trans, ret); 1395 break; 1396 } 1397 1398 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 1399 blocksize, 0); 1400 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 1401 0, true); 1402 ret = btrfs_free_extent(trans, &ref); 1403 if (ret) { 1404 btrfs_abort_transaction(trans, ret); 1405 break; 1406 } 1407 1408 btrfs_unlock_up_safe(path, 0); 1409 1410 ret = level; 1411 break; 1412 } 1413 btrfs_tree_unlock(parent); 1414 free_extent_buffer(parent); 1415 return ret; 1416 } 1417 1418 /* 1419 * helper to find next relocated block in reloc tree 1420 */ 1421 static noinline_for_stack 1422 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1423 int *level) 1424 { 1425 struct extent_buffer *eb; 1426 int i; 1427 u64 last_snapshot; 1428 u32 nritems; 1429 1430 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1431 1432 for (i = 0; i < *level; i++) { 1433 free_extent_buffer(path->nodes[i]); 1434 path->nodes[i] = NULL; 1435 } 1436 1437 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1438 eb = path->nodes[i]; 1439 nritems = btrfs_header_nritems(eb); 1440 while (path->slots[i] + 1 < nritems) { 1441 path->slots[i]++; 1442 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1443 last_snapshot) 1444 continue; 1445 1446 *level = i; 1447 return 0; 1448 } 1449 free_extent_buffer(path->nodes[i]); 1450 path->nodes[i] = NULL; 1451 } 1452 return 1; 1453 } 1454 1455 /* 1456 * walk down reloc tree to find relocated block of lowest level 1457 */ 1458 static noinline_for_stack 1459 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1460 int *level) 1461 { 1462 struct extent_buffer *eb = NULL; 1463 int i; 1464 u64 ptr_gen = 0; 1465 u64 last_snapshot; 1466 u32 nritems; 1467 1468 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1469 1470 for (i = *level; i > 0; i--) { 1471 eb = path->nodes[i]; 1472 nritems = btrfs_header_nritems(eb); 1473 while (path->slots[i] < nritems) { 1474 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1475 if (ptr_gen > last_snapshot) 1476 break; 1477 path->slots[i]++; 1478 } 1479 if (path->slots[i] >= nritems) { 1480 if (i == *level) 1481 break; 1482 *level = i + 1; 1483 return 0; 1484 } 1485 if (i == 1) { 1486 *level = i; 1487 return 0; 1488 } 1489 1490 eb = btrfs_read_node_slot(eb, path->slots[i]); 1491 if (IS_ERR(eb)) 1492 return PTR_ERR(eb); 1493 BUG_ON(btrfs_header_level(eb) != i - 1); 1494 path->nodes[i - 1] = eb; 1495 path->slots[i - 1] = 0; 1496 } 1497 return 1; 1498 } 1499 1500 /* 1501 * invalidate extent cache for file extents whose key in range of 1502 * [min_key, max_key) 1503 */ 1504 static int invalidate_extent_cache(struct btrfs_root *root, 1505 struct btrfs_key *min_key, 1506 struct btrfs_key *max_key) 1507 { 1508 struct btrfs_fs_info *fs_info = root->fs_info; 1509 struct inode *inode = NULL; 1510 u64 objectid; 1511 u64 start, end; 1512 u64 ino; 1513 1514 objectid = min_key->objectid; 1515 while (1) { 1516 cond_resched(); 1517 iput(inode); 1518 1519 if (objectid > max_key->objectid) 1520 break; 1521 1522 inode = find_next_inode(root, objectid); 1523 if (!inode) 1524 break; 1525 ino = btrfs_ino(BTRFS_I(inode)); 1526 1527 if (ino > max_key->objectid) { 1528 iput(inode); 1529 break; 1530 } 1531 1532 objectid = ino + 1; 1533 if (!S_ISREG(inode->i_mode)) 1534 continue; 1535 1536 if (unlikely(min_key->objectid == ino)) { 1537 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1538 continue; 1539 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1540 start = 0; 1541 else { 1542 start = min_key->offset; 1543 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1544 } 1545 } else { 1546 start = 0; 1547 } 1548 1549 if (unlikely(max_key->objectid == ino)) { 1550 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1551 continue; 1552 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1553 end = (u64)-1; 1554 } else { 1555 if (max_key->offset == 0) 1556 continue; 1557 end = max_key->offset; 1558 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1559 end--; 1560 } 1561 } else { 1562 end = (u64)-1; 1563 } 1564 1565 /* the lock_extent waits for readpage to complete */ 1566 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 1567 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 1568 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 1569 } 1570 return 0; 1571 } 1572 1573 static int find_next_key(struct btrfs_path *path, int level, 1574 struct btrfs_key *key) 1575 1576 { 1577 while (level < BTRFS_MAX_LEVEL) { 1578 if (!path->nodes[level]) 1579 break; 1580 if (path->slots[level] + 1 < 1581 btrfs_header_nritems(path->nodes[level])) { 1582 btrfs_node_key_to_cpu(path->nodes[level], key, 1583 path->slots[level] + 1); 1584 return 0; 1585 } 1586 level++; 1587 } 1588 return 1; 1589 } 1590 1591 /* 1592 * Insert current subvolume into reloc_control::dirty_subvol_roots 1593 */ 1594 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1595 struct reloc_control *rc, 1596 struct btrfs_root *root) 1597 { 1598 struct btrfs_root *reloc_root = root->reloc_root; 1599 struct btrfs_root_item *reloc_root_item; 1600 int ret; 1601 1602 /* @root must be a subvolume tree root with a valid reloc tree */ 1603 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1604 ASSERT(reloc_root); 1605 1606 reloc_root_item = &reloc_root->root_item; 1607 memset(&reloc_root_item->drop_progress, 0, 1608 sizeof(reloc_root_item->drop_progress)); 1609 btrfs_set_root_drop_level(reloc_root_item, 0); 1610 btrfs_set_root_refs(reloc_root_item, 0); 1611 ret = btrfs_update_reloc_root(trans, root); 1612 if (ret) 1613 return ret; 1614 1615 if (list_empty(&root->reloc_dirty_list)) { 1616 btrfs_grab_root(root); 1617 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1618 } 1619 1620 return 0; 1621 } 1622 1623 static int clean_dirty_subvols(struct reloc_control *rc) 1624 { 1625 struct btrfs_root *root; 1626 struct btrfs_root *next; 1627 int ret = 0; 1628 int ret2; 1629 1630 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1631 reloc_dirty_list) { 1632 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1633 /* Merged subvolume, cleanup its reloc root */ 1634 struct btrfs_root *reloc_root = root->reloc_root; 1635 1636 list_del_init(&root->reloc_dirty_list); 1637 root->reloc_root = NULL; 1638 /* 1639 * Need barrier to ensure clear_bit() only happens after 1640 * root->reloc_root = NULL. Pairs with have_reloc_root. 1641 */ 1642 smp_wmb(); 1643 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1644 if (reloc_root) { 1645 /* 1646 * btrfs_drop_snapshot drops our ref we hold for 1647 * ->reloc_root. If it fails however we must 1648 * drop the ref ourselves. 1649 */ 1650 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1651 if (ret2 < 0) { 1652 btrfs_put_root(reloc_root); 1653 if (!ret) 1654 ret = ret2; 1655 } 1656 } 1657 btrfs_put_root(root); 1658 } else { 1659 /* Orphan reloc tree, just clean it up */ 1660 ret2 = btrfs_drop_snapshot(root, 0, 1); 1661 if (ret2 < 0) { 1662 btrfs_put_root(root); 1663 if (!ret) 1664 ret = ret2; 1665 } 1666 } 1667 } 1668 return ret; 1669 } 1670 1671 /* 1672 * merge the relocated tree blocks in reloc tree with corresponding 1673 * fs tree. 1674 */ 1675 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1676 struct btrfs_root *root) 1677 { 1678 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1679 struct btrfs_key key; 1680 struct btrfs_key next_key; 1681 struct btrfs_trans_handle *trans = NULL; 1682 struct btrfs_root *reloc_root; 1683 struct btrfs_root_item *root_item; 1684 struct btrfs_path *path; 1685 struct extent_buffer *leaf; 1686 int reserve_level; 1687 int level; 1688 int max_level; 1689 int replaced = 0; 1690 int ret = 0; 1691 u32 min_reserved; 1692 1693 path = btrfs_alloc_path(); 1694 if (!path) 1695 return -ENOMEM; 1696 path->reada = READA_FORWARD; 1697 1698 reloc_root = root->reloc_root; 1699 root_item = &reloc_root->root_item; 1700 1701 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1702 level = btrfs_root_level(root_item); 1703 atomic_inc(&reloc_root->node->refs); 1704 path->nodes[level] = reloc_root->node; 1705 path->slots[level] = 0; 1706 } else { 1707 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1708 1709 level = btrfs_root_drop_level(root_item); 1710 BUG_ON(level == 0); 1711 path->lowest_level = level; 1712 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1713 path->lowest_level = 0; 1714 if (ret < 0) { 1715 btrfs_free_path(path); 1716 return ret; 1717 } 1718 1719 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1720 path->slots[level]); 1721 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1722 1723 btrfs_unlock_up_safe(path, 0); 1724 } 1725 1726 /* 1727 * In merge_reloc_root(), we modify the upper level pointer to swap the 1728 * tree blocks between reloc tree and subvolume tree. Thus for tree 1729 * block COW, we COW at most from level 1 to root level for each tree. 1730 * 1731 * Thus the needed metadata size is at most root_level * nodesize, 1732 * and * 2 since we have two trees to COW. 1733 */ 1734 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1735 min_reserved = fs_info->nodesize * reserve_level * 2; 1736 memset(&next_key, 0, sizeof(next_key)); 1737 1738 while (1) { 1739 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 1740 BTRFS_RESERVE_FLUSH_LIMIT); 1741 if (ret) 1742 goto out; 1743 trans = btrfs_start_transaction(root, 0); 1744 if (IS_ERR(trans)) { 1745 ret = PTR_ERR(trans); 1746 trans = NULL; 1747 goto out; 1748 } 1749 1750 /* 1751 * At this point we no longer have a reloc_control, so we can't 1752 * depend on btrfs_init_reloc_root to update our last_trans. 1753 * 1754 * But that's ok, we started the trans handle on our 1755 * corresponding fs_root, which means it's been added to the 1756 * dirty list. At commit time we'll still call 1757 * btrfs_update_reloc_root() and update our root item 1758 * appropriately. 1759 */ 1760 reloc_root->last_trans = trans->transid; 1761 trans->block_rsv = rc->block_rsv; 1762 1763 replaced = 0; 1764 max_level = level; 1765 1766 ret = walk_down_reloc_tree(reloc_root, path, &level); 1767 if (ret < 0) 1768 goto out; 1769 if (ret > 0) 1770 break; 1771 1772 if (!find_next_key(path, level, &key) && 1773 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1774 ret = 0; 1775 } else { 1776 ret = replace_path(trans, rc, root, reloc_root, path, 1777 &next_key, level, max_level); 1778 } 1779 if (ret < 0) 1780 goto out; 1781 if (ret > 0) { 1782 level = ret; 1783 btrfs_node_key_to_cpu(path->nodes[level], &key, 1784 path->slots[level]); 1785 replaced = 1; 1786 } 1787 1788 ret = walk_up_reloc_tree(reloc_root, path, &level); 1789 if (ret > 0) 1790 break; 1791 1792 BUG_ON(level == 0); 1793 /* 1794 * save the merging progress in the drop_progress. 1795 * this is OK since root refs == 1 in this case. 1796 */ 1797 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1798 path->slots[level]); 1799 btrfs_set_root_drop_level(root_item, level); 1800 1801 btrfs_end_transaction_throttle(trans); 1802 trans = NULL; 1803 1804 btrfs_btree_balance_dirty(fs_info); 1805 1806 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1807 invalidate_extent_cache(root, &key, &next_key); 1808 } 1809 1810 /* 1811 * handle the case only one block in the fs tree need to be 1812 * relocated and the block is tree root. 1813 */ 1814 leaf = btrfs_lock_root_node(root); 1815 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1816 BTRFS_NESTING_COW); 1817 btrfs_tree_unlock(leaf); 1818 free_extent_buffer(leaf); 1819 out: 1820 btrfs_free_path(path); 1821 1822 if (ret == 0) { 1823 ret = insert_dirty_subvol(trans, rc, root); 1824 if (ret) 1825 btrfs_abort_transaction(trans, ret); 1826 } 1827 1828 if (trans) 1829 btrfs_end_transaction_throttle(trans); 1830 1831 btrfs_btree_balance_dirty(fs_info); 1832 1833 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1834 invalidate_extent_cache(root, &key, &next_key); 1835 1836 return ret; 1837 } 1838 1839 static noinline_for_stack 1840 int prepare_to_merge(struct reloc_control *rc, int err) 1841 { 1842 struct btrfs_root *root = rc->extent_root; 1843 struct btrfs_fs_info *fs_info = root->fs_info; 1844 struct btrfs_root *reloc_root; 1845 struct btrfs_trans_handle *trans; 1846 LIST_HEAD(reloc_roots); 1847 u64 num_bytes = 0; 1848 int ret; 1849 1850 mutex_lock(&fs_info->reloc_mutex); 1851 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1852 rc->merging_rsv_size += rc->nodes_relocated * 2; 1853 mutex_unlock(&fs_info->reloc_mutex); 1854 1855 again: 1856 if (!err) { 1857 num_bytes = rc->merging_rsv_size; 1858 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 1859 BTRFS_RESERVE_FLUSH_ALL); 1860 if (ret) 1861 err = ret; 1862 } 1863 1864 trans = btrfs_join_transaction(rc->extent_root); 1865 if (IS_ERR(trans)) { 1866 if (!err) 1867 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1868 num_bytes, NULL); 1869 return PTR_ERR(trans); 1870 } 1871 1872 if (!err) { 1873 if (num_bytes != rc->merging_rsv_size) { 1874 btrfs_end_transaction(trans); 1875 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1876 num_bytes, NULL); 1877 goto again; 1878 } 1879 } 1880 1881 rc->merge_reloc_tree = 1; 1882 1883 while (!list_empty(&rc->reloc_roots)) { 1884 reloc_root = list_entry(rc->reloc_roots.next, 1885 struct btrfs_root, root_list); 1886 list_del_init(&reloc_root->root_list); 1887 1888 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1889 false); 1890 if (IS_ERR(root)) { 1891 /* 1892 * Even if we have an error we need this reloc root 1893 * back on our list so we can clean up properly. 1894 */ 1895 list_add(&reloc_root->root_list, &reloc_roots); 1896 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1897 if (!err) 1898 err = PTR_ERR(root); 1899 break; 1900 } 1901 ASSERT(root->reloc_root == reloc_root); 1902 1903 /* 1904 * set reference count to 1, so btrfs_recover_relocation 1905 * knows it should resumes merging 1906 */ 1907 if (!err) 1908 btrfs_set_root_refs(&reloc_root->root_item, 1); 1909 ret = btrfs_update_reloc_root(trans, root); 1910 1911 /* 1912 * Even if we have an error we need this reloc root back on our 1913 * list so we can clean up properly. 1914 */ 1915 list_add(&reloc_root->root_list, &reloc_roots); 1916 btrfs_put_root(root); 1917 1918 if (ret) { 1919 btrfs_abort_transaction(trans, ret); 1920 if (!err) 1921 err = ret; 1922 break; 1923 } 1924 } 1925 1926 list_splice(&reloc_roots, &rc->reloc_roots); 1927 1928 if (!err) 1929 err = btrfs_commit_transaction(trans); 1930 else 1931 btrfs_end_transaction(trans); 1932 return err; 1933 } 1934 1935 static noinline_for_stack 1936 void free_reloc_roots(struct list_head *list) 1937 { 1938 struct btrfs_root *reloc_root, *tmp; 1939 1940 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1941 __del_reloc_root(reloc_root); 1942 } 1943 1944 static noinline_for_stack 1945 void merge_reloc_roots(struct reloc_control *rc) 1946 { 1947 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1948 struct btrfs_root *root; 1949 struct btrfs_root *reloc_root; 1950 LIST_HEAD(reloc_roots); 1951 int found = 0; 1952 int ret = 0; 1953 again: 1954 root = rc->extent_root; 1955 1956 /* 1957 * this serializes us with btrfs_record_root_in_transaction, 1958 * we have to make sure nobody is in the middle of 1959 * adding their roots to the list while we are 1960 * doing this splice 1961 */ 1962 mutex_lock(&fs_info->reloc_mutex); 1963 list_splice_init(&rc->reloc_roots, &reloc_roots); 1964 mutex_unlock(&fs_info->reloc_mutex); 1965 1966 while (!list_empty(&reloc_roots)) { 1967 found = 1; 1968 reloc_root = list_entry(reloc_roots.next, 1969 struct btrfs_root, root_list); 1970 1971 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1972 false); 1973 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1974 if (IS_ERR(root)) { 1975 /* 1976 * For recovery we read the fs roots on mount, 1977 * and if we didn't find the root then we marked 1978 * the reloc root as a garbage root. For normal 1979 * relocation obviously the root should exist in 1980 * memory. However there's no reason we can't 1981 * handle the error properly here just in case. 1982 */ 1983 ASSERT(0); 1984 ret = PTR_ERR(root); 1985 goto out; 1986 } 1987 if (root->reloc_root != reloc_root) { 1988 /* 1989 * This is actually impossible without something 1990 * going really wrong (like weird race condition 1991 * or cosmic rays). 1992 */ 1993 ASSERT(0); 1994 ret = -EINVAL; 1995 goto out; 1996 } 1997 ret = merge_reloc_root(rc, root); 1998 btrfs_put_root(root); 1999 if (ret) { 2000 if (list_empty(&reloc_root->root_list)) 2001 list_add_tail(&reloc_root->root_list, 2002 &reloc_roots); 2003 goto out; 2004 } 2005 } else { 2006 if (!IS_ERR(root)) { 2007 if (root->reloc_root == reloc_root) { 2008 root->reloc_root = NULL; 2009 btrfs_put_root(reloc_root); 2010 } 2011 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 2012 &root->state); 2013 btrfs_put_root(root); 2014 } 2015 2016 list_del_init(&reloc_root->root_list); 2017 /* Don't forget to queue this reloc root for cleanup */ 2018 list_add_tail(&reloc_root->reloc_dirty_list, 2019 &rc->dirty_subvol_roots); 2020 } 2021 } 2022 2023 if (found) { 2024 found = 0; 2025 goto again; 2026 } 2027 out: 2028 if (ret) { 2029 btrfs_handle_fs_error(fs_info, ret, NULL); 2030 free_reloc_roots(&reloc_roots); 2031 2032 /* new reloc root may be added */ 2033 mutex_lock(&fs_info->reloc_mutex); 2034 list_splice_init(&rc->reloc_roots, &reloc_roots); 2035 mutex_unlock(&fs_info->reloc_mutex); 2036 free_reloc_roots(&reloc_roots); 2037 } 2038 2039 /* 2040 * We used to have 2041 * 2042 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2043 * 2044 * here, but it's wrong. If we fail to start the transaction in 2045 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2046 * have actually been removed from the reloc_root_tree rb tree. This is 2047 * fine because we're bailing here, and we hold a reference on the root 2048 * for the list that holds it, so these roots will be cleaned up when we 2049 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2050 * will be cleaned up on unmount. 2051 * 2052 * The remaining nodes will be cleaned up by free_reloc_control. 2053 */ 2054 } 2055 2056 static void free_block_list(struct rb_root *blocks) 2057 { 2058 struct tree_block *block; 2059 struct rb_node *rb_node; 2060 while ((rb_node = rb_first(blocks))) { 2061 block = rb_entry(rb_node, struct tree_block, rb_node); 2062 rb_erase(rb_node, blocks); 2063 kfree(block); 2064 } 2065 } 2066 2067 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2068 struct btrfs_root *reloc_root) 2069 { 2070 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2071 struct btrfs_root *root; 2072 int ret; 2073 2074 if (reloc_root->last_trans == trans->transid) 2075 return 0; 2076 2077 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 2078 2079 /* 2080 * This should succeed, since we can't have a reloc root without having 2081 * already looked up the actual root and created the reloc root for this 2082 * root. 2083 * 2084 * However if there's some sort of corruption where we have a ref to a 2085 * reloc root without a corresponding root this could return ENOENT. 2086 */ 2087 if (IS_ERR(root)) { 2088 ASSERT(0); 2089 return PTR_ERR(root); 2090 } 2091 if (root->reloc_root != reloc_root) { 2092 ASSERT(0); 2093 btrfs_err(fs_info, 2094 "root %llu has two reloc roots associated with it", 2095 reloc_root->root_key.offset); 2096 btrfs_put_root(root); 2097 return -EUCLEAN; 2098 } 2099 ret = btrfs_record_root_in_trans(trans, root); 2100 btrfs_put_root(root); 2101 2102 return ret; 2103 } 2104 2105 static noinline_for_stack 2106 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2107 struct reloc_control *rc, 2108 struct btrfs_backref_node *node, 2109 struct btrfs_backref_edge *edges[]) 2110 { 2111 struct btrfs_backref_node *next; 2112 struct btrfs_root *root; 2113 int index = 0; 2114 int ret; 2115 2116 next = node; 2117 while (1) { 2118 cond_resched(); 2119 next = walk_up_backref(next, edges, &index); 2120 root = next->root; 2121 2122 /* 2123 * If there is no root, then our references for this block are 2124 * incomplete, as we should be able to walk all the way up to a 2125 * block that is owned by a root. 2126 * 2127 * This path is only for SHAREABLE roots, so if we come upon a 2128 * non-SHAREABLE root then we have backrefs that resolve 2129 * improperly. 2130 * 2131 * Both of these cases indicate file system corruption, or a bug 2132 * in the backref walking code. 2133 */ 2134 if (!root) { 2135 ASSERT(0); 2136 btrfs_err(trans->fs_info, 2137 "bytenr %llu doesn't have a backref path ending in a root", 2138 node->bytenr); 2139 return ERR_PTR(-EUCLEAN); 2140 } 2141 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2142 ASSERT(0); 2143 btrfs_err(trans->fs_info, 2144 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2145 node->bytenr); 2146 return ERR_PTR(-EUCLEAN); 2147 } 2148 2149 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2150 ret = record_reloc_root_in_trans(trans, root); 2151 if (ret) 2152 return ERR_PTR(ret); 2153 break; 2154 } 2155 2156 ret = btrfs_record_root_in_trans(trans, root); 2157 if (ret) 2158 return ERR_PTR(ret); 2159 root = root->reloc_root; 2160 2161 /* 2162 * We could have raced with another thread which failed, so 2163 * root->reloc_root may not be set, return ENOENT in this case. 2164 */ 2165 if (!root) 2166 return ERR_PTR(-ENOENT); 2167 2168 if (next->new_bytenr != root->node->start) { 2169 /* 2170 * We just created the reloc root, so we shouldn't have 2171 * ->new_bytenr set and this shouldn't be in the changed 2172 * list. If it is then we have multiple roots pointing 2173 * at the same bytenr which indicates corruption, or 2174 * we've made a mistake in the backref walking code. 2175 */ 2176 ASSERT(next->new_bytenr == 0); 2177 ASSERT(list_empty(&next->list)); 2178 if (next->new_bytenr || !list_empty(&next->list)) { 2179 btrfs_err(trans->fs_info, 2180 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2181 node->bytenr, next->bytenr); 2182 return ERR_PTR(-EUCLEAN); 2183 } 2184 2185 next->new_bytenr = root->node->start; 2186 btrfs_put_root(next->root); 2187 next->root = btrfs_grab_root(root); 2188 ASSERT(next->root); 2189 list_add_tail(&next->list, 2190 &rc->backref_cache.changed); 2191 mark_block_processed(rc, next); 2192 break; 2193 } 2194 2195 WARN_ON(1); 2196 root = NULL; 2197 next = walk_down_backref(edges, &index); 2198 if (!next || next->level <= node->level) 2199 break; 2200 } 2201 if (!root) { 2202 /* 2203 * This can happen if there's fs corruption or if there's a bug 2204 * in the backref lookup code. 2205 */ 2206 ASSERT(0); 2207 return ERR_PTR(-ENOENT); 2208 } 2209 2210 next = node; 2211 /* setup backref node path for btrfs_reloc_cow_block */ 2212 while (1) { 2213 rc->backref_cache.path[next->level] = next; 2214 if (--index < 0) 2215 break; 2216 next = edges[index]->node[UPPER]; 2217 } 2218 return root; 2219 } 2220 2221 /* 2222 * Select a tree root for relocation. 2223 * 2224 * Return NULL if the block is not shareable. We should use do_relocation() in 2225 * this case. 2226 * 2227 * Return a tree root pointer if the block is shareable. 2228 * Return -ENOENT if the block is root of reloc tree. 2229 */ 2230 static noinline_for_stack 2231 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2232 { 2233 struct btrfs_backref_node *next; 2234 struct btrfs_root *root; 2235 struct btrfs_root *fs_root = NULL; 2236 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2237 int index = 0; 2238 2239 next = node; 2240 while (1) { 2241 cond_resched(); 2242 next = walk_up_backref(next, edges, &index); 2243 root = next->root; 2244 2245 /* 2246 * This can occur if we have incomplete extent refs leading all 2247 * the way up a particular path, in this case return -EUCLEAN. 2248 */ 2249 if (!root) 2250 return ERR_PTR(-EUCLEAN); 2251 2252 /* No other choice for non-shareable tree */ 2253 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2254 return root; 2255 2256 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2257 fs_root = root; 2258 2259 if (next != node) 2260 return NULL; 2261 2262 next = walk_down_backref(edges, &index); 2263 if (!next || next->level <= node->level) 2264 break; 2265 } 2266 2267 if (!fs_root) 2268 return ERR_PTR(-ENOENT); 2269 return fs_root; 2270 } 2271 2272 static noinline_for_stack 2273 u64 calcu_metadata_size(struct reloc_control *rc, 2274 struct btrfs_backref_node *node, int reserve) 2275 { 2276 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2277 struct btrfs_backref_node *next = node; 2278 struct btrfs_backref_edge *edge; 2279 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2280 u64 num_bytes = 0; 2281 int index = 0; 2282 2283 BUG_ON(reserve && node->processed); 2284 2285 while (next) { 2286 cond_resched(); 2287 while (1) { 2288 if (next->processed && (reserve || next != node)) 2289 break; 2290 2291 num_bytes += fs_info->nodesize; 2292 2293 if (list_empty(&next->upper)) 2294 break; 2295 2296 edge = list_entry(next->upper.next, 2297 struct btrfs_backref_edge, list[LOWER]); 2298 edges[index++] = edge; 2299 next = edge->node[UPPER]; 2300 } 2301 next = walk_down_backref(edges, &index); 2302 } 2303 return num_bytes; 2304 } 2305 2306 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2307 struct reloc_control *rc, 2308 struct btrfs_backref_node *node) 2309 { 2310 struct btrfs_root *root = rc->extent_root; 2311 struct btrfs_fs_info *fs_info = root->fs_info; 2312 u64 num_bytes; 2313 int ret; 2314 u64 tmp; 2315 2316 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2317 2318 trans->block_rsv = rc->block_rsv; 2319 rc->reserved_bytes += num_bytes; 2320 2321 /* 2322 * We are under a transaction here so we can only do limited flushing. 2323 * If we get an enospc just kick back -EAGAIN so we know to drop the 2324 * transaction and try to refill when we can flush all the things. 2325 */ 2326 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, 2327 BTRFS_RESERVE_FLUSH_LIMIT); 2328 if (ret) { 2329 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2330 while (tmp <= rc->reserved_bytes) 2331 tmp <<= 1; 2332 /* 2333 * only one thread can access block_rsv at this point, 2334 * so we don't need hold lock to protect block_rsv. 2335 * we expand more reservation size here to allow enough 2336 * space for relocation and we will return earlier in 2337 * enospc case. 2338 */ 2339 rc->block_rsv->size = tmp + fs_info->nodesize * 2340 RELOCATION_RESERVED_NODES; 2341 return -EAGAIN; 2342 } 2343 2344 return 0; 2345 } 2346 2347 /* 2348 * relocate a block tree, and then update pointers in upper level 2349 * blocks that reference the block to point to the new location. 2350 * 2351 * if called by link_to_upper, the block has already been relocated. 2352 * in that case this function just updates pointers. 2353 */ 2354 static int do_relocation(struct btrfs_trans_handle *trans, 2355 struct reloc_control *rc, 2356 struct btrfs_backref_node *node, 2357 struct btrfs_key *key, 2358 struct btrfs_path *path, int lowest) 2359 { 2360 struct btrfs_backref_node *upper; 2361 struct btrfs_backref_edge *edge; 2362 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2363 struct btrfs_root *root; 2364 struct extent_buffer *eb; 2365 u32 blocksize; 2366 u64 bytenr; 2367 int slot; 2368 int ret = 0; 2369 2370 /* 2371 * If we are lowest then this is the first time we're processing this 2372 * block, and thus shouldn't have an eb associated with it yet. 2373 */ 2374 ASSERT(!lowest || !node->eb); 2375 2376 path->lowest_level = node->level + 1; 2377 rc->backref_cache.path[node->level] = node; 2378 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2379 struct btrfs_ref ref = { 0 }; 2380 2381 cond_resched(); 2382 2383 upper = edge->node[UPPER]; 2384 root = select_reloc_root(trans, rc, upper, edges); 2385 if (IS_ERR(root)) { 2386 ret = PTR_ERR(root); 2387 goto next; 2388 } 2389 2390 if (upper->eb && !upper->locked) { 2391 if (!lowest) { 2392 ret = btrfs_bin_search(upper->eb, key, &slot); 2393 if (ret < 0) 2394 goto next; 2395 BUG_ON(ret); 2396 bytenr = btrfs_node_blockptr(upper->eb, slot); 2397 if (node->eb->start == bytenr) 2398 goto next; 2399 } 2400 btrfs_backref_drop_node_buffer(upper); 2401 } 2402 2403 if (!upper->eb) { 2404 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2405 if (ret) { 2406 if (ret > 0) 2407 ret = -ENOENT; 2408 2409 btrfs_release_path(path); 2410 break; 2411 } 2412 2413 if (!upper->eb) { 2414 upper->eb = path->nodes[upper->level]; 2415 path->nodes[upper->level] = NULL; 2416 } else { 2417 BUG_ON(upper->eb != path->nodes[upper->level]); 2418 } 2419 2420 upper->locked = 1; 2421 path->locks[upper->level] = 0; 2422 2423 slot = path->slots[upper->level]; 2424 btrfs_release_path(path); 2425 } else { 2426 ret = btrfs_bin_search(upper->eb, key, &slot); 2427 if (ret < 0) 2428 goto next; 2429 BUG_ON(ret); 2430 } 2431 2432 bytenr = btrfs_node_blockptr(upper->eb, slot); 2433 if (lowest) { 2434 if (bytenr != node->bytenr) { 2435 btrfs_err(root->fs_info, 2436 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2437 bytenr, node->bytenr, slot, 2438 upper->eb->start); 2439 ret = -EIO; 2440 goto next; 2441 } 2442 } else { 2443 if (node->eb->start == bytenr) 2444 goto next; 2445 } 2446 2447 blocksize = root->fs_info->nodesize; 2448 eb = btrfs_read_node_slot(upper->eb, slot); 2449 if (IS_ERR(eb)) { 2450 ret = PTR_ERR(eb); 2451 goto next; 2452 } 2453 btrfs_tree_lock(eb); 2454 2455 if (!node->eb) { 2456 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2457 slot, &eb, BTRFS_NESTING_COW); 2458 btrfs_tree_unlock(eb); 2459 free_extent_buffer(eb); 2460 if (ret < 0) 2461 goto next; 2462 /* 2463 * We've just COWed this block, it should have updated 2464 * the correct backref node entry. 2465 */ 2466 ASSERT(node->eb == eb); 2467 } else { 2468 btrfs_set_node_blockptr(upper->eb, slot, 2469 node->eb->start); 2470 btrfs_set_node_ptr_generation(upper->eb, slot, 2471 trans->transid); 2472 btrfs_mark_buffer_dirty(upper->eb); 2473 2474 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2475 node->eb->start, blocksize, 2476 upper->eb->start); 2477 btrfs_init_tree_ref(&ref, node->level, 2478 btrfs_header_owner(upper->eb), 2479 root->root_key.objectid, false); 2480 ret = btrfs_inc_extent_ref(trans, &ref); 2481 if (!ret) 2482 ret = btrfs_drop_subtree(trans, root, eb, 2483 upper->eb); 2484 if (ret) 2485 btrfs_abort_transaction(trans, ret); 2486 } 2487 next: 2488 if (!upper->pending) 2489 btrfs_backref_drop_node_buffer(upper); 2490 else 2491 btrfs_backref_unlock_node_buffer(upper); 2492 if (ret) 2493 break; 2494 } 2495 2496 if (!ret && node->pending) { 2497 btrfs_backref_drop_node_buffer(node); 2498 list_move_tail(&node->list, &rc->backref_cache.changed); 2499 node->pending = 0; 2500 } 2501 2502 path->lowest_level = 0; 2503 2504 /* 2505 * We should have allocated all of our space in the block rsv and thus 2506 * shouldn't ENOSPC. 2507 */ 2508 ASSERT(ret != -ENOSPC); 2509 return ret; 2510 } 2511 2512 static int link_to_upper(struct btrfs_trans_handle *trans, 2513 struct reloc_control *rc, 2514 struct btrfs_backref_node *node, 2515 struct btrfs_path *path) 2516 { 2517 struct btrfs_key key; 2518 2519 btrfs_node_key_to_cpu(node->eb, &key, 0); 2520 return do_relocation(trans, rc, node, &key, path, 0); 2521 } 2522 2523 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2524 struct reloc_control *rc, 2525 struct btrfs_path *path, int err) 2526 { 2527 LIST_HEAD(list); 2528 struct btrfs_backref_cache *cache = &rc->backref_cache; 2529 struct btrfs_backref_node *node; 2530 int level; 2531 int ret; 2532 2533 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2534 while (!list_empty(&cache->pending[level])) { 2535 node = list_entry(cache->pending[level].next, 2536 struct btrfs_backref_node, list); 2537 list_move_tail(&node->list, &list); 2538 BUG_ON(!node->pending); 2539 2540 if (!err) { 2541 ret = link_to_upper(trans, rc, node, path); 2542 if (ret < 0) 2543 err = ret; 2544 } 2545 } 2546 list_splice_init(&list, &cache->pending[level]); 2547 } 2548 return err; 2549 } 2550 2551 /* 2552 * mark a block and all blocks directly/indirectly reference the block 2553 * as processed. 2554 */ 2555 static void update_processed_blocks(struct reloc_control *rc, 2556 struct btrfs_backref_node *node) 2557 { 2558 struct btrfs_backref_node *next = node; 2559 struct btrfs_backref_edge *edge; 2560 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2561 int index = 0; 2562 2563 while (next) { 2564 cond_resched(); 2565 while (1) { 2566 if (next->processed) 2567 break; 2568 2569 mark_block_processed(rc, next); 2570 2571 if (list_empty(&next->upper)) 2572 break; 2573 2574 edge = list_entry(next->upper.next, 2575 struct btrfs_backref_edge, list[LOWER]); 2576 edges[index++] = edge; 2577 next = edge->node[UPPER]; 2578 } 2579 next = walk_down_backref(edges, &index); 2580 } 2581 } 2582 2583 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2584 { 2585 u32 blocksize = rc->extent_root->fs_info->nodesize; 2586 2587 if (test_range_bit(&rc->processed_blocks, bytenr, 2588 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2589 return 1; 2590 return 0; 2591 } 2592 2593 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2594 struct tree_block *block) 2595 { 2596 struct extent_buffer *eb; 2597 2598 eb = read_tree_block(fs_info, block->bytenr, block->owner, 2599 block->key.offset, block->level, NULL); 2600 if (IS_ERR(eb)) { 2601 return PTR_ERR(eb); 2602 } else if (!extent_buffer_uptodate(eb)) { 2603 free_extent_buffer(eb); 2604 return -EIO; 2605 } 2606 if (block->level == 0) 2607 btrfs_item_key_to_cpu(eb, &block->key, 0); 2608 else 2609 btrfs_node_key_to_cpu(eb, &block->key, 0); 2610 free_extent_buffer(eb); 2611 block->key_ready = 1; 2612 return 0; 2613 } 2614 2615 /* 2616 * helper function to relocate a tree block 2617 */ 2618 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2619 struct reloc_control *rc, 2620 struct btrfs_backref_node *node, 2621 struct btrfs_key *key, 2622 struct btrfs_path *path) 2623 { 2624 struct btrfs_root *root; 2625 int ret = 0; 2626 2627 if (!node) 2628 return 0; 2629 2630 /* 2631 * If we fail here we want to drop our backref_node because we are going 2632 * to start over and regenerate the tree for it. 2633 */ 2634 ret = reserve_metadata_space(trans, rc, node); 2635 if (ret) 2636 goto out; 2637 2638 BUG_ON(node->processed); 2639 root = select_one_root(node); 2640 if (IS_ERR(root)) { 2641 ret = PTR_ERR(root); 2642 2643 /* See explanation in select_one_root for the -EUCLEAN case. */ 2644 ASSERT(ret == -ENOENT); 2645 if (ret == -ENOENT) { 2646 ret = 0; 2647 update_processed_blocks(rc, node); 2648 } 2649 goto out; 2650 } 2651 2652 if (root) { 2653 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2654 /* 2655 * This block was the root block of a root, and this is 2656 * the first time we're processing the block and thus it 2657 * should not have had the ->new_bytenr modified and 2658 * should have not been included on the changed list. 2659 * 2660 * However in the case of corruption we could have 2661 * multiple refs pointing to the same block improperly, 2662 * and thus we would trip over these checks. ASSERT() 2663 * for the developer case, because it could indicate a 2664 * bug in the backref code, however error out for a 2665 * normal user in the case of corruption. 2666 */ 2667 ASSERT(node->new_bytenr == 0); 2668 ASSERT(list_empty(&node->list)); 2669 if (node->new_bytenr || !list_empty(&node->list)) { 2670 btrfs_err(root->fs_info, 2671 "bytenr %llu has improper references to it", 2672 node->bytenr); 2673 ret = -EUCLEAN; 2674 goto out; 2675 } 2676 ret = btrfs_record_root_in_trans(trans, root); 2677 if (ret) 2678 goto out; 2679 /* 2680 * Another thread could have failed, need to check if we 2681 * have reloc_root actually set. 2682 */ 2683 if (!root->reloc_root) { 2684 ret = -ENOENT; 2685 goto out; 2686 } 2687 root = root->reloc_root; 2688 node->new_bytenr = root->node->start; 2689 btrfs_put_root(node->root); 2690 node->root = btrfs_grab_root(root); 2691 ASSERT(node->root); 2692 list_add_tail(&node->list, &rc->backref_cache.changed); 2693 } else { 2694 path->lowest_level = node->level; 2695 if (root == root->fs_info->chunk_root) 2696 btrfs_reserve_chunk_metadata(trans, false); 2697 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2698 btrfs_release_path(path); 2699 if (root == root->fs_info->chunk_root) 2700 btrfs_trans_release_chunk_metadata(trans); 2701 if (ret > 0) 2702 ret = 0; 2703 } 2704 if (!ret) 2705 update_processed_blocks(rc, node); 2706 } else { 2707 ret = do_relocation(trans, rc, node, key, path, 1); 2708 } 2709 out: 2710 if (ret || node->level == 0 || node->cowonly) 2711 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2712 return ret; 2713 } 2714 2715 /* 2716 * relocate a list of blocks 2717 */ 2718 static noinline_for_stack 2719 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2720 struct reloc_control *rc, struct rb_root *blocks) 2721 { 2722 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2723 struct btrfs_backref_node *node; 2724 struct btrfs_path *path; 2725 struct tree_block *block; 2726 struct tree_block *next; 2727 int ret; 2728 int err = 0; 2729 2730 path = btrfs_alloc_path(); 2731 if (!path) { 2732 err = -ENOMEM; 2733 goto out_free_blocks; 2734 } 2735 2736 /* Kick in readahead for tree blocks with missing keys */ 2737 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2738 if (!block->key_ready) 2739 btrfs_readahead_tree_block(fs_info, block->bytenr, 2740 block->owner, 0, 2741 block->level); 2742 } 2743 2744 /* Get first keys */ 2745 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2746 if (!block->key_ready) { 2747 err = get_tree_block_key(fs_info, block); 2748 if (err) 2749 goto out_free_path; 2750 } 2751 } 2752 2753 /* Do tree relocation */ 2754 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2755 node = build_backref_tree(rc, &block->key, 2756 block->level, block->bytenr); 2757 if (IS_ERR(node)) { 2758 err = PTR_ERR(node); 2759 goto out; 2760 } 2761 2762 ret = relocate_tree_block(trans, rc, node, &block->key, 2763 path); 2764 if (ret < 0) { 2765 err = ret; 2766 break; 2767 } 2768 } 2769 out: 2770 err = finish_pending_nodes(trans, rc, path, err); 2771 2772 out_free_path: 2773 btrfs_free_path(path); 2774 out_free_blocks: 2775 free_block_list(blocks); 2776 return err; 2777 } 2778 2779 static noinline_for_stack int prealloc_file_extent_cluster( 2780 struct btrfs_inode *inode, 2781 struct file_extent_cluster *cluster) 2782 { 2783 u64 alloc_hint = 0; 2784 u64 start; 2785 u64 end; 2786 u64 offset = inode->index_cnt; 2787 u64 num_bytes; 2788 int nr; 2789 int ret = 0; 2790 u64 i_size = i_size_read(&inode->vfs_inode); 2791 u64 prealloc_start = cluster->start - offset; 2792 u64 prealloc_end = cluster->end - offset; 2793 u64 cur_offset = prealloc_start; 2794 2795 /* 2796 * For subpage case, previous i_size may not be aligned to PAGE_SIZE. 2797 * This means the range [i_size, PAGE_END + 1) is filled with zeros by 2798 * btrfs_do_readpage() call of previously relocated file cluster. 2799 * 2800 * If the current cluster starts in the above range, btrfs_do_readpage() 2801 * will skip the read, and relocate_one_page() will later writeback 2802 * the padding zeros as new data, causing data corruption. 2803 * 2804 * Here we have to manually invalidate the range (i_size, PAGE_END + 1). 2805 */ 2806 if (!IS_ALIGNED(i_size, PAGE_SIZE)) { 2807 struct address_space *mapping = inode->vfs_inode.i_mapping; 2808 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2809 const u32 sectorsize = fs_info->sectorsize; 2810 struct page *page; 2811 2812 ASSERT(sectorsize < PAGE_SIZE); 2813 ASSERT(IS_ALIGNED(i_size, sectorsize)); 2814 2815 /* 2816 * Subpage can't handle page with DIRTY but without UPTODATE 2817 * bit as it can lead to the following deadlock: 2818 * 2819 * btrfs_readpage() 2820 * | Page already *locked* 2821 * |- btrfs_lock_and_flush_ordered_range() 2822 * |- btrfs_start_ordered_extent() 2823 * |- extent_write_cache_pages() 2824 * |- lock_page() 2825 * We try to lock the page we already hold. 2826 * 2827 * Here we just writeback the whole data reloc inode, so that 2828 * we will be ensured to have no dirty range in the page, and 2829 * are safe to clear the uptodate bits. 2830 * 2831 * This shouldn't cause too much overhead, as we need to write 2832 * the data back anyway. 2833 */ 2834 ret = filemap_write_and_wait(mapping); 2835 if (ret < 0) 2836 return ret; 2837 2838 clear_extent_bits(&inode->io_tree, i_size, 2839 round_up(i_size, PAGE_SIZE) - 1, 2840 EXTENT_UPTODATE); 2841 page = find_lock_page(mapping, i_size >> PAGE_SHIFT); 2842 /* 2843 * If page is freed we don't need to do anything then, as we 2844 * will re-read the whole page anyway. 2845 */ 2846 if (page) { 2847 btrfs_subpage_clear_uptodate(fs_info, page, i_size, 2848 round_up(i_size, PAGE_SIZE) - i_size); 2849 unlock_page(page); 2850 put_page(page); 2851 } 2852 } 2853 2854 BUG_ON(cluster->start != cluster->boundary[0]); 2855 ret = btrfs_alloc_data_chunk_ondemand(inode, 2856 prealloc_end + 1 - prealloc_start); 2857 if (ret) 2858 return ret; 2859 2860 btrfs_inode_lock(&inode->vfs_inode, 0); 2861 for (nr = 0; nr < cluster->nr; nr++) { 2862 start = cluster->boundary[nr] - offset; 2863 if (nr + 1 < cluster->nr) 2864 end = cluster->boundary[nr + 1] - 1 - offset; 2865 else 2866 end = cluster->end - offset; 2867 2868 lock_extent(&inode->io_tree, start, end); 2869 num_bytes = end + 1 - start; 2870 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2871 num_bytes, num_bytes, 2872 end + 1, &alloc_hint); 2873 cur_offset = end + 1; 2874 unlock_extent(&inode->io_tree, start, end); 2875 if (ret) 2876 break; 2877 } 2878 btrfs_inode_unlock(&inode->vfs_inode, 0); 2879 2880 if (cur_offset < prealloc_end) 2881 btrfs_free_reserved_data_space_noquota(inode->root->fs_info, 2882 prealloc_end + 1 - cur_offset); 2883 return ret; 2884 } 2885 2886 static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, 2887 u64 start, u64 end, u64 block_start) 2888 { 2889 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2890 struct extent_map *em; 2891 int ret = 0; 2892 2893 em = alloc_extent_map(); 2894 if (!em) 2895 return -ENOMEM; 2896 2897 em->start = start; 2898 em->len = end + 1 - start; 2899 em->block_len = em->len; 2900 em->block_start = block_start; 2901 set_bit(EXTENT_FLAG_PINNED, &em->flags); 2902 2903 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2904 while (1) { 2905 write_lock(&em_tree->lock); 2906 ret = add_extent_mapping(em_tree, em, 0); 2907 write_unlock(&em_tree->lock); 2908 if (ret != -EEXIST) { 2909 free_extent_map(em); 2910 break; 2911 } 2912 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 2913 } 2914 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2915 return ret; 2916 } 2917 2918 /* 2919 * Allow error injection to test balance/relocation cancellation 2920 */ 2921 noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) 2922 { 2923 return atomic_read(&fs_info->balance_cancel_req) || 2924 atomic_read(&fs_info->reloc_cancel_req) || 2925 fatal_signal_pending(current); 2926 } 2927 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2928 2929 static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, 2930 int cluster_nr) 2931 { 2932 /* Last extent, use cluster end directly */ 2933 if (cluster_nr >= cluster->nr - 1) 2934 return cluster->end; 2935 2936 /* Use next boundary start*/ 2937 return cluster->boundary[cluster_nr + 1] - 1; 2938 } 2939 2940 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, 2941 struct file_extent_cluster *cluster, 2942 int *cluster_nr, unsigned long page_index) 2943 { 2944 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2945 u64 offset = BTRFS_I(inode)->index_cnt; 2946 const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; 2947 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2948 struct page *page; 2949 u64 page_start; 2950 u64 page_end; 2951 u64 cur; 2952 int ret; 2953 2954 ASSERT(page_index <= last_index); 2955 page = find_lock_page(inode->i_mapping, page_index); 2956 if (!page) { 2957 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2958 page_index, last_index + 1 - page_index); 2959 page = find_or_create_page(inode->i_mapping, page_index, mask); 2960 if (!page) 2961 return -ENOMEM; 2962 } 2963 ret = set_page_extent_mapped(page); 2964 if (ret < 0) 2965 goto release_page; 2966 2967 if (PageReadahead(page)) 2968 page_cache_async_readahead(inode->i_mapping, ra, NULL, page, 2969 page_index, last_index + 1 - page_index); 2970 2971 if (!PageUptodate(page)) { 2972 btrfs_readpage(NULL, page); 2973 lock_page(page); 2974 if (!PageUptodate(page)) { 2975 ret = -EIO; 2976 goto release_page; 2977 } 2978 } 2979 2980 page_start = page_offset(page); 2981 page_end = page_start + PAGE_SIZE - 1; 2982 2983 /* 2984 * Start from the cluster, as for subpage case, the cluster can start 2985 * inside the page. 2986 */ 2987 cur = max(page_start, cluster->boundary[*cluster_nr] - offset); 2988 while (cur <= page_end) { 2989 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2990 u64 extent_end = get_cluster_boundary_end(cluster, 2991 *cluster_nr) - offset; 2992 u64 clamped_start = max(page_start, extent_start); 2993 u64 clamped_end = min(page_end, extent_end); 2994 u32 clamped_len = clamped_end + 1 - clamped_start; 2995 2996 /* Reserve metadata for this range */ 2997 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2998 clamped_len); 2999 if (ret) 3000 goto release_page; 3001 3002 /* Mark the range delalloc and dirty for later writeback */ 3003 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); 3004 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 3005 clamped_end, 0, NULL); 3006 if (ret) { 3007 clear_extent_bits(&BTRFS_I(inode)->io_tree, 3008 clamped_start, clamped_end, 3009 EXTENT_LOCKED | EXTENT_BOUNDARY); 3010 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3011 clamped_len, true); 3012 btrfs_delalloc_release_extents(BTRFS_I(inode), 3013 clamped_len); 3014 goto release_page; 3015 } 3016 btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); 3017 3018 /* 3019 * Set the boundary if it's inside the page. 3020 * Data relocation requires the destination extents to have the 3021 * same size as the source. 3022 * EXTENT_BOUNDARY bit prevents current extent from being merged 3023 * with previous extent. 3024 */ 3025 if (in_range(cluster->boundary[*cluster_nr] - offset, 3026 page_start, PAGE_SIZE)) { 3027 u64 boundary_start = cluster->boundary[*cluster_nr] - 3028 offset; 3029 u64 boundary_end = boundary_start + 3030 fs_info->sectorsize - 1; 3031 3032 set_extent_bits(&BTRFS_I(inode)->io_tree, 3033 boundary_start, boundary_end, 3034 EXTENT_BOUNDARY); 3035 } 3036 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); 3037 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 3038 cur += clamped_len; 3039 3040 /* Crossed extent end, go to next extent */ 3041 if (cur >= extent_end) { 3042 (*cluster_nr)++; 3043 /* Just finished the last extent of the cluster, exit. */ 3044 if (*cluster_nr >= cluster->nr) 3045 break; 3046 } 3047 } 3048 unlock_page(page); 3049 put_page(page); 3050 3051 balance_dirty_pages_ratelimited(inode->i_mapping); 3052 btrfs_throttle(fs_info); 3053 if (btrfs_should_cancel_balance(fs_info)) 3054 ret = -ECANCELED; 3055 return ret; 3056 3057 release_page: 3058 unlock_page(page); 3059 put_page(page); 3060 return ret; 3061 } 3062 3063 static int relocate_file_extent_cluster(struct inode *inode, 3064 struct file_extent_cluster *cluster) 3065 { 3066 u64 offset = BTRFS_I(inode)->index_cnt; 3067 unsigned long index; 3068 unsigned long last_index; 3069 struct file_ra_state *ra; 3070 int cluster_nr = 0; 3071 int ret = 0; 3072 3073 if (!cluster->nr) 3074 return 0; 3075 3076 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3077 if (!ra) 3078 return -ENOMEM; 3079 3080 ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); 3081 if (ret) 3082 goto out; 3083 3084 file_ra_state_init(ra, inode->i_mapping); 3085 3086 ret = setup_relocation_extent_mapping(inode, cluster->start - offset, 3087 cluster->end - offset, cluster->start); 3088 if (ret) 3089 goto out; 3090 3091 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3092 for (index = (cluster->start - offset) >> PAGE_SHIFT; 3093 index <= last_index && !ret; index++) 3094 ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); 3095 if (ret == 0) 3096 WARN_ON(cluster_nr != cluster->nr); 3097 out: 3098 kfree(ra); 3099 return ret; 3100 } 3101 3102 static noinline_for_stack 3103 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3104 struct file_extent_cluster *cluster) 3105 { 3106 int ret; 3107 3108 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3109 ret = relocate_file_extent_cluster(inode, cluster); 3110 if (ret) 3111 return ret; 3112 cluster->nr = 0; 3113 } 3114 3115 if (!cluster->nr) 3116 cluster->start = extent_key->objectid; 3117 else 3118 BUG_ON(cluster->nr >= MAX_EXTENTS); 3119 cluster->end = extent_key->objectid + extent_key->offset - 1; 3120 cluster->boundary[cluster->nr] = extent_key->objectid; 3121 cluster->nr++; 3122 3123 if (cluster->nr >= MAX_EXTENTS) { 3124 ret = relocate_file_extent_cluster(inode, cluster); 3125 if (ret) 3126 return ret; 3127 cluster->nr = 0; 3128 } 3129 return 0; 3130 } 3131 3132 /* 3133 * helper to add a tree block to the list. 3134 * the major work is getting the generation and level of the block 3135 */ 3136 static int add_tree_block(struct reloc_control *rc, 3137 struct btrfs_key *extent_key, 3138 struct btrfs_path *path, 3139 struct rb_root *blocks) 3140 { 3141 struct extent_buffer *eb; 3142 struct btrfs_extent_item *ei; 3143 struct btrfs_tree_block_info *bi; 3144 struct tree_block *block; 3145 struct rb_node *rb_node; 3146 u32 item_size; 3147 int level = -1; 3148 u64 generation; 3149 u64 owner = 0; 3150 3151 eb = path->nodes[0]; 3152 item_size = btrfs_item_size_nr(eb, path->slots[0]); 3153 3154 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3155 item_size >= sizeof(*ei) + sizeof(*bi)) { 3156 unsigned long ptr = 0, end; 3157 3158 ei = btrfs_item_ptr(eb, path->slots[0], 3159 struct btrfs_extent_item); 3160 end = (unsigned long)ei + item_size; 3161 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3162 bi = (struct btrfs_tree_block_info *)(ei + 1); 3163 level = btrfs_tree_block_level(eb, bi); 3164 ptr = (unsigned long)(bi + 1); 3165 } else { 3166 level = (int)extent_key->offset; 3167 ptr = (unsigned long)(ei + 1); 3168 } 3169 generation = btrfs_extent_generation(eb, ei); 3170 3171 /* 3172 * We're reading random blocks without knowing their owner ahead 3173 * of time. This is ok most of the time, as all reloc roots and 3174 * fs roots have the same lock type. However normal trees do 3175 * not, and the only way to know ahead of time is to read the 3176 * inline ref offset. We know it's an fs root if 3177 * 3178 * 1. There's more than one ref. 3179 * 2. There's a SHARED_DATA_REF_KEY set. 3180 * 3. FULL_BACKREF is set on the flags. 3181 * 3182 * Otherwise it's safe to assume that the ref offset == the 3183 * owner of this block, so we can use that when calling 3184 * read_tree_block. 3185 */ 3186 if (btrfs_extent_refs(eb, ei) == 1 && 3187 !(btrfs_extent_flags(eb, ei) & 3188 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3189 ptr < end) { 3190 struct btrfs_extent_inline_ref *iref; 3191 int type; 3192 3193 iref = (struct btrfs_extent_inline_ref *)ptr; 3194 type = btrfs_get_extent_inline_ref_type(eb, iref, 3195 BTRFS_REF_TYPE_BLOCK); 3196 if (type == BTRFS_REF_TYPE_INVALID) 3197 return -EINVAL; 3198 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3199 owner = btrfs_extent_inline_ref_offset(eb, iref); 3200 } 3201 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3202 btrfs_print_v0_err(eb->fs_info); 3203 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3204 return -EINVAL; 3205 } else { 3206 BUG(); 3207 } 3208 3209 btrfs_release_path(path); 3210 3211 BUG_ON(level == -1); 3212 3213 block = kmalloc(sizeof(*block), GFP_NOFS); 3214 if (!block) 3215 return -ENOMEM; 3216 3217 block->bytenr = extent_key->objectid; 3218 block->key.objectid = rc->extent_root->fs_info->nodesize; 3219 block->key.offset = generation; 3220 block->level = level; 3221 block->key_ready = 0; 3222 block->owner = owner; 3223 3224 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3225 if (rb_node) 3226 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3227 -EEXIST); 3228 3229 return 0; 3230 } 3231 3232 /* 3233 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3234 */ 3235 static int __add_tree_block(struct reloc_control *rc, 3236 u64 bytenr, u32 blocksize, 3237 struct rb_root *blocks) 3238 { 3239 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3240 struct btrfs_path *path; 3241 struct btrfs_key key; 3242 int ret; 3243 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3244 3245 if (tree_block_processed(bytenr, rc)) 3246 return 0; 3247 3248 if (rb_simple_search(blocks, bytenr)) 3249 return 0; 3250 3251 path = btrfs_alloc_path(); 3252 if (!path) 3253 return -ENOMEM; 3254 again: 3255 key.objectid = bytenr; 3256 if (skinny) { 3257 key.type = BTRFS_METADATA_ITEM_KEY; 3258 key.offset = (u64)-1; 3259 } else { 3260 key.type = BTRFS_EXTENT_ITEM_KEY; 3261 key.offset = blocksize; 3262 } 3263 3264 path->search_commit_root = 1; 3265 path->skip_locking = 1; 3266 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3267 if (ret < 0) 3268 goto out; 3269 3270 if (ret > 0 && skinny) { 3271 if (path->slots[0]) { 3272 path->slots[0]--; 3273 btrfs_item_key_to_cpu(path->nodes[0], &key, 3274 path->slots[0]); 3275 if (key.objectid == bytenr && 3276 (key.type == BTRFS_METADATA_ITEM_KEY || 3277 (key.type == BTRFS_EXTENT_ITEM_KEY && 3278 key.offset == blocksize))) 3279 ret = 0; 3280 } 3281 3282 if (ret) { 3283 skinny = false; 3284 btrfs_release_path(path); 3285 goto again; 3286 } 3287 } 3288 if (ret) { 3289 ASSERT(ret == 1); 3290 btrfs_print_leaf(path->nodes[0]); 3291 btrfs_err(fs_info, 3292 "tree block extent item (%llu) is not found in extent tree", 3293 bytenr); 3294 WARN_ON(1); 3295 ret = -EINVAL; 3296 goto out; 3297 } 3298 3299 ret = add_tree_block(rc, &key, path, blocks); 3300 out: 3301 btrfs_free_path(path); 3302 return ret; 3303 } 3304 3305 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3306 struct btrfs_block_group *block_group, 3307 struct inode *inode, 3308 u64 ino) 3309 { 3310 struct btrfs_root *root = fs_info->tree_root; 3311 struct btrfs_trans_handle *trans; 3312 int ret = 0; 3313 3314 if (inode) 3315 goto truncate; 3316 3317 inode = btrfs_iget(fs_info->sb, ino, root); 3318 if (IS_ERR(inode)) 3319 return -ENOENT; 3320 3321 truncate: 3322 ret = btrfs_check_trunc_cache_free_space(fs_info, 3323 &fs_info->global_block_rsv); 3324 if (ret) 3325 goto out; 3326 3327 trans = btrfs_join_transaction(root); 3328 if (IS_ERR(trans)) { 3329 ret = PTR_ERR(trans); 3330 goto out; 3331 } 3332 3333 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3334 3335 btrfs_end_transaction(trans); 3336 btrfs_btree_balance_dirty(fs_info); 3337 out: 3338 iput(inode); 3339 return ret; 3340 } 3341 3342 /* 3343 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3344 * cache inode, to avoid free space cache data extent blocking data relocation. 3345 */ 3346 static int delete_v1_space_cache(struct extent_buffer *leaf, 3347 struct btrfs_block_group *block_group, 3348 u64 data_bytenr) 3349 { 3350 u64 space_cache_ino; 3351 struct btrfs_file_extent_item *ei; 3352 struct btrfs_key key; 3353 bool found = false; 3354 int i; 3355 int ret; 3356 3357 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3358 return 0; 3359 3360 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3361 u8 type; 3362 3363 btrfs_item_key_to_cpu(leaf, &key, i); 3364 if (key.type != BTRFS_EXTENT_DATA_KEY) 3365 continue; 3366 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3367 type = btrfs_file_extent_type(leaf, ei); 3368 3369 if ((type == BTRFS_FILE_EXTENT_REG || 3370 type == BTRFS_FILE_EXTENT_PREALLOC) && 3371 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3372 found = true; 3373 space_cache_ino = key.objectid; 3374 break; 3375 } 3376 } 3377 if (!found) 3378 return -ENOENT; 3379 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3380 space_cache_ino); 3381 return ret; 3382 } 3383 3384 /* 3385 * helper to find all tree blocks that reference a given data extent 3386 */ 3387 static noinline_for_stack 3388 int add_data_references(struct reloc_control *rc, 3389 struct btrfs_key *extent_key, 3390 struct btrfs_path *path, 3391 struct rb_root *blocks) 3392 { 3393 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3394 struct ulist *leaves = NULL; 3395 struct ulist_iterator leaf_uiter; 3396 struct ulist_node *ref_node = NULL; 3397 const u32 blocksize = fs_info->nodesize; 3398 int ret = 0; 3399 3400 btrfs_release_path(path); 3401 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, 3402 0, &leaves, NULL, true); 3403 if (ret < 0) 3404 return ret; 3405 3406 ULIST_ITER_INIT(&leaf_uiter); 3407 while ((ref_node = ulist_next(leaves, &leaf_uiter))) { 3408 struct extent_buffer *eb; 3409 3410 eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); 3411 if (IS_ERR(eb)) { 3412 ret = PTR_ERR(eb); 3413 break; 3414 } 3415 ret = delete_v1_space_cache(eb, rc->block_group, 3416 extent_key->objectid); 3417 free_extent_buffer(eb); 3418 if (ret < 0) 3419 break; 3420 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3421 if (ret < 0) 3422 break; 3423 } 3424 if (ret < 0) 3425 free_block_list(blocks); 3426 ulist_free(leaves); 3427 return ret; 3428 } 3429 3430 /* 3431 * helper to find next unprocessed extent 3432 */ 3433 static noinline_for_stack 3434 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3435 struct btrfs_key *extent_key) 3436 { 3437 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3438 struct btrfs_key key; 3439 struct extent_buffer *leaf; 3440 u64 start, end, last; 3441 int ret; 3442 3443 last = rc->block_group->start + rc->block_group->length; 3444 while (1) { 3445 cond_resched(); 3446 if (rc->search_start >= last) { 3447 ret = 1; 3448 break; 3449 } 3450 3451 key.objectid = rc->search_start; 3452 key.type = BTRFS_EXTENT_ITEM_KEY; 3453 key.offset = 0; 3454 3455 path->search_commit_root = 1; 3456 path->skip_locking = 1; 3457 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3458 0, 0); 3459 if (ret < 0) 3460 break; 3461 next: 3462 leaf = path->nodes[0]; 3463 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3464 ret = btrfs_next_leaf(rc->extent_root, path); 3465 if (ret != 0) 3466 break; 3467 leaf = path->nodes[0]; 3468 } 3469 3470 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3471 if (key.objectid >= last) { 3472 ret = 1; 3473 break; 3474 } 3475 3476 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3477 key.type != BTRFS_METADATA_ITEM_KEY) { 3478 path->slots[0]++; 3479 goto next; 3480 } 3481 3482 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3483 key.objectid + key.offset <= rc->search_start) { 3484 path->slots[0]++; 3485 goto next; 3486 } 3487 3488 if (key.type == BTRFS_METADATA_ITEM_KEY && 3489 key.objectid + fs_info->nodesize <= 3490 rc->search_start) { 3491 path->slots[0]++; 3492 goto next; 3493 } 3494 3495 ret = find_first_extent_bit(&rc->processed_blocks, 3496 key.objectid, &start, &end, 3497 EXTENT_DIRTY, NULL); 3498 3499 if (ret == 0 && start <= key.objectid) { 3500 btrfs_release_path(path); 3501 rc->search_start = end + 1; 3502 } else { 3503 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3504 rc->search_start = key.objectid + key.offset; 3505 else 3506 rc->search_start = key.objectid + 3507 fs_info->nodesize; 3508 memcpy(extent_key, &key, sizeof(key)); 3509 return 0; 3510 } 3511 } 3512 btrfs_release_path(path); 3513 return ret; 3514 } 3515 3516 static void set_reloc_control(struct reloc_control *rc) 3517 { 3518 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3519 3520 mutex_lock(&fs_info->reloc_mutex); 3521 fs_info->reloc_ctl = rc; 3522 mutex_unlock(&fs_info->reloc_mutex); 3523 } 3524 3525 static void unset_reloc_control(struct reloc_control *rc) 3526 { 3527 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3528 3529 mutex_lock(&fs_info->reloc_mutex); 3530 fs_info->reloc_ctl = NULL; 3531 mutex_unlock(&fs_info->reloc_mutex); 3532 } 3533 3534 static noinline_for_stack 3535 int prepare_to_relocate(struct reloc_control *rc) 3536 { 3537 struct btrfs_trans_handle *trans; 3538 int ret; 3539 3540 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3541 BTRFS_BLOCK_RSV_TEMP); 3542 if (!rc->block_rsv) 3543 return -ENOMEM; 3544 3545 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3546 rc->search_start = rc->block_group->start; 3547 rc->extents_found = 0; 3548 rc->nodes_relocated = 0; 3549 rc->merging_rsv_size = 0; 3550 rc->reserved_bytes = 0; 3551 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3552 RELOCATION_RESERVED_NODES; 3553 ret = btrfs_block_rsv_refill(rc->extent_root, 3554 rc->block_rsv, rc->block_rsv->size, 3555 BTRFS_RESERVE_FLUSH_ALL); 3556 if (ret) 3557 return ret; 3558 3559 rc->create_reloc_tree = 1; 3560 set_reloc_control(rc); 3561 3562 trans = btrfs_join_transaction(rc->extent_root); 3563 if (IS_ERR(trans)) { 3564 unset_reloc_control(rc); 3565 /* 3566 * extent tree is not a ref_cow tree and has no reloc_root to 3567 * cleanup. And callers are responsible to free the above 3568 * block rsv. 3569 */ 3570 return PTR_ERR(trans); 3571 } 3572 return btrfs_commit_transaction(trans); 3573 } 3574 3575 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3576 { 3577 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3578 struct rb_root blocks = RB_ROOT; 3579 struct btrfs_key key; 3580 struct btrfs_trans_handle *trans = NULL; 3581 struct btrfs_path *path; 3582 struct btrfs_extent_item *ei; 3583 u64 flags; 3584 int ret; 3585 int err = 0; 3586 int progress = 0; 3587 3588 path = btrfs_alloc_path(); 3589 if (!path) 3590 return -ENOMEM; 3591 path->reada = READA_FORWARD; 3592 3593 ret = prepare_to_relocate(rc); 3594 if (ret) { 3595 err = ret; 3596 goto out_free; 3597 } 3598 3599 while (1) { 3600 rc->reserved_bytes = 0; 3601 ret = btrfs_block_rsv_refill(rc->extent_root, 3602 rc->block_rsv, rc->block_rsv->size, 3603 BTRFS_RESERVE_FLUSH_ALL); 3604 if (ret) { 3605 err = ret; 3606 break; 3607 } 3608 progress++; 3609 trans = btrfs_start_transaction(rc->extent_root, 0); 3610 if (IS_ERR(trans)) { 3611 err = PTR_ERR(trans); 3612 trans = NULL; 3613 break; 3614 } 3615 restart: 3616 if (update_backref_cache(trans, &rc->backref_cache)) { 3617 btrfs_end_transaction(trans); 3618 trans = NULL; 3619 continue; 3620 } 3621 3622 ret = find_next_extent(rc, path, &key); 3623 if (ret < 0) 3624 err = ret; 3625 if (ret != 0) 3626 break; 3627 3628 rc->extents_found++; 3629 3630 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3631 struct btrfs_extent_item); 3632 flags = btrfs_extent_flags(path->nodes[0], ei); 3633 3634 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3635 ret = add_tree_block(rc, &key, path, &blocks); 3636 } else if (rc->stage == UPDATE_DATA_PTRS && 3637 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3638 ret = add_data_references(rc, &key, path, &blocks); 3639 } else { 3640 btrfs_release_path(path); 3641 ret = 0; 3642 } 3643 if (ret < 0) { 3644 err = ret; 3645 break; 3646 } 3647 3648 if (!RB_EMPTY_ROOT(&blocks)) { 3649 ret = relocate_tree_blocks(trans, rc, &blocks); 3650 if (ret < 0) { 3651 if (ret != -EAGAIN) { 3652 err = ret; 3653 break; 3654 } 3655 rc->extents_found--; 3656 rc->search_start = key.objectid; 3657 } 3658 } 3659 3660 btrfs_end_transaction_throttle(trans); 3661 btrfs_btree_balance_dirty(fs_info); 3662 trans = NULL; 3663 3664 if (rc->stage == MOVE_DATA_EXTENTS && 3665 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3666 rc->found_file_extent = 1; 3667 ret = relocate_data_extent(rc->data_inode, 3668 &key, &rc->cluster); 3669 if (ret < 0) { 3670 err = ret; 3671 break; 3672 } 3673 } 3674 if (btrfs_should_cancel_balance(fs_info)) { 3675 err = -ECANCELED; 3676 break; 3677 } 3678 } 3679 if (trans && progress && err == -ENOSPC) { 3680 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3681 if (ret == 1) { 3682 err = 0; 3683 progress = 0; 3684 goto restart; 3685 } 3686 } 3687 3688 btrfs_release_path(path); 3689 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3690 3691 if (trans) { 3692 btrfs_end_transaction_throttle(trans); 3693 btrfs_btree_balance_dirty(fs_info); 3694 } 3695 3696 if (!err) { 3697 ret = relocate_file_extent_cluster(rc->data_inode, 3698 &rc->cluster); 3699 if (ret < 0) 3700 err = ret; 3701 } 3702 3703 rc->create_reloc_tree = 0; 3704 set_reloc_control(rc); 3705 3706 btrfs_backref_release_cache(&rc->backref_cache); 3707 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3708 3709 /* 3710 * Even in the case when the relocation is cancelled, we should all go 3711 * through prepare_to_merge() and merge_reloc_roots(). 3712 * 3713 * For error (including cancelled balance), prepare_to_merge() will 3714 * mark all reloc trees orphan, then queue them for cleanup in 3715 * merge_reloc_roots() 3716 */ 3717 err = prepare_to_merge(rc, err); 3718 3719 merge_reloc_roots(rc); 3720 3721 rc->merge_reloc_tree = 0; 3722 unset_reloc_control(rc); 3723 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3724 3725 /* get rid of pinned extents */ 3726 trans = btrfs_join_transaction(rc->extent_root); 3727 if (IS_ERR(trans)) { 3728 err = PTR_ERR(trans); 3729 goto out_free; 3730 } 3731 ret = btrfs_commit_transaction(trans); 3732 if (ret && !err) 3733 err = ret; 3734 out_free: 3735 ret = clean_dirty_subvols(rc); 3736 if (ret < 0 && !err) 3737 err = ret; 3738 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3739 btrfs_free_path(path); 3740 return err; 3741 } 3742 3743 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3744 struct btrfs_root *root, u64 objectid) 3745 { 3746 struct btrfs_path *path; 3747 struct btrfs_inode_item *item; 3748 struct extent_buffer *leaf; 3749 int ret; 3750 3751 path = btrfs_alloc_path(); 3752 if (!path) 3753 return -ENOMEM; 3754 3755 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3756 if (ret) 3757 goto out; 3758 3759 leaf = path->nodes[0]; 3760 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3761 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3762 btrfs_set_inode_generation(leaf, item, 1); 3763 btrfs_set_inode_size(leaf, item, 0); 3764 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3765 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3766 BTRFS_INODE_PREALLOC); 3767 btrfs_mark_buffer_dirty(leaf); 3768 out: 3769 btrfs_free_path(path); 3770 return ret; 3771 } 3772 3773 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3774 struct btrfs_root *root, u64 objectid) 3775 { 3776 struct btrfs_path *path; 3777 struct btrfs_key key; 3778 int ret = 0; 3779 3780 path = btrfs_alloc_path(); 3781 if (!path) { 3782 ret = -ENOMEM; 3783 goto out; 3784 } 3785 3786 key.objectid = objectid; 3787 key.type = BTRFS_INODE_ITEM_KEY; 3788 key.offset = 0; 3789 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3790 if (ret) { 3791 if (ret > 0) 3792 ret = -ENOENT; 3793 goto out; 3794 } 3795 ret = btrfs_del_item(trans, root, path); 3796 out: 3797 if (ret) 3798 btrfs_abort_transaction(trans, ret); 3799 btrfs_free_path(path); 3800 } 3801 3802 /* 3803 * helper to create inode for data relocation. 3804 * the inode is in data relocation tree and its link count is 0 3805 */ 3806 static noinline_for_stack 3807 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3808 struct btrfs_block_group *group) 3809 { 3810 struct inode *inode = NULL; 3811 struct btrfs_trans_handle *trans; 3812 struct btrfs_root *root; 3813 u64 objectid; 3814 int err = 0; 3815 3816 root = btrfs_grab_root(fs_info->data_reloc_root); 3817 trans = btrfs_start_transaction(root, 6); 3818 if (IS_ERR(trans)) { 3819 btrfs_put_root(root); 3820 return ERR_CAST(trans); 3821 } 3822 3823 err = btrfs_get_free_objectid(root, &objectid); 3824 if (err) 3825 goto out; 3826 3827 err = __insert_orphan_inode(trans, root, objectid); 3828 if (err) 3829 goto out; 3830 3831 inode = btrfs_iget(fs_info->sb, objectid, root); 3832 if (IS_ERR(inode)) { 3833 delete_orphan_inode(trans, root, objectid); 3834 err = PTR_ERR(inode); 3835 inode = NULL; 3836 goto out; 3837 } 3838 BTRFS_I(inode)->index_cnt = group->start; 3839 3840 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 3841 out: 3842 btrfs_put_root(root); 3843 btrfs_end_transaction(trans); 3844 btrfs_btree_balance_dirty(fs_info); 3845 if (err) { 3846 if (inode) 3847 iput(inode); 3848 inode = ERR_PTR(err); 3849 } 3850 return inode; 3851 } 3852 3853 /* 3854 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3855 * has been requested meanwhile and don't start in that case. 3856 * 3857 * Return: 3858 * 0 success 3859 * -EINPROGRESS operation is already in progress, that's probably a bug 3860 * -ECANCELED cancellation request was set before the operation started 3861 * -EAGAIN can not start because there are ongoing send operations 3862 */ 3863 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3864 { 3865 spin_lock(&fs_info->send_reloc_lock); 3866 if (fs_info->send_in_progress) { 3867 btrfs_warn_rl(fs_info, 3868 "cannot run relocation while send operations are in progress (%d in progress)", 3869 fs_info->send_in_progress); 3870 spin_unlock(&fs_info->send_reloc_lock); 3871 return -EAGAIN; 3872 } 3873 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3874 /* This should not happen */ 3875 spin_unlock(&fs_info->send_reloc_lock); 3876 btrfs_err(fs_info, "reloc already running, cannot start"); 3877 return -EINPROGRESS; 3878 } 3879 spin_unlock(&fs_info->send_reloc_lock); 3880 3881 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3882 btrfs_info(fs_info, "chunk relocation canceled on start"); 3883 /* 3884 * On cancel, clear all requests but let the caller mark 3885 * the end after cleanup operations. 3886 */ 3887 atomic_set(&fs_info->reloc_cancel_req, 0); 3888 return -ECANCELED; 3889 } 3890 return 0; 3891 } 3892 3893 /* 3894 * Mark end of chunk relocation that is cancellable and wake any waiters. 3895 */ 3896 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3897 { 3898 /* Requested after start, clear bit first so any waiters can continue */ 3899 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3900 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3901 spin_lock(&fs_info->send_reloc_lock); 3902 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3903 spin_unlock(&fs_info->send_reloc_lock); 3904 atomic_set(&fs_info->reloc_cancel_req, 0); 3905 } 3906 3907 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3908 { 3909 struct reloc_control *rc; 3910 3911 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3912 if (!rc) 3913 return NULL; 3914 3915 INIT_LIST_HEAD(&rc->reloc_roots); 3916 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3917 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); 3918 mapping_tree_init(&rc->reloc_root_tree); 3919 extent_io_tree_init(fs_info, &rc->processed_blocks, 3920 IO_TREE_RELOC_BLOCKS, NULL); 3921 return rc; 3922 } 3923 3924 static void free_reloc_control(struct reloc_control *rc) 3925 { 3926 struct mapping_node *node, *tmp; 3927 3928 free_reloc_roots(&rc->reloc_roots); 3929 rbtree_postorder_for_each_entry_safe(node, tmp, 3930 &rc->reloc_root_tree.rb_root, rb_node) 3931 kfree(node); 3932 3933 kfree(rc); 3934 } 3935 3936 /* 3937 * Print the block group being relocated 3938 */ 3939 static void describe_relocation(struct btrfs_fs_info *fs_info, 3940 struct btrfs_block_group *block_group) 3941 { 3942 char buf[128] = {'\0'}; 3943 3944 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3945 3946 btrfs_info(fs_info, 3947 "relocating block group %llu flags %s", 3948 block_group->start, buf); 3949 } 3950 3951 static const char *stage_to_string(int stage) 3952 { 3953 if (stage == MOVE_DATA_EXTENTS) 3954 return "move data extents"; 3955 if (stage == UPDATE_DATA_PTRS) 3956 return "update data pointers"; 3957 return "unknown"; 3958 } 3959 3960 /* 3961 * function to relocate all extents in a block group. 3962 */ 3963 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 3964 { 3965 struct btrfs_block_group *bg; 3966 struct btrfs_root *extent_root = fs_info->extent_root; 3967 struct reloc_control *rc; 3968 struct inode *inode; 3969 struct btrfs_path *path; 3970 int ret; 3971 int rw = 0; 3972 int err = 0; 3973 3974 bg = btrfs_lookup_block_group(fs_info, group_start); 3975 if (!bg) 3976 return -ENOENT; 3977 3978 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3979 btrfs_put_block_group(bg); 3980 return -ETXTBSY; 3981 } 3982 3983 rc = alloc_reloc_control(fs_info); 3984 if (!rc) { 3985 btrfs_put_block_group(bg); 3986 return -ENOMEM; 3987 } 3988 3989 ret = reloc_chunk_start(fs_info); 3990 if (ret < 0) { 3991 err = ret; 3992 goto out_put_bg; 3993 } 3994 3995 rc->extent_root = extent_root; 3996 rc->block_group = bg; 3997 3998 ret = btrfs_inc_block_group_ro(rc->block_group, true); 3999 if (ret) { 4000 err = ret; 4001 goto out; 4002 } 4003 rw = 1; 4004 4005 path = btrfs_alloc_path(); 4006 if (!path) { 4007 err = -ENOMEM; 4008 goto out; 4009 } 4010 4011 inode = lookup_free_space_inode(rc->block_group, path); 4012 btrfs_free_path(path); 4013 4014 if (!IS_ERR(inode)) 4015 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4016 else 4017 ret = PTR_ERR(inode); 4018 4019 if (ret && ret != -ENOENT) { 4020 err = ret; 4021 goto out; 4022 } 4023 4024 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4025 if (IS_ERR(rc->data_inode)) { 4026 err = PTR_ERR(rc->data_inode); 4027 rc->data_inode = NULL; 4028 goto out; 4029 } 4030 4031 describe_relocation(fs_info, rc->block_group); 4032 4033 btrfs_wait_block_group_reservations(rc->block_group); 4034 btrfs_wait_nocow_writers(rc->block_group); 4035 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4036 rc->block_group->start, 4037 rc->block_group->length); 4038 4039 ret = btrfs_zone_finish(rc->block_group); 4040 WARN_ON(ret && ret != -EAGAIN); 4041 4042 while (1) { 4043 int finishes_stage; 4044 4045 mutex_lock(&fs_info->cleaner_mutex); 4046 ret = relocate_block_group(rc); 4047 mutex_unlock(&fs_info->cleaner_mutex); 4048 if (ret < 0) 4049 err = ret; 4050 4051 finishes_stage = rc->stage; 4052 /* 4053 * We may have gotten ENOSPC after we already dirtied some 4054 * extents. If writeout happens while we're relocating a 4055 * different block group we could end up hitting the 4056 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4057 * btrfs_reloc_cow_block. Make sure we write everything out 4058 * properly so we don't trip over this problem, and then break 4059 * out of the loop if we hit an error. 4060 */ 4061 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4062 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4063 (u64)-1); 4064 if (ret) 4065 err = ret; 4066 invalidate_mapping_pages(rc->data_inode->i_mapping, 4067 0, -1); 4068 rc->stage = UPDATE_DATA_PTRS; 4069 } 4070 4071 if (err < 0) 4072 goto out; 4073 4074 if (rc->extents_found == 0) 4075 break; 4076 4077 btrfs_info(fs_info, "found %llu extents, stage: %s", 4078 rc->extents_found, stage_to_string(finishes_stage)); 4079 } 4080 4081 WARN_ON(rc->block_group->pinned > 0); 4082 WARN_ON(rc->block_group->reserved > 0); 4083 WARN_ON(rc->block_group->used > 0); 4084 out: 4085 if (err && rw) 4086 btrfs_dec_block_group_ro(rc->block_group); 4087 iput(rc->data_inode); 4088 out_put_bg: 4089 btrfs_put_block_group(bg); 4090 reloc_chunk_end(fs_info); 4091 free_reloc_control(rc); 4092 return err; 4093 } 4094 4095 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4096 { 4097 struct btrfs_fs_info *fs_info = root->fs_info; 4098 struct btrfs_trans_handle *trans; 4099 int ret, err; 4100 4101 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4102 if (IS_ERR(trans)) 4103 return PTR_ERR(trans); 4104 4105 memset(&root->root_item.drop_progress, 0, 4106 sizeof(root->root_item.drop_progress)); 4107 btrfs_set_root_drop_level(&root->root_item, 0); 4108 btrfs_set_root_refs(&root->root_item, 0); 4109 ret = btrfs_update_root(trans, fs_info->tree_root, 4110 &root->root_key, &root->root_item); 4111 4112 err = btrfs_end_transaction(trans); 4113 if (err) 4114 return err; 4115 return ret; 4116 } 4117 4118 /* 4119 * recover relocation interrupted by system crash. 4120 * 4121 * this function resumes merging reloc trees with corresponding fs trees. 4122 * this is important for keeping the sharing of tree blocks 4123 */ 4124 int btrfs_recover_relocation(struct btrfs_root *root) 4125 { 4126 struct btrfs_fs_info *fs_info = root->fs_info; 4127 LIST_HEAD(reloc_roots); 4128 struct btrfs_key key; 4129 struct btrfs_root *fs_root; 4130 struct btrfs_root *reloc_root; 4131 struct btrfs_path *path; 4132 struct extent_buffer *leaf; 4133 struct reloc_control *rc = NULL; 4134 struct btrfs_trans_handle *trans; 4135 int ret; 4136 int err = 0; 4137 4138 path = btrfs_alloc_path(); 4139 if (!path) 4140 return -ENOMEM; 4141 path->reada = READA_BACK; 4142 4143 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4144 key.type = BTRFS_ROOT_ITEM_KEY; 4145 key.offset = (u64)-1; 4146 4147 while (1) { 4148 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4149 path, 0, 0); 4150 if (ret < 0) { 4151 err = ret; 4152 goto out; 4153 } 4154 if (ret > 0) { 4155 if (path->slots[0] == 0) 4156 break; 4157 path->slots[0]--; 4158 } 4159 leaf = path->nodes[0]; 4160 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4161 btrfs_release_path(path); 4162 4163 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4164 key.type != BTRFS_ROOT_ITEM_KEY) 4165 break; 4166 4167 reloc_root = btrfs_read_tree_root(root, &key); 4168 if (IS_ERR(reloc_root)) { 4169 err = PTR_ERR(reloc_root); 4170 goto out; 4171 } 4172 4173 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4174 list_add(&reloc_root->root_list, &reloc_roots); 4175 4176 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4177 fs_root = btrfs_get_fs_root(fs_info, 4178 reloc_root->root_key.offset, false); 4179 if (IS_ERR(fs_root)) { 4180 ret = PTR_ERR(fs_root); 4181 if (ret != -ENOENT) { 4182 err = ret; 4183 goto out; 4184 } 4185 ret = mark_garbage_root(reloc_root); 4186 if (ret < 0) { 4187 err = ret; 4188 goto out; 4189 } 4190 } else { 4191 btrfs_put_root(fs_root); 4192 } 4193 } 4194 4195 if (key.offset == 0) 4196 break; 4197 4198 key.offset--; 4199 } 4200 btrfs_release_path(path); 4201 4202 if (list_empty(&reloc_roots)) 4203 goto out; 4204 4205 rc = alloc_reloc_control(fs_info); 4206 if (!rc) { 4207 err = -ENOMEM; 4208 goto out; 4209 } 4210 4211 ret = reloc_chunk_start(fs_info); 4212 if (ret < 0) { 4213 err = ret; 4214 goto out_end; 4215 } 4216 4217 rc->extent_root = fs_info->extent_root; 4218 4219 set_reloc_control(rc); 4220 4221 trans = btrfs_join_transaction(rc->extent_root); 4222 if (IS_ERR(trans)) { 4223 err = PTR_ERR(trans); 4224 goto out_unset; 4225 } 4226 4227 rc->merge_reloc_tree = 1; 4228 4229 while (!list_empty(&reloc_roots)) { 4230 reloc_root = list_entry(reloc_roots.next, 4231 struct btrfs_root, root_list); 4232 list_del(&reloc_root->root_list); 4233 4234 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4235 list_add_tail(&reloc_root->root_list, 4236 &rc->reloc_roots); 4237 continue; 4238 } 4239 4240 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4241 false); 4242 if (IS_ERR(fs_root)) { 4243 err = PTR_ERR(fs_root); 4244 list_add_tail(&reloc_root->root_list, &reloc_roots); 4245 btrfs_end_transaction(trans); 4246 goto out_unset; 4247 } 4248 4249 err = __add_reloc_root(reloc_root); 4250 ASSERT(err != -EEXIST); 4251 if (err) { 4252 list_add_tail(&reloc_root->root_list, &reloc_roots); 4253 btrfs_put_root(fs_root); 4254 btrfs_end_transaction(trans); 4255 goto out_unset; 4256 } 4257 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4258 btrfs_put_root(fs_root); 4259 } 4260 4261 err = btrfs_commit_transaction(trans); 4262 if (err) 4263 goto out_unset; 4264 4265 merge_reloc_roots(rc); 4266 4267 unset_reloc_control(rc); 4268 4269 trans = btrfs_join_transaction(rc->extent_root); 4270 if (IS_ERR(trans)) { 4271 err = PTR_ERR(trans); 4272 goto out_clean; 4273 } 4274 err = btrfs_commit_transaction(trans); 4275 out_clean: 4276 ret = clean_dirty_subvols(rc); 4277 if (ret < 0 && !err) 4278 err = ret; 4279 out_unset: 4280 unset_reloc_control(rc); 4281 out_end: 4282 reloc_chunk_end(fs_info); 4283 free_reloc_control(rc); 4284 out: 4285 free_reloc_roots(&reloc_roots); 4286 4287 btrfs_free_path(path); 4288 4289 if (err == 0) { 4290 /* cleanup orphan inode in data relocation tree */ 4291 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4292 ASSERT(fs_root); 4293 err = btrfs_orphan_cleanup(fs_root); 4294 btrfs_put_root(fs_root); 4295 } 4296 return err; 4297 } 4298 4299 /* 4300 * helper to add ordered checksum for data relocation. 4301 * 4302 * cloning checksum properly handles the nodatasum extents. 4303 * it also saves CPU time to re-calculate the checksum. 4304 */ 4305 int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) 4306 { 4307 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4308 struct btrfs_ordered_sum *sums; 4309 struct btrfs_ordered_extent *ordered; 4310 int ret; 4311 u64 disk_bytenr; 4312 u64 new_bytenr; 4313 LIST_HEAD(list); 4314 4315 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4316 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); 4317 4318 disk_bytenr = file_pos + inode->index_cnt; 4319 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 4320 disk_bytenr + len - 1, &list, 0); 4321 if (ret) 4322 goto out; 4323 4324 while (!list_empty(&list)) { 4325 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4326 list_del_init(&sums->list); 4327 4328 /* 4329 * We need to offset the new_bytenr based on where the csum is. 4330 * We need to do this because we will read in entire prealloc 4331 * extents but we may have written to say the middle of the 4332 * prealloc extent, so we need to make sure the csum goes with 4333 * the right disk offset. 4334 * 4335 * We can do this because the data reloc inode refers strictly 4336 * to the on disk bytes, so we don't have to worry about 4337 * disk_len vs real len like with real inodes since it's all 4338 * disk length. 4339 */ 4340 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; 4341 sums->bytenr = new_bytenr; 4342 4343 btrfs_add_ordered_sum(ordered, sums); 4344 } 4345 out: 4346 btrfs_put_ordered_extent(ordered); 4347 return ret; 4348 } 4349 4350 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4351 struct btrfs_root *root, struct extent_buffer *buf, 4352 struct extent_buffer *cow) 4353 { 4354 struct btrfs_fs_info *fs_info = root->fs_info; 4355 struct reloc_control *rc; 4356 struct btrfs_backref_node *node; 4357 int first_cow = 0; 4358 int level; 4359 int ret = 0; 4360 4361 rc = fs_info->reloc_ctl; 4362 if (!rc) 4363 return 0; 4364 4365 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4366 4367 level = btrfs_header_level(buf); 4368 if (btrfs_header_generation(buf) <= 4369 btrfs_root_last_snapshot(&root->root_item)) 4370 first_cow = 1; 4371 4372 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4373 rc->create_reloc_tree) { 4374 WARN_ON(!first_cow && level == 0); 4375 4376 node = rc->backref_cache.path[level]; 4377 BUG_ON(node->bytenr != buf->start && 4378 node->new_bytenr != buf->start); 4379 4380 btrfs_backref_drop_node_buffer(node); 4381 atomic_inc(&cow->refs); 4382 node->eb = cow; 4383 node->new_bytenr = cow->start; 4384 4385 if (!node->pending) { 4386 list_move_tail(&node->list, 4387 &rc->backref_cache.pending[level]); 4388 node->pending = 1; 4389 } 4390 4391 if (first_cow) 4392 mark_block_processed(rc, node); 4393 4394 if (first_cow && level > 0) 4395 rc->nodes_relocated += buf->len; 4396 } 4397 4398 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4399 ret = replace_file_extents(trans, rc, root, cow); 4400 return ret; 4401 } 4402 4403 /* 4404 * called before creating snapshot. it calculates metadata reservation 4405 * required for relocating tree blocks in the snapshot 4406 */ 4407 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4408 u64 *bytes_to_reserve) 4409 { 4410 struct btrfs_root *root = pending->root; 4411 struct reloc_control *rc = root->fs_info->reloc_ctl; 4412 4413 if (!rc || !have_reloc_root(root)) 4414 return; 4415 4416 if (!rc->merge_reloc_tree) 4417 return; 4418 4419 root = root->reloc_root; 4420 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4421 /* 4422 * relocation is in the stage of merging trees. the space 4423 * used by merging a reloc tree is twice the size of 4424 * relocated tree nodes in the worst case. half for cowing 4425 * the reloc tree, half for cowing the fs tree. the space 4426 * used by cowing the reloc tree will be freed after the 4427 * tree is dropped. if we create snapshot, cowing the fs 4428 * tree may use more space than it frees. so we need 4429 * reserve extra space. 4430 */ 4431 *bytes_to_reserve += rc->nodes_relocated; 4432 } 4433 4434 /* 4435 * called after snapshot is created. migrate block reservation 4436 * and create reloc root for the newly created snapshot 4437 * 4438 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4439 * references held on the reloc_root, one for root->reloc_root and one for 4440 * rc->reloc_roots. 4441 */ 4442 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4443 struct btrfs_pending_snapshot *pending) 4444 { 4445 struct btrfs_root *root = pending->root; 4446 struct btrfs_root *reloc_root; 4447 struct btrfs_root *new_root; 4448 struct reloc_control *rc = root->fs_info->reloc_ctl; 4449 int ret; 4450 4451 if (!rc || !have_reloc_root(root)) 4452 return 0; 4453 4454 rc = root->fs_info->reloc_ctl; 4455 rc->merging_rsv_size += rc->nodes_relocated; 4456 4457 if (rc->merge_reloc_tree) { 4458 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4459 rc->block_rsv, 4460 rc->nodes_relocated, true); 4461 if (ret) 4462 return ret; 4463 } 4464 4465 new_root = pending->snap; 4466 reloc_root = create_reloc_root(trans, root->reloc_root, 4467 new_root->root_key.objectid); 4468 if (IS_ERR(reloc_root)) 4469 return PTR_ERR(reloc_root); 4470 4471 ret = __add_reloc_root(reloc_root); 4472 ASSERT(ret != -EEXIST); 4473 if (ret) { 4474 /* Pairs with create_reloc_root */ 4475 btrfs_put_root(reloc_root); 4476 return ret; 4477 } 4478 new_root->reloc_root = btrfs_grab_root(reloc_root); 4479 4480 if (rc->create_reloc_tree) 4481 ret = clone_backref_node(trans, rc, root, reloc_root); 4482 return ret; 4483 } 4484