1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "inode-map.h" 22 #include "qgroup.h" 23 #include "print-tree.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "backref.h" 27 #include "misc.h" 28 29 /* 30 * Relocation overview 31 * 32 * [What does relocation do] 33 * 34 * The objective of relocation is to relocate all extents of the target block 35 * group to other block groups. 36 * This is utilized by resize (shrink only), profile converting, compacting 37 * space, or balance routine to spread chunks over devices. 38 * 39 * Before | After 40 * ------------------------------------------------------------------ 41 * BG A: 10 data extents | BG A: deleted 42 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 43 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 44 * 45 * [How does relocation work] 46 * 47 * 1. Mark the target block group read-only 48 * New extents won't be allocated from the target block group. 49 * 50 * 2.1 Record each extent in the target block group 51 * To build a proper map of extents to be relocated. 52 * 53 * 2.2 Build data reloc tree and reloc trees 54 * Data reloc tree will contain an inode, recording all newly relocated 55 * data extents. 56 * There will be only one data reloc tree for one data block group. 57 * 58 * Reloc tree will be a special snapshot of its source tree, containing 59 * relocated tree blocks. 60 * Each tree referring to a tree block in target block group will get its 61 * reloc tree built. 62 * 63 * 2.3 Swap source tree with its corresponding reloc tree 64 * Each involved tree only refers to new extents after swap. 65 * 66 * 3. Cleanup reloc trees and data reloc tree. 67 * As old extents in the target block group are still referenced by reloc 68 * trees, we need to clean them up before really freeing the target block 69 * group. 70 * 71 * The main complexity is in steps 2.2 and 2.3. 72 * 73 * The entry point of relocation is relocate_block_group() function. 74 */ 75 76 #define RELOCATION_RESERVED_NODES 256 77 /* 78 * map address of tree root to tree 79 */ 80 struct mapping_node { 81 struct { 82 struct rb_node rb_node; 83 u64 bytenr; 84 }; /* Use rb_simle_node for search/insert */ 85 void *data; 86 }; 87 88 struct mapping_tree { 89 struct rb_root rb_root; 90 spinlock_t lock; 91 }; 92 93 /* 94 * present a tree block to process 95 */ 96 struct tree_block { 97 struct { 98 struct rb_node rb_node; 99 u64 bytenr; 100 }; /* Use rb_simple_node for search/insert */ 101 struct btrfs_key key; 102 unsigned int level:8; 103 unsigned int key_ready:1; 104 }; 105 106 #define MAX_EXTENTS 128 107 108 struct file_extent_cluster { 109 u64 start; 110 u64 end; 111 u64 boundary[MAX_EXTENTS]; 112 unsigned int nr; 113 }; 114 115 struct reloc_control { 116 /* block group to relocate */ 117 struct btrfs_block_group *block_group; 118 /* extent tree */ 119 struct btrfs_root *extent_root; 120 /* inode for moving data */ 121 struct inode *data_inode; 122 123 struct btrfs_block_rsv *block_rsv; 124 125 struct btrfs_backref_cache backref_cache; 126 127 struct file_extent_cluster cluster; 128 /* tree blocks have been processed */ 129 struct extent_io_tree processed_blocks; 130 /* map start of tree root to corresponding reloc tree */ 131 struct mapping_tree reloc_root_tree; 132 /* list of reloc trees */ 133 struct list_head reloc_roots; 134 /* list of subvolume trees that get relocated */ 135 struct list_head dirty_subvol_roots; 136 /* size of metadata reservation for merging reloc trees */ 137 u64 merging_rsv_size; 138 /* size of relocated tree nodes */ 139 u64 nodes_relocated; 140 /* reserved size for block group relocation*/ 141 u64 reserved_bytes; 142 143 u64 search_start; 144 u64 extents_found; 145 146 unsigned int stage:8; 147 unsigned int create_reloc_tree:1; 148 unsigned int merge_reloc_tree:1; 149 unsigned int found_file_extent:1; 150 }; 151 152 /* stages of data relocation */ 153 #define MOVE_DATA_EXTENTS 0 154 #define UPDATE_DATA_PTRS 1 155 156 static void remove_backref_node(struct btrfs_backref_cache *cache, 157 struct btrfs_backref_node *node); 158 159 static void mark_block_processed(struct reloc_control *rc, 160 struct btrfs_backref_node *node) 161 { 162 u32 blocksize; 163 164 if (node->level == 0 || 165 in_range(node->bytenr, rc->block_group->start, 166 rc->block_group->length)) { 167 blocksize = rc->extent_root->fs_info->nodesize; 168 set_extent_bits(&rc->processed_blocks, node->bytenr, 169 node->bytenr + blocksize - 1, EXTENT_DIRTY); 170 } 171 node->processed = 1; 172 } 173 174 175 static void mapping_tree_init(struct mapping_tree *tree) 176 { 177 tree->rb_root = RB_ROOT; 178 spin_lock_init(&tree->lock); 179 } 180 181 static void backref_cache_cleanup(struct btrfs_backref_cache *cache) 182 { 183 struct btrfs_backref_node *node; 184 int i; 185 186 while (!list_empty(&cache->detached)) { 187 node = list_entry(cache->detached.next, 188 struct btrfs_backref_node, list); 189 remove_backref_node(cache, node); 190 } 191 192 while (!list_empty(&cache->leaves)) { 193 node = list_entry(cache->leaves.next, 194 struct btrfs_backref_node, lower); 195 remove_backref_node(cache, node); 196 } 197 198 cache->last_trans = 0; 199 200 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 201 ASSERT(list_empty(&cache->pending[i])); 202 ASSERT(list_empty(&cache->pending_edge)); 203 ASSERT(list_empty(&cache->useless_node)); 204 ASSERT(list_empty(&cache->changed)); 205 ASSERT(list_empty(&cache->detached)); 206 ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); 207 ASSERT(!cache->nr_nodes); 208 ASSERT(!cache->nr_edges); 209 } 210 211 static void free_backref_node(struct btrfs_backref_cache *cache, 212 struct btrfs_backref_node *node) 213 { 214 if (node) { 215 cache->nr_nodes--; 216 btrfs_put_root(node->root); 217 kfree(node); 218 } 219 } 220 221 static struct btrfs_backref_edge *alloc_backref_edge( 222 struct btrfs_backref_cache *cache) 223 { 224 struct btrfs_backref_edge *edge; 225 226 edge = kzalloc(sizeof(*edge), GFP_NOFS); 227 if (edge) 228 cache->nr_edges++; 229 return edge; 230 } 231 232 #define LINK_LOWER (1 << 0) 233 #define LINK_UPPER (1 << 1) 234 static void link_backref_edge(struct btrfs_backref_edge *edge, 235 struct btrfs_backref_node *lower, 236 struct btrfs_backref_node *upper, 237 int link_which) 238 { 239 ASSERT(upper && lower && upper->level == lower->level + 1); 240 edge->node[LOWER] = lower; 241 edge->node[UPPER] = upper; 242 if (link_which & LINK_LOWER) 243 list_add_tail(&edge->list[LOWER], &lower->upper); 244 if (link_which & LINK_UPPER) 245 list_add_tail(&edge->list[UPPER], &upper->lower); 246 } 247 248 static void free_backref_edge(struct btrfs_backref_cache *cache, 249 struct btrfs_backref_edge *edge) 250 { 251 if (edge) { 252 cache->nr_edges--; 253 kfree(edge); 254 } 255 } 256 257 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) 258 { 259 260 struct btrfs_fs_info *fs_info = NULL; 261 struct btrfs_backref_node *bnode = rb_entry(rb_node, 262 struct btrfs_backref_node, rb_node); 263 if (bnode->root) 264 fs_info = bnode->root->fs_info; 265 btrfs_panic(fs_info, errno, 266 "Inconsistency in backref cache found at offset %llu", 267 bytenr); 268 } 269 270 /* 271 * walk up backref nodes until reach node presents tree root 272 */ 273 static struct btrfs_backref_node *walk_up_backref( 274 struct btrfs_backref_node *node, 275 struct btrfs_backref_edge *edges[], int *index) 276 { 277 struct btrfs_backref_edge *edge; 278 int idx = *index; 279 280 while (!list_empty(&node->upper)) { 281 edge = list_entry(node->upper.next, 282 struct btrfs_backref_edge, list[LOWER]); 283 edges[idx++] = edge; 284 node = edge->node[UPPER]; 285 } 286 BUG_ON(node->detached); 287 *index = idx; 288 return node; 289 } 290 291 /* 292 * walk down backref nodes to find start of next reference path 293 */ 294 static struct btrfs_backref_node *walk_down_backref( 295 struct btrfs_backref_edge *edges[], int *index) 296 { 297 struct btrfs_backref_edge *edge; 298 struct btrfs_backref_node *lower; 299 int idx = *index; 300 301 while (idx > 0) { 302 edge = edges[idx - 1]; 303 lower = edge->node[LOWER]; 304 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 305 idx--; 306 continue; 307 } 308 edge = list_entry(edge->list[LOWER].next, 309 struct btrfs_backref_edge, list[LOWER]); 310 edges[idx - 1] = edge; 311 *index = idx; 312 return edge->node[UPPER]; 313 } 314 *index = 0; 315 return NULL; 316 } 317 318 static void unlock_node_buffer(struct btrfs_backref_node *node) 319 { 320 if (node->locked) { 321 btrfs_tree_unlock(node->eb); 322 node->locked = 0; 323 } 324 } 325 326 static void drop_node_buffer(struct btrfs_backref_node *node) 327 { 328 if (node->eb) { 329 unlock_node_buffer(node); 330 free_extent_buffer(node->eb); 331 node->eb = NULL; 332 } 333 } 334 335 static void drop_backref_node(struct btrfs_backref_cache *tree, 336 struct btrfs_backref_node *node) 337 { 338 BUG_ON(!list_empty(&node->upper)); 339 340 drop_node_buffer(node); 341 list_del(&node->list); 342 list_del(&node->lower); 343 if (!RB_EMPTY_NODE(&node->rb_node)) 344 rb_erase(&node->rb_node, &tree->rb_root); 345 free_backref_node(tree, node); 346 } 347 348 /* 349 * remove a backref node from the backref cache 350 */ 351 static void remove_backref_node(struct btrfs_backref_cache *cache, 352 struct btrfs_backref_node *node) 353 { 354 struct btrfs_backref_node *upper; 355 struct btrfs_backref_edge *edge; 356 357 if (!node) 358 return; 359 360 BUG_ON(!node->lowest && !node->detached); 361 while (!list_empty(&node->upper)) { 362 edge = list_entry(node->upper.next, struct btrfs_backref_edge, 363 list[LOWER]); 364 upper = edge->node[UPPER]; 365 list_del(&edge->list[LOWER]); 366 list_del(&edge->list[UPPER]); 367 free_backref_edge(cache, edge); 368 369 if (RB_EMPTY_NODE(&upper->rb_node)) { 370 BUG_ON(!list_empty(&node->upper)); 371 drop_backref_node(cache, node); 372 node = upper; 373 node->lowest = 1; 374 continue; 375 } 376 /* 377 * add the node to leaf node list if no other 378 * child block cached. 379 */ 380 if (list_empty(&upper->lower)) { 381 list_add_tail(&upper->lower, &cache->leaves); 382 upper->lowest = 1; 383 } 384 } 385 386 drop_backref_node(cache, node); 387 } 388 389 static void update_backref_node(struct btrfs_backref_cache *cache, 390 struct btrfs_backref_node *node, u64 bytenr) 391 { 392 struct rb_node *rb_node; 393 rb_erase(&node->rb_node, &cache->rb_root); 394 node->bytenr = bytenr; 395 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); 396 if (rb_node) 397 backref_tree_panic(rb_node, -EEXIST, bytenr); 398 } 399 400 /* 401 * update backref cache after a transaction commit 402 */ 403 static int update_backref_cache(struct btrfs_trans_handle *trans, 404 struct btrfs_backref_cache *cache) 405 { 406 struct btrfs_backref_node *node; 407 int level = 0; 408 409 if (cache->last_trans == 0) { 410 cache->last_trans = trans->transid; 411 return 0; 412 } 413 414 if (cache->last_trans == trans->transid) 415 return 0; 416 417 /* 418 * detached nodes are used to avoid unnecessary backref 419 * lookup. transaction commit changes the extent tree. 420 * so the detached nodes are no longer useful. 421 */ 422 while (!list_empty(&cache->detached)) { 423 node = list_entry(cache->detached.next, 424 struct btrfs_backref_node, list); 425 remove_backref_node(cache, node); 426 } 427 428 while (!list_empty(&cache->changed)) { 429 node = list_entry(cache->changed.next, 430 struct btrfs_backref_node, list); 431 list_del_init(&node->list); 432 BUG_ON(node->pending); 433 update_backref_node(cache, node, node->new_bytenr); 434 } 435 436 /* 437 * some nodes can be left in the pending list if there were 438 * errors during processing the pending nodes. 439 */ 440 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 441 list_for_each_entry(node, &cache->pending[level], list) { 442 BUG_ON(!node->pending); 443 if (node->bytenr == node->new_bytenr) 444 continue; 445 update_backref_node(cache, node, node->new_bytenr); 446 } 447 } 448 449 cache->last_trans = 0; 450 return 1; 451 } 452 453 static bool reloc_root_is_dead(struct btrfs_root *root) 454 { 455 /* 456 * Pair with set_bit/clear_bit in clean_dirty_subvols and 457 * btrfs_update_reloc_root. We need to see the updated bit before 458 * trying to access reloc_root 459 */ 460 smp_rmb(); 461 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 462 return true; 463 return false; 464 } 465 466 /* 467 * Check if this subvolume tree has valid reloc tree. 468 * 469 * Reloc tree after swap is considered dead, thus not considered as valid. 470 * This is enough for most callers, as they don't distinguish dead reloc root 471 * from no reloc root. But should_ignore_root() below is a special case. 472 */ 473 static bool have_reloc_root(struct btrfs_root *root) 474 { 475 if (reloc_root_is_dead(root)) 476 return false; 477 if (!root->reloc_root) 478 return false; 479 return true; 480 } 481 482 static int should_ignore_root(struct btrfs_root *root) 483 { 484 struct btrfs_root *reloc_root; 485 486 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 487 return 0; 488 489 /* This root has been merged with its reloc tree, we can ignore it */ 490 if (reloc_root_is_dead(root)) 491 return 1; 492 493 reloc_root = root->reloc_root; 494 if (!reloc_root) 495 return 0; 496 497 if (btrfs_header_generation(reloc_root->commit_root) == 498 root->fs_info->running_transaction->transid) 499 return 0; 500 /* 501 * if there is reloc tree and it was created in previous 502 * transaction backref lookup can find the reloc tree, 503 * so backref node for the fs tree root is useless for 504 * relocation. 505 */ 506 return 1; 507 } 508 /* 509 * find reloc tree by address of tree root 510 */ 511 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 512 { 513 struct reloc_control *rc = fs_info->reloc_ctl; 514 struct rb_node *rb_node; 515 struct mapping_node *node; 516 struct btrfs_root *root = NULL; 517 518 ASSERT(rc); 519 spin_lock(&rc->reloc_root_tree.lock); 520 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 521 if (rb_node) { 522 node = rb_entry(rb_node, struct mapping_node, rb_node); 523 root = (struct btrfs_root *)node->data; 524 } 525 spin_unlock(&rc->reloc_root_tree.lock); 526 return btrfs_grab_root(root); 527 } 528 529 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, 530 u64 root_objectid) 531 { 532 struct btrfs_key key; 533 534 key.objectid = root_objectid; 535 key.type = BTRFS_ROOT_ITEM_KEY; 536 key.offset = (u64)-1; 537 538 return btrfs_get_fs_root(fs_info, &key, false); 539 } 540 541 /* 542 * Handle direct tree backref 543 * 544 * Direct tree backref means, the backref item shows its parent bytenr 545 * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined). 546 * 547 * @ref_key: The converted backref key. 548 * For keyed backref, it's the item key. 549 * For inlined backref, objectid is the bytenr, 550 * type is btrfs_inline_ref_type, offset is 551 * btrfs_inline_ref_offset. 552 */ 553 static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, 554 struct btrfs_key *ref_key, 555 struct btrfs_backref_node *cur) 556 { 557 struct btrfs_backref_edge *edge; 558 struct btrfs_backref_node *upper; 559 struct rb_node *rb_node; 560 561 ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY); 562 563 /* Only reloc root uses backref pointing to itself */ 564 if (ref_key->objectid == ref_key->offset) { 565 struct btrfs_root *root; 566 567 cur->is_reloc_root = 1; 568 /* Only reloc backref cache cares about a specific root */ 569 if (cache->is_reloc) { 570 root = find_reloc_root(cache->fs_info, cur->bytenr); 571 if (WARN_ON(!root)) 572 return -ENOENT; 573 cur->root = root; 574 } else { 575 /* 576 * For generic purpose backref cache, reloc root node 577 * is useless. 578 */ 579 list_add(&cur->list, &cache->useless_node); 580 } 581 return 0; 582 } 583 584 edge = alloc_backref_edge(cache); 585 if (!edge) 586 return -ENOMEM; 587 588 rb_node = rb_simple_search(&cache->rb_root, ref_key->offset); 589 if (!rb_node) { 590 /* Parent node not yet cached */ 591 upper = btrfs_backref_alloc_node(cache, ref_key->offset, 592 cur->level + 1); 593 if (!upper) { 594 free_backref_edge(cache, edge); 595 return -ENOMEM; 596 } 597 598 /* 599 * Backrefs for the upper level block isn't cached, add the 600 * block to pending list 601 */ 602 list_add_tail(&edge->list[UPPER], &cache->pending_edge); 603 } else { 604 /* Parent node already cached */ 605 upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 606 ASSERT(upper->checked); 607 INIT_LIST_HEAD(&edge->list[UPPER]); 608 } 609 link_backref_edge(edge, cur, upper, LINK_LOWER); 610 return 0; 611 } 612 613 /* 614 * Handle indirect tree backref 615 * 616 * Indirect tree backref means, we only know which tree the node belongs to. 617 * We still need to do a tree search to find out the parents. This is for 618 * TREE_BLOCK_REF backref (keyed or inlined). 619 * 620 * @ref_key: The same as @ref_key in handle_direct_tree_backref() 621 * @tree_key: The first key of this tree block. 622 * @path: A clean (released) path, to avoid allocating path everytime 623 * the function get called. 624 */ 625 static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, 626 struct btrfs_path *path, 627 struct btrfs_key *ref_key, 628 struct btrfs_key *tree_key, 629 struct btrfs_backref_node *cur) 630 { 631 struct btrfs_fs_info *fs_info = cache->fs_info; 632 struct btrfs_backref_node *upper; 633 struct btrfs_backref_node *lower; 634 struct btrfs_backref_edge *edge; 635 struct extent_buffer *eb; 636 struct btrfs_root *root; 637 struct rb_node *rb_node; 638 int level; 639 bool need_check = true; 640 int ret; 641 642 root = read_fs_root(fs_info, ref_key->offset); 643 if (IS_ERR(root)) 644 return PTR_ERR(root); 645 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 646 cur->cowonly = 1; 647 648 if (btrfs_root_level(&root->root_item) == cur->level) { 649 /* Tree root */ 650 ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); 651 if (should_ignore_root(root)) { 652 btrfs_put_root(root); 653 list_add(&cur->list, &cache->useless_node); 654 } else { 655 cur->root = root; 656 } 657 return 0; 658 } 659 660 level = cur->level + 1; 661 662 /* Search the tree to find parent blocks referring to the block */ 663 path->search_commit_root = 1; 664 path->skip_locking = 1; 665 path->lowest_level = level; 666 ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); 667 path->lowest_level = 0; 668 if (ret < 0) { 669 btrfs_put_root(root); 670 return ret; 671 } 672 if (ret > 0 && path->slots[level] > 0) 673 path->slots[level]--; 674 675 eb = path->nodes[level]; 676 if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { 677 btrfs_err(fs_info, 678 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", 679 cur->bytenr, level - 1, root->root_key.objectid, 680 tree_key->objectid, tree_key->type, tree_key->offset); 681 btrfs_put_root(root); 682 ret = -ENOENT; 683 goto out; 684 } 685 lower = cur; 686 687 /* Add all nodes and edges in the path */ 688 for (; level < BTRFS_MAX_LEVEL; level++) { 689 if (!path->nodes[level]) { 690 ASSERT(btrfs_root_bytenr(&root->root_item) == 691 lower->bytenr); 692 if (should_ignore_root(root)) { 693 btrfs_put_root(root); 694 list_add(&lower->list, &cache->useless_node); 695 } else { 696 lower->root = root; 697 } 698 break; 699 } 700 701 edge = alloc_backref_edge(cache); 702 if (!edge) { 703 btrfs_put_root(root); 704 ret = -ENOMEM; 705 goto out; 706 } 707 708 eb = path->nodes[level]; 709 rb_node = rb_simple_search(&cache->rb_root, eb->start); 710 if (!rb_node) { 711 upper = btrfs_backref_alloc_node(cache, eb->start, 712 lower->level + 1); 713 if (!upper) { 714 btrfs_put_root(root); 715 free_backref_edge(cache, edge); 716 ret = -ENOMEM; 717 goto out; 718 } 719 upper->owner = btrfs_header_owner(eb); 720 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 721 upper->cowonly = 1; 722 723 /* 724 * If we know the block isn't shared we can avoid 725 * checking its backrefs. 726 */ 727 if (btrfs_block_can_be_shared(root, eb)) 728 upper->checked = 0; 729 else 730 upper->checked = 1; 731 732 /* 733 * Add the block to pending list if we need to check its 734 * backrefs, we only do this once while walking up a 735 * tree as we will catch anything else later on. 736 */ 737 if (!upper->checked && need_check) { 738 need_check = false; 739 list_add_tail(&edge->list[UPPER], 740 &cache->pending_edge); 741 } else { 742 if (upper->checked) 743 need_check = true; 744 INIT_LIST_HEAD(&edge->list[UPPER]); 745 } 746 } else { 747 upper = rb_entry(rb_node, struct btrfs_backref_node, 748 rb_node); 749 ASSERT(upper->checked); 750 INIT_LIST_HEAD(&edge->list[UPPER]); 751 if (!upper->owner) 752 upper->owner = btrfs_header_owner(eb); 753 } 754 link_backref_edge(edge, lower, upper, LINK_LOWER); 755 756 if (rb_node) { 757 btrfs_put_root(root); 758 break; 759 } 760 lower = upper; 761 upper = NULL; 762 } 763 out: 764 btrfs_release_path(path); 765 return ret; 766 } 767 768 static int handle_one_tree_block(struct btrfs_backref_cache *cache, 769 struct btrfs_path *path, 770 struct btrfs_backref_iter *iter, 771 struct btrfs_key *node_key, 772 struct btrfs_backref_node *cur) 773 { 774 struct btrfs_fs_info *fs_info = cache->fs_info; 775 struct btrfs_backref_edge *edge; 776 struct btrfs_backref_node *exist; 777 int ret; 778 779 ret = btrfs_backref_iter_start(iter, cur->bytenr); 780 if (ret < 0) 781 return ret; 782 /* 783 * We skip the first btrfs_tree_block_info, as we don't use the key 784 * stored in it, but fetch it from the tree block 785 */ 786 if (btrfs_backref_has_tree_block_info(iter)) { 787 ret = btrfs_backref_iter_next(iter); 788 if (ret < 0) 789 goto out; 790 /* No extra backref? This means the tree block is corrupted */ 791 if (ret > 0) { 792 ret = -EUCLEAN; 793 goto out; 794 } 795 } 796 WARN_ON(cur->checked); 797 if (!list_empty(&cur->upper)) { 798 /* 799 * the backref was added previously when processing 800 * backref of type BTRFS_TREE_BLOCK_REF_KEY 801 */ 802 ASSERT(list_is_singular(&cur->upper)); 803 edge = list_entry(cur->upper.next, struct btrfs_backref_edge, 804 list[LOWER]); 805 ASSERT(list_empty(&edge->list[UPPER])); 806 exist = edge->node[UPPER]; 807 /* 808 * add the upper level block to pending list if we need 809 * check its backrefs 810 */ 811 if (!exist->checked) 812 list_add_tail(&edge->list[UPPER], &cache->pending_edge); 813 } else { 814 exist = NULL; 815 } 816 817 for (; ret == 0; ret = btrfs_backref_iter_next(iter)) { 818 struct extent_buffer *eb; 819 struct btrfs_key key; 820 int type; 821 822 cond_resched(); 823 eb = btrfs_backref_get_eb(iter); 824 825 key.objectid = iter->bytenr; 826 if (btrfs_backref_iter_is_inline_ref(iter)) { 827 struct btrfs_extent_inline_ref *iref; 828 829 /* update key for inline back ref */ 830 iref = (struct btrfs_extent_inline_ref *) 831 ((unsigned long)iter->cur_ptr); 832 type = btrfs_get_extent_inline_ref_type(eb, iref, 833 BTRFS_REF_TYPE_BLOCK); 834 if (type == BTRFS_REF_TYPE_INVALID) { 835 ret = -EUCLEAN; 836 goto out; 837 } 838 key.type = type; 839 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 840 } else { 841 key.type = iter->cur_key.type; 842 key.offset = iter->cur_key.offset; 843 } 844 845 /* 846 * Parent node found and matches current inline ref, no need to 847 * rebuild this node for this inline ref. 848 */ 849 if (exist && 850 ((key.type == BTRFS_TREE_BLOCK_REF_KEY && 851 exist->owner == key.offset) || 852 (key.type == BTRFS_SHARED_BLOCK_REF_KEY && 853 exist->bytenr == key.offset))) { 854 exist = NULL; 855 continue; 856 } 857 858 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ 859 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 860 ret = handle_direct_tree_backref(cache, &key, cur); 861 if (ret < 0) 862 goto out; 863 continue; 864 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 865 ret = -EINVAL; 866 btrfs_print_v0_err(fs_info); 867 btrfs_handle_fs_error(fs_info, ret, NULL); 868 goto out; 869 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 870 continue; 871 } 872 873 /* 874 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset 875 * means the root objectid. We need to search the tree to get 876 * its parent bytenr. 877 */ 878 ret = handle_indirect_tree_backref(cache, path, &key, node_key, 879 cur); 880 if (ret < 0) 881 goto out; 882 } 883 ret = 0; 884 cur->checked = 1; 885 WARN_ON(exist); 886 out: 887 btrfs_backref_iter_release(iter); 888 return ret; 889 } 890 891 /* 892 * In handle_one_tree_backref(), we have only linked the lower node to the edge, 893 * but the upper node hasn't been linked to the edge. 894 * This means we can only iterate through btrfs_backref_node::upper to reach 895 * parent edges, but not through btrfs_backref_node::lower to reach children 896 * edges. 897 * 898 * This function will finish the btrfs_backref_node::lower to related edges, 899 * so that backref cache can be bi-directionally iterated. 900 * 901 * Also, this will add the nodes to backref cache for the next run. 902 */ 903 static int finish_upper_links(struct btrfs_backref_cache *cache, 904 struct btrfs_backref_node *start) 905 { 906 struct list_head *useless_node = &cache->useless_node; 907 struct btrfs_backref_edge *edge; 908 struct rb_node *rb_node; 909 LIST_HEAD(pending_edge); 910 911 ASSERT(start->checked); 912 913 /* Insert this node to cache if it's not COW-only */ 914 if (!start->cowonly) { 915 rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, 916 &start->rb_node); 917 if (rb_node) 918 backref_tree_panic(rb_node, -EEXIST, start->bytenr); 919 list_add_tail(&start->lower, &cache->leaves); 920 } 921 922 /* 923 * Use breadth first search to iterate all related edges. 924 * 925 * The starting points are all the edges of this node 926 */ 927 list_for_each_entry(edge, &start->upper, list[LOWER]) 928 list_add_tail(&edge->list[UPPER], &pending_edge); 929 930 while (!list_empty(&pending_edge)) { 931 struct btrfs_backref_node *upper; 932 struct btrfs_backref_node *lower; 933 struct rb_node *rb_node; 934 935 edge = list_first_entry(&pending_edge, 936 struct btrfs_backref_edge, list[UPPER]); 937 list_del_init(&edge->list[UPPER]); 938 upper = edge->node[UPPER]; 939 lower = edge->node[LOWER]; 940 941 /* Parent is detached, no need to keep any edges */ 942 if (upper->detached) { 943 list_del(&edge->list[LOWER]); 944 free_backref_edge(cache, edge); 945 946 /* Lower node is orphan, queue for cleanup */ 947 if (list_empty(&lower->upper)) 948 list_add(&lower->list, useless_node); 949 continue; 950 } 951 952 /* 953 * All new nodes added in current build_backref_tree() haven't 954 * been linked to the cache rb tree. 955 * So if we have upper->rb_node populated, this means a cache 956 * hit. We only need to link the edge, as @upper and all its 957 * parent have already been linked. 958 */ 959 if (!RB_EMPTY_NODE(&upper->rb_node)) { 960 if (upper->lowest) { 961 list_del_init(&upper->lower); 962 upper->lowest = 0; 963 } 964 965 list_add_tail(&edge->list[UPPER], &upper->lower); 966 continue; 967 } 968 969 /* Sanity check, we shouldn't have any unchecked nodes */ 970 if (!upper->checked) { 971 ASSERT(0); 972 return -EUCLEAN; 973 } 974 975 /* Sanity check, COW-only node has non-COW-only parent */ 976 if (start->cowonly != upper->cowonly) { 977 ASSERT(0); 978 return -EUCLEAN; 979 } 980 981 /* Only cache non-COW-only (subvolume trees) tree blocks */ 982 if (!upper->cowonly) { 983 rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, 984 &upper->rb_node); 985 if (rb_node) { 986 backref_tree_panic(rb_node, -EEXIST, 987 upper->bytenr); 988 return -EUCLEAN; 989 } 990 } 991 992 list_add_tail(&edge->list[UPPER], &upper->lower); 993 994 /* 995 * Also queue all the parent edges of this uncached node to 996 * finish the upper linkage 997 */ 998 list_for_each_entry(edge, &upper->upper, list[LOWER]) 999 list_add_tail(&edge->list[UPPER], &pending_edge); 1000 } 1001 return 0; 1002 } 1003 1004 /* 1005 * For useless nodes, do two major clean ups: 1006 * 1007 * - Cleanup the children edges and nodes 1008 * If child node is also orphan (no parent) during cleanup, then the child 1009 * node will also be cleaned up. 1010 * 1011 * - Freeing up leaves (level 0), keeps nodes detached 1012 * For nodes, the node is still cached as "detached" 1013 * 1014 * Return false if @node is not in the @useless_nodes list. 1015 * Return true if @node is in the @useless_nodes list. 1016 */ 1017 static bool handle_useless_nodes(struct reloc_control *rc, 1018 struct btrfs_backref_node *node) 1019 { 1020 struct btrfs_backref_cache *cache = &rc->backref_cache; 1021 struct list_head *useless_node = &cache->useless_node; 1022 bool ret = false; 1023 1024 while (!list_empty(useless_node)) { 1025 struct btrfs_backref_node *cur; 1026 1027 cur = list_first_entry(useless_node, struct btrfs_backref_node, 1028 list); 1029 list_del_init(&cur->list); 1030 1031 /* Only tree root nodes can be added to @useless_nodes */ 1032 ASSERT(list_empty(&cur->upper)); 1033 1034 if (cur == node) 1035 ret = true; 1036 1037 /* The node is the lowest node */ 1038 if (cur->lowest) { 1039 list_del_init(&cur->lower); 1040 cur->lowest = 0; 1041 } 1042 1043 /* Cleanup the lower edges */ 1044 while (!list_empty(&cur->lower)) { 1045 struct btrfs_backref_edge *edge; 1046 struct btrfs_backref_node *lower; 1047 1048 edge = list_entry(cur->lower.next, 1049 struct btrfs_backref_edge, list[UPPER]); 1050 list_del(&edge->list[UPPER]); 1051 list_del(&edge->list[LOWER]); 1052 lower = edge->node[LOWER]; 1053 free_backref_edge(cache, edge); 1054 1055 /* Child node is also orphan, queue for cleanup */ 1056 if (list_empty(&lower->upper)) 1057 list_add(&lower->list, useless_node); 1058 } 1059 /* Mark this block processed for relocation */ 1060 mark_block_processed(rc, cur); 1061 1062 /* 1063 * Backref nodes for tree leaves are deleted from the cache. 1064 * Backref nodes for upper level tree blocks are left in the 1065 * cache to avoid unnecessary backref lookup. 1066 */ 1067 if (cur->level > 0) { 1068 list_add(&cur->list, &cache->detached); 1069 cur->detached = 1; 1070 } else { 1071 rb_erase(&cur->rb_node, &cache->rb_root); 1072 free_backref_node(cache, cur); 1073 } 1074 } 1075 return ret; 1076 } 1077 1078 /* 1079 * Build backref tree for a given tree block. Root of the backref tree 1080 * corresponds the tree block, leaves of the backref tree correspond roots of 1081 * b-trees that reference the tree block. 1082 * 1083 * The basic idea of this function is check backrefs of a given block to find 1084 * upper level blocks that reference the block, and then check backrefs of 1085 * these upper level blocks recursively. The recursion stops when tree root is 1086 * reached or backrefs for the block is cached. 1087 * 1088 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 1089 * all upper level blocks that directly/indirectly reference the block are also 1090 * cached. 1091 */ 1092 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 1093 struct reloc_control *rc, struct btrfs_key *node_key, 1094 int level, u64 bytenr) 1095 { 1096 struct btrfs_backref_iter *iter; 1097 struct btrfs_backref_cache *cache = &rc->backref_cache; 1098 /* For searching parent of TREE_BLOCK_REF */ 1099 struct btrfs_path *path; 1100 struct btrfs_backref_node *cur; 1101 struct btrfs_backref_node *upper; 1102 struct btrfs_backref_node *lower; 1103 struct btrfs_backref_node *node = NULL; 1104 struct btrfs_backref_edge *edge; 1105 int ret; 1106 int err = 0; 1107 1108 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); 1109 if (!iter) 1110 return ERR_PTR(-ENOMEM); 1111 path = btrfs_alloc_path(); 1112 if (!path) { 1113 err = -ENOMEM; 1114 goto out; 1115 } 1116 1117 node = btrfs_backref_alloc_node(cache, bytenr, level); 1118 if (!node) { 1119 err = -ENOMEM; 1120 goto out; 1121 } 1122 1123 node->lowest = 1; 1124 cur = node; 1125 1126 /* Breadth-first search to build backref cache */ 1127 do { 1128 ret = handle_one_tree_block(cache, path, iter, node_key, cur); 1129 if (ret < 0) { 1130 err = ret; 1131 goto out; 1132 } 1133 edge = list_first_entry_or_null(&cache->pending_edge, 1134 struct btrfs_backref_edge, list[UPPER]); 1135 /* 1136 * The pending list isn't empty, take the first block to 1137 * process 1138 */ 1139 if (edge) { 1140 list_del_init(&edge->list[UPPER]); 1141 cur = edge->node[UPPER]; 1142 } 1143 } while (edge); 1144 1145 /* Finish the upper linkage of newly added edges/nodes */ 1146 ret = finish_upper_links(cache, node); 1147 if (ret < 0) { 1148 err = ret; 1149 goto out; 1150 } 1151 1152 if (handle_useless_nodes(rc, node)) 1153 node = NULL; 1154 out: 1155 btrfs_backref_iter_free(iter); 1156 btrfs_free_path(path); 1157 if (err) { 1158 while (!list_empty(&cache->useless_node)) { 1159 lower = list_first_entry(&cache->useless_node, 1160 struct btrfs_backref_node, list); 1161 list_del_init(&lower->list); 1162 } 1163 while (!list_empty(&cache->pending_edge)) { 1164 edge = list_first_entry(&cache->pending_edge, 1165 struct btrfs_backref_edge, list[UPPER]); 1166 list_del(&edge->list[UPPER]); 1167 list_del(&edge->list[LOWER]); 1168 lower = edge->node[LOWER]; 1169 upper = edge->node[UPPER]; 1170 free_backref_edge(cache, edge); 1171 1172 /* 1173 * Lower is no longer linked to any upper backref nodes 1174 * and isn't in the cache, we can free it ourselves. 1175 */ 1176 if (list_empty(&lower->upper) && 1177 RB_EMPTY_NODE(&lower->rb_node)) 1178 list_add(&lower->list, &cache->useless_node); 1179 1180 if (!RB_EMPTY_NODE(&upper->rb_node)) 1181 continue; 1182 1183 /* Add this guy's upper edges to the list to process */ 1184 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1185 list_add_tail(&edge->list[UPPER], 1186 &cache->pending_edge); 1187 if (list_empty(&upper->upper)) 1188 list_add(&upper->list, &cache->useless_node); 1189 } 1190 1191 while (!list_empty(&cache->useless_node)) { 1192 lower = list_first_entry(&cache->useless_node, 1193 struct btrfs_backref_node, list); 1194 list_del_init(&lower->list); 1195 if (lower == node) 1196 node = NULL; 1197 free_backref_node(cache, lower); 1198 } 1199 1200 remove_backref_node(cache, node); 1201 ASSERT(list_empty(&cache->useless_node) && 1202 list_empty(&cache->pending_edge)); 1203 return ERR_PTR(err); 1204 } 1205 ASSERT(!node || !node->detached); 1206 ASSERT(list_empty(&cache->useless_node) && 1207 list_empty(&cache->pending_edge)); 1208 return node; 1209 } 1210 1211 /* 1212 * helper to add backref node for the newly created snapshot. 1213 * the backref node is created by cloning backref node that 1214 * corresponds to root of source tree 1215 */ 1216 static int clone_backref_node(struct btrfs_trans_handle *trans, 1217 struct reloc_control *rc, 1218 struct btrfs_root *src, 1219 struct btrfs_root *dest) 1220 { 1221 struct btrfs_root *reloc_root = src->reloc_root; 1222 struct btrfs_backref_cache *cache = &rc->backref_cache; 1223 struct btrfs_backref_node *node = NULL; 1224 struct btrfs_backref_node *new_node; 1225 struct btrfs_backref_edge *edge; 1226 struct btrfs_backref_edge *new_edge; 1227 struct rb_node *rb_node; 1228 1229 if (cache->last_trans > 0) 1230 update_backref_cache(trans, cache); 1231 1232 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 1233 if (rb_node) { 1234 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 1235 if (node->detached) 1236 node = NULL; 1237 else 1238 BUG_ON(node->new_bytenr != reloc_root->node->start); 1239 } 1240 1241 if (!node) { 1242 rb_node = rb_simple_search(&cache->rb_root, 1243 reloc_root->commit_root->start); 1244 if (rb_node) { 1245 node = rb_entry(rb_node, struct btrfs_backref_node, 1246 rb_node); 1247 BUG_ON(node->detached); 1248 } 1249 } 1250 1251 if (!node) 1252 return 0; 1253 1254 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 1255 node->level); 1256 if (!new_node) 1257 return -ENOMEM; 1258 1259 new_node->lowest = node->lowest; 1260 new_node->checked = 1; 1261 new_node->root = btrfs_grab_root(dest); 1262 ASSERT(new_node->root); 1263 1264 if (!node->lowest) { 1265 list_for_each_entry(edge, &node->lower, list[UPPER]) { 1266 new_edge = alloc_backref_edge(cache); 1267 if (!new_edge) 1268 goto fail; 1269 1270 link_backref_edge(new_edge, edge->node[LOWER], new_node, 1271 LINK_UPPER); 1272 } 1273 } else { 1274 list_add_tail(&new_node->lower, &cache->leaves); 1275 } 1276 1277 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 1278 &new_node->rb_node); 1279 if (rb_node) 1280 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); 1281 1282 if (!new_node->lowest) { 1283 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 1284 list_add_tail(&new_edge->list[LOWER], 1285 &new_edge->node[LOWER]->upper); 1286 } 1287 } 1288 return 0; 1289 fail: 1290 while (!list_empty(&new_node->lower)) { 1291 new_edge = list_entry(new_node->lower.next, 1292 struct btrfs_backref_edge, list[UPPER]); 1293 list_del(&new_edge->list[UPPER]); 1294 free_backref_edge(cache, new_edge); 1295 } 1296 free_backref_node(cache, new_node); 1297 return -ENOMEM; 1298 } 1299 1300 /* 1301 * helper to add 'address of tree root -> reloc tree' mapping 1302 */ 1303 static int __must_check __add_reloc_root(struct btrfs_root *root) 1304 { 1305 struct btrfs_fs_info *fs_info = root->fs_info; 1306 struct rb_node *rb_node; 1307 struct mapping_node *node; 1308 struct reloc_control *rc = fs_info->reloc_ctl; 1309 1310 node = kmalloc(sizeof(*node), GFP_NOFS); 1311 if (!node) 1312 return -ENOMEM; 1313 1314 node->bytenr = root->commit_root->start; 1315 node->data = root; 1316 1317 spin_lock(&rc->reloc_root_tree.lock); 1318 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 1319 node->bytenr, &node->rb_node); 1320 spin_unlock(&rc->reloc_root_tree.lock); 1321 if (rb_node) { 1322 btrfs_panic(fs_info, -EEXIST, 1323 "Duplicate root found for start=%llu while inserting into relocation tree", 1324 node->bytenr); 1325 } 1326 1327 list_add_tail(&root->root_list, &rc->reloc_roots); 1328 return 0; 1329 } 1330 1331 /* 1332 * helper to delete the 'address of tree root -> reloc tree' 1333 * mapping 1334 */ 1335 static void __del_reloc_root(struct btrfs_root *root) 1336 { 1337 struct btrfs_fs_info *fs_info = root->fs_info; 1338 struct rb_node *rb_node; 1339 struct mapping_node *node = NULL; 1340 struct reloc_control *rc = fs_info->reloc_ctl; 1341 bool put_ref = false; 1342 1343 if (rc && root->node) { 1344 spin_lock(&rc->reloc_root_tree.lock); 1345 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 1346 root->commit_root->start); 1347 if (rb_node) { 1348 node = rb_entry(rb_node, struct mapping_node, rb_node); 1349 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1350 RB_CLEAR_NODE(&node->rb_node); 1351 } 1352 spin_unlock(&rc->reloc_root_tree.lock); 1353 if (!node) 1354 return; 1355 BUG_ON((struct btrfs_root *)node->data != root); 1356 } 1357 1358 /* 1359 * We only put the reloc root here if it's on the list. There's a lot 1360 * of places where the pattern is to splice the rc->reloc_roots, process 1361 * the reloc roots, and then add the reloc root back onto 1362 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 1363 * list we don't want the reference being dropped, because the guy 1364 * messing with the list is in charge of the reference. 1365 */ 1366 spin_lock(&fs_info->trans_lock); 1367 if (!list_empty(&root->root_list)) { 1368 put_ref = true; 1369 list_del_init(&root->root_list); 1370 } 1371 spin_unlock(&fs_info->trans_lock); 1372 if (put_ref) 1373 btrfs_put_root(root); 1374 kfree(node); 1375 } 1376 1377 /* 1378 * helper to update the 'address of tree root -> reloc tree' 1379 * mapping 1380 */ 1381 static int __update_reloc_root(struct btrfs_root *root) 1382 { 1383 struct btrfs_fs_info *fs_info = root->fs_info; 1384 struct rb_node *rb_node; 1385 struct mapping_node *node = NULL; 1386 struct reloc_control *rc = fs_info->reloc_ctl; 1387 1388 spin_lock(&rc->reloc_root_tree.lock); 1389 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 1390 root->commit_root->start); 1391 if (rb_node) { 1392 node = rb_entry(rb_node, struct mapping_node, rb_node); 1393 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1394 } 1395 spin_unlock(&rc->reloc_root_tree.lock); 1396 1397 if (!node) 1398 return 0; 1399 BUG_ON((struct btrfs_root *)node->data != root); 1400 1401 spin_lock(&rc->reloc_root_tree.lock); 1402 node->bytenr = root->node->start; 1403 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 1404 node->bytenr, &node->rb_node); 1405 spin_unlock(&rc->reloc_root_tree.lock); 1406 if (rb_node) 1407 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1408 return 0; 1409 } 1410 1411 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 1412 struct btrfs_root *root, u64 objectid) 1413 { 1414 struct btrfs_fs_info *fs_info = root->fs_info; 1415 struct btrfs_root *reloc_root; 1416 struct extent_buffer *eb; 1417 struct btrfs_root_item *root_item; 1418 struct btrfs_key root_key; 1419 int ret; 1420 1421 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1422 BUG_ON(!root_item); 1423 1424 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1425 root_key.type = BTRFS_ROOT_ITEM_KEY; 1426 root_key.offset = objectid; 1427 1428 if (root->root_key.objectid == objectid) { 1429 u64 commit_root_gen; 1430 1431 /* called by btrfs_init_reloc_root */ 1432 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1433 BTRFS_TREE_RELOC_OBJECTID); 1434 BUG_ON(ret); 1435 /* 1436 * Set the last_snapshot field to the generation of the commit 1437 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 1438 * correctly (returns true) when the relocation root is created 1439 * either inside the critical section of a transaction commit 1440 * (through transaction.c:qgroup_account_snapshot()) and when 1441 * it's created before the transaction commit is started. 1442 */ 1443 commit_root_gen = btrfs_header_generation(root->commit_root); 1444 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 1445 } else { 1446 /* 1447 * called by btrfs_reloc_post_snapshot_hook. 1448 * the source tree is a reloc tree, all tree blocks 1449 * modified after it was created have RELOC flag 1450 * set in their headers. so it's OK to not update 1451 * the 'last_snapshot'. 1452 */ 1453 ret = btrfs_copy_root(trans, root, root->node, &eb, 1454 BTRFS_TREE_RELOC_OBJECTID); 1455 BUG_ON(ret); 1456 } 1457 1458 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1459 btrfs_set_root_bytenr(root_item, eb->start); 1460 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1461 btrfs_set_root_generation(root_item, trans->transid); 1462 1463 if (root->root_key.objectid == objectid) { 1464 btrfs_set_root_refs(root_item, 0); 1465 memset(&root_item->drop_progress, 0, 1466 sizeof(struct btrfs_disk_key)); 1467 root_item->drop_level = 0; 1468 } 1469 1470 btrfs_tree_unlock(eb); 1471 free_extent_buffer(eb); 1472 1473 ret = btrfs_insert_root(trans, fs_info->tree_root, 1474 &root_key, root_item); 1475 BUG_ON(ret); 1476 kfree(root_item); 1477 1478 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 1479 BUG_ON(IS_ERR(reloc_root)); 1480 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); 1481 reloc_root->last_trans = trans->transid; 1482 return reloc_root; 1483 } 1484 1485 /* 1486 * create reloc tree for a given fs tree. reloc tree is just a 1487 * snapshot of the fs tree with special root objectid. 1488 * 1489 * The reloc_root comes out of here with two references, one for 1490 * root->reloc_root, and another for being on the rc->reloc_roots list. 1491 */ 1492 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 1493 struct btrfs_root *root) 1494 { 1495 struct btrfs_fs_info *fs_info = root->fs_info; 1496 struct btrfs_root *reloc_root; 1497 struct reloc_control *rc = fs_info->reloc_ctl; 1498 struct btrfs_block_rsv *rsv; 1499 int clear_rsv = 0; 1500 int ret; 1501 1502 if (!rc) 1503 return 0; 1504 1505 /* 1506 * The subvolume has reloc tree but the swap is finished, no need to 1507 * create/update the dead reloc tree 1508 */ 1509 if (reloc_root_is_dead(root)) 1510 return 0; 1511 1512 /* 1513 * This is subtle but important. We do not do 1514 * record_root_in_transaction for reloc roots, instead we record their 1515 * corresponding fs root, and then here we update the last trans for the 1516 * reloc root. This means that we have to do this for the entire life 1517 * of the reloc root, regardless of which stage of the relocation we are 1518 * in. 1519 */ 1520 if (root->reloc_root) { 1521 reloc_root = root->reloc_root; 1522 reloc_root->last_trans = trans->transid; 1523 return 0; 1524 } 1525 1526 /* 1527 * We are merging reloc roots, we do not need new reloc trees. Also 1528 * reloc trees never need their own reloc tree. 1529 */ 1530 if (!rc->create_reloc_tree || 1531 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1532 return 0; 1533 1534 if (!trans->reloc_reserved) { 1535 rsv = trans->block_rsv; 1536 trans->block_rsv = rc->block_rsv; 1537 clear_rsv = 1; 1538 } 1539 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 1540 if (clear_rsv) 1541 trans->block_rsv = rsv; 1542 1543 ret = __add_reloc_root(reloc_root); 1544 BUG_ON(ret < 0); 1545 root->reloc_root = btrfs_grab_root(reloc_root); 1546 return 0; 1547 } 1548 1549 /* 1550 * update root item of reloc tree 1551 */ 1552 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 1553 struct btrfs_root *root) 1554 { 1555 struct btrfs_fs_info *fs_info = root->fs_info; 1556 struct btrfs_root *reloc_root; 1557 struct btrfs_root_item *root_item; 1558 int ret; 1559 1560 if (!have_reloc_root(root)) 1561 goto out; 1562 1563 reloc_root = root->reloc_root; 1564 root_item = &reloc_root->root_item; 1565 1566 /* 1567 * We are probably ok here, but __del_reloc_root() will drop its ref of 1568 * the root. We have the ref for root->reloc_root, but just in case 1569 * hold it while we update the reloc root. 1570 */ 1571 btrfs_grab_root(reloc_root); 1572 1573 /* root->reloc_root will stay until current relocation finished */ 1574 if (fs_info->reloc_ctl->merge_reloc_tree && 1575 btrfs_root_refs(root_item) == 0) { 1576 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1577 /* 1578 * Mark the tree as dead before we change reloc_root so 1579 * have_reloc_root will not touch it from now on. 1580 */ 1581 smp_wmb(); 1582 __del_reloc_root(reloc_root); 1583 } 1584 1585 if (reloc_root->commit_root != reloc_root->node) { 1586 __update_reloc_root(reloc_root); 1587 btrfs_set_root_node(root_item, reloc_root->node); 1588 free_extent_buffer(reloc_root->commit_root); 1589 reloc_root->commit_root = btrfs_root_node(reloc_root); 1590 } 1591 1592 ret = btrfs_update_root(trans, fs_info->tree_root, 1593 &reloc_root->root_key, root_item); 1594 BUG_ON(ret); 1595 btrfs_put_root(reloc_root); 1596 out: 1597 return 0; 1598 } 1599 1600 /* 1601 * helper to find first cached inode with inode number >= objectid 1602 * in a subvolume 1603 */ 1604 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 1605 { 1606 struct rb_node *node; 1607 struct rb_node *prev; 1608 struct btrfs_inode *entry; 1609 struct inode *inode; 1610 1611 spin_lock(&root->inode_lock); 1612 again: 1613 node = root->inode_tree.rb_node; 1614 prev = NULL; 1615 while (node) { 1616 prev = node; 1617 entry = rb_entry(node, struct btrfs_inode, rb_node); 1618 1619 if (objectid < btrfs_ino(entry)) 1620 node = node->rb_left; 1621 else if (objectid > btrfs_ino(entry)) 1622 node = node->rb_right; 1623 else 1624 break; 1625 } 1626 if (!node) { 1627 while (prev) { 1628 entry = rb_entry(prev, struct btrfs_inode, rb_node); 1629 if (objectid <= btrfs_ino(entry)) { 1630 node = prev; 1631 break; 1632 } 1633 prev = rb_next(prev); 1634 } 1635 } 1636 while (node) { 1637 entry = rb_entry(node, struct btrfs_inode, rb_node); 1638 inode = igrab(&entry->vfs_inode); 1639 if (inode) { 1640 spin_unlock(&root->inode_lock); 1641 return inode; 1642 } 1643 1644 objectid = btrfs_ino(entry) + 1; 1645 if (cond_resched_lock(&root->inode_lock)) 1646 goto again; 1647 1648 node = rb_next(node); 1649 } 1650 spin_unlock(&root->inode_lock); 1651 return NULL; 1652 } 1653 1654 /* 1655 * get new location of data 1656 */ 1657 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1658 u64 bytenr, u64 num_bytes) 1659 { 1660 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1661 struct btrfs_path *path; 1662 struct btrfs_file_extent_item *fi; 1663 struct extent_buffer *leaf; 1664 int ret; 1665 1666 path = btrfs_alloc_path(); 1667 if (!path) 1668 return -ENOMEM; 1669 1670 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1671 ret = btrfs_lookup_file_extent(NULL, root, path, 1672 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1673 if (ret < 0) 1674 goto out; 1675 if (ret > 0) { 1676 ret = -ENOENT; 1677 goto out; 1678 } 1679 1680 leaf = path->nodes[0]; 1681 fi = btrfs_item_ptr(leaf, path->slots[0], 1682 struct btrfs_file_extent_item); 1683 1684 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1685 btrfs_file_extent_compression(leaf, fi) || 1686 btrfs_file_extent_encryption(leaf, fi) || 1687 btrfs_file_extent_other_encoding(leaf, fi)); 1688 1689 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1690 ret = -EINVAL; 1691 goto out; 1692 } 1693 1694 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1695 ret = 0; 1696 out: 1697 btrfs_free_path(path); 1698 return ret; 1699 } 1700 1701 /* 1702 * update file extent items in the tree leaf to point to 1703 * the new locations. 1704 */ 1705 static noinline_for_stack 1706 int replace_file_extents(struct btrfs_trans_handle *trans, 1707 struct reloc_control *rc, 1708 struct btrfs_root *root, 1709 struct extent_buffer *leaf) 1710 { 1711 struct btrfs_fs_info *fs_info = root->fs_info; 1712 struct btrfs_key key; 1713 struct btrfs_file_extent_item *fi; 1714 struct inode *inode = NULL; 1715 u64 parent; 1716 u64 bytenr; 1717 u64 new_bytenr = 0; 1718 u64 num_bytes; 1719 u64 end; 1720 u32 nritems; 1721 u32 i; 1722 int ret = 0; 1723 int first = 1; 1724 int dirty = 0; 1725 1726 if (rc->stage != UPDATE_DATA_PTRS) 1727 return 0; 1728 1729 /* reloc trees always use full backref */ 1730 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1731 parent = leaf->start; 1732 else 1733 parent = 0; 1734 1735 nritems = btrfs_header_nritems(leaf); 1736 for (i = 0; i < nritems; i++) { 1737 struct btrfs_ref ref = { 0 }; 1738 1739 cond_resched(); 1740 btrfs_item_key_to_cpu(leaf, &key, i); 1741 if (key.type != BTRFS_EXTENT_DATA_KEY) 1742 continue; 1743 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1744 if (btrfs_file_extent_type(leaf, fi) == 1745 BTRFS_FILE_EXTENT_INLINE) 1746 continue; 1747 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1748 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1749 if (bytenr == 0) 1750 continue; 1751 if (!in_range(bytenr, rc->block_group->start, 1752 rc->block_group->length)) 1753 continue; 1754 1755 /* 1756 * if we are modifying block in fs tree, wait for readpage 1757 * to complete and drop the extent cache 1758 */ 1759 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1760 if (first) { 1761 inode = find_next_inode(root, key.objectid); 1762 first = 0; 1763 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1764 btrfs_add_delayed_iput(inode); 1765 inode = find_next_inode(root, key.objectid); 1766 } 1767 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1768 end = key.offset + 1769 btrfs_file_extent_num_bytes(leaf, fi); 1770 WARN_ON(!IS_ALIGNED(key.offset, 1771 fs_info->sectorsize)); 1772 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1773 end--; 1774 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1775 key.offset, end); 1776 if (!ret) 1777 continue; 1778 1779 btrfs_drop_extent_cache(BTRFS_I(inode), 1780 key.offset, end, 1); 1781 unlock_extent(&BTRFS_I(inode)->io_tree, 1782 key.offset, end); 1783 } 1784 } 1785 1786 ret = get_new_location(rc->data_inode, &new_bytenr, 1787 bytenr, num_bytes); 1788 if (ret) { 1789 /* 1790 * Don't have to abort since we've not changed anything 1791 * in the file extent yet. 1792 */ 1793 break; 1794 } 1795 1796 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1797 dirty = 1; 1798 1799 key.offset -= btrfs_file_extent_offset(leaf, fi); 1800 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1801 num_bytes, parent); 1802 ref.real_root = root->root_key.objectid; 1803 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1804 key.objectid, key.offset); 1805 ret = btrfs_inc_extent_ref(trans, &ref); 1806 if (ret) { 1807 btrfs_abort_transaction(trans, ret); 1808 break; 1809 } 1810 1811 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1812 num_bytes, parent); 1813 ref.real_root = root->root_key.objectid; 1814 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1815 key.objectid, key.offset); 1816 ret = btrfs_free_extent(trans, &ref); 1817 if (ret) { 1818 btrfs_abort_transaction(trans, ret); 1819 break; 1820 } 1821 } 1822 if (dirty) 1823 btrfs_mark_buffer_dirty(leaf); 1824 if (inode) 1825 btrfs_add_delayed_iput(inode); 1826 return ret; 1827 } 1828 1829 static noinline_for_stack 1830 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1831 struct btrfs_path *path, int level) 1832 { 1833 struct btrfs_disk_key key1; 1834 struct btrfs_disk_key key2; 1835 btrfs_node_key(eb, &key1, slot); 1836 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1837 return memcmp(&key1, &key2, sizeof(key1)); 1838 } 1839 1840 /* 1841 * try to replace tree blocks in fs tree with the new blocks 1842 * in reloc tree. tree blocks haven't been modified since the 1843 * reloc tree was create can be replaced. 1844 * 1845 * if a block was replaced, level of the block + 1 is returned. 1846 * if no block got replaced, 0 is returned. if there are other 1847 * errors, a negative error number is returned. 1848 */ 1849 static noinline_for_stack 1850 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1851 struct btrfs_root *dest, struct btrfs_root *src, 1852 struct btrfs_path *path, struct btrfs_key *next_key, 1853 int lowest_level, int max_level) 1854 { 1855 struct btrfs_fs_info *fs_info = dest->fs_info; 1856 struct extent_buffer *eb; 1857 struct extent_buffer *parent; 1858 struct btrfs_ref ref = { 0 }; 1859 struct btrfs_key key; 1860 u64 old_bytenr; 1861 u64 new_bytenr; 1862 u64 old_ptr_gen; 1863 u64 new_ptr_gen; 1864 u64 last_snapshot; 1865 u32 blocksize; 1866 int cow = 0; 1867 int level; 1868 int ret; 1869 int slot; 1870 1871 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1872 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1873 1874 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1875 again: 1876 slot = path->slots[lowest_level]; 1877 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1878 1879 eb = btrfs_lock_root_node(dest); 1880 btrfs_set_lock_blocking_write(eb); 1881 level = btrfs_header_level(eb); 1882 1883 if (level < lowest_level) { 1884 btrfs_tree_unlock(eb); 1885 free_extent_buffer(eb); 1886 return 0; 1887 } 1888 1889 if (cow) { 1890 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1891 BUG_ON(ret); 1892 } 1893 btrfs_set_lock_blocking_write(eb); 1894 1895 if (next_key) { 1896 next_key->objectid = (u64)-1; 1897 next_key->type = (u8)-1; 1898 next_key->offset = (u64)-1; 1899 } 1900 1901 parent = eb; 1902 while (1) { 1903 struct btrfs_key first_key; 1904 1905 level = btrfs_header_level(parent); 1906 BUG_ON(level < lowest_level); 1907 1908 ret = btrfs_bin_search(parent, &key, level, &slot); 1909 if (ret < 0) 1910 break; 1911 if (ret && slot > 0) 1912 slot--; 1913 1914 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1915 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1916 1917 old_bytenr = btrfs_node_blockptr(parent, slot); 1918 blocksize = fs_info->nodesize; 1919 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1920 btrfs_node_key_to_cpu(parent, &first_key, slot); 1921 1922 if (level <= max_level) { 1923 eb = path->nodes[level]; 1924 new_bytenr = btrfs_node_blockptr(eb, 1925 path->slots[level]); 1926 new_ptr_gen = btrfs_node_ptr_generation(eb, 1927 path->slots[level]); 1928 } else { 1929 new_bytenr = 0; 1930 new_ptr_gen = 0; 1931 } 1932 1933 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1934 ret = level; 1935 break; 1936 } 1937 1938 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1939 memcmp_node_keys(parent, slot, path, level)) { 1940 if (level <= lowest_level) { 1941 ret = 0; 1942 break; 1943 } 1944 1945 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, 1946 level - 1, &first_key); 1947 if (IS_ERR(eb)) { 1948 ret = PTR_ERR(eb); 1949 break; 1950 } else if (!extent_buffer_uptodate(eb)) { 1951 ret = -EIO; 1952 free_extent_buffer(eb); 1953 break; 1954 } 1955 btrfs_tree_lock(eb); 1956 if (cow) { 1957 ret = btrfs_cow_block(trans, dest, eb, parent, 1958 slot, &eb); 1959 BUG_ON(ret); 1960 } 1961 btrfs_set_lock_blocking_write(eb); 1962 1963 btrfs_tree_unlock(parent); 1964 free_extent_buffer(parent); 1965 1966 parent = eb; 1967 continue; 1968 } 1969 1970 if (!cow) { 1971 btrfs_tree_unlock(parent); 1972 free_extent_buffer(parent); 1973 cow = 1; 1974 goto again; 1975 } 1976 1977 btrfs_node_key_to_cpu(path->nodes[level], &key, 1978 path->slots[level]); 1979 btrfs_release_path(path); 1980 1981 path->lowest_level = level; 1982 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1983 path->lowest_level = 0; 1984 BUG_ON(ret); 1985 1986 /* 1987 * Info qgroup to trace both subtrees. 1988 * 1989 * We must trace both trees. 1990 * 1) Tree reloc subtree 1991 * If not traced, we will leak data numbers 1992 * 2) Fs subtree 1993 * If not traced, we will double count old data 1994 * 1995 * We don't scan the subtree right now, but only record 1996 * the swapped tree blocks. 1997 * The real subtree rescan is delayed until we have new 1998 * CoW on the subtree root node before transaction commit. 1999 */ 2000 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 2001 rc->block_group, parent, slot, 2002 path->nodes[level], path->slots[level], 2003 last_snapshot); 2004 if (ret < 0) 2005 break; 2006 /* 2007 * swap blocks in fs tree and reloc tree. 2008 */ 2009 btrfs_set_node_blockptr(parent, slot, new_bytenr); 2010 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 2011 btrfs_mark_buffer_dirty(parent); 2012 2013 btrfs_set_node_blockptr(path->nodes[level], 2014 path->slots[level], old_bytenr); 2015 btrfs_set_node_ptr_generation(path->nodes[level], 2016 path->slots[level], old_ptr_gen); 2017 btrfs_mark_buffer_dirty(path->nodes[level]); 2018 2019 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 2020 blocksize, path->nodes[level]->start); 2021 ref.skip_qgroup = true; 2022 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 2023 ret = btrfs_inc_extent_ref(trans, &ref); 2024 BUG_ON(ret); 2025 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 2026 blocksize, 0); 2027 ref.skip_qgroup = true; 2028 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 2029 ret = btrfs_inc_extent_ref(trans, &ref); 2030 BUG_ON(ret); 2031 2032 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 2033 blocksize, path->nodes[level]->start); 2034 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 2035 ref.skip_qgroup = true; 2036 ret = btrfs_free_extent(trans, &ref); 2037 BUG_ON(ret); 2038 2039 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 2040 blocksize, 0); 2041 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 2042 ref.skip_qgroup = true; 2043 ret = btrfs_free_extent(trans, &ref); 2044 BUG_ON(ret); 2045 2046 btrfs_unlock_up_safe(path, 0); 2047 2048 ret = level; 2049 break; 2050 } 2051 btrfs_tree_unlock(parent); 2052 free_extent_buffer(parent); 2053 return ret; 2054 } 2055 2056 /* 2057 * helper to find next relocated block in reloc tree 2058 */ 2059 static noinline_for_stack 2060 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 2061 int *level) 2062 { 2063 struct extent_buffer *eb; 2064 int i; 2065 u64 last_snapshot; 2066 u32 nritems; 2067 2068 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2069 2070 for (i = 0; i < *level; i++) { 2071 free_extent_buffer(path->nodes[i]); 2072 path->nodes[i] = NULL; 2073 } 2074 2075 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 2076 eb = path->nodes[i]; 2077 nritems = btrfs_header_nritems(eb); 2078 while (path->slots[i] + 1 < nritems) { 2079 path->slots[i]++; 2080 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 2081 last_snapshot) 2082 continue; 2083 2084 *level = i; 2085 return 0; 2086 } 2087 free_extent_buffer(path->nodes[i]); 2088 path->nodes[i] = NULL; 2089 } 2090 return 1; 2091 } 2092 2093 /* 2094 * walk down reloc tree to find relocated block of lowest level 2095 */ 2096 static noinline_for_stack 2097 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 2098 int *level) 2099 { 2100 struct btrfs_fs_info *fs_info = root->fs_info; 2101 struct extent_buffer *eb = NULL; 2102 int i; 2103 u64 bytenr; 2104 u64 ptr_gen = 0; 2105 u64 last_snapshot; 2106 u32 nritems; 2107 2108 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2109 2110 for (i = *level; i > 0; i--) { 2111 struct btrfs_key first_key; 2112 2113 eb = path->nodes[i]; 2114 nritems = btrfs_header_nritems(eb); 2115 while (path->slots[i] < nritems) { 2116 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 2117 if (ptr_gen > last_snapshot) 2118 break; 2119 path->slots[i]++; 2120 } 2121 if (path->slots[i] >= nritems) { 2122 if (i == *level) 2123 break; 2124 *level = i + 1; 2125 return 0; 2126 } 2127 if (i == 1) { 2128 *level = i; 2129 return 0; 2130 } 2131 2132 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2133 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); 2134 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, 2135 &first_key); 2136 if (IS_ERR(eb)) { 2137 return PTR_ERR(eb); 2138 } else if (!extent_buffer_uptodate(eb)) { 2139 free_extent_buffer(eb); 2140 return -EIO; 2141 } 2142 BUG_ON(btrfs_header_level(eb) != i - 1); 2143 path->nodes[i - 1] = eb; 2144 path->slots[i - 1] = 0; 2145 } 2146 return 1; 2147 } 2148 2149 /* 2150 * invalidate extent cache for file extents whose key in range of 2151 * [min_key, max_key) 2152 */ 2153 static int invalidate_extent_cache(struct btrfs_root *root, 2154 struct btrfs_key *min_key, 2155 struct btrfs_key *max_key) 2156 { 2157 struct btrfs_fs_info *fs_info = root->fs_info; 2158 struct inode *inode = NULL; 2159 u64 objectid; 2160 u64 start, end; 2161 u64 ino; 2162 2163 objectid = min_key->objectid; 2164 while (1) { 2165 cond_resched(); 2166 iput(inode); 2167 2168 if (objectid > max_key->objectid) 2169 break; 2170 2171 inode = find_next_inode(root, objectid); 2172 if (!inode) 2173 break; 2174 ino = btrfs_ino(BTRFS_I(inode)); 2175 2176 if (ino > max_key->objectid) { 2177 iput(inode); 2178 break; 2179 } 2180 2181 objectid = ino + 1; 2182 if (!S_ISREG(inode->i_mode)) 2183 continue; 2184 2185 if (unlikely(min_key->objectid == ino)) { 2186 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 2187 continue; 2188 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 2189 start = 0; 2190 else { 2191 start = min_key->offset; 2192 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 2193 } 2194 } else { 2195 start = 0; 2196 } 2197 2198 if (unlikely(max_key->objectid == ino)) { 2199 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 2200 continue; 2201 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 2202 end = (u64)-1; 2203 } else { 2204 if (max_key->offset == 0) 2205 continue; 2206 end = max_key->offset; 2207 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 2208 end--; 2209 } 2210 } else { 2211 end = (u64)-1; 2212 } 2213 2214 /* the lock_extent waits for readpage to complete */ 2215 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2216 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 2217 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2218 } 2219 return 0; 2220 } 2221 2222 static int find_next_key(struct btrfs_path *path, int level, 2223 struct btrfs_key *key) 2224 2225 { 2226 while (level < BTRFS_MAX_LEVEL) { 2227 if (!path->nodes[level]) 2228 break; 2229 if (path->slots[level] + 1 < 2230 btrfs_header_nritems(path->nodes[level])) { 2231 btrfs_node_key_to_cpu(path->nodes[level], key, 2232 path->slots[level] + 1); 2233 return 0; 2234 } 2235 level++; 2236 } 2237 return 1; 2238 } 2239 2240 /* 2241 * Insert current subvolume into reloc_control::dirty_subvol_roots 2242 */ 2243 static void insert_dirty_subvol(struct btrfs_trans_handle *trans, 2244 struct reloc_control *rc, 2245 struct btrfs_root *root) 2246 { 2247 struct btrfs_root *reloc_root = root->reloc_root; 2248 struct btrfs_root_item *reloc_root_item; 2249 2250 /* @root must be a subvolume tree root with a valid reloc tree */ 2251 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 2252 ASSERT(reloc_root); 2253 2254 reloc_root_item = &reloc_root->root_item; 2255 memset(&reloc_root_item->drop_progress, 0, 2256 sizeof(reloc_root_item->drop_progress)); 2257 reloc_root_item->drop_level = 0; 2258 btrfs_set_root_refs(reloc_root_item, 0); 2259 btrfs_update_reloc_root(trans, root); 2260 2261 if (list_empty(&root->reloc_dirty_list)) { 2262 btrfs_grab_root(root); 2263 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 2264 } 2265 } 2266 2267 static int clean_dirty_subvols(struct reloc_control *rc) 2268 { 2269 struct btrfs_root *root; 2270 struct btrfs_root *next; 2271 int ret = 0; 2272 int ret2; 2273 2274 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 2275 reloc_dirty_list) { 2276 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 2277 /* Merged subvolume, cleanup its reloc root */ 2278 struct btrfs_root *reloc_root = root->reloc_root; 2279 2280 list_del_init(&root->reloc_dirty_list); 2281 root->reloc_root = NULL; 2282 /* 2283 * Need barrier to ensure clear_bit() only happens after 2284 * root->reloc_root = NULL. Pairs with have_reloc_root. 2285 */ 2286 smp_wmb(); 2287 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 2288 if (reloc_root) { 2289 /* 2290 * btrfs_drop_snapshot drops our ref we hold for 2291 * ->reloc_root. If it fails however we must 2292 * drop the ref ourselves. 2293 */ 2294 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 2295 if (ret2 < 0) { 2296 btrfs_put_root(reloc_root); 2297 if (!ret) 2298 ret = ret2; 2299 } 2300 } 2301 btrfs_put_root(root); 2302 } else { 2303 /* Orphan reloc tree, just clean it up */ 2304 ret2 = btrfs_drop_snapshot(root, 0, 1); 2305 if (ret2 < 0) { 2306 btrfs_put_root(root); 2307 if (!ret) 2308 ret = ret2; 2309 } 2310 } 2311 } 2312 return ret; 2313 } 2314 2315 /* 2316 * merge the relocated tree blocks in reloc tree with corresponding 2317 * fs tree. 2318 */ 2319 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 2320 struct btrfs_root *root) 2321 { 2322 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2323 struct btrfs_key key; 2324 struct btrfs_key next_key; 2325 struct btrfs_trans_handle *trans = NULL; 2326 struct btrfs_root *reloc_root; 2327 struct btrfs_root_item *root_item; 2328 struct btrfs_path *path; 2329 struct extent_buffer *leaf; 2330 int level; 2331 int max_level; 2332 int replaced = 0; 2333 int ret; 2334 int err = 0; 2335 u32 min_reserved; 2336 2337 path = btrfs_alloc_path(); 2338 if (!path) 2339 return -ENOMEM; 2340 path->reada = READA_FORWARD; 2341 2342 reloc_root = root->reloc_root; 2343 root_item = &reloc_root->root_item; 2344 2345 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 2346 level = btrfs_root_level(root_item); 2347 atomic_inc(&reloc_root->node->refs); 2348 path->nodes[level] = reloc_root->node; 2349 path->slots[level] = 0; 2350 } else { 2351 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 2352 2353 level = root_item->drop_level; 2354 BUG_ON(level == 0); 2355 path->lowest_level = level; 2356 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 2357 path->lowest_level = 0; 2358 if (ret < 0) { 2359 btrfs_free_path(path); 2360 return ret; 2361 } 2362 2363 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 2364 path->slots[level]); 2365 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 2366 2367 btrfs_unlock_up_safe(path, 0); 2368 } 2369 2370 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2371 memset(&next_key, 0, sizeof(next_key)); 2372 2373 while (1) { 2374 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 2375 BTRFS_RESERVE_FLUSH_ALL); 2376 if (ret) { 2377 err = ret; 2378 goto out; 2379 } 2380 trans = btrfs_start_transaction(root, 0); 2381 if (IS_ERR(trans)) { 2382 err = PTR_ERR(trans); 2383 trans = NULL; 2384 goto out; 2385 } 2386 2387 /* 2388 * At this point we no longer have a reloc_control, so we can't 2389 * depend on btrfs_init_reloc_root to update our last_trans. 2390 * 2391 * But that's ok, we started the trans handle on our 2392 * corresponding fs_root, which means it's been added to the 2393 * dirty list. At commit time we'll still call 2394 * btrfs_update_reloc_root() and update our root item 2395 * appropriately. 2396 */ 2397 reloc_root->last_trans = trans->transid; 2398 trans->block_rsv = rc->block_rsv; 2399 2400 replaced = 0; 2401 max_level = level; 2402 2403 ret = walk_down_reloc_tree(reloc_root, path, &level); 2404 if (ret < 0) { 2405 err = ret; 2406 goto out; 2407 } 2408 if (ret > 0) 2409 break; 2410 2411 if (!find_next_key(path, level, &key) && 2412 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2413 ret = 0; 2414 } else { 2415 ret = replace_path(trans, rc, root, reloc_root, path, 2416 &next_key, level, max_level); 2417 } 2418 if (ret < 0) { 2419 err = ret; 2420 goto out; 2421 } 2422 2423 if (ret > 0) { 2424 level = ret; 2425 btrfs_node_key_to_cpu(path->nodes[level], &key, 2426 path->slots[level]); 2427 replaced = 1; 2428 } 2429 2430 ret = walk_up_reloc_tree(reloc_root, path, &level); 2431 if (ret > 0) 2432 break; 2433 2434 BUG_ON(level == 0); 2435 /* 2436 * save the merging progress in the drop_progress. 2437 * this is OK since root refs == 1 in this case. 2438 */ 2439 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 2440 path->slots[level]); 2441 root_item->drop_level = level; 2442 2443 btrfs_end_transaction_throttle(trans); 2444 trans = NULL; 2445 2446 btrfs_btree_balance_dirty(fs_info); 2447 2448 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2449 invalidate_extent_cache(root, &key, &next_key); 2450 } 2451 2452 /* 2453 * handle the case only one block in the fs tree need to be 2454 * relocated and the block is tree root. 2455 */ 2456 leaf = btrfs_lock_root_node(root); 2457 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); 2458 btrfs_tree_unlock(leaf); 2459 free_extent_buffer(leaf); 2460 if (ret < 0) 2461 err = ret; 2462 out: 2463 btrfs_free_path(path); 2464 2465 if (err == 0) 2466 insert_dirty_subvol(trans, rc, root); 2467 2468 if (trans) 2469 btrfs_end_transaction_throttle(trans); 2470 2471 btrfs_btree_balance_dirty(fs_info); 2472 2473 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2474 invalidate_extent_cache(root, &key, &next_key); 2475 2476 return err; 2477 } 2478 2479 static noinline_for_stack 2480 int prepare_to_merge(struct reloc_control *rc, int err) 2481 { 2482 struct btrfs_root *root = rc->extent_root; 2483 struct btrfs_fs_info *fs_info = root->fs_info; 2484 struct btrfs_root *reloc_root; 2485 struct btrfs_trans_handle *trans; 2486 LIST_HEAD(reloc_roots); 2487 u64 num_bytes = 0; 2488 int ret; 2489 2490 mutex_lock(&fs_info->reloc_mutex); 2491 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2492 rc->merging_rsv_size += rc->nodes_relocated * 2; 2493 mutex_unlock(&fs_info->reloc_mutex); 2494 2495 again: 2496 if (!err) { 2497 num_bytes = rc->merging_rsv_size; 2498 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2499 BTRFS_RESERVE_FLUSH_ALL); 2500 if (ret) 2501 err = ret; 2502 } 2503 2504 trans = btrfs_join_transaction(rc->extent_root); 2505 if (IS_ERR(trans)) { 2506 if (!err) 2507 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2508 num_bytes, NULL); 2509 return PTR_ERR(trans); 2510 } 2511 2512 if (!err) { 2513 if (num_bytes != rc->merging_rsv_size) { 2514 btrfs_end_transaction(trans); 2515 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2516 num_bytes, NULL); 2517 goto again; 2518 } 2519 } 2520 2521 rc->merge_reloc_tree = 1; 2522 2523 while (!list_empty(&rc->reloc_roots)) { 2524 reloc_root = list_entry(rc->reloc_roots.next, 2525 struct btrfs_root, root_list); 2526 list_del_init(&reloc_root->root_list); 2527 2528 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2529 BUG_ON(IS_ERR(root)); 2530 BUG_ON(root->reloc_root != reloc_root); 2531 2532 /* 2533 * set reference count to 1, so btrfs_recover_relocation 2534 * knows it should resumes merging 2535 */ 2536 if (!err) 2537 btrfs_set_root_refs(&reloc_root->root_item, 1); 2538 btrfs_update_reloc_root(trans, root); 2539 2540 list_add(&reloc_root->root_list, &reloc_roots); 2541 btrfs_put_root(root); 2542 } 2543 2544 list_splice(&reloc_roots, &rc->reloc_roots); 2545 2546 if (!err) 2547 btrfs_commit_transaction(trans); 2548 else 2549 btrfs_end_transaction(trans); 2550 return err; 2551 } 2552 2553 static noinline_for_stack 2554 void free_reloc_roots(struct list_head *list) 2555 { 2556 struct btrfs_root *reloc_root; 2557 2558 while (!list_empty(list)) { 2559 reloc_root = list_entry(list->next, struct btrfs_root, 2560 root_list); 2561 __del_reloc_root(reloc_root); 2562 } 2563 } 2564 2565 static noinline_for_stack 2566 void merge_reloc_roots(struct reloc_control *rc) 2567 { 2568 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2569 struct btrfs_root *root; 2570 struct btrfs_root *reloc_root; 2571 LIST_HEAD(reloc_roots); 2572 int found = 0; 2573 int ret = 0; 2574 again: 2575 root = rc->extent_root; 2576 2577 /* 2578 * this serializes us with btrfs_record_root_in_transaction, 2579 * we have to make sure nobody is in the middle of 2580 * adding their roots to the list while we are 2581 * doing this splice 2582 */ 2583 mutex_lock(&fs_info->reloc_mutex); 2584 list_splice_init(&rc->reloc_roots, &reloc_roots); 2585 mutex_unlock(&fs_info->reloc_mutex); 2586 2587 while (!list_empty(&reloc_roots)) { 2588 found = 1; 2589 reloc_root = list_entry(reloc_roots.next, 2590 struct btrfs_root, root_list); 2591 2592 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 2593 root = read_fs_root(fs_info, 2594 reloc_root->root_key.offset); 2595 BUG_ON(IS_ERR(root)); 2596 BUG_ON(root->reloc_root != reloc_root); 2597 2598 ret = merge_reloc_root(rc, root); 2599 btrfs_put_root(root); 2600 if (ret) { 2601 if (list_empty(&reloc_root->root_list)) 2602 list_add_tail(&reloc_root->root_list, 2603 &reloc_roots); 2604 goto out; 2605 } 2606 } else { 2607 list_del_init(&reloc_root->root_list); 2608 /* Don't forget to queue this reloc root for cleanup */ 2609 list_add_tail(&reloc_root->reloc_dirty_list, 2610 &rc->dirty_subvol_roots); 2611 } 2612 } 2613 2614 if (found) { 2615 found = 0; 2616 goto again; 2617 } 2618 out: 2619 if (ret) { 2620 btrfs_handle_fs_error(fs_info, ret, NULL); 2621 if (!list_empty(&reloc_roots)) 2622 free_reloc_roots(&reloc_roots); 2623 2624 /* new reloc root may be added */ 2625 mutex_lock(&fs_info->reloc_mutex); 2626 list_splice_init(&rc->reloc_roots, &reloc_roots); 2627 mutex_unlock(&fs_info->reloc_mutex); 2628 if (!list_empty(&reloc_roots)) 2629 free_reloc_roots(&reloc_roots); 2630 } 2631 2632 /* 2633 * We used to have 2634 * 2635 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2636 * 2637 * here, but it's wrong. If we fail to start the transaction in 2638 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2639 * have actually been removed from the reloc_root_tree rb tree. This is 2640 * fine because we're bailing here, and we hold a reference on the root 2641 * for the list that holds it, so these roots will be cleaned up when we 2642 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2643 * will be cleaned up on unmount. 2644 * 2645 * The remaining nodes will be cleaned up by free_reloc_control. 2646 */ 2647 } 2648 2649 static void free_block_list(struct rb_root *blocks) 2650 { 2651 struct tree_block *block; 2652 struct rb_node *rb_node; 2653 while ((rb_node = rb_first(blocks))) { 2654 block = rb_entry(rb_node, struct tree_block, rb_node); 2655 rb_erase(rb_node, blocks); 2656 kfree(block); 2657 } 2658 } 2659 2660 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2661 struct btrfs_root *reloc_root) 2662 { 2663 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2664 struct btrfs_root *root; 2665 int ret; 2666 2667 if (reloc_root->last_trans == trans->transid) 2668 return 0; 2669 2670 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2671 BUG_ON(IS_ERR(root)); 2672 BUG_ON(root->reloc_root != reloc_root); 2673 ret = btrfs_record_root_in_trans(trans, root); 2674 btrfs_put_root(root); 2675 2676 return ret; 2677 } 2678 2679 static noinline_for_stack 2680 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2681 struct reloc_control *rc, 2682 struct btrfs_backref_node *node, 2683 struct btrfs_backref_edge *edges[]) 2684 { 2685 struct btrfs_backref_node *next; 2686 struct btrfs_root *root; 2687 int index = 0; 2688 2689 next = node; 2690 while (1) { 2691 cond_resched(); 2692 next = walk_up_backref(next, edges, &index); 2693 root = next->root; 2694 BUG_ON(!root); 2695 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); 2696 2697 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2698 record_reloc_root_in_trans(trans, root); 2699 break; 2700 } 2701 2702 btrfs_record_root_in_trans(trans, root); 2703 root = root->reloc_root; 2704 2705 if (next->new_bytenr != root->node->start) { 2706 BUG_ON(next->new_bytenr); 2707 BUG_ON(!list_empty(&next->list)); 2708 next->new_bytenr = root->node->start; 2709 btrfs_put_root(next->root); 2710 next->root = btrfs_grab_root(root); 2711 ASSERT(next->root); 2712 list_add_tail(&next->list, 2713 &rc->backref_cache.changed); 2714 mark_block_processed(rc, next); 2715 break; 2716 } 2717 2718 WARN_ON(1); 2719 root = NULL; 2720 next = walk_down_backref(edges, &index); 2721 if (!next || next->level <= node->level) 2722 break; 2723 } 2724 if (!root) 2725 return NULL; 2726 2727 next = node; 2728 /* setup backref node path for btrfs_reloc_cow_block */ 2729 while (1) { 2730 rc->backref_cache.path[next->level] = next; 2731 if (--index < 0) 2732 break; 2733 next = edges[index]->node[UPPER]; 2734 } 2735 return root; 2736 } 2737 2738 /* 2739 * select a tree root for relocation. return NULL if the block 2740 * is reference counted. we should use do_relocation() in this 2741 * case. return a tree root pointer if the block isn't reference 2742 * counted. return -ENOENT if the block is root of reloc tree. 2743 */ 2744 static noinline_for_stack 2745 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2746 { 2747 struct btrfs_backref_node *next; 2748 struct btrfs_root *root; 2749 struct btrfs_root *fs_root = NULL; 2750 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2751 int index = 0; 2752 2753 next = node; 2754 while (1) { 2755 cond_resched(); 2756 next = walk_up_backref(next, edges, &index); 2757 root = next->root; 2758 BUG_ON(!root); 2759 2760 /* no other choice for non-references counted tree */ 2761 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 2762 return root; 2763 2764 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2765 fs_root = root; 2766 2767 if (next != node) 2768 return NULL; 2769 2770 next = walk_down_backref(edges, &index); 2771 if (!next || next->level <= node->level) 2772 break; 2773 } 2774 2775 if (!fs_root) 2776 return ERR_PTR(-ENOENT); 2777 return fs_root; 2778 } 2779 2780 static noinline_for_stack 2781 u64 calcu_metadata_size(struct reloc_control *rc, 2782 struct btrfs_backref_node *node, int reserve) 2783 { 2784 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2785 struct btrfs_backref_node *next = node; 2786 struct btrfs_backref_edge *edge; 2787 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2788 u64 num_bytes = 0; 2789 int index = 0; 2790 2791 BUG_ON(reserve && node->processed); 2792 2793 while (next) { 2794 cond_resched(); 2795 while (1) { 2796 if (next->processed && (reserve || next != node)) 2797 break; 2798 2799 num_bytes += fs_info->nodesize; 2800 2801 if (list_empty(&next->upper)) 2802 break; 2803 2804 edge = list_entry(next->upper.next, 2805 struct btrfs_backref_edge, list[LOWER]); 2806 edges[index++] = edge; 2807 next = edge->node[UPPER]; 2808 } 2809 next = walk_down_backref(edges, &index); 2810 } 2811 return num_bytes; 2812 } 2813 2814 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2815 struct reloc_control *rc, 2816 struct btrfs_backref_node *node) 2817 { 2818 struct btrfs_root *root = rc->extent_root; 2819 struct btrfs_fs_info *fs_info = root->fs_info; 2820 u64 num_bytes; 2821 int ret; 2822 u64 tmp; 2823 2824 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2825 2826 trans->block_rsv = rc->block_rsv; 2827 rc->reserved_bytes += num_bytes; 2828 2829 /* 2830 * We are under a transaction here so we can only do limited flushing. 2831 * If we get an enospc just kick back -EAGAIN so we know to drop the 2832 * transaction and try to refill when we can flush all the things. 2833 */ 2834 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, 2835 BTRFS_RESERVE_FLUSH_LIMIT); 2836 if (ret) { 2837 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2838 while (tmp <= rc->reserved_bytes) 2839 tmp <<= 1; 2840 /* 2841 * only one thread can access block_rsv at this point, 2842 * so we don't need hold lock to protect block_rsv. 2843 * we expand more reservation size here to allow enough 2844 * space for relocation and we will return earlier in 2845 * enospc case. 2846 */ 2847 rc->block_rsv->size = tmp + fs_info->nodesize * 2848 RELOCATION_RESERVED_NODES; 2849 return -EAGAIN; 2850 } 2851 2852 return 0; 2853 } 2854 2855 /* 2856 * relocate a block tree, and then update pointers in upper level 2857 * blocks that reference the block to point to the new location. 2858 * 2859 * if called by link_to_upper, the block has already been relocated. 2860 * in that case this function just updates pointers. 2861 */ 2862 static int do_relocation(struct btrfs_trans_handle *trans, 2863 struct reloc_control *rc, 2864 struct btrfs_backref_node *node, 2865 struct btrfs_key *key, 2866 struct btrfs_path *path, int lowest) 2867 { 2868 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2869 struct btrfs_backref_node *upper; 2870 struct btrfs_backref_edge *edge; 2871 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2872 struct btrfs_root *root; 2873 struct extent_buffer *eb; 2874 u32 blocksize; 2875 u64 bytenr; 2876 u64 generation; 2877 int slot; 2878 int ret; 2879 int err = 0; 2880 2881 BUG_ON(lowest && node->eb); 2882 2883 path->lowest_level = node->level + 1; 2884 rc->backref_cache.path[node->level] = node; 2885 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2886 struct btrfs_key first_key; 2887 struct btrfs_ref ref = { 0 }; 2888 2889 cond_resched(); 2890 2891 upper = edge->node[UPPER]; 2892 root = select_reloc_root(trans, rc, upper, edges); 2893 BUG_ON(!root); 2894 2895 if (upper->eb && !upper->locked) { 2896 if (!lowest) { 2897 ret = btrfs_bin_search(upper->eb, key, 2898 upper->level, &slot); 2899 if (ret < 0) { 2900 err = ret; 2901 goto next; 2902 } 2903 BUG_ON(ret); 2904 bytenr = btrfs_node_blockptr(upper->eb, slot); 2905 if (node->eb->start == bytenr) 2906 goto next; 2907 } 2908 drop_node_buffer(upper); 2909 } 2910 2911 if (!upper->eb) { 2912 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2913 if (ret) { 2914 if (ret < 0) 2915 err = ret; 2916 else 2917 err = -ENOENT; 2918 2919 btrfs_release_path(path); 2920 break; 2921 } 2922 2923 if (!upper->eb) { 2924 upper->eb = path->nodes[upper->level]; 2925 path->nodes[upper->level] = NULL; 2926 } else { 2927 BUG_ON(upper->eb != path->nodes[upper->level]); 2928 } 2929 2930 upper->locked = 1; 2931 path->locks[upper->level] = 0; 2932 2933 slot = path->slots[upper->level]; 2934 btrfs_release_path(path); 2935 } else { 2936 ret = btrfs_bin_search(upper->eb, key, upper->level, 2937 &slot); 2938 if (ret < 0) { 2939 err = ret; 2940 goto next; 2941 } 2942 BUG_ON(ret); 2943 } 2944 2945 bytenr = btrfs_node_blockptr(upper->eb, slot); 2946 if (lowest) { 2947 if (bytenr != node->bytenr) { 2948 btrfs_err(root->fs_info, 2949 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2950 bytenr, node->bytenr, slot, 2951 upper->eb->start); 2952 err = -EIO; 2953 goto next; 2954 } 2955 } else { 2956 if (node->eb->start == bytenr) 2957 goto next; 2958 } 2959 2960 blocksize = root->fs_info->nodesize; 2961 generation = btrfs_node_ptr_generation(upper->eb, slot); 2962 btrfs_node_key_to_cpu(upper->eb, &first_key, slot); 2963 eb = read_tree_block(fs_info, bytenr, generation, 2964 upper->level - 1, &first_key); 2965 if (IS_ERR(eb)) { 2966 err = PTR_ERR(eb); 2967 goto next; 2968 } else if (!extent_buffer_uptodate(eb)) { 2969 free_extent_buffer(eb); 2970 err = -EIO; 2971 goto next; 2972 } 2973 btrfs_tree_lock(eb); 2974 btrfs_set_lock_blocking_write(eb); 2975 2976 if (!node->eb) { 2977 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2978 slot, &eb); 2979 btrfs_tree_unlock(eb); 2980 free_extent_buffer(eb); 2981 if (ret < 0) { 2982 err = ret; 2983 goto next; 2984 } 2985 BUG_ON(node->eb != eb); 2986 } else { 2987 btrfs_set_node_blockptr(upper->eb, slot, 2988 node->eb->start); 2989 btrfs_set_node_ptr_generation(upper->eb, slot, 2990 trans->transid); 2991 btrfs_mark_buffer_dirty(upper->eb); 2992 2993 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2994 node->eb->start, blocksize, 2995 upper->eb->start); 2996 ref.real_root = root->root_key.objectid; 2997 btrfs_init_tree_ref(&ref, node->level, 2998 btrfs_header_owner(upper->eb)); 2999 ret = btrfs_inc_extent_ref(trans, &ref); 3000 BUG_ON(ret); 3001 3002 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 3003 BUG_ON(ret); 3004 } 3005 next: 3006 if (!upper->pending) 3007 drop_node_buffer(upper); 3008 else 3009 unlock_node_buffer(upper); 3010 if (err) 3011 break; 3012 } 3013 3014 if (!err && node->pending) { 3015 drop_node_buffer(node); 3016 list_move_tail(&node->list, &rc->backref_cache.changed); 3017 node->pending = 0; 3018 } 3019 3020 path->lowest_level = 0; 3021 BUG_ON(err == -ENOSPC); 3022 return err; 3023 } 3024 3025 static int link_to_upper(struct btrfs_trans_handle *trans, 3026 struct reloc_control *rc, 3027 struct btrfs_backref_node *node, 3028 struct btrfs_path *path) 3029 { 3030 struct btrfs_key key; 3031 3032 btrfs_node_key_to_cpu(node->eb, &key, 0); 3033 return do_relocation(trans, rc, node, &key, path, 0); 3034 } 3035 3036 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 3037 struct reloc_control *rc, 3038 struct btrfs_path *path, int err) 3039 { 3040 LIST_HEAD(list); 3041 struct btrfs_backref_cache *cache = &rc->backref_cache; 3042 struct btrfs_backref_node *node; 3043 int level; 3044 int ret; 3045 3046 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 3047 while (!list_empty(&cache->pending[level])) { 3048 node = list_entry(cache->pending[level].next, 3049 struct btrfs_backref_node, list); 3050 list_move_tail(&node->list, &list); 3051 BUG_ON(!node->pending); 3052 3053 if (!err) { 3054 ret = link_to_upper(trans, rc, node, path); 3055 if (ret < 0) 3056 err = ret; 3057 } 3058 } 3059 list_splice_init(&list, &cache->pending[level]); 3060 } 3061 return err; 3062 } 3063 3064 /* 3065 * mark a block and all blocks directly/indirectly reference the block 3066 * as processed. 3067 */ 3068 static void update_processed_blocks(struct reloc_control *rc, 3069 struct btrfs_backref_node *node) 3070 { 3071 struct btrfs_backref_node *next = node; 3072 struct btrfs_backref_edge *edge; 3073 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 3074 int index = 0; 3075 3076 while (next) { 3077 cond_resched(); 3078 while (1) { 3079 if (next->processed) 3080 break; 3081 3082 mark_block_processed(rc, next); 3083 3084 if (list_empty(&next->upper)) 3085 break; 3086 3087 edge = list_entry(next->upper.next, 3088 struct btrfs_backref_edge, list[LOWER]); 3089 edges[index++] = edge; 3090 next = edge->node[UPPER]; 3091 } 3092 next = walk_down_backref(edges, &index); 3093 } 3094 } 3095 3096 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 3097 { 3098 u32 blocksize = rc->extent_root->fs_info->nodesize; 3099 3100 if (test_range_bit(&rc->processed_blocks, bytenr, 3101 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 3102 return 1; 3103 return 0; 3104 } 3105 3106 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 3107 struct tree_block *block) 3108 { 3109 struct extent_buffer *eb; 3110 3111 eb = read_tree_block(fs_info, block->bytenr, block->key.offset, 3112 block->level, NULL); 3113 if (IS_ERR(eb)) { 3114 return PTR_ERR(eb); 3115 } else if (!extent_buffer_uptodate(eb)) { 3116 free_extent_buffer(eb); 3117 return -EIO; 3118 } 3119 if (block->level == 0) 3120 btrfs_item_key_to_cpu(eb, &block->key, 0); 3121 else 3122 btrfs_node_key_to_cpu(eb, &block->key, 0); 3123 free_extent_buffer(eb); 3124 block->key_ready = 1; 3125 return 0; 3126 } 3127 3128 /* 3129 * helper function to relocate a tree block 3130 */ 3131 static int relocate_tree_block(struct btrfs_trans_handle *trans, 3132 struct reloc_control *rc, 3133 struct btrfs_backref_node *node, 3134 struct btrfs_key *key, 3135 struct btrfs_path *path) 3136 { 3137 struct btrfs_root *root; 3138 int ret = 0; 3139 3140 if (!node) 3141 return 0; 3142 3143 /* 3144 * If we fail here we want to drop our backref_node because we are going 3145 * to start over and regenerate the tree for it. 3146 */ 3147 ret = reserve_metadata_space(trans, rc, node); 3148 if (ret) 3149 goto out; 3150 3151 BUG_ON(node->processed); 3152 root = select_one_root(node); 3153 if (root == ERR_PTR(-ENOENT)) { 3154 update_processed_blocks(rc, node); 3155 goto out; 3156 } 3157 3158 if (root) { 3159 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3160 BUG_ON(node->new_bytenr); 3161 BUG_ON(!list_empty(&node->list)); 3162 btrfs_record_root_in_trans(trans, root); 3163 root = root->reloc_root; 3164 node->new_bytenr = root->node->start; 3165 btrfs_put_root(node->root); 3166 node->root = btrfs_grab_root(root); 3167 ASSERT(node->root); 3168 list_add_tail(&node->list, &rc->backref_cache.changed); 3169 } else { 3170 path->lowest_level = node->level; 3171 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 3172 btrfs_release_path(path); 3173 if (ret > 0) 3174 ret = 0; 3175 } 3176 if (!ret) 3177 update_processed_blocks(rc, node); 3178 } else { 3179 ret = do_relocation(trans, rc, node, key, path, 1); 3180 } 3181 out: 3182 if (ret || node->level == 0 || node->cowonly) 3183 remove_backref_node(&rc->backref_cache, node); 3184 return ret; 3185 } 3186 3187 /* 3188 * relocate a list of blocks 3189 */ 3190 static noinline_for_stack 3191 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 3192 struct reloc_control *rc, struct rb_root *blocks) 3193 { 3194 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3195 struct btrfs_backref_node *node; 3196 struct btrfs_path *path; 3197 struct tree_block *block; 3198 struct tree_block *next; 3199 int ret; 3200 int err = 0; 3201 3202 path = btrfs_alloc_path(); 3203 if (!path) { 3204 err = -ENOMEM; 3205 goto out_free_blocks; 3206 } 3207 3208 /* Kick in readahead for tree blocks with missing keys */ 3209 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3210 if (!block->key_ready) 3211 readahead_tree_block(fs_info, block->bytenr); 3212 } 3213 3214 /* Get first keys */ 3215 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3216 if (!block->key_ready) { 3217 err = get_tree_block_key(fs_info, block); 3218 if (err) 3219 goto out_free_path; 3220 } 3221 } 3222 3223 /* Do tree relocation */ 3224 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3225 node = build_backref_tree(rc, &block->key, 3226 block->level, block->bytenr); 3227 if (IS_ERR(node)) { 3228 err = PTR_ERR(node); 3229 goto out; 3230 } 3231 3232 ret = relocate_tree_block(trans, rc, node, &block->key, 3233 path); 3234 if (ret < 0) { 3235 err = ret; 3236 break; 3237 } 3238 } 3239 out: 3240 err = finish_pending_nodes(trans, rc, path, err); 3241 3242 out_free_path: 3243 btrfs_free_path(path); 3244 out_free_blocks: 3245 free_block_list(blocks); 3246 return err; 3247 } 3248 3249 static noinline_for_stack 3250 int prealloc_file_extent_cluster(struct inode *inode, 3251 struct file_extent_cluster *cluster) 3252 { 3253 u64 alloc_hint = 0; 3254 u64 start; 3255 u64 end; 3256 u64 offset = BTRFS_I(inode)->index_cnt; 3257 u64 num_bytes; 3258 int nr = 0; 3259 int ret = 0; 3260 u64 prealloc_start = cluster->start - offset; 3261 u64 prealloc_end = cluster->end - offset; 3262 u64 cur_offset; 3263 struct extent_changeset *data_reserved = NULL; 3264 3265 BUG_ON(cluster->start != cluster->boundary[0]); 3266 inode_lock(inode); 3267 3268 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, 3269 prealloc_end + 1 - prealloc_start); 3270 if (ret) 3271 goto out; 3272 3273 cur_offset = prealloc_start; 3274 while (nr < cluster->nr) { 3275 start = cluster->boundary[nr] - offset; 3276 if (nr + 1 < cluster->nr) 3277 end = cluster->boundary[nr + 1] - 1 - offset; 3278 else 3279 end = cluster->end - offset; 3280 3281 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3282 num_bytes = end + 1 - start; 3283 if (cur_offset < start) 3284 btrfs_free_reserved_data_space(inode, data_reserved, 3285 cur_offset, start - cur_offset); 3286 ret = btrfs_prealloc_file_range(inode, 0, start, 3287 num_bytes, num_bytes, 3288 end + 1, &alloc_hint); 3289 cur_offset = end + 1; 3290 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3291 if (ret) 3292 break; 3293 nr++; 3294 } 3295 if (cur_offset < prealloc_end) 3296 btrfs_free_reserved_data_space(inode, data_reserved, 3297 cur_offset, prealloc_end + 1 - cur_offset); 3298 out: 3299 inode_unlock(inode); 3300 extent_changeset_free(data_reserved); 3301 return ret; 3302 } 3303 3304 static noinline_for_stack 3305 int setup_extent_mapping(struct inode *inode, u64 start, u64 end, 3306 u64 block_start) 3307 { 3308 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3309 struct extent_map *em; 3310 int ret = 0; 3311 3312 em = alloc_extent_map(); 3313 if (!em) 3314 return -ENOMEM; 3315 3316 em->start = start; 3317 em->len = end + 1 - start; 3318 em->block_len = em->len; 3319 em->block_start = block_start; 3320 set_bit(EXTENT_FLAG_PINNED, &em->flags); 3321 3322 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3323 while (1) { 3324 write_lock(&em_tree->lock); 3325 ret = add_extent_mapping(em_tree, em, 0); 3326 write_unlock(&em_tree->lock); 3327 if (ret != -EEXIST) { 3328 free_extent_map(em); 3329 break; 3330 } 3331 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 3332 } 3333 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3334 return ret; 3335 } 3336 3337 /* 3338 * Allow error injection to test balance cancellation 3339 */ 3340 int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) 3341 { 3342 return atomic_read(&fs_info->balance_cancel_req); 3343 } 3344 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 3345 3346 static int relocate_file_extent_cluster(struct inode *inode, 3347 struct file_extent_cluster *cluster) 3348 { 3349 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3350 u64 page_start; 3351 u64 page_end; 3352 u64 offset = BTRFS_I(inode)->index_cnt; 3353 unsigned long index; 3354 unsigned long last_index; 3355 struct page *page; 3356 struct file_ra_state *ra; 3357 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 3358 int nr = 0; 3359 int ret = 0; 3360 3361 if (!cluster->nr) 3362 return 0; 3363 3364 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3365 if (!ra) 3366 return -ENOMEM; 3367 3368 ret = prealloc_file_extent_cluster(inode, cluster); 3369 if (ret) 3370 goto out; 3371 3372 file_ra_state_init(ra, inode->i_mapping); 3373 3374 ret = setup_extent_mapping(inode, cluster->start - offset, 3375 cluster->end - offset, cluster->start); 3376 if (ret) 3377 goto out; 3378 3379 index = (cluster->start - offset) >> PAGE_SHIFT; 3380 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3381 while (index <= last_index) { 3382 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3383 PAGE_SIZE); 3384 if (ret) 3385 goto out; 3386 3387 page = find_lock_page(inode->i_mapping, index); 3388 if (!page) { 3389 page_cache_sync_readahead(inode->i_mapping, 3390 ra, NULL, index, 3391 last_index + 1 - index); 3392 page = find_or_create_page(inode->i_mapping, index, 3393 mask); 3394 if (!page) { 3395 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3396 PAGE_SIZE, true); 3397 btrfs_delalloc_release_extents(BTRFS_I(inode), 3398 PAGE_SIZE); 3399 ret = -ENOMEM; 3400 goto out; 3401 } 3402 } 3403 3404 if (PageReadahead(page)) { 3405 page_cache_async_readahead(inode->i_mapping, 3406 ra, NULL, page, index, 3407 last_index + 1 - index); 3408 } 3409 3410 if (!PageUptodate(page)) { 3411 btrfs_readpage(NULL, page); 3412 lock_page(page); 3413 if (!PageUptodate(page)) { 3414 unlock_page(page); 3415 put_page(page); 3416 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3417 PAGE_SIZE, true); 3418 btrfs_delalloc_release_extents(BTRFS_I(inode), 3419 PAGE_SIZE); 3420 ret = -EIO; 3421 goto out; 3422 } 3423 } 3424 3425 page_start = page_offset(page); 3426 page_end = page_start + PAGE_SIZE - 1; 3427 3428 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3429 3430 set_page_extent_mapped(page); 3431 3432 if (nr < cluster->nr && 3433 page_start + offset == cluster->boundary[nr]) { 3434 set_extent_bits(&BTRFS_I(inode)->io_tree, 3435 page_start, page_end, 3436 EXTENT_BOUNDARY); 3437 nr++; 3438 } 3439 3440 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 3441 NULL); 3442 if (ret) { 3443 unlock_page(page); 3444 put_page(page); 3445 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3446 PAGE_SIZE, true); 3447 btrfs_delalloc_release_extents(BTRFS_I(inode), 3448 PAGE_SIZE); 3449 3450 clear_extent_bits(&BTRFS_I(inode)->io_tree, 3451 page_start, page_end, 3452 EXTENT_LOCKED | EXTENT_BOUNDARY); 3453 goto out; 3454 3455 } 3456 set_page_dirty(page); 3457 3458 unlock_extent(&BTRFS_I(inode)->io_tree, 3459 page_start, page_end); 3460 unlock_page(page); 3461 put_page(page); 3462 3463 index++; 3464 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 3465 balance_dirty_pages_ratelimited(inode->i_mapping); 3466 btrfs_throttle(fs_info); 3467 if (btrfs_should_cancel_balance(fs_info)) { 3468 ret = -ECANCELED; 3469 goto out; 3470 } 3471 } 3472 WARN_ON(nr != cluster->nr); 3473 out: 3474 kfree(ra); 3475 return ret; 3476 } 3477 3478 static noinline_for_stack 3479 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3480 struct file_extent_cluster *cluster) 3481 { 3482 int ret; 3483 3484 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3485 ret = relocate_file_extent_cluster(inode, cluster); 3486 if (ret) 3487 return ret; 3488 cluster->nr = 0; 3489 } 3490 3491 if (!cluster->nr) 3492 cluster->start = extent_key->objectid; 3493 else 3494 BUG_ON(cluster->nr >= MAX_EXTENTS); 3495 cluster->end = extent_key->objectid + extent_key->offset - 1; 3496 cluster->boundary[cluster->nr] = extent_key->objectid; 3497 cluster->nr++; 3498 3499 if (cluster->nr >= MAX_EXTENTS) { 3500 ret = relocate_file_extent_cluster(inode, cluster); 3501 if (ret) 3502 return ret; 3503 cluster->nr = 0; 3504 } 3505 return 0; 3506 } 3507 3508 /* 3509 * helper to add a tree block to the list. 3510 * the major work is getting the generation and level of the block 3511 */ 3512 static int add_tree_block(struct reloc_control *rc, 3513 struct btrfs_key *extent_key, 3514 struct btrfs_path *path, 3515 struct rb_root *blocks) 3516 { 3517 struct extent_buffer *eb; 3518 struct btrfs_extent_item *ei; 3519 struct btrfs_tree_block_info *bi; 3520 struct tree_block *block; 3521 struct rb_node *rb_node; 3522 u32 item_size; 3523 int level = -1; 3524 u64 generation; 3525 3526 eb = path->nodes[0]; 3527 item_size = btrfs_item_size_nr(eb, path->slots[0]); 3528 3529 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3530 item_size >= sizeof(*ei) + sizeof(*bi)) { 3531 ei = btrfs_item_ptr(eb, path->slots[0], 3532 struct btrfs_extent_item); 3533 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3534 bi = (struct btrfs_tree_block_info *)(ei + 1); 3535 level = btrfs_tree_block_level(eb, bi); 3536 } else { 3537 level = (int)extent_key->offset; 3538 } 3539 generation = btrfs_extent_generation(eb, ei); 3540 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3541 btrfs_print_v0_err(eb->fs_info); 3542 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3543 return -EINVAL; 3544 } else { 3545 BUG(); 3546 } 3547 3548 btrfs_release_path(path); 3549 3550 BUG_ON(level == -1); 3551 3552 block = kmalloc(sizeof(*block), GFP_NOFS); 3553 if (!block) 3554 return -ENOMEM; 3555 3556 block->bytenr = extent_key->objectid; 3557 block->key.objectid = rc->extent_root->fs_info->nodesize; 3558 block->key.offset = generation; 3559 block->level = level; 3560 block->key_ready = 0; 3561 3562 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3563 if (rb_node) 3564 backref_tree_panic(rb_node, -EEXIST, block->bytenr); 3565 3566 return 0; 3567 } 3568 3569 /* 3570 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3571 */ 3572 static int __add_tree_block(struct reloc_control *rc, 3573 u64 bytenr, u32 blocksize, 3574 struct rb_root *blocks) 3575 { 3576 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3577 struct btrfs_path *path; 3578 struct btrfs_key key; 3579 int ret; 3580 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3581 3582 if (tree_block_processed(bytenr, rc)) 3583 return 0; 3584 3585 if (rb_simple_search(blocks, bytenr)) 3586 return 0; 3587 3588 path = btrfs_alloc_path(); 3589 if (!path) 3590 return -ENOMEM; 3591 again: 3592 key.objectid = bytenr; 3593 if (skinny) { 3594 key.type = BTRFS_METADATA_ITEM_KEY; 3595 key.offset = (u64)-1; 3596 } else { 3597 key.type = BTRFS_EXTENT_ITEM_KEY; 3598 key.offset = blocksize; 3599 } 3600 3601 path->search_commit_root = 1; 3602 path->skip_locking = 1; 3603 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3604 if (ret < 0) 3605 goto out; 3606 3607 if (ret > 0 && skinny) { 3608 if (path->slots[0]) { 3609 path->slots[0]--; 3610 btrfs_item_key_to_cpu(path->nodes[0], &key, 3611 path->slots[0]); 3612 if (key.objectid == bytenr && 3613 (key.type == BTRFS_METADATA_ITEM_KEY || 3614 (key.type == BTRFS_EXTENT_ITEM_KEY && 3615 key.offset == blocksize))) 3616 ret = 0; 3617 } 3618 3619 if (ret) { 3620 skinny = false; 3621 btrfs_release_path(path); 3622 goto again; 3623 } 3624 } 3625 if (ret) { 3626 ASSERT(ret == 1); 3627 btrfs_print_leaf(path->nodes[0]); 3628 btrfs_err(fs_info, 3629 "tree block extent item (%llu) is not found in extent tree", 3630 bytenr); 3631 WARN_ON(1); 3632 ret = -EINVAL; 3633 goto out; 3634 } 3635 3636 ret = add_tree_block(rc, &key, path, blocks); 3637 out: 3638 btrfs_free_path(path); 3639 return ret; 3640 } 3641 3642 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3643 struct btrfs_block_group *block_group, 3644 struct inode *inode, 3645 u64 ino) 3646 { 3647 struct btrfs_key key; 3648 struct btrfs_root *root = fs_info->tree_root; 3649 struct btrfs_trans_handle *trans; 3650 int ret = 0; 3651 3652 if (inode) 3653 goto truncate; 3654 3655 key.objectid = ino; 3656 key.type = BTRFS_INODE_ITEM_KEY; 3657 key.offset = 0; 3658 3659 inode = btrfs_iget(fs_info->sb, &key, root); 3660 if (IS_ERR(inode)) 3661 return -ENOENT; 3662 3663 truncate: 3664 ret = btrfs_check_trunc_cache_free_space(fs_info, 3665 &fs_info->global_block_rsv); 3666 if (ret) 3667 goto out; 3668 3669 trans = btrfs_join_transaction(root); 3670 if (IS_ERR(trans)) { 3671 ret = PTR_ERR(trans); 3672 goto out; 3673 } 3674 3675 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3676 3677 btrfs_end_transaction(trans); 3678 btrfs_btree_balance_dirty(fs_info); 3679 out: 3680 iput(inode); 3681 return ret; 3682 } 3683 3684 /* 3685 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3686 * cache inode, to avoid free space cache data extent blocking data relocation. 3687 */ 3688 static int delete_v1_space_cache(struct extent_buffer *leaf, 3689 struct btrfs_block_group *block_group, 3690 u64 data_bytenr) 3691 { 3692 u64 space_cache_ino; 3693 struct btrfs_file_extent_item *ei; 3694 struct btrfs_key key; 3695 bool found = false; 3696 int i; 3697 int ret; 3698 3699 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3700 return 0; 3701 3702 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3703 btrfs_item_key_to_cpu(leaf, &key, i); 3704 if (key.type != BTRFS_EXTENT_DATA_KEY) 3705 continue; 3706 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3707 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG && 3708 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3709 found = true; 3710 space_cache_ino = key.objectid; 3711 break; 3712 } 3713 } 3714 if (!found) 3715 return -ENOENT; 3716 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3717 space_cache_ino); 3718 return ret; 3719 } 3720 3721 /* 3722 * helper to find all tree blocks that reference a given data extent 3723 */ 3724 static noinline_for_stack 3725 int add_data_references(struct reloc_control *rc, 3726 struct btrfs_key *extent_key, 3727 struct btrfs_path *path, 3728 struct rb_root *blocks) 3729 { 3730 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3731 struct ulist *leaves = NULL; 3732 struct ulist_iterator leaf_uiter; 3733 struct ulist_node *ref_node = NULL; 3734 const u32 blocksize = fs_info->nodesize; 3735 int ret = 0; 3736 3737 btrfs_release_path(path); 3738 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, 3739 0, &leaves, NULL, true); 3740 if (ret < 0) 3741 return ret; 3742 3743 ULIST_ITER_INIT(&leaf_uiter); 3744 while ((ref_node = ulist_next(leaves, &leaf_uiter))) { 3745 struct extent_buffer *eb; 3746 3747 eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); 3748 if (IS_ERR(eb)) { 3749 ret = PTR_ERR(eb); 3750 break; 3751 } 3752 ret = delete_v1_space_cache(eb, rc->block_group, 3753 extent_key->objectid); 3754 free_extent_buffer(eb); 3755 if (ret < 0) 3756 break; 3757 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3758 if (ret < 0) 3759 break; 3760 } 3761 if (ret < 0) 3762 free_block_list(blocks); 3763 ulist_free(leaves); 3764 return ret; 3765 } 3766 3767 /* 3768 * helper to find next unprocessed extent 3769 */ 3770 static noinline_for_stack 3771 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3772 struct btrfs_key *extent_key) 3773 { 3774 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3775 struct btrfs_key key; 3776 struct extent_buffer *leaf; 3777 u64 start, end, last; 3778 int ret; 3779 3780 last = rc->block_group->start + rc->block_group->length; 3781 while (1) { 3782 cond_resched(); 3783 if (rc->search_start >= last) { 3784 ret = 1; 3785 break; 3786 } 3787 3788 key.objectid = rc->search_start; 3789 key.type = BTRFS_EXTENT_ITEM_KEY; 3790 key.offset = 0; 3791 3792 path->search_commit_root = 1; 3793 path->skip_locking = 1; 3794 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3795 0, 0); 3796 if (ret < 0) 3797 break; 3798 next: 3799 leaf = path->nodes[0]; 3800 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3801 ret = btrfs_next_leaf(rc->extent_root, path); 3802 if (ret != 0) 3803 break; 3804 leaf = path->nodes[0]; 3805 } 3806 3807 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3808 if (key.objectid >= last) { 3809 ret = 1; 3810 break; 3811 } 3812 3813 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3814 key.type != BTRFS_METADATA_ITEM_KEY) { 3815 path->slots[0]++; 3816 goto next; 3817 } 3818 3819 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3820 key.objectid + key.offset <= rc->search_start) { 3821 path->slots[0]++; 3822 goto next; 3823 } 3824 3825 if (key.type == BTRFS_METADATA_ITEM_KEY && 3826 key.objectid + fs_info->nodesize <= 3827 rc->search_start) { 3828 path->slots[0]++; 3829 goto next; 3830 } 3831 3832 ret = find_first_extent_bit(&rc->processed_blocks, 3833 key.objectid, &start, &end, 3834 EXTENT_DIRTY, NULL); 3835 3836 if (ret == 0 && start <= key.objectid) { 3837 btrfs_release_path(path); 3838 rc->search_start = end + 1; 3839 } else { 3840 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3841 rc->search_start = key.objectid + key.offset; 3842 else 3843 rc->search_start = key.objectid + 3844 fs_info->nodesize; 3845 memcpy(extent_key, &key, sizeof(key)); 3846 return 0; 3847 } 3848 } 3849 btrfs_release_path(path); 3850 return ret; 3851 } 3852 3853 static void set_reloc_control(struct reloc_control *rc) 3854 { 3855 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3856 3857 mutex_lock(&fs_info->reloc_mutex); 3858 fs_info->reloc_ctl = rc; 3859 mutex_unlock(&fs_info->reloc_mutex); 3860 } 3861 3862 static void unset_reloc_control(struct reloc_control *rc) 3863 { 3864 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3865 3866 mutex_lock(&fs_info->reloc_mutex); 3867 fs_info->reloc_ctl = NULL; 3868 mutex_unlock(&fs_info->reloc_mutex); 3869 } 3870 3871 static int check_extent_flags(u64 flags) 3872 { 3873 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3874 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3875 return 1; 3876 if (!(flags & BTRFS_EXTENT_FLAG_DATA) && 3877 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3878 return 1; 3879 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3880 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 3881 return 1; 3882 return 0; 3883 } 3884 3885 static noinline_for_stack 3886 int prepare_to_relocate(struct reloc_control *rc) 3887 { 3888 struct btrfs_trans_handle *trans; 3889 int ret; 3890 3891 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3892 BTRFS_BLOCK_RSV_TEMP); 3893 if (!rc->block_rsv) 3894 return -ENOMEM; 3895 3896 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3897 rc->search_start = rc->block_group->start; 3898 rc->extents_found = 0; 3899 rc->nodes_relocated = 0; 3900 rc->merging_rsv_size = 0; 3901 rc->reserved_bytes = 0; 3902 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3903 RELOCATION_RESERVED_NODES; 3904 ret = btrfs_block_rsv_refill(rc->extent_root, 3905 rc->block_rsv, rc->block_rsv->size, 3906 BTRFS_RESERVE_FLUSH_ALL); 3907 if (ret) 3908 return ret; 3909 3910 rc->create_reloc_tree = 1; 3911 set_reloc_control(rc); 3912 3913 trans = btrfs_join_transaction(rc->extent_root); 3914 if (IS_ERR(trans)) { 3915 unset_reloc_control(rc); 3916 /* 3917 * extent tree is not a ref_cow tree and has no reloc_root to 3918 * cleanup. And callers are responsible to free the above 3919 * block rsv. 3920 */ 3921 return PTR_ERR(trans); 3922 } 3923 btrfs_commit_transaction(trans); 3924 return 0; 3925 } 3926 3927 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3928 { 3929 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3930 struct rb_root blocks = RB_ROOT; 3931 struct btrfs_key key; 3932 struct btrfs_trans_handle *trans = NULL; 3933 struct btrfs_path *path; 3934 struct btrfs_extent_item *ei; 3935 u64 flags; 3936 u32 item_size; 3937 int ret; 3938 int err = 0; 3939 int progress = 0; 3940 3941 path = btrfs_alloc_path(); 3942 if (!path) 3943 return -ENOMEM; 3944 path->reada = READA_FORWARD; 3945 3946 ret = prepare_to_relocate(rc); 3947 if (ret) { 3948 err = ret; 3949 goto out_free; 3950 } 3951 3952 while (1) { 3953 rc->reserved_bytes = 0; 3954 ret = btrfs_block_rsv_refill(rc->extent_root, 3955 rc->block_rsv, rc->block_rsv->size, 3956 BTRFS_RESERVE_FLUSH_ALL); 3957 if (ret) { 3958 err = ret; 3959 break; 3960 } 3961 progress++; 3962 trans = btrfs_start_transaction(rc->extent_root, 0); 3963 if (IS_ERR(trans)) { 3964 err = PTR_ERR(trans); 3965 trans = NULL; 3966 break; 3967 } 3968 restart: 3969 if (update_backref_cache(trans, &rc->backref_cache)) { 3970 btrfs_end_transaction(trans); 3971 trans = NULL; 3972 continue; 3973 } 3974 3975 ret = find_next_extent(rc, path, &key); 3976 if (ret < 0) 3977 err = ret; 3978 if (ret != 0) 3979 break; 3980 3981 rc->extents_found++; 3982 3983 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3984 struct btrfs_extent_item); 3985 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 3986 if (item_size >= sizeof(*ei)) { 3987 flags = btrfs_extent_flags(path->nodes[0], ei); 3988 ret = check_extent_flags(flags); 3989 BUG_ON(ret); 3990 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3991 err = -EINVAL; 3992 btrfs_print_v0_err(trans->fs_info); 3993 btrfs_abort_transaction(trans, err); 3994 break; 3995 } else { 3996 BUG(); 3997 } 3998 3999 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 4000 ret = add_tree_block(rc, &key, path, &blocks); 4001 } else if (rc->stage == UPDATE_DATA_PTRS && 4002 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4003 ret = add_data_references(rc, &key, path, &blocks); 4004 } else { 4005 btrfs_release_path(path); 4006 ret = 0; 4007 } 4008 if (ret < 0) { 4009 err = ret; 4010 break; 4011 } 4012 4013 if (!RB_EMPTY_ROOT(&blocks)) { 4014 ret = relocate_tree_blocks(trans, rc, &blocks); 4015 if (ret < 0) { 4016 if (ret != -EAGAIN) { 4017 err = ret; 4018 break; 4019 } 4020 rc->extents_found--; 4021 rc->search_start = key.objectid; 4022 } 4023 } 4024 4025 btrfs_end_transaction_throttle(trans); 4026 btrfs_btree_balance_dirty(fs_info); 4027 trans = NULL; 4028 4029 if (rc->stage == MOVE_DATA_EXTENTS && 4030 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4031 rc->found_file_extent = 1; 4032 ret = relocate_data_extent(rc->data_inode, 4033 &key, &rc->cluster); 4034 if (ret < 0) { 4035 err = ret; 4036 break; 4037 } 4038 } 4039 if (btrfs_should_cancel_balance(fs_info)) { 4040 err = -ECANCELED; 4041 break; 4042 } 4043 } 4044 if (trans && progress && err == -ENOSPC) { 4045 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 4046 if (ret == 1) { 4047 err = 0; 4048 progress = 0; 4049 goto restart; 4050 } 4051 } 4052 4053 btrfs_release_path(path); 4054 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 4055 4056 if (trans) { 4057 btrfs_end_transaction_throttle(trans); 4058 btrfs_btree_balance_dirty(fs_info); 4059 } 4060 4061 if (!err) { 4062 ret = relocate_file_extent_cluster(rc->data_inode, 4063 &rc->cluster); 4064 if (ret < 0) 4065 err = ret; 4066 } 4067 4068 rc->create_reloc_tree = 0; 4069 set_reloc_control(rc); 4070 4071 backref_cache_cleanup(&rc->backref_cache); 4072 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 4073 4074 /* 4075 * Even in the case when the relocation is cancelled, we should all go 4076 * through prepare_to_merge() and merge_reloc_roots(). 4077 * 4078 * For error (including cancelled balance), prepare_to_merge() will 4079 * mark all reloc trees orphan, then queue them for cleanup in 4080 * merge_reloc_roots() 4081 */ 4082 err = prepare_to_merge(rc, err); 4083 4084 merge_reloc_roots(rc); 4085 4086 rc->merge_reloc_tree = 0; 4087 unset_reloc_control(rc); 4088 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 4089 4090 /* get rid of pinned extents */ 4091 trans = btrfs_join_transaction(rc->extent_root); 4092 if (IS_ERR(trans)) { 4093 err = PTR_ERR(trans); 4094 goto out_free; 4095 } 4096 btrfs_commit_transaction(trans); 4097 out_free: 4098 ret = clean_dirty_subvols(rc); 4099 if (ret < 0 && !err) 4100 err = ret; 4101 btrfs_free_block_rsv(fs_info, rc->block_rsv); 4102 btrfs_free_path(path); 4103 return err; 4104 } 4105 4106 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 4107 struct btrfs_root *root, u64 objectid) 4108 { 4109 struct btrfs_path *path; 4110 struct btrfs_inode_item *item; 4111 struct extent_buffer *leaf; 4112 int ret; 4113 4114 path = btrfs_alloc_path(); 4115 if (!path) 4116 return -ENOMEM; 4117 4118 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 4119 if (ret) 4120 goto out; 4121 4122 leaf = path->nodes[0]; 4123 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 4124 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 4125 btrfs_set_inode_generation(leaf, item, 1); 4126 btrfs_set_inode_size(leaf, item, 0); 4127 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 4128 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4129 BTRFS_INODE_PREALLOC); 4130 btrfs_mark_buffer_dirty(leaf); 4131 out: 4132 btrfs_free_path(path); 4133 return ret; 4134 } 4135 4136 /* 4137 * helper to create inode for data relocation. 4138 * the inode is in data relocation tree and its link count is 0 4139 */ 4140 static noinline_for_stack 4141 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 4142 struct btrfs_block_group *group) 4143 { 4144 struct inode *inode = NULL; 4145 struct btrfs_trans_handle *trans; 4146 struct btrfs_root *root; 4147 struct btrfs_key key; 4148 u64 objectid; 4149 int err = 0; 4150 4151 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4152 if (IS_ERR(root)) 4153 return ERR_CAST(root); 4154 4155 trans = btrfs_start_transaction(root, 6); 4156 if (IS_ERR(trans)) { 4157 btrfs_put_root(root); 4158 return ERR_CAST(trans); 4159 } 4160 4161 err = btrfs_find_free_objectid(root, &objectid); 4162 if (err) 4163 goto out; 4164 4165 err = __insert_orphan_inode(trans, root, objectid); 4166 BUG_ON(err); 4167 4168 key.objectid = objectid; 4169 key.type = BTRFS_INODE_ITEM_KEY; 4170 key.offset = 0; 4171 inode = btrfs_iget(fs_info->sb, &key, root); 4172 BUG_ON(IS_ERR(inode)); 4173 BTRFS_I(inode)->index_cnt = group->start; 4174 4175 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4176 out: 4177 btrfs_put_root(root); 4178 btrfs_end_transaction(trans); 4179 btrfs_btree_balance_dirty(fs_info); 4180 if (err) { 4181 if (inode) 4182 iput(inode); 4183 inode = ERR_PTR(err); 4184 } 4185 return inode; 4186 } 4187 4188 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 4189 { 4190 struct reloc_control *rc; 4191 4192 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4193 if (!rc) 4194 return NULL; 4195 4196 INIT_LIST_HEAD(&rc->reloc_roots); 4197 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 4198 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); 4199 mapping_tree_init(&rc->reloc_root_tree); 4200 extent_io_tree_init(fs_info, &rc->processed_blocks, 4201 IO_TREE_RELOC_BLOCKS, NULL); 4202 return rc; 4203 } 4204 4205 static void free_reloc_control(struct reloc_control *rc) 4206 { 4207 struct mapping_node *node, *tmp; 4208 4209 free_reloc_roots(&rc->reloc_roots); 4210 rbtree_postorder_for_each_entry_safe(node, tmp, 4211 &rc->reloc_root_tree.rb_root, rb_node) 4212 kfree(node); 4213 4214 kfree(rc); 4215 } 4216 4217 /* 4218 * Print the block group being relocated 4219 */ 4220 static void describe_relocation(struct btrfs_fs_info *fs_info, 4221 struct btrfs_block_group *block_group) 4222 { 4223 char buf[128] = {'\0'}; 4224 4225 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 4226 4227 btrfs_info(fs_info, 4228 "relocating block group %llu flags %s", 4229 block_group->start, buf); 4230 } 4231 4232 static const char *stage_to_string(int stage) 4233 { 4234 if (stage == MOVE_DATA_EXTENTS) 4235 return "move data extents"; 4236 if (stage == UPDATE_DATA_PTRS) 4237 return "update data pointers"; 4238 return "unknown"; 4239 } 4240 4241 /* 4242 * function to relocate all extents in a block group. 4243 */ 4244 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 4245 { 4246 struct btrfs_block_group *bg; 4247 struct btrfs_root *extent_root = fs_info->extent_root; 4248 struct reloc_control *rc; 4249 struct inode *inode; 4250 struct btrfs_path *path; 4251 int ret; 4252 int rw = 0; 4253 int err = 0; 4254 4255 bg = btrfs_lookup_block_group(fs_info, group_start); 4256 if (!bg) 4257 return -ENOENT; 4258 4259 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4260 btrfs_put_block_group(bg); 4261 return -ETXTBSY; 4262 } 4263 4264 rc = alloc_reloc_control(fs_info); 4265 if (!rc) { 4266 btrfs_put_block_group(bg); 4267 return -ENOMEM; 4268 } 4269 4270 rc->extent_root = extent_root; 4271 rc->block_group = bg; 4272 4273 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4274 if (ret) { 4275 err = ret; 4276 goto out; 4277 } 4278 rw = 1; 4279 4280 path = btrfs_alloc_path(); 4281 if (!path) { 4282 err = -ENOMEM; 4283 goto out; 4284 } 4285 4286 inode = lookup_free_space_inode(rc->block_group, path); 4287 btrfs_free_path(path); 4288 4289 if (!IS_ERR(inode)) 4290 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4291 else 4292 ret = PTR_ERR(inode); 4293 4294 if (ret && ret != -ENOENT) { 4295 err = ret; 4296 goto out; 4297 } 4298 4299 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4300 if (IS_ERR(rc->data_inode)) { 4301 err = PTR_ERR(rc->data_inode); 4302 rc->data_inode = NULL; 4303 goto out; 4304 } 4305 4306 describe_relocation(fs_info, rc->block_group); 4307 4308 btrfs_wait_block_group_reservations(rc->block_group); 4309 btrfs_wait_nocow_writers(rc->block_group); 4310 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4311 rc->block_group->start, 4312 rc->block_group->length); 4313 4314 while (1) { 4315 int finishes_stage; 4316 4317 mutex_lock(&fs_info->cleaner_mutex); 4318 ret = relocate_block_group(rc); 4319 mutex_unlock(&fs_info->cleaner_mutex); 4320 if (ret < 0) 4321 err = ret; 4322 4323 finishes_stage = rc->stage; 4324 /* 4325 * We may have gotten ENOSPC after we already dirtied some 4326 * extents. If writeout happens while we're relocating a 4327 * different block group we could end up hitting the 4328 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4329 * btrfs_reloc_cow_block. Make sure we write everything out 4330 * properly so we don't trip over this problem, and then break 4331 * out of the loop if we hit an error. 4332 */ 4333 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4334 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4335 (u64)-1); 4336 if (ret) 4337 err = ret; 4338 invalidate_mapping_pages(rc->data_inode->i_mapping, 4339 0, -1); 4340 rc->stage = UPDATE_DATA_PTRS; 4341 } 4342 4343 if (err < 0) 4344 goto out; 4345 4346 if (rc->extents_found == 0) 4347 break; 4348 4349 btrfs_info(fs_info, "found %llu extents, stage: %s", 4350 rc->extents_found, stage_to_string(finishes_stage)); 4351 } 4352 4353 WARN_ON(rc->block_group->pinned > 0); 4354 WARN_ON(rc->block_group->reserved > 0); 4355 WARN_ON(rc->block_group->used > 0); 4356 out: 4357 if (err && rw) 4358 btrfs_dec_block_group_ro(rc->block_group); 4359 iput(rc->data_inode); 4360 btrfs_put_block_group(rc->block_group); 4361 free_reloc_control(rc); 4362 return err; 4363 } 4364 4365 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4366 { 4367 struct btrfs_fs_info *fs_info = root->fs_info; 4368 struct btrfs_trans_handle *trans; 4369 int ret, err; 4370 4371 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4372 if (IS_ERR(trans)) 4373 return PTR_ERR(trans); 4374 4375 memset(&root->root_item.drop_progress, 0, 4376 sizeof(root->root_item.drop_progress)); 4377 root->root_item.drop_level = 0; 4378 btrfs_set_root_refs(&root->root_item, 0); 4379 ret = btrfs_update_root(trans, fs_info->tree_root, 4380 &root->root_key, &root->root_item); 4381 4382 err = btrfs_end_transaction(trans); 4383 if (err) 4384 return err; 4385 return ret; 4386 } 4387 4388 /* 4389 * recover relocation interrupted by system crash. 4390 * 4391 * this function resumes merging reloc trees with corresponding fs trees. 4392 * this is important for keeping the sharing of tree blocks 4393 */ 4394 int btrfs_recover_relocation(struct btrfs_root *root) 4395 { 4396 struct btrfs_fs_info *fs_info = root->fs_info; 4397 LIST_HEAD(reloc_roots); 4398 struct btrfs_key key; 4399 struct btrfs_root *fs_root; 4400 struct btrfs_root *reloc_root; 4401 struct btrfs_path *path; 4402 struct extent_buffer *leaf; 4403 struct reloc_control *rc = NULL; 4404 struct btrfs_trans_handle *trans; 4405 int ret; 4406 int err = 0; 4407 4408 path = btrfs_alloc_path(); 4409 if (!path) 4410 return -ENOMEM; 4411 path->reada = READA_BACK; 4412 4413 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4414 key.type = BTRFS_ROOT_ITEM_KEY; 4415 key.offset = (u64)-1; 4416 4417 while (1) { 4418 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4419 path, 0, 0); 4420 if (ret < 0) { 4421 err = ret; 4422 goto out; 4423 } 4424 if (ret > 0) { 4425 if (path->slots[0] == 0) 4426 break; 4427 path->slots[0]--; 4428 } 4429 leaf = path->nodes[0]; 4430 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4431 btrfs_release_path(path); 4432 4433 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4434 key.type != BTRFS_ROOT_ITEM_KEY) 4435 break; 4436 4437 reloc_root = btrfs_read_tree_root(root, &key); 4438 if (IS_ERR(reloc_root)) { 4439 err = PTR_ERR(reloc_root); 4440 goto out; 4441 } 4442 4443 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); 4444 list_add(&reloc_root->root_list, &reloc_roots); 4445 4446 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4447 fs_root = read_fs_root(fs_info, 4448 reloc_root->root_key.offset); 4449 if (IS_ERR(fs_root)) { 4450 ret = PTR_ERR(fs_root); 4451 if (ret != -ENOENT) { 4452 err = ret; 4453 goto out; 4454 } 4455 ret = mark_garbage_root(reloc_root); 4456 if (ret < 0) { 4457 err = ret; 4458 goto out; 4459 } 4460 } else { 4461 btrfs_put_root(fs_root); 4462 } 4463 } 4464 4465 if (key.offset == 0) 4466 break; 4467 4468 key.offset--; 4469 } 4470 btrfs_release_path(path); 4471 4472 if (list_empty(&reloc_roots)) 4473 goto out; 4474 4475 rc = alloc_reloc_control(fs_info); 4476 if (!rc) { 4477 err = -ENOMEM; 4478 goto out; 4479 } 4480 4481 rc->extent_root = fs_info->extent_root; 4482 4483 set_reloc_control(rc); 4484 4485 trans = btrfs_join_transaction(rc->extent_root); 4486 if (IS_ERR(trans)) { 4487 err = PTR_ERR(trans); 4488 goto out_unset; 4489 } 4490 4491 rc->merge_reloc_tree = 1; 4492 4493 while (!list_empty(&reloc_roots)) { 4494 reloc_root = list_entry(reloc_roots.next, 4495 struct btrfs_root, root_list); 4496 list_del(&reloc_root->root_list); 4497 4498 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4499 list_add_tail(&reloc_root->root_list, 4500 &rc->reloc_roots); 4501 continue; 4502 } 4503 4504 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); 4505 if (IS_ERR(fs_root)) { 4506 err = PTR_ERR(fs_root); 4507 list_add_tail(&reloc_root->root_list, &reloc_roots); 4508 btrfs_end_transaction(trans); 4509 goto out_unset; 4510 } 4511 4512 err = __add_reloc_root(reloc_root); 4513 BUG_ON(err < 0); /* -ENOMEM or logic error */ 4514 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4515 btrfs_put_root(fs_root); 4516 } 4517 4518 err = btrfs_commit_transaction(trans); 4519 if (err) 4520 goto out_unset; 4521 4522 merge_reloc_roots(rc); 4523 4524 unset_reloc_control(rc); 4525 4526 trans = btrfs_join_transaction(rc->extent_root); 4527 if (IS_ERR(trans)) { 4528 err = PTR_ERR(trans); 4529 goto out_clean; 4530 } 4531 err = btrfs_commit_transaction(trans); 4532 out_clean: 4533 ret = clean_dirty_subvols(rc); 4534 if (ret < 0 && !err) 4535 err = ret; 4536 out_unset: 4537 unset_reloc_control(rc); 4538 free_reloc_control(rc); 4539 out: 4540 if (!list_empty(&reloc_roots)) 4541 free_reloc_roots(&reloc_roots); 4542 4543 btrfs_free_path(path); 4544 4545 if (err == 0) { 4546 /* cleanup orphan inode in data relocation tree */ 4547 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4548 if (IS_ERR(fs_root)) { 4549 err = PTR_ERR(fs_root); 4550 } else { 4551 err = btrfs_orphan_cleanup(fs_root); 4552 btrfs_put_root(fs_root); 4553 } 4554 } 4555 return err; 4556 } 4557 4558 /* 4559 * helper to add ordered checksum for data relocation. 4560 * 4561 * cloning checksum properly handles the nodatasum extents. 4562 * it also saves CPU time to re-calculate the checksum. 4563 */ 4564 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4565 { 4566 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4567 struct btrfs_ordered_sum *sums; 4568 struct btrfs_ordered_extent *ordered; 4569 int ret; 4570 u64 disk_bytenr; 4571 u64 new_bytenr; 4572 LIST_HEAD(list); 4573 4574 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4575 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); 4576 4577 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 4578 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 4579 disk_bytenr + len - 1, &list, 0); 4580 if (ret) 4581 goto out; 4582 4583 while (!list_empty(&list)) { 4584 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4585 list_del_init(&sums->list); 4586 4587 /* 4588 * We need to offset the new_bytenr based on where the csum is. 4589 * We need to do this because we will read in entire prealloc 4590 * extents but we may have written to say the middle of the 4591 * prealloc extent, so we need to make sure the csum goes with 4592 * the right disk offset. 4593 * 4594 * We can do this because the data reloc inode refers strictly 4595 * to the on disk bytes, so we don't have to worry about 4596 * disk_len vs real len like with real inodes since it's all 4597 * disk length. 4598 */ 4599 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; 4600 sums->bytenr = new_bytenr; 4601 4602 btrfs_add_ordered_sum(ordered, sums); 4603 } 4604 out: 4605 btrfs_put_ordered_extent(ordered); 4606 return ret; 4607 } 4608 4609 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4610 struct btrfs_root *root, struct extent_buffer *buf, 4611 struct extent_buffer *cow) 4612 { 4613 struct btrfs_fs_info *fs_info = root->fs_info; 4614 struct reloc_control *rc; 4615 struct btrfs_backref_node *node; 4616 int first_cow = 0; 4617 int level; 4618 int ret = 0; 4619 4620 rc = fs_info->reloc_ctl; 4621 if (!rc) 4622 return 0; 4623 4624 BUG_ON(rc->stage == UPDATE_DATA_PTRS && 4625 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); 4626 4627 level = btrfs_header_level(buf); 4628 if (btrfs_header_generation(buf) <= 4629 btrfs_root_last_snapshot(&root->root_item)) 4630 first_cow = 1; 4631 4632 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4633 rc->create_reloc_tree) { 4634 WARN_ON(!first_cow && level == 0); 4635 4636 node = rc->backref_cache.path[level]; 4637 BUG_ON(node->bytenr != buf->start && 4638 node->new_bytenr != buf->start); 4639 4640 drop_node_buffer(node); 4641 atomic_inc(&cow->refs); 4642 node->eb = cow; 4643 node->new_bytenr = cow->start; 4644 4645 if (!node->pending) { 4646 list_move_tail(&node->list, 4647 &rc->backref_cache.pending[level]); 4648 node->pending = 1; 4649 } 4650 4651 if (first_cow) 4652 mark_block_processed(rc, node); 4653 4654 if (first_cow && level > 0) 4655 rc->nodes_relocated += buf->len; 4656 } 4657 4658 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4659 ret = replace_file_extents(trans, rc, root, cow); 4660 return ret; 4661 } 4662 4663 /* 4664 * called before creating snapshot. it calculates metadata reservation 4665 * required for relocating tree blocks in the snapshot 4666 */ 4667 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4668 u64 *bytes_to_reserve) 4669 { 4670 struct btrfs_root *root = pending->root; 4671 struct reloc_control *rc = root->fs_info->reloc_ctl; 4672 4673 if (!rc || !have_reloc_root(root)) 4674 return; 4675 4676 if (!rc->merge_reloc_tree) 4677 return; 4678 4679 root = root->reloc_root; 4680 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4681 /* 4682 * relocation is in the stage of merging trees. the space 4683 * used by merging a reloc tree is twice the size of 4684 * relocated tree nodes in the worst case. half for cowing 4685 * the reloc tree, half for cowing the fs tree. the space 4686 * used by cowing the reloc tree will be freed after the 4687 * tree is dropped. if we create snapshot, cowing the fs 4688 * tree may use more space than it frees. so we need 4689 * reserve extra space. 4690 */ 4691 *bytes_to_reserve += rc->nodes_relocated; 4692 } 4693 4694 /* 4695 * called after snapshot is created. migrate block reservation 4696 * and create reloc root for the newly created snapshot 4697 * 4698 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4699 * references held on the reloc_root, one for root->reloc_root and one for 4700 * rc->reloc_roots. 4701 */ 4702 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4703 struct btrfs_pending_snapshot *pending) 4704 { 4705 struct btrfs_root *root = pending->root; 4706 struct btrfs_root *reloc_root; 4707 struct btrfs_root *new_root; 4708 struct reloc_control *rc = root->fs_info->reloc_ctl; 4709 int ret; 4710 4711 if (!rc || !have_reloc_root(root)) 4712 return 0; 4713 4714 rc = root->fs_info->reloc_ctl; 4715 rc->merging_rsv_size += rc->nodes_relocated; 4716 4717 if (rc->merge_reloc_tree) { 4718 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4719 rc->block_rsv, 4720 rc->nodes_relocated, true); 4721 if (ret) 4722 return ret; 4723 } 4724 4725 new_root = pending->snap; 4726 reloc_root = create_reloc_root(trans, root->reloc_root, 4727 new_root->root_key.objectid); 4728 if (IS_ERR(reloc_root)) 4729 return PTR_ERR(reloc_root); 4730 4731 ret = __add_reloc_root(reloc_root); 4732 BUG_ON(ret < 0); 4733 new_root->reloc_root = btrfs_grab_root(reloc_root); 4734 4735 if (rc->create_reloc_tree) 4736 ret = clone_backref_node(trans, rc, root, reloc_root); 4737 return ret; 4738 } 4739