1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "inode-map.h" 22 #include "qgroup.h" 23 #include "print-tree.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "backref.h" 27 28 /* 29 * Relocation overview 30 * 31 * [What does relocation do] 32 * 33 * The objective of relocation is to relocate all extents of the target block 34 * group to other block groups. 35 * This is utilized by resize (shrink only), profile converting, compacting 36 * space, or balance routine to spread chunks over devices. 37 * 38 * Before | After 39 * ------------------------------------------------------------------ 40 * BG A: 10 data extents | BG A: deleted 41 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 42 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 43 * 44 * [How does relocation work] 45 * 46 * 1. Mark the target block group read-only 47 * New extents won't be allocated from the target block group. 48 * 49 * 2.1 Record each extent in the target block group 50 * To build a proper map of extents to be relocated. 51 * 52 * 2.2 Build data reloc tree and reloc trees 53 * Data reloc tree will contain an inode, recording all newly relocated 54 * data extents. 55 * There will be only one data reloc tree for one data block group. 56 * 57 * Reloc tree will be a special snapshot of its source tree, containing 58 * relocated tree blocks. 59 * Each tree referring to a tree block in target block group will get its 60 * reloc tree built. 61 * 62 * 2.3 Swap source tree with its corresponding reloc tree 63 * Each involved tree only refers to new extents after swap. 64 * 65 * 3. Cleanup reloc trees and data reloc tree. 66 * As old extents in the target block group are still referenced by reloc 67 * trees, we need to clean them up before really freeing the target block 68 * group. 69 * 70 * The main complexity is in steps 2.2 and 2.3. 71 * 72 * The entry point of relocation is relocate_block_group() function. 73 */ 74 75 /* 76 * backref_node, mapping_node and tree_block start with this 77 */ 78 struct tree_entry { 79 struct rb_node rb_node; 80 u64 bytenr; 81 }; 82 83 /* 84 * present a tree block in the backref cache 85 */ 86 struct backref_node { 87 struct rb_node rb_node; 88 u64 bytenr; 89 90 u64 new_bytenr; 91 /* objectid of tree block owner, can be not uptodate */ 92 u64 owner; 93 /* link to pending, changed or detached list */ 94 struct list_head list; 95 /* list of upper level blocks reference this block */ 96 struct list_head upper; 97 /* list of child blocks in the cache */ 98 struct list_head lower; 99 /* NULL if this node is not tree root */ 100 struct btrfs_root *root; 101 /* extent buffer got by COW the block */ 102 struct extent_buffer *eb; 103 /* level of tree block */ 104 unsigned int level:8; 105 /* is the block in non-reference counted tree */ 106 unsigned int cowonly:1; 107 /* 1 if no child node in the cache */ 108 unsigned int lowest:1; 109 /* is the extent buffer locked */ 110 unsigned int locked:1; 111 /* has the block been processed */ 112 unsigned int processed:1; 113 /* have backrefs of this block been checked */ 114 unsigned int checked:1; 115 /* 116 * 1 if corresponding block has been cowed but some upper 117 * level block pointers may not point to the new location 118 */ 119 unsigned int pending:1; 120 /* 121 * 1 if the backref node isn't connected to any other 122 * backref node. 123 */ 124 unsigned int detached:1; 125 }; 126 127 /* 128 * present a block pointer in the backref cache 129 */ 130 struct backref_edge { 131 struct list_head list[2]; 132 struct backref_node *node[2]; 133 }; 134 135 #define LOWER 0 136 #define UPPER 1 137 #define RELOCATION_RESERVED_NODES 256 138 139 struct backref_cache { 140 /* red black tree of all backref nodes in the cache */ 141 struct rb_root rb_root; 142 /* for passing backref nodes to btrfs_reloc_cow_block */ 143 struct backref_node *path[BTRFS_MAX_LEVEL]; 144 /* 145 * list of blocks that have been cowed but some block 146 * pointers in upper level blocks may not reflect the 147 * new location 148 */ 149 struct list_head pending[BTRFS_MAX_LEVEL]; 150 /* list of backref nodes with no child node */ 151 struct list_head leaves; 152 /* list of blocks that have been cowed in current transaction */ 153 struct list_head changed; 154 /* list of detached backref node. */ 155 struct list_head detached; 156 157 u64 last_trans; 158 159 int nr_nodes; 160 int nr_edges; 161 }; 162 163 /* 164 * map address of tree root to tree 165 */ 166 struct mapping_node { 167 struct rb_node rb_node; 168 u64 bytenr; 169 void *data; 170 }; 171 172 struct mapping_tree { 173 struct rb_root rb_root; 174 spinlock_t lock; 175 }; 176 177 /* 178 * present a tree block to process 179 */ 180 struct tree_block { 181 struct rb_node rb_node; 182 u64 bytenr; 183 struct btrfs_key key; 184 unsigned int level:8; 185 unsigned int key_ready:1; 186 }; 187 188 #define MAX_EXTENTS 128 189 190 struct file_extent_cluster { 191 u64 start; 192 u64 end; 193 u64 boundary[MAX_EXTENTS]; 194 unsigned int nr; 195 }; 196 197 struct reloc_control { 198 /* block group to relocate */ 199 struct btrfs_block_group *block_group; 200 /* extent tree */ 201 struct btrfs_root *extent_root; 202 /* inode for moving data */ 203 struct inode *data_inode; 204 205 struct btrfs_block_rsv *block_rsv; 206 207 struct backref_cache backref_cache; 208 209 struct file_extent_cluster cluster; 210 /* tree blocks have been processed */ 211 struct extent_io_tree processed_blocks; 212 /* map start of tree root to corresponding reloc tree */ 213 struct mapping_tree reloc_root_tree; 214 /* list of reloc trees */ 215 struct list_head reloc_roots; 216 /* list of subvolume trees that get relocated */ 217 struct list_head dirty_subvol_roots; 218 /* size of metadata reservation for merging reloc trees */ 219 u64 merging_rsv_size; 220 /* size of relocated tree nodes */ 221 u64 nodes_relocated; 222 /* reserved size for block group relocation*/ 223 u64 reserved_bytes; 224 225 u64 search_start; 226 u64 extents_found; 227 228 unsigned int stage:8; 229 unsigned int create_reloc_tree:1; 230 unsigned int merge_reloc_tree:1; 231 unsigned int found_file_extent:1; 232 }; 233 234 /* stages of data relocation */ 235 #define MOVE_DATA_EXTENTS 0 236 #define UPDATE_DATA_PTRS 1 237 238 static void remove_backref_node(struct backref_cache *cache, 239 struct backref_node *node); 240 static void __mark_block_processed(struct reloc_control *rc, 241 struct backref_node *node); 242 243 static void mapping_tree_init(struct mapping_tree *tree) 244 { 245 tree->rb_root = RB_ROOT; 246 spin_lock_init(&tree->lock); 247 } 248 249 static void backref_cache_init(struct backref_cache *cache) 250 { 251 int i; 252 cache->rb_root = RB_ROOT; 253 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 254 INIT_LIST_HEAD(&cache->pending[i]); 255 INIT_LIST_HEAD(&cache->changed); 256 INIT_LIST_HEAD(&cache->detached); 257 INIT_LIST_HEAD(&cache->leaves); 258 } 259 260 static void backref_cache_cleanup(struct backref_cache *cache) 261 { 262 struct backref_node *node; 263 int i; 264 265 while (!list_empty(&cache->detached)) { 266 node = list_entry(cache->detached.next, 267 struct backref_node, list); 268 remove_backref_node(cache, node); 269 } 270 271 while (!list_empty(&cache->leaves)) { 272 node = list_entry(cache->leaves.next, 273 struct backref_node, lower); 274 remove_backref_node(cache, node); 275 } 276 277 cache->last_trans = 0; 278 279 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 280 ASSERT(list_empty(&cache->pending[i])); 281 ASSERT(list_empty(&cache->changed)); 282 ASSERT(list_empty(&cache->detached)); 283 ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); 284 ASSERT(!cache->nr_nodes); 285 ASSERT(!cache->nr_edges); 286 } 287 288 static struct backref_node *alloc_backref_node(struct backref_cache *cache) 289 { 290 struct backref_node *node; 291 292 node = kzalloc(sizeof(*node), GFP_NOFS); 293 if (node) { 294 INIT_LIST_HEAD(&node->list); 295 INIT_LIST_HEAD(&node->upper); 296 INIT_LIST_HEAD(&node->lower); 297 RB_CLEAR_NODE(&node->rb_node); 298 cache->nr_nodes++; 299 } 300 return node; 301 } 302 303 static void free_backref_node(struct backref_cache *cache, 304 struct backref_node *node) 305 { 306 if (node) { 307 cache->nr_nodes--; 308 btrfs_put_root(node->root); 309 kfree(node); 310 } 311 } 312 313 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) 314 { 315 struct backref_edge *edge; 316 317 edge = kzalloc(sizeof(*edge), GFP_NOFS); 318 if (edge) 319 cache->nr_edges++; 320 return edge; 321 } 322 323 static void free_backref_edge(struct backref_cache *cache, 324 struct backref_edge *edge) 325 { 326 if (edge) { 327 cache->nr_edges--; 328 kfree(edge); 329 } 330 } 331 332 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 333 struct rb_node *node) 334 { 335 struct rb_node **p = &root->rb_node; 336 struct rb_node *parent = NULL; 337 struct tree_entry *entry; 338 339 while (*p) { 340 parent = *p; 341 entry = rb_entry(parent, struct tree_entry, rb_node); 342 343 if (bytenr < entry->bytenr) 344 p = &(*p)->rb_left; 345 else if (bytenr > entry->bytenr) 346 p = &(*p)->rb_right; 347 else 348 return parent; 349 } 350 351 rb_link_node(node, parent, p); 352 rb_insert_color(node, root); 353 return NULL; 354 } 355 356 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) 357 { 358 struct rb_node *n = root->rb_node; 359 struct tree_entry *entry; 360 361 while (n) { 362 entry = rb_entry(n, struct tree_entry, rb_node); 363 364 if (bytenr < entry->bytenr) 365 n = n->rb_left; 366 else if (bytenr > entry->bytenr) 367 n = n->rb_right; 368 else 369 return n; 370 } 371 return NULL; 372 } 373 374 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) 375 { 376 377 struct btrfs_fs_info *fs_info = NULL; 378 struct backref_node *bnode = rb_entry(rb_node, struct backref_node, 379 rb_node); 380 if (bnode->root) 381 fs_info = bnode->root->fs_info; 382 btrfs_panic(fs_info, errno, 383 "Inconsistency in backref cache found at offset %llu", 384 bytenr); 385 } 386 387 /* 388 * walk up backref nodes until reach node presents tree root 389 */ 390 static struct backref_node *walk_up_backref(struct backref_node *node, 391 struct backref_edge *edges[], 392 int *index) 393 { 394 struct backref_edge *edge; 395 int idx = *index; 396 397 while (!list_empty(&node->upper)) { 398 edge = list_entry(node->upper.next, 399 struct backref_edge, list[LOWER]); 400 edges[idx++] = edge; 401 node = edge->node[UPPER]; 402 } 403 BUG_ON(node->detached); 404 *index = idx; 405 return node; 406 } 407 408 /* 409 * walk down backref nodes to find start of next reference path 410 */ 411 static struct backref_node *walk_down_backref(struct backref_edge *edges[], 412 int *index) 413 { 414 struct backref_edge *edge; 415 struct backref_node *lower; 416 int idx = *index; 417 418 while (idx > 0) { 419 edge = edges[idx - 1]; 420 lower = edge->node[LOWER]; 421 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 422 idx--; 423 continue; 424 } 425 edge = list_entry(edge->list[LOWER].next, 426 struct backref_edge, list[LOWER]); 427 edges[idx - 1] = edge; 428 *index = idx; 429 return edge->node[UPPER]; 430 } 431 *index = 0; 432 return NULL; 433 } 434 435 static void unlock_node_buffer(struct backref_node *node) 436 { 437 if (node->locked) { 438 btrfs_tree_unlock(node->eb); 439 node->locked = 0; 440 } 441 } 442 443 static void drop_node_buffer(struct backref_node *node) 444 { 445 if (node->eb) { 446 unlock_node_buffer(node); 447 free_extent_buffer(node->eb); 448 node->eb = NULL; 449 } 450 } 451 452 static void drop_backref_node(struct backref_cache *tree, 453 struct backref_node *node) 454 { 455 BUG_ON(!list_empty(&node->upper)); 456 457 drop_node_buffer(node); 458 list_del(&node->list); 459 list_del(&node->lower); 460 if (!RB_EMPTY_NODE(&node->rb_node)) 461 rb_erase(&node->rb_node, &tree->rb_root); 462 free_backref_node(tree, node); 463 } 464 465 /* 466 * remove a backref node from the backref cache 467 */ 468 static void remove_backref_node(struct backref_cache *cache, 469 struct backref_node *node) 470 { 471 struct backref_node *upper; 472 struct backref_edge *edge; 473 474 if (!node) 475 return; 476 477 BUG_ON(!node->lowest && !node->detached); 478 while (!list_empty(&node->upper)) { 479 edge = list_entry(node->upper.next, struct backref_edge, 480 list[LOWER]); 481 upper = edge->node[UPPER]; 482 list_del(&edge->list[LOWER]); 483 list_del(&edge->list[UPPER]); 484 free_backref_edge(cache, edge); 485 486 if (RB_EMPTY_NODE(&upper->rb_node)) { 487 BUG_ON(!list_empty(&node->upper)); 488 drop_backref_node(cache, node); 489 node = upper; 490 node->lowest = 1; 491 continue; 492 } 493 /* 494 * add the node to leaf node list if no other 495 * child block cached. 496 */ 497 if (list_empty(&upper->lower)) { 498 list_add_tail(&upper->lower, &cache->leaves); 499 upper->lowest = 1; 500 } 501 } 502 503 drop_backref_node(cache, node); 504 } 505 506 static void update_backref_node(struct backref_cache *cache, 507 struct backref_node *node, u64 bytenr) 508 { 509 struct rb_node *rb_node; 510 rb_erase(&node->rb_node, &cache->rb_root); 511 node->bytenr = bytenr; 512 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 513 if (rb_node) 514 backref_tree_panic(rb_node, -EEXIST, bytenr); 515 } 516 517 /* 518 * update backref cache after a transaction commit 519 */ 520 static int update_backref_cache(struct btrfs_trans_handle *trans, 521 struct backref_cache *cache) 522 { 523 struct backref_node *node; 524 int level = 0; 525 526 if (cache->last_trans == 0) { 527 cache->last_trans = trans->transid; 528 return 0; 529 } 530 531 if (cache->last_trans == trans->transid) 532 return 0; 533 534 /* 535 * detached nodes are used to avoid unnecessary backref 536 * lookup. transaction commit changes the extent tree. 537 * so the detached nodes are no longer useful. 538 */ 539 while (!list_empty(&cache->detached)) { 540 node = list_entry(cache->detached.next, 541 struct backref_node, list); 542 remove_backref_node(cache, node); 543 } 544 545 while (!list_empty(&cache->changed)) { 546 node = list_entry(cache->changed.next, 547 struct backref_node, list); 548 list_del_init(&node->list); 549 BUG_ON(node->pending); 550 update_backref_node(cache, node, node->new_bytenr); 551 } 552 553 /* 554 * some nodes can be left in the pending list if there were 555 * errors during processing the pending nodes. 556 */ 557 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 558 list_for_each_entry(node, &cache->pending[level], list) { 559 BUG_ON(!node->pending); 560 if (node->bytenr == node->new_bytenr) 561 continue; 562 update_backref_node(cache, node, node->new_bytenr); 563 } 564 } 565 566 cache->last_trans = 0; 567 return 1; 568 } 569 570 static bool reloc_root_is_dead(struct btrfs_root *root) 571 { 572 /* 573 * Pair with set_bit/clear_bit in clean_dirty_subvols and 574 * btrfs_update_reloc_root. We need to see the updated bit before 575 * trying to access reloc_root 576 */ 577 smp_rmb(); 578 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 579 return true; 580 return false; 581 } 582 583 /* 584 * Check if this subvolume tree has valid reloc tree. 585 * 586 * Reloc tree after swap is considered dead, thus not considered as valid. 587 * This is enough for most callers, as they don't distinguish dead reloc root 588 * from no reloc root. But should_ignore_root() below is a special case. 589 */ 590 static bool have_reloc_root(struct btrfs_root *root) 591 { 592 if (reloc_root_is_dead(root)) 593 return false; 594 if (!root->reloc_root) 595 return false; 596 return true; 597 } 598 599 static int should_ignore_root(struct btrfs_root *root) 600 { 601 struct btrfs_root *reloc_root; 602 603 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 604 return 0; 605 606 /* This root has been merged with its reloc tree, we can ignore it */ 607 if (reloc_root_is_dead(root)) 608 return 1; 609 610 reloc_root = root->reloc_root; 611 if (!reloc_root) 612 return 0; 613 614 if (btrfs_header_generation(reloc_root->commit_root) == 615 root->fs_info->running_transaction->transid) 616 return 0; 617 /* 618 * if there is reloc tree and it was created in previous 619 * transaction backref lookup can find the reloc tree, 620 * so backref node for the fs tree root is useless for 621 * relocation. 622 */ 623 return 1; 624 } 625 /* 626 * find reloc tree by address of tree root 627 */ 628 static struct btrfs_root *find_reloc_root(struct reloc_control *rc, 629 u64 bytenr) 630 { 631 struct rb_node *rb_node; 632 struct mapping_node *node; 633 struct btrfs_root *root = NULL; 634 635 spin_lock(&rc->reloc_root_tree.lock); 636 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); 637 if (rb_node) { 638 node = rb_entry(rb_node, struct mapping_node, rb_node); 639 root = (struct btrfs_root *)node->data; 640 } 641 spin_unlock(&rc->reloc_root_tree.lock); 642 return btrfs_grab_root(root); 643 } 644 645 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, 646 u64 root_objectid) 647 { 648 struct btrfs_key key; 649 650 key.objectid = root_objectid; 651 key.type = BTRFS_ROOT_ITEM_KEY; 652 key.offset = (u64)-1; 653 654 return btrfs_get_fs_root(fs_info, &key, false); 655 } 656 657 static noinline_for_stack 658 int find_inline_backref(struct extent_buffer *leaf, int slot, 659 unsigned long *ptr, unsigned long *end) 660 { 661 struct btrfs_key key; 662 struct btrfs_extent_item *ei; 663 struct btrfs_tree_block_info *bi; 664 u32 item_size; 665 666 btrfs_item_key_to_cpu(leaf, &key, slot); 667 668 item_size = btrfs_item_size_nr(leaf, slot); 669 if (item_size < sizeof(*ei)) { 670 btrfs_print_v0_err(leaf->fs_info); 671 btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL); 672 return 1; 673 } 674 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 675 WARN_ON(!(btrfs_extent_flags(leaf, ei) & 676 BTRFS_EXTENT_FLAG_TREE_BLOCK)); 677 678 if (key.type == BTRFS_EXTENT_ITEM_KEY && 679 item_size <= sizeof(*ei) + sizeof(*bi)) { 680 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); 681 return 1; 682 } 683 if (key.type == BTRFS_METADATA_ITEM_KEY && 684 item_size <= sizeof(*ei)) { 685 WARN_ON(item_size < sizeof(*ei)); 686 return 1; 687 } 688 689 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 690 bi = (struct btrfs_tree_block_info *)(ei + 1); 691 *ptr = (unsigned long)(bi + 1); 692 } else { 693 *ptr = (unsigned long)(ei + 1); 694 } 695 *end = (unsigned long)ei + item_size; 696 return 0; 697 } 698 699 /* 700 * build backref tree for a given tree block. root of the backref tree 701 * corresponds the tree block, leaves of the backref tree correspond 702 * roots of b-trees that reference the tree block. 703 * 704 * the basic idea of this function is check backrefs of a given block 705 * to find upper level blocks that reference the block, and then check 706 * backrefs of these upper level blocks recursively. the recursion stop 707 * when tree root is reached or backrefs for the block is cached. 708 * 709 * NOTE: if we find backrefs for a block are cached, we know backrefs 710 * for all upper level blocks that directly/indirectly reference the 711 * block are also cached. 712 */ 713 static noinline_for_stack 714 struct backref_node *build_backref_tree(struct reloc_control *rc, 715 struct btrfs_key *node_key, 716 int level, u64 bytenr) 717 { 718 struct backref_cache *cache = &rc->backref_cache; 719 struct btrfs_path *path1; /* For searching extent root */ 720 struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ 721 struct extent_buffer *eb; 722 struct btrfs_root *root; 723 struct backref_node *cur; 724 struct backref_node *upper; 725 struct backref_node *lower; 726 struct backref_node *node = NULL; 727 struct backref_node *exist = NULL; 728 struct backref_edge *edge; 729 struct rb_node *rb_node; 730 struct btrfs_key key; 731 unsigned long end; 732 unsigned long ptr; 733 LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ 734 LIST_HEAD(useless); 735 int cowonly; 736 int ret; 737 int err = 0; 738 bool need_check = true; 739 740 path1 = btrfs_alloc_path(); 741 path2 = btrfs_alloc_path(); 742 if (!path1 || !path2) { 743 err = -ENOMEM; 744 goto out; 745 } 746 747 node = alloc_backref_node(cache); 748 if (!node) { 749 err = -ENOMEM; 750 goto out; 751 } 752 753 node->bytenr = bytenr; 754 node->level = level; 755 node->lowest = 1; 756 cur = node; 757 again: 758 end = 0; 759 ptr = 0; 760 key.objectid = cur->bytenr; 761 key.type = BTRFS_METADATA_ITEM_KEY; 762 key.offset = (u64)-1; 763 764 path1->search_commit_root = 1; 765 path1->skip_locking = 1; 766 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, 767 0, 0); 768 if (ret < 0) { 769 err = ret; 770 goto out; 771 } 772 ASSERT(ret); 773 ASSERT(path1->slots[0]); 774 775 path1->slots[0]--; 776 777 WARN_ON(cur->checked); 778 if (!list_empty(&cur->upper)) { 779 /* 780 * the backref was added previously when processing 781 * backref of type BTRFS_TREE_BLOCK_REF_KEY 782 */ 783 ASSERT(list_is_singular(&cur->upper)); 784 edge = list_entry(cur->upper.next, struct backref_edge, 785 list[LOWER]); 786 ASSERT(list_empty(&edge->list[UPPER])); 787 exist = edge->node[UPPER]; 788 /* 789 * add the upper level block to pending list if we need 790 * check its backrefs 791 */ 792 if (!exist->checked) 793 list_add_tail(&edge->list[UPPER], &list); 794 } else { 795 exist = NULL; 796 } 797 798 while (1) { 799 cond_resched(); 800 eb = path1->nodes[0]; 801 802 if (ptr >= end) { 803 if (path1->slots[0] >= btrfs_header_nritems(eb)) { 804 ret = btrfs_next_leaf(rc->extent_root, path1); 805 if (ret < 0) { 806 err = ret; 807 goto out; 808 } 809 if (ret > 0) 810 break; 811 eb = path1->nodes[0]; 812 } 813 814 btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); 815 if (key.objectid != cur->bytenr) { 816 WARN_ON(exist); 817 break; 818 } 819 820 if (key.type == BTRFS_EXTENT_ITEM_KEY || 821 key.type == BTRFS_METADATA_ITEM_KEY) { 822 ret = find_inline_backref(eb, path1->slots[0], 823 &ptr, &end); 824 if (ret) 825 goto next; 826 } 827 } 828 829 if (ptr < end) { 830 /* update key for inline back ref */ 831 struct btrfs_extent_inline_ref *iref; 832 int type; 833 iref = (struct btrfs_extent_inline_ref *)ptr; 834 type = btrfs_get_extent_inline_ref_type(eb, iref, 835 BTRFS_REF_TYPE_BLOCK); 836 if (type == BTRFS_REF_TYPE_INVALID) { 837 err = -EUCLEAN; 838 goto out; 839 } 840 key.type = type; 841 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 842 843 WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && 844 key.type != BTRFS_SHARED_BLOCK_REF_KEY); 845 } 846 847 /* 848 * Parent node found and matches current inline ref, no need to 849 * rebuild this node for this inline ref. 850 */ 851 if (exist && 852 ((key.type == BTRFS_TREE_BLOCK_REF_KEY && 853 exist->owner == key.offset) || 854 (key.type == BTRFS_SHARED_BLOCK_REF_KEY && 855 exist->bytenr == key.offset))) { 856 exist = NULL; 857 goto next; 858 } 859 860 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ 861 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 862 if (key.objectid == key.offset) { 863 /* 864 * Only root blocks of reloc trees use backref 865 * pointing to itself. 866 */ 867 root = find_reloc_root(rc, cur->bytenr); 868 ASSERT(root); 869 cur->root = root; 870 break; 871 } 872 873 edge = alloc_backref_edge(cache); 874 if (!edge) { 875 err = -ENOMEM; 876 goto out; 877 } 878 rb_node = tree_search(&cache->rb_root, key.offset); 879 if (!rb_node) { 880 upper = alloc_backref_node(cache); 881 if (!upper) { 882 free_backref_edge(cache, edge); 883 err = -ENOMEM; 884 goto out; 885 } 886 upper->bytenr = key.offset; 887 upper->level = cur->level + 1; 888 /* 889 * backrefs for the upper level block isn't 890 * cached, add the block to pending list 891 */ 892 list_add_tail(&edge->list[UPPER], &list); 893 } else { 894 upper = rb_entry(rb_node, struct backref_node, 895 rb_node); 896 ASSERT(upper->checked); 897 INIT_LIST_HEAD(&edge->list[UPPER]); 898 } 899 list_add_tail(&edge->list[LOWER], &cur->upper); 900 edge->node[LOWER] = cur; 901 edge->node[UPPER] = upper; 902 903 goto next; 904 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 905 err = -EINVAL; 906 btrfs_print_v0_err(rc->extent_root->fs_info); 907 btrfs_handle_fs_error(rc->extent_root->fs_info, err, 908 NULL); 909 goto out; 910 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 911 goto next; 912 } 913 914 /* 915 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset 916 * means the root objectid. We need to search the tree to get 917 * its parent bytenr. 918 */ 919 root = read_fs_root(rc->extent_root->fs_info, key.offset); 920 if (IS_ERR(root)) { 921 err = PTR_ERR(root); 922 goto out; 923 } 924 925 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 926 cur->cowonly = 1; 927 928 if (btrfs_root_level(&root->root_item) == cur->level) { 929 /* tree root */ 930 ASSERT(btrfs_root_bytenr(&root->root_item) == 931 cur->bytenr); 932 if (should_ignore_root(root)) { 933 btrfs_put_root(root); 934 list_add(&cur->list, &useless); 935 } else { 936 cur->root = root; 937 } 938 break; 939 } 940 941 level = cur->level + 1; 942 943 /* Search the tree to find parent blocks referring the block. */ 944 path2->search_commit_root = 1; 945 path2->skip_locking = 1; 946 path2->lowest_level = level; 947 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); 948 path2->lowest_level = 0; 949 if (ret < 0) { 950 btrfs_put_root(root); 951 err = ret; 952 goto out; 953 } 954 if (ret > 0 && path2->slots[level] > 0) 955 path2->slots[level]--; 956 957 eb = path2->nodes[level]; 958 if (btrfs_node_blockptr(eb, path2->slots[level]) != 959 cur->bytenr) { 960 btrfs_err(root->fs_info, 961 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", 962 cur->bytenr, level - 1, 963 root->root_key.objectid, 964 node_key->objectid, node_key->type, 965 node_key->offset); 966 btrfs_put_root(root); 967 err = -ENOENT; 968 goto out; 969 } 970 lower = cur; 971 need_check = true; 972 973 /* Add all nodes and edges in the path */ 974 for (; level < BTRFS_MAX_LEVEL; level++) { 975 if (!path2->nodes[level]) { 976 ASSERT(btrfs_root_bytenr(&root->root_item) == 977 lower->bytenr); 978 if (should_ignore_root(root)) { 979 btrfs_put_root(root); 980 list_add(&lower->list, &useless); 981 } else { 982 lower->root = root; 983 } 984 break; 985 } 986 987 edge = alloc_backref_edge(cache); 988 if (!edge) { 989 btrfs_put_root(root); 990 err = -ENOMEM; 991 goto out; 992 } 993 994 eb = path2->nodes[level]; 995 rb_node = tree_search(&cache->rb_root, eb->start); 996 if (!rb_node) { 997 upper = alloc_backref_node(cache); 998 if (!upper) { 999 btrfs_put_root(root); 1000 free_backref_edge(cache, edge); 1001 err = -ENOMEM; 1002 goto out; 1003 } 1004 upper->bytenr = eb->start; 1005 upper->owner = btrfs_header_owner(eb); 1006 upper->level = lower->level + 1; 1007 if (!test_bit(BTRFS_ROOT_REF_COWS, 1008 &root->state)) 1009 upper->cowonly = 1; 1010 1011 /* 1012 * if we know the block isn't shared 1013 * we can void checking its backrefs. 1014 */ 1015 if (btrfs_block_can_be_shared(root, eb)) 1016 upper->checked = 0; 1017 else 1018 upper->checked = 1; 1019 1020 /* 1021 * add the block to pending list if we 1022 * need check its backrefs, we only do this once 1023 * while walking up a tree as we will catch 1024 * anything else later on. 1025 */ 1026 if (!upper->checked && need_check) { 1027 need_check = false; 1028 list_add_tail(&edge->list[UPPER], 1029 &list); 1030 } else { 1031 if (upper->checked) 1032 need_check = true; 1033 INIT_LIST_HEAD(&edge->list[UPPER]); 1034 } 1035 } else { 1036 upper = rb_entry(rb_node, struct backref_node, 1037 rb_node); 1038 ASSERT(upper->checked); 1039 INIT_LIST_HEAD(&edge->list[UPPER]); 1040 if (!upper->owner) 1041 upper->owner = btrfs_header_owner(eb); 1042 } 1043 list_add_tail(&edge->list[LOWER], &lower->upper); 1044 edge->node[LOWER] = lower; 1045 edge->node[UPPER] = upper; 1046 1047 if (rb_node) { 1048 btrfs_put_root(root); 1049 break; 1050 } 1051 lower = upper; 1052 upper = NULL; 1053 } 1054 btrfs_release_path(path2); 1055 next: 1056 if (ptr < end) { 1057 ptr += btrfs_extent_inline_ref_size(key.type); 1058 if (ptr >= end) { 1059 WARN_ON(ptr > end); 1060 ptr = 0; 1061 end = 0; 1062 } 1063 } 1064 if (ptr >= end) 1065 path1->slots[0]++; 1066 } 1067 btrfs_release_path(path1); 1068 1069 cur->checked = 1; 1070 WARN_ON(exist); 1071 1072 /* the pending list isn't empty, take the first block to process */ 1073 if (!list_empty(&list)) { 1074 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1075 list_del_init(&edge->list[UPPER]); 1076 cur = edge->node[UPPER]; 1077 goto again; 1078 } 1079 1080 /* 1081 * everything goes well, connect backref nodes and insert backref nodes 1082 * into the cache. 1083 */ 1084 ASSERT(node->checked); 1085 cowonly = node->cowonly; 1086 if (!cowonly) { 1087 rb_node = tree_insert(&cache->rb_root, node->bytenr, 1088 &node->rb_node); 1089 if (rb_node) 1090 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1091 list_add_tail(&node->lower, &cache->leaves); 1092 } 1093 1094 list_for_each_entry(edge, &node->upper, list[LOWER]) 1095 list_add_tail(&edge->list[UPPER], &list); 1096 1097 while (!list_empty(&list)) { 1098 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1099 list_del_init(&edge->list[UPPER]); 1100 upper = edge->node[UPPER]; 1101 if (upper->detached) { 1102 list_del(&edge->list[LOWER]); 1103 lower = edge->node[LOWER]; 1104 free_backref_edge(cache, edge); 1105 if (list_empty(&lower->upper)) 1106 list_add(&lower->list, &useless); 1107 continue; 1108 } 1109 1110 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1111 if (upper->lowest) { 1112 list_del_init(&upper->lower); 1113 upper->lowest = 0; 1114 } 1115 1116 list_add_tail(&edge->list[UPPER], &upper->lower); 1117 continue; 1118 } 1119 1120 if (!upper->checked) { 1121 /* 1122 * Still want to blow up for developers since this is a 1123 * logic bug. 1124 */ 1125 ASSERT(0); 1126 err = -EINVAL; 1127 goto out; 1128 } 1129 if (cowonly != upper->cowonly) { 1130 ASSERT(0); 1131 err = -EINVAL; 1132 goto out; 1133 } 1134 1135 if (!cowonly) { 1136 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1137 &upper->rb_node); 1138 if (rb_node) 1139 backref_tree_panic(rb_node, -EEXIST, 1140 upper->bytenr); 1141 } 1142 1143 list_add_tail(&edge->list[UPPER], &upper->lower); 1144 1145 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1146 list_add_tail(&edge->list[UPPER], &list); 1147 } 1148 /* 1149 * process useless backref nodes. backref nodes for tree leaves 1150 * are deleted from the cache. backref nodes for upper level 1151 * tree blocks are left in the cache to avoid unnecessary backref 1152 * lookup. 1153 */ 1154 while (!list_empty(&useless)) { 1155 upper = list_entry(useless.next, struct backref_node, list); 1156 list_del_init(&upper->list); 1157 ASSERT(list_empty(&upper->upper)); 1158 if (upper == node) 1159 node = NULL; 1160 if (upper->lowest) { 1161 list_del_init(&upper->lower); 1162 upper->lowest = 0; 1163 } 1164 while (!list_empty(&upper->lower)) { 1165 edge = list_entry(upper->lower.next, 1166 struct backref_edge, list[UPPER]); 1167 list_del(&edge->list[UPPER]); 1168 list_del(&edge->list[LOWER]); 1169 lower = edge->node[LOWER]; 1170 free_backref_edge(cache, edge); 1171 1172 if (list_empty(&lower->upper)) 1173 list_add(&lower->list, &useless); 1174 } 1175 __mark_block_processed(rc, upper); 1176 if (upper->level > 0) { 1177 list_add(&upper->list, &cache->detached); 1178 upper->detached = 1; 1179 } else { 1180 rb_erase(&upper->rb_node, &cache->rb_root); 1181 free_backref_node(cache, upper); 1182 } 1183 } 1184 out: 1185 btrfs_free_path(path1); 1186 btrfs_free_path(path2); 1187 if (err) { 1188 while (!list_empty(&useless)) { 1189 lower = list_entry(useless.next, 1190 struct backref_node, list); 1191 list_del_init(&lower->list); 1192 } 1193 while (!list_empty(&list)) { 1194 edge = list_first_entry(&list, struct backref_edge, 1195 list[UPPER]); 1196 list_del(&edge->list[UPPER]); 1197 list_del(&edge->list[LOWER]); 1198 lower = edge->node[LOWER]; 1199 upper = edge->node[UPPER]; 1200 free_backref_edge(cache, edge); 1201 1202 /* 1203 * Lower is no longer linked to any upper backref nodes 1204 * and isn't in the cache, we can free it ourselves. 1205 */ 1206 if (list_empty(&lower->upper) && 1207 RB_EMPTY_NODE(&lower->rb_node)) 1208 list_add(&lower->list, &useless); 1209 1210 if (!RB_EMPTY_NODE(&upper->rb_node)) 1211 continue; 1212 1213 /* Add this guy's upper edges to the list to process */ 1214 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1215 list_add_tail(&edge->list[UPPER], &list); 1216 if (list_empty(&upper->upper)) 1217 list_add(&upper->list, &useless); 1218 } 1219 1220 while (!list_empty(&useless)) { 1221 lower = list_entry(useless.next, 1222 struct backref_node, list); 1223 list_del_init(&lower->list); 1224 if (lower == node) 1225 node = NULL; 1226 free_backref_node(cache, lower); 1227 } 1228 1229 remove_backref_node(cache, node); 1230 return ERR_PTR(err); 1231 } 1232 ASSERT(!node || !node->detached); 1233 return node; 1234 } 1235 1236 /* 1237 * helper to add backref node for the newly created snapshot. 1238 * the backref node is created by cloning backref node that 1239 * corresponds to root of source tree 1240 */ 1241 static int clone_backref_node(struct btrfs_trans_handle *trans, 1242 struct reloc_control *rc, 1243 struct btrfs_root *src, 1244 struct btrfs_root *dest) 1245 { 1246 struct btrfs_root *reloc_root = src->reloc_root; 1247 struct backref_cache *cache = &rc->backref_cache; 1248 struct backref_node *node = NULL; 1249 struct backref_node *new_node; 1250 struct backref_edge *edge; 1251 struct backref_edge *new_edge; 1252 struct rb_node *rb_node; 1253 1254 if (cache->last_trans > 0) 1255 update_backref_cache(trans, cache); 1256 1257 rb_node = tree_search(&cache->rb_root, src->commit_root->start); 1258 if (rb_node) { 1259 node = rb_entry(rb_node, struct backref_node, rb_node); 1260 if (node->detached) 1261 node = NULL; 1262 else 1263 BUG_ON(node->new_bytenr != reloc_root->node->start); 1264 } 1265 1266 if (!node) { 1267 rb_node = tree_search(&cache->rb_root, 1268 reloc_root->commit_root->start); 1269 if (rb_node) { 1270 node = rb_entry(rb_node, struct backref_node, 1271 rb_node); 1272 BUG_ON(node->detached); 1273 } 1274 } 1275 1276 if (!node) 1277 return 0; 1278 1279 new_node = alloc_backref_node(cache); 1280 if (!new_node) 1281 return -ENOMEM; 1282 1283 new_node->bytenr = dest->node->start; 1284 new_node->level = node->level; 1285 new_node->lowest = node->lowest; 1286 new_node->checked = 1; 1287 new_node->root = btrfs_grab_root(dest); 1288 ASSERT(new_node->root); 1289 1290 if (!node->lowest) { 1291 list_for_each_entry(edge, &node->lower, list[UPPER]) { 1292 new_edge = alloc_backref_edge(cache); 1293 if (!new_edge) 1294 goto fail; 1295 1296 new_edge->node[UPPER] = new_node; 1297 new_edge->node[LOWER] = edge->node[LOWER]; 1298 list_add_tail(&new_edge->list[UPPER], 1299 &new_node->lower); 1300 } 1301 } else { 1302 list_add_tail(&new_node->lower, &cache->leaves); 1303 } 1304 1305 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1306 &new_node->rb_node); 1307 if (rb_node) 1308 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); 1309 1310 if (!new_node->lowest) { 1311 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 1312 list_add_tail(&new_edge->list[LOWER], 1313 &new_edge->node[LOWER]->upper); 1314 } 1315 } 1316 return 0; 1317 fail: 1318 while (!list_empty(&new_node->lower)) { 1319 new_edge = list_entry(new_node->lower.next, 1320 struct backref_edge, list[UPPER]); 1321 list_del(&new_edge->list[UPPER]); 1322 free_backref_edge(cache, new_edge); 1323 } 1324 free_backref_node(cache, new_node); 1325 return -ENOMEM; 1326 } 1327 1328 /* 1329 * helper to add 'address of tree root -> reloc tree' mapping 1330 */ 1331 static int __must_check __add_reloc_root(struct btrfs_root *root) 1332 { 1333 struct btrfs_fs_info *fs_info = root->fs_info; 1334 struct rb_node *rb_node; 1335 struct mapping_node *node; 1336 struct reloc_control *rc = fs_info->reloc_ctl; 1337 1338 node = kmalloc(sizeof(*node), GFP_NOFS); 1339 if (!node) 1340 return -ENOMEM; 1341 1342 node->bytenr = root->commit_root->start; 1343 node->data = root; 1344 1345 spin_lock(&rc->reloc_root_tree.lock); 1346 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1347 node->bytenr, &node->rb_node); 1348 spin_unlock(&rc->reloc_root_tree.lock); 1349 if (rb_node) { 1350 btrfs_panic(fs_info, -EEXIST, 1351 "Duplicate root found for start=%llu while inserting into relocation tree", 1352 node->bytenr); 1353 } 1354 1355 list_add_tail(&root->root_list, &rc->reloc_roots); 1356 return 0; 1357 } 1358 1359 /* 1360 * helper to delete the 'address of tree root -> reloc tree' 1361 * mapping 1362 */ 1363 static void __del_reloc_root(struct btrfs_root *root) 1364 { 1365 struct btrfs_fs_info *fs_info = root->fs_info; 1366 struct rb_node *rb_node; 1367 struct mapping_node *node = NULL; 1368 struct reloc_control *rc = fs_info->reloc_ctl; 1369 bool put_ref = false; 1370 1371 if (rc && root->node) { 1372 spin_lock(&rc->reloc_root_tree.lock); 1373 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1374 root->commit_root->start); 1375 if (rb_node) { 1376 node = rb_entry(rb_node, struct mapping_node, rb_node); 1377 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1378 RB_CLEAR_NODE(&node->rb_node); 1379 } 1380 spin_unlock(&rc->reloc_root_tree.lock); 1381 if (!node) 1382 return; 1383 BUG_ON((struct btrfs_root *)node->data != root); 1384 } 1385 1386 /* 1387 * We only put the reloc root here if it's on the list. There's a lot 1388 * of places where the pattern is to splice the rc->reloc_roots, process 1389 * the reloc roots, and then add the reloc root back onto 1390 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 1391 * list we don't want the reference being dropped, because the guy 1392 * messing with the list is in charge of the reference. 1393 */ 1394 spin_lock(&fs_info->trans_lock); 1395 if (!list_empty(&root->root_list)) { 1396 put_ref = true; 1397 list_del_init(&root->root_list); 1398 } 1399 spin_unlock(&fs_info->trans_lock); 1400 if (put_ref) 1401 btrfs_put_root(root); 1402 kfree(node); 1403 } 1404 1405 /* 1406 * helper to update the 'address of tree root -> reloc tree' 1407 * mapping 1408 */ 1409 static int __update_reloc_root(struct btrfs_root *root) 1410 { 1411 struct btrfs_fs_info *fs_info = root->fs_info; 1412 struct rb_node *rb_node; 1413 struct mapping_node *node = NULL; 1414 struct reloc_control *rc = fs_info->reloc_ctl; 1415 1416 spin_lock(&rc->reloc_root_tree.lock); 1417 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1418 root->commit_root->start); 1419 if (rb_node) { 1420 node = rb_entry(rb_node, struct mapping_node, rb_node); 1421 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1422 } 1423 spin_unlock(&rc->reloc_root_tree.lock); 1424 1425 if (!node) 1426 return 0; 1427 BUG_ON((struct btrfs_root *)node->data != root); 1428 1429 spin_lock(&rc->reloc_root_tree.lock); 1430 node->bytenr = root->node->start; 1431 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1432 node->bytenr, &node->rb_node); 1433 spin_unlock(&rc->reloc_root_tree.lock); 1434 if (rb_node) 1435 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1436 return 0; 1437 } 1438 1439 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 1440 struct btrfs_root *root, u64 objectid) 1441 { 1442 struct btrfs_fs_info *fs_info = root->fs_info; 1443 struct btrfs_root *reloc_root; 1444 struct extent_buffer *eb; 1445 struct btrfs_root_item *root_item; 1446 struct btrfs_key root_key; 1447 int ret; 1448 1449 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1450 BUG_ON(!root_item); 1451 1452 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1453 root_key.type = BTRFS_ROOT_ITEM_KEY; 1454 root_key.offset = objectid; 1455 1456 if (root->root_key.objectid == objectid) { 1457 u64 commit_root_gen; 1458 1459 /* called by btrfs_init_reloc_root */ 1460 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1461 BTRFS_TREE_RELOC_OBJECTID); 1462 BUG_ON(ret); 1463 /* 1464 * Set the last_snapshot field to the generation of the commit 1465 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 1466 * correctly (returns true) when the relocation root is created 1467 * either inside the critical section of a transaction commit 1468 * (through transaction.c:qgroup_account_snapshot()) and when 1469 * it's created before the transaction commit is started. 1470 */ 1471 commit_root_gen = btrfs_header_generation(root->commit_root); 1472 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 1473 } else { 1474 /* 1475 * called by btrfs_reloc_post_snapshot_hook. 1476 * the source tree is a reloc tree, all tree blocks 1477 * modified after it was created have RELOC flag 1478 * set in their headers. so it's OK to not update 1479 * the 'last_snapshot'. 1480 */ 1481 ret = btrfs_copy_root(trans, root, root->node, &eb, 1482 BTRFS_TREE_RELOC_OBJECTID); 1483 BUG_ON(ret); 1484 } 1485 1486 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1487 btrfs_set_root_bytenr(root_item, eb->start); 1488 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1489 btrfs_set_root_generation(root_item, trans->transid); 1490 1491 if (root->root_key.objectid == objectid) { 1492 btrfs_set_root_refs(root_item, 0); 1493 memset(&root_item->drop_progress, 0, 1494 sizeof(struct btrfs_disk_key)); 1495 root_item->drop_level = 0; 1496 } 1497 1498 btrfs_tree_unlock(eb); 1499 free_extent_buffer(eb); 1500 1501 ret = btrfs_insert_root(trans, fs_info->tree_root, 1502 &root_key, root_item); 1503 BUG_ON(ret); 1504 kfree(root_item); 1505 1506 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 1507 BUG_ON(IS_ERR(reloc_root)); 1508 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); 1509 reloc_root->last_trans = trans->transid; 1510 return reloc_root; 1511 } 1512 1513 /* 1514 * create reloc tree for a given fs tree. reloc tree is just a 1515 * snapshot of the fs tree with special root objectid. 1516 * 1517 * The reloc_root comes out of here with two references, one for 1518 * root->reloc_root, and another for being on the rc->reloc_roots list. 1519 */ 1520 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 1521 struct btrfs_root *root) 1522 { 1523 struct btrfs_fs_info *fs_info = root->fs_info; 1524 struct btrfs_root *reloc_root; 1525 struct reloc_control *rc = fs_info->reloc_ctl; 1526 struct btrfs_block_rsv *rsv; 1527 int clear_rsv = 0; 1528 int ret; 1529 1530 if (!rc) 1531 return 0; 1532 1533 /* 1534 * The subvolume has reloc tree but the swap is finished, no need to 1535 * create/update the dead reloc tree 1536 */ 1537 if (reloc_root_is_dead(root)) 1538 return 0; 1539 1540 /* 1541 * This is subtle but important. We do not do 1542 * record_root_in_transaction for reloc roots, instead we record their 1543 * corresponding fs root, and then here we update the last trans for the 1544 * reloc root. This means that we have to do this for the entire life 1545 * of the reloc root, regardless of which stage of the relocation we are 1546 * in. 1547 */ 1548 if (root->reloc_root) { 1549 reloc_root = root->reloc_root; 1550 reloc_root->last_trans = trans->transid; 1551 return 0; 1552 } 1553 1554 /* 1555 * We are merging reloc roots, we do not need new reloc trees. Also 1556 * reloc trees never need their own reloc tree. 1557 */ 1558 if (!rc->create_reloc_tree || 1559 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1560 return 0; 1561 1562 if (!trans->reloc_reserved) { 1563 rsv = trans->block_rsv; 1564 trans->block_rsv = rc->block_rsv; 1565 clear_rsv = 1; 1566 } 1567 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 1568 if (clear_rsv) 1569 trans->block_rsv = rsv; 1570 1571 ret = __add_reloc_root(reloc_root); 1572 BUG_ON(ret < 0); 1573 root->reloc_root = btrfs_grab_root(reloc_root); 1574 return 0; 1575 } 1576 1577 /* 1578 * update root item of reloc tree 1579 */ 1580 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 1581 struct btrfs_root *root) 1582 { 1583 struct btrfs_fs_info *fs_info = root->fs_info; 1584 struct btrfs_root *reloc_root; 1585 struct btrfs_root_item *root_item; 1586 int ret; 1587 1588 if (!have_reloc_root(root)) 1589 goto out; 1590 1591 reloc_root = root->reloc_root; 1592 root_item = &reloc_root->root_item; 1593 1594 /* 1595 * We are probably ok here, but __del_reloc_root() will drop its ref of 1596 * the root. We have the ref for root->reloc_root, but just in case 1597 * hold it while we update the reloc root. 1598 */ 1599 btrfs_grab_root(reloc_root); 1600 1601 /* root->reloc_root will stay until current relocation finished */ 1602 if (fs_info->reloc_ctl->merge_reloc_tree && 1603 btrfs_root_refs(root_item) == 0) { 1604 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1605 /* 1606 * Mark the tree as dead before we change reloc_root so 1607 * have_reloc_root will not touch it from now on. 1608 */ 1609 smp_wmb(); 1610 __del_reloc_root(reloc_root); 1611 } 1612 1613 if (reloc_root->commit_root != reloc_root->node) { 1614 __update_reloc_root(reloc_root); 1615 btrfs_set_root_node(root_item, reloc_root->node); 1616 free_extent_buffer(reloc_root->commit_root); 1617 reloc_root->commit_root = btrfs_root_node(reloc_root); 1618 } 1619 1620 ret = btrfs_update_root(trans, fs_info->tree_root, 1621 &reloc_root->root_key, root_item); 1622 BUG_ON(ret); 1623 btrfs_put_root(reloc_root); 1624 out: 1625 return 0; 1626 } 1627 1628 /* 1629 * helper to find first cached inode with inode number >= objectid 1630 * in a subvolume 1631 */ 1632 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 1633 { 1634 struct rb_node *node; 1635 struct rb_node *prev; 1636 struct btrfs_inode *entry; 1637 struct inode *inode; 1638 1639 spin_lock(&root->inode_lock); 1640 again: 1641 node = root->inode_tree.rb_node; 1642 prev = NULL; 1643 while (node) { 1644 prev = node; 1645 entry = rb_entry(node, struct btrfs_inode, rb_node); 1646 1647 if (objectid < btrfs_ino(entry)) 1648 node = node->rb_left; 1649 else if (objectid > btrfs_ino(entry)) 1650 node = node->rb_right; 1651 else 1652 break; 1653 } 1654 if (!node) { 1655 while (prev) { 1656 entry = rb_entry(prev, struct btrfs_inode, rb_node); 1657 if (objectid <= btrfs_ino(entry)) { 1658 node = prev; 1659 break; 1660 } 1661 prev = rb_next(prev); 1662 } 1663 } 1664 while (node) { 1665 entry = rb_entry(node, struct btrfs_inode, rb_node); 1666 inode = igrab(&entry->vfs_inode); 1667 if (inode) { 1668 spin_unlock(&root->inode_lock); 1669 return inode; 1670 } 1671 1672 objectid = btrfs_ino(entry) + 1; 1673 if (cond_resched_lock(&root->inode_lock)) 1674 goto again; 1675 1676 node = rb_next(node); 1677 } 1678 spin_unlock(&root->inode_lock); 1679 return NULL; 1680 } 1681 1682 static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group) 1683 { 1684 if (bytenr >= block_group->start && 1685 bytenr < block_group->start + block_group->length) 1686 return 1; 1687 return 0; 1688 } 1689 1690 /* 1691 * get new location of data 1692 */ 1693 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1694 u64 bytenr, u64 num_bytes) 1695 { 1696 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1697 struct btrfs_path *path; 1698 struct btrfs_file_extent_item *fi; 1699 struct extent_buffer *leaf; 1700 int ret; 1701 1702 path = btrfs_alloc_path(); 1703 if (!path) 1704 return -ENOMEM; 1705 1706 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1707 ret = btrfs_lookup_file_extent(NULL, root, path, 1708 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1709 if (ret < 0) 1710 goto out; 1711 if (ret > 0) { 1712 ret = -ENOENT; 1713 goto out; 1714 } 1715 1716 leaf = path->nodes[0]; 1717 fi = btrfs_item_ptr(leaf, path->slots[0], 1718 struct btrfs_file_extent_item); 1719 1720 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1721 btrfs_file_extent_compression(leaf, fi) || 1722 btrfs_file_extent_encryption(leaf, fi) || 1723 btrfs_file_extent_other_encoding(leaf, fi)); 1724 1725 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1726 ret = -EINVAL; 1727 goto out; 1728 } 1729 1730 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1731 ret = 0; 1732 out: 1733 btrfs_free_path(path); 1734 return ret; 1735 } 1736 1737 /* 1738 * update file extent items in the tree leaf to point to 1739 * the new locations. 1740 */ 1741 static noinline_for_stack 1742 int replace_file_extents(struct btrfs_trans_handle *trans, 1743 struct reloc_control *rc, 1744 struct btrfs_root *root, 1745 struct extent_buffer *leaf) 1746 { 1747 struct btrfs_fs_info *fs_info = root->fs_info; 1748 struct btrfs_key key; 1749 struct btrfs_file_extent_item *fi; 1750 struct inode *inode = NULL; 1751 u64 parent; 1752 u64 bytenr; 1753 u64 new_bytenr = 0; 1754 u64 num_bytes; 1755 u64 end; 1756 u32 nritems; 1757 u32 i; 1758 int ret = 0; 1759 int first = 1; 1760 int dirty = 0; 1761 1762 if (rc->stage != UPDATE_DATA_PTRS) 1763 return 0; 1764 1765 /* reloc trees always use full backref */ 1766 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1767 parent = leaf->start; 1768 else 1769 parent = 0; 1770 1771 nritems = btrfs_header_nritems(leaf); 1772 for (i = 0; i < nritems; i++) { 1773 struct btrfs_ref ref = { 0 }; 1774 1775 cond_resched(); 1776 btrfs_item_key_to_cpu(leaf, &key, i); 1777 if (key.type != BTRFS_EXTENT_DATA_KEY) 1778 continue; 1779 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1780 if (btrfs_file_extent_type(leaf, fi) == 1781 BTRFS_FILE_EXTENT_INLINE) 1782 continue; 1783 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1784 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1785 if (bytenr == 0) 1786 continue; 1787 if (!in_block_group(bytenr, rc->block_group)) 1788 continue; 1789 1790 /* 1791 * if we are modifying block in fs tree, wait for readpage 1792 * to complete and drop the extent cache 1793 */ 1794 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1795 if (first) { 1796 inode = find_next_inode(root, key.objectid); 1797 first = 0; 1798 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1799 btrfs_add_delayed_iput(inode); 1800 inode = find_next_inode(root, key.objectid); 1801 } 1802 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1803 end = key.offset + 1804 btrfs_file_extent_num_bytes(leaf, fi); 1805 WARN_ON(!IS_ALIGNED(key.offset, 1806 fs_info->sectorsize)); 1807 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1808 end--; 1809 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1810 key.offset, end); 1811 if (!ret) 1812 continue; 1813 1814 btrfs_drop_extent_cache(BTRFS_I(inode), 1815 key.offset, end, 1); 1816 unlock_extent(&BTRFS_I(inode)->io_tree, 1817 key.offset, end); 1818 } 1819 } 1820 1821 ret = get_new_location(rc->data_inode, &new_bytenr, 1822 bytenr, num_bytes); 1823 if (ret) { 1824 /* 1825 * Don't have to abort since we've not changed anything 1826 * in the file extent yet. 1827 */ 1828 break; 1829 } 1830 1831 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1832 dirty = 1; 1833 1834 key.offset -= btrfs_file_extent_offset(leaf, fi); 1835 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1836 num_bytes, parent); 1837 ref.real_root = root->root_key.objectid; 1838 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1839 key.objectid, key.offset); 1840 ret = btrfs_inc_extent_ref(trans, &ref); 1841 if (ret) { 1842 btrfs_abort_transaction(trans, ret); 1843 break; 1844 } 1845 1846 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1847 num_bytes, parent); 1848 ref.real_root = root->root_key.objectid; 1849 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1850 key.objectid, key.offset); 1851 ret = btrfs_free_extent(trans, &ref); 1852 if (ret) { 1853 btrfs_abort_transaction(trans, ret); 1854 break; 1855 } 1856 } 1857 if (dirty) 1858 btrfs_mark_buffer_dirty(leaf); 1859 if (inode) 1860 btrfs_add_delayed_iput(inode); 1861 return ret; 1862 } 1863 1864 static noinline_for_stack 1865 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1866 struct btrfs_path *path, int level) 1867 { 1868 struct btrfs_disk_key key1; 1869 struct btrfs_disk_key key2; 1870 btrfs_node_key(eb, &key1, slot); 1871 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1872 return memcmp(&key1, &key2, sizeof(key1)); 1873 } 1874 1875 /* 1876 * try to replace tree blocks in fs tree with the new blocks 1877 * in reloc tree. tree blocks haven't been modified since the 1878 * reloc tree was create can be replaced. 1879 * 1880 * if a block was replaced, level of the block + 1 is returned. 1881 * if no block got replaced, 0 is returned. if there are other 1882 * errors, a negative error number is returned. 1883 */ 1884 static noinline_for_stack 1885 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1886 struct btrfs_root *dest, struct btrfs_root *src, 1887 struct btrfs_path *path, struct btrfs_key *next_key, 1888 int lowest_level, int max_level) 1889 { 1890 struct btrfs_fs_info *fs_info = dest->fs_info; 1891 struct extent_buffer *eb; 1892 struct extent_buffer *parent; 1893 struct btrfs_ref ref = { 0 }; 1894 struct btrfs_key key; 1895 u64 old_bytenr; 1896 u64 new_bytenr; 1897 u64 old_ptr_gen; 1898 u64 new_ptr_gen; 1899 u64 last_snapshot; 1900 u32 blocksize; 1901 int cow = 0; 1902 int level; 1903 int ret; 1904 int slot; 1905 1906 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1907 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1908 1909 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1910 again: 1911 slot = path->slots[lowest_level]; 1912 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1913 1914 eb = btrfs_lock_root_node(dest); 1915 btrfs_set_lock_blocking_write(eb); 1916 level = btrfs_header_level(eb); 1917 1918 if (level < lowest_level) { 1919 btrfs_tree_unlock(eb); 1920 free_extent_buffer(eb); 1921 return 0; 1922 } 1923 1924 if (cow) { 1925 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1926 BUG_ON(ret); 1927 } 1928 btrfs_set_lock_blocking_write(eb); 1929 1930 if (next_key) { 1931 next_key->objectid = (u64)-1; 1932 next_key->type = (u8)-1; 1933 next_key->offset = (u64)-1; 1934 } 1935 1936 parent = eb; 1937 while (1) { 1938 struct btrfs_key first_key; 1939 1940 level = btrfs_header_level(parent); 1941 BUG_ON(level < lowest_level); 1942 1943 ret = btrfs_bin_search(parent, &key, level, &slot); 1944 if (ret < 0) 1945 break; 1946 if (ret && slot > 0) 1947 slot--; 1948 1949 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1950 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1951 1952 old_bytenr = btrfs_node_blockptr(parent, slot); 1953 blocksize = fs_info->nodesize; 1954 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1955 btrfs_node_key_to_cpu(parent, &first_key, slot); 1956 1957 if (level <= max_level) { 1958 eb = path->nodes[level]; 1959 new_bytenr = btrfs_node_blockptr(eb, 1960 path->slots[level]); 1961 new_ptr_gen = btrfs_node_ptr_generation(eb, 1962 path->slots[level]); 1963 } else { 1964 new_bytenr = 0; 1965 new_ptr_gen = 0; 1966 } 1967 1968 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1969 ret = level; 1970 break; 1971 } 1972 1973 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1974 memcmp_node_keys(parent, slot, path, level)) { 1975 if (level <= lowest_level) { 1976 ret = 0; 1977 break; 1978 } 1979 1980 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, 1981 level - 1, &first_key); 1982 if (IS_ERR(eb)) { 1983 ret = PTR_ERR(eb); 1984 break; 1985 } else if (!extent_buffer_uptodate(eb)) { 1986 ret = -EIO; 1987 free_extent_buffer(eb); 1988 break; 1989 } 1990 btrfs_tree_lock(eb); 1991 if (cow) { 1992 ret = btrfs_cow_block(trans, dest, eb, parent, 1993 slot, &eb); 1994 BUG_ON(ret); 1995 } 1996 btrfs_set_lock_blocking_write(eb); 1997 1998 btrfs_tree_unlock(parent); 1999 free_extent_buffer(parent); 2000 2001 parent = eb; 2002 continue; 2003 } 2004 2005 if (!cow) { 2006 btrfs_tree_unlock(parent); 2007 free_extent_buffer(parent); 2008 cow = 1; 2009 goto again; 2010 } 2011 2012 btrfs_node_key_to_cpu(path->nodes[level], &key, 2013 path->slots[level]); 2014 btrfs_release_path(path); 2015 2016 path->lowest_level = level; 2017 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 2018 path->lowest_level = 0; 2019 BUG_ON(ret); 2020 2021 /* 2022 * Info qgroup to trace both subtrees. 2023 * 2024 * We must trace both trees. 2025 * 1) Tree reloc subtree 2026 * If not traced, we will leak data numbers 2027 * 2) Fs subtree 2028 * If not traced, we will double count old data 2029 * 2030 * We don't scan the subtree right now, but only record 2031 * the swapped tree blocks. 2032 * The real subtree rescan is delayed until we have new 2033 * CoW on the subtree root node before transaction commit. 2034 */ 2035 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 2036 rc->block_group, parent, slot, 2037 path->nodes[level], path->slots[level], 2038 last_snapshot); 2039 if (ret < 0) 2040 break; 2041 /* 2042 * swap blocks in fs tree and reloc tree. 2043 */ 2044 btrfs_set_node_blockptr(parent, slot, new_bytenr); 2045 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 2046 btrfs_mark_buffer_dirty(parent); 2047 2048 btrfs_set_node_blockptr(path->nodes[level], 2049 path->slots[level], old_bytenr); 2050 btrfs_set_node_ptr_generation(path->nodes[level], 2051 path->slots[level], old_ptr_gen); 2052 btrfs_mark_buffer_dirty(path->nodes[level]); 2053 2054 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 2055 blocksize, path->nodes[level]->start); 2056 ref.skip_qgroup = true; 2057 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 2058 ret = btrfs_inc_extent_ref(trans, &ref); 2059 BUG_ON(ret); 2060 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 2061 blocksize, 0); 2062 ref.skip_qgroup = true; 2063 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 2064 ret = btrfs_inc_extent_ref(trans, &ref); 2065 BUG_ON(ret); 2066 2067 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 2068 blocksize, path->nodes[level]->start); 2069 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 2070 ref.skip_qgroup = true; 2071 ret = btrfs_free_extent(trans, &ref); 2072 BUG_ON(ret); 2073 2074 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 2075 blocksize, 0); 2076 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 2077 ref.skip_qgroup = true; 2078 ret = btrfs_free_extent(trans, &ref); 2079 BUG_ON(ret); 2080 2081 btrfs_unlock_up_safe(path, 0); 2082 2083 ret = level; 2084 break; 2085 } 2086 btrfs_tree_unlock(parent); 2087 free_extent_buffer(parent); 2088 return ret; 2089 } 2090 2091 /* 2092 * helper to find next relocated block in reloc tree 2093 */ 2094 static noinline_for_stack 2095 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 2096 int *level) 2097 { 2098 struct extent_buffer *eb; 2099 int i; 2100 u64 last_snapshot; 2101 u32 nritems; 2102 2103 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2104 2105 for (i = 0; i < *level; i++) { 2106 free_extent_buffer(path->nodes[i]); 2107 path->nodes[i] = NULL; 2108 } 2109 2110 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 2111 eb = path->nodes[i]; 2112 nritems = btrfs_header_nritems(eb); 2113 while (path->slots[i] + 1 < nritems) { 2114 path->slots[i]++; 2115 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 2116 last_snapshot) 2117 continue; 2118 2119 *level = i; 2120 return 0; 2121 } 2122 free_extent_buffer(path->nodes[i]); 2123 path->nodes[i] = NULL; 2124 } 2125 return 1; 2126 } 2127 2128 /* 2129 * walk down reloc tree to find relocated block of lowest level 2130 */ 2131 static noinline_for_stack 2132 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 2133 int *level) 2134 { 2135 struct btrfs_fs_info *fs_info = root->fs_info; 2136 struct extent_buffer *eb = NULL; 2137 int i; 2138 u64 bytenr; 2139 u64 ptr_gen = 0; 2140 u64 last_snapshot; 2141 u32 nritems; 2142 2143 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2144 2145 for (i = *level; i > 0; i--) { 2146 struct btrfs_key first_key; 2147 2148 eb = path->nodes[i]; 2149 nritems = btrfs_header_nritems(eb); 2150 while (path->slots[i] < nritems) { 2151 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 2152 if (ptr_gen > last_snapshot) 2153 break; 2154 path->slots[i]++; 2155 } 2156 if (path->slots[i] >= nritems) { 2157 if (i == *level) 2158 break; 2159 *level = i + 1; 2160 return 0; 2161 } 2162 if (i == 1) { 2163 *level = i; 2164 return 0; 2165 } 2166 2167 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2168 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); 2169 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, 2170 &first_key); 2171 if (IS_ERR(eb)) { 2172 return PTR_ERR(eb); 2173 } else if (!extent_buffer_uptodate(eb)) { 2174 free_extent_buffer(eb); 2175 return -EIO; 2176 } 2177 BUG_ON(btrfs_header_level(eb) != i - 1); 2178 path->nodes[i - 1] = eb; 2179 path->slots[i - 1] = 0; 2180 } 2181 return 1; 2182 } 2183 2184 /* 2185 * invalidate extent cache for file extents whose key in range of 2186 * [min_key, max_key) 2187 */ 2188 static int invalidate_extent_cache(struct btrfs_root *root, 2189 struct btrfs_key *min_key, 2190 struct btrfs_key *max_key) 2191 { 2192 struct btrfs_fs_info *fs_info = root->fs_info; 2193 struct inode *inode = NULL; 2194 u64 objectid; 2195 u64 start, end; 2196 u64 ino; 2197 2198 objectid = min_key->objectid; 2199 while (1) { 2200 cond_resched(); 2201 iput(inode); 2202 2203 if (objectid > max_key->objectid) 2204 break; 2205 2206 inode = find_next_inode(root, objectid); 2207 if (!inode) 2208 break; 2209 ino = btrfs_ino(BTRFS_I(inode)); 2210 2211 if (ino > max_key->objectid) { 2212 iput(inode); 2213 break; 2214 } 2215 2216 objectid = ino + 1; 2217 if (!S_ISREG(inode->i_mode)) 2218 continue; 2219 2220 if (unlikely(min_key->objectid == ino)) { 2221 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 2222 continue; 2223 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 2224 start = 0; 2225 else { 2226 start = min_key->offset; 2227 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 2228 } 2229 } else { 2230 start = 0; 2231 } 2232 2233 if (unlikely(max_key->objectid == ino)) { 2234 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 2235 continue; 2236 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 2237 end = (u64)-1; 2238 } else { 2239 if (max_key->offset == 0) 2240 continue; 2241 end = max_key->offset; 2242 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 2243 end--; 2244 } 2245 } else { 2246 end = (u64)-1; 2247 } 2248 2249 /* the lock_extent waits for readpage to complete */ 2250 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2251 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 2252 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2253 } 2254 return 0; 2255 } 2256 2257 static int find_next_key(struct btrfs_path *path, int level, 2258 struct btrfs_key *key) 2259 2260 { 2261 while (level < BTRFS_MAX_LEVEL) { 2262 if (!path->nodes[level]) 2263 break; 2264 if (path->slots[level] + 1 < 2265 btrfs_header_nritems(path->nodes[level])) { 2266 btrfs_node_key_to_cpu(path->nodes[level], key, 2267 path->slots[level] + 1); 2268 return 0; 2269 } 2270 level++; 2271 } 2272 return 1; 2273 } 2274 2275 /* 2276 * Insert current subvolume into reloc_control::dirty_subvol_roots 2277 */ 2278 static void insert_dirty_subvol(struct btrfs_trans_handle *trans, 2279 struct reloc_control *rc, 2280 struct btrfs_root *root) 2281 { 2282 struct btrfs_root *reloc_root = root->reloc_root; 2283 struct btrfs_root_item *reloc_root_item; 2284 2285 /* @root must be a subvolume tree root with a valid reloc tree */ 2286 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 2287 ASSERT(reloc_root); 2288 2289 reloc_root_item = &reloc_root->root_item; 2290 memset(&reloc_root_item->drop_progress, 0, 2291 sizeof(reloc_root_item->drop_progress)); 2292 reloc_root_item->drop_level = 0; 2293 btrfs_set_root_refs(reloc_root_item, 0); 2294 btrfs_update_reloc_root(trans, root); 2295 2296 if (list_empty(&root->reloc_dirty_list)) { 2297 btrfs_grab_root(root); 2298 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 2299 } 2300 } 2301 2302 static int clean_dirty_subvols(struct reloc_control *rc) 2303 { 2304 struct btrfs_root *root; 2305 struct btrfs_root *next; 2306 int ret = 0; 2307 int ret2; 2308 2309 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 2310 reloc_dirty_list) { 2311 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 2312 /* Merged subvolume, cleanup its reloc root */ 2313 struct btrfs_root *reloc_root = root->reloc_root; 2314 2315 list_del_init(&root->reloc_dirty_list); 2316 root->reloc_root = NULL; 2317 /* 2318 * Need barrier to ensure clear_bit() only happens after 2319 * root->reloc_root = NULL. Pairs with have_reloc_root. 2320 */ 2321 smp_wmb(); 2322 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 2323 if (reloc_root) { 2324 /* 2325 * btrfs_drop_snapshot drops our ref we hold for 2326 * ->reloc_root. If it fails however we must 2327 * drop the ref ourselves. 2328 */ 2329 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 2330 if (ret2 < 0) { 2331 btrfs_put_root(reloc_root); 2332 if (!ret) 2333 ret = ret2; 2334 } 2335 } 2336 btrfs_put_root(root); 2337 } else { 2338 /* Orphan reloc tree, just clean it up */ 2339 ret2 = btrfs_drop_snapshot(root, 0, 1); 2340 if (ret2 < 0) { 2341 btrfs_put_root(root); 2342 if (!ret) 2343 ret = ret2; 2344 } 2345 } 2346 } 2347 return ret; 2348 } 2349 2350 /* 2351 * merge the relocated tree blocks in reloc tree with corresponding 2352 * fs tree. 2353 */ 2354 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 2355 struct btrfs_root *root) 2356 { 2357 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2358 struct btrfs_key key; 2359 struct btrfs_key next_key; 2360 struct btrfs_trans_handle *trans = NULL; 2361 struct btrfs_root *reloc_root; 2362 struct btrfs_root_item *root_item; 2363 struct btrfs_path *path; 2364 struct extent_buffer *leaf; 2365 int level; 2366 int max_level; 2367 int replaced = 0; 2368 int ret; 2369 int err = 0; 2370 u32 min_reserved; 2371 2372 path = btrfs_alloc_path(); 2373 if (!path) 2374 return -ENOMEM; 2375 path->reada = READA_FORWARD; 2376 2377 reloc_root = root->reloc_root; 2378 root_item = &reloc_root->root_item; 2379 2380 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 2381 level = btrfs_root_level(root_item); 2382 atomic_inc(&reloc_root->node->refs); 2383 path->nodes[level] = reloc_root->node; 2384 path->slots[level] = 0; 2385 } else { 2386 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 2387 2388 level = root_item->drop_level; 2389 BUG_ON(level == 0); 2390 path->lowest_level = level; 2391 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 2392 path->lowest_level = 0; 2393 if (ret < 0) { 2394 btrfs_free_path(path); 2395 return ret; 2396 } 2397 2398 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 2399 path->slots[level]); 2400 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 2401 2402 btrfs_unlock_up_safe(path, 0); 2403 } 2404 2405 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2406 memset(&next_key, 0, sizeof(next_key)); 2407 2408 while (1) { 2409 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 2410 BTRFS_RESERVE_FLUSH_ALL); 2411 if (ret) { 2412 err = ret; 2413 goto out; 2414 } 2415 trans = btrfs_start_transaction(root, 0); 2416 if (IS_ERR(trans)) { 2417 err = PTR_ERR(trans); 2418 trans = NULL; 2419 goto out; 2420 } 2421 2422 /* 2423 * At this point we no longer have a reloc_control, so we can't 2424 * depend on btrfs_init_reloc_root to update our last_trans. 2425 * 2426 * But that's ok, we started the trans handle on our 2427 * corresponding fs_root, which means it's been added to the 2428 * dirty list. At commit time we'll still call 2429 * btrfs_update_reloc_root() and update our root item 2430 * appropriately. 2431 */ 2432 reloc_root->last_trans = trans->transid; 2433 trans->block_rsv = rc->block_rsv; 2434 2435 replaced = 0; 2436 max_level = level; 2437 2438 ret = walk_down_reloc_tree(reloc_root, path, &level); 2439 if (ret < 0) { 2440 err = ret; 2441 goto out; 2442 } 2443 if (ret > 0) 2444 break; 2445 2446 if (!find_next_key(path, level, &key) && 2447 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2448 ret = 0; 2449 } else { 2450 ret = replace_path(trans, rc, root, reloc_root, path, 2451 &next_key, level, max_level); 2452 } 2453 if (ret < 0) { 2454 err = ret; 2455 goto out; 2456 } 2457 2458 if (ret > 0) { 2459 level = ret; 2460 btrfs_node_key_to_cpu(path->nodes[level], &key, 2461 path->slots[level]); 2462 replaced = 1; 2463 } 2464 2465 ret = walk_up_reloc_tree(reloc_root, path, &level); 2466 if (ret > 0) 2467 break; 2468 2469 BUG_ON(level == 0); 2470 /* 2471 * save the merging progress in the drop_progress. 2472 * this is OK since root refs == 1 in this case. 2473 */ 2474 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 2475 path->slots[level]); 2476 root_item->drop_level = level; 2477 2478 btrfs_end_transaction_throttle(trans); 2479 trans = NULL; 2480 2481 btrfs_btree_balance_dirty(fs_info); 2482 2483 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2484 invalidate_extent_cache(root, &key, &next_key); 2485 } 2486 2487 /* 2488 * handle the case only one block in the fs tree need to be 2489 * relocated and the block is tree root. 2490 */ 2491 leaf = btrfs_lock_root_node(root); 2492 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); 2493 btrfs_tree_unlock(leaf); 2494 free_extent_buffer(leaf); 2495 if (ret < 0) 2496 err = ret; 2497 out: 2498 btrfs_free_path(path); 2499 2500 if (err == 0) 2501 insert_dirty_subvol(trans, rc, root); 2502 2503 if (trans) 2504 btrfs_end_transaction_throttle(trans); 2505 2506 btrfs_btree_balance_dirty(fs_info); 2507 2508 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2509 invalidate_extent_cache(root, &key, &next_key); 2510 2511 return err; 2512 } 2513 2514 static noinline_for_stack 2515 int prepare_to_merge(struct reloc_control *rc, int err) 2516 { 2517 struct btrfs_root *root = rc->extent_root; 2518 struct btrfs_fs_info *fs_info = root->fs_info; 2519 struct btrfs_root *reloc_root; 2520 struct btrfs_trans_handle *trans; 2521 LIST_HEAD(reloc_roots); 2522 u64 num_bytes = 0; 2523 int ret; 2524 2525 mutex_lock(&fs_info->reloc_mutex); 2526 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2527 rc->merging_rsv_size += rc->nodes_relocated * 2; 2528 mutex_unlock(&fs_info->reloc_mutex); 2529 2530 again: 2531 if (!err) { 2532 num_bytes = rc->merging_rsv_size; 2533 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2534 BTRFS_RESERVE_FLUSH_ALL); 2535 if (ret) 2536 err = ret; 2537 } 2538 2539 trans = btrfs_join_transaction(rc->extent_root); 2540 if (IS_ERR(trans)) { 2541 if (!err) 2542 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2543 num_bytes, NULL); 2544 return PTR_ERR(trans); 2545 } 2546 2547 if (!err) { 2548 if (num_bytes != rc->merging_rsv_size) { 2549 btrfs_end_transaction(trans); 2550 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2551 num_bytes, NULL); 2552 goto again; 2553 } 2554 } 2555 2556 rc->merge_reloc_tree = 1; 2557 2558 while (!list_empty(&rc->reloc_roots)) { 2559 reloc_root = list_entry(rc->reloc_roots.next, 2560 struct btrfs_root, root_list); 2561 list_del_init(&reloc_root->root_list); 2562 2563 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2564 BUG_ON(IS_ERR(root)); 2565 BUG_ON(root->reloc_root != reloc_root); 2566 2567 /* 2568 * set reference count to 1, so btrfs_recover_relocation 2569 * knows it should resumes merging 2570 */ 2571 if (!err) 2572 btrfs_set_root_refs(&reloc_root->root_item, 1); 2573 btrfs_update_reloc_root(trans, root); 2574 2575 list_add(&reloc_root->root_list, &reloc_roots); 2576 btrfs_put_root(root); 2577 } 2578 2579 list_splice(&reloc_roots, &rc->reloc_roots); 2580 2581 if (!err) 2582 btrfs_commit_transaction(trans); 2583 else 2584 btrfs_end_transaction(trans); 2585 return err; 2586 } 2587 2588 static noinline_for_stack 2589 void free_reloc_roots(struct list_head *list) 2590 { 2591 struct btrfs_root *reloc_root; 2592 2593 while (!list_empty(list)) { 2594 reloc_root = list_entry(list->next, struct btrfs_root, 2595 root_list); 2596 __del_reloc_root(reloc_root); 2597 } 2598 } 2599 2600 static noinline_for_stack 2601 void merge_reloc_roots(struct reloc_control *rc) 2602 { 2603 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2604 struct btrfs_root *root; 2605 struct btrfs_root *reloc_root; 2606 LIST_HEAD(reloc_roots); 2607 int found = 0; 2608 int ret = 0; 2609 again: 2610 root = rc->extent_root; 2611 2612 /* 2613 * this serializes us with btrfs_record_root_in_transaction, 2614 * we have to make sure nobody is in the middle of 2615 * adding their roots to the list while we are 2616 * doing this splice 2617 */ 2618 mutex_lock(&fs_info->reloc_mutex); 2619 list_splice_init(&rc->reloc_roots, &reloc_roots); 2620 mutex_unlock(&fs_info->reloc_mutex); 2621 2622 while (!list_empty(&reloc_roots)) { 2623 found = 1; 2624 reloc_root = list_entry(reloc_roots.next, 2625 struct btrfs_root, root_list); 2626 2627 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 2628 root = read_fs_root(fs_info, 2629 reloc_root->root_key.offset); 2630 BUG_ON(IS_ERR(root)); 2631 BUG_ON(root->reloc_root != reloc_root); 2632 2633 ret = merge_reloc_root(rc, root); 2634 btrfs_put_root(root); 2635 if (ret) { 2636 if (list_empty(&reloc_root->root_list)) 2637 list_add_tail(&reloc_root->root_list, 2638 &reloc_roots); 2639 goto out; 2640 } 2641 } else { 2642 list_del_init(&reloc_root->root_list); 2643 /* Don't forget to queue this reloc root for cleanup */ 2644 list_add_tail(&reloc_root->reloc_dirty_list, 2645 &rc->dirty_subvol_roots); 2646 } 2647 } 2648 2649 if (found) { 2650 found = 0; 2651 goto again; 2652 } 2653 out: 2654 if (ret) { 2655 btrfs_handle_fs_error(fs_info, ret, NULL); 2656 if (!list_empty(&reloc_roots)) 2657 free_reloc_roots(&reloc_roots); 2658 2659 /* new reloc root may be added */ 2660 mutex_lock(&fs_info->reloc_mutex); 2661 list_splice_init(&rc->reloc_roots, &reloc_roots); 2662 mutex_unlock(&fs_info->reloc_mutex); 2663 if (!list_empty(&reloc_roots)) 2664 free_reloc_roots(&reloc_roots); 2665 } 2666 2667 /* 2668 * We used to have 2669 * 2670 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2671 * 2672 * here, but it's wrong. If we fail to start the transaction in 2673 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2674 * have actually been removed from the reloc_root_tree rb tree. This is 2675 * fine because we're bailing here, and we hold a reference on the root 2676 * for the list that holds it, so these roots will be cleaned up when we 2677 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2678 * will be cleaned up on unmount. 2679 * 2680 * The remaining nodes will be cleaned up by free_reloc_control. 2681 */ 2682 } 2683 2684 static void free_block_list(struct rb_root *blocks) 2685 { 2686 struct tree_block *block; 2687 struct rb_node *rb_node; 2688 while ((rb_node = rb_first(blocks))) { 2689 block = rb_entry(rb_node, struct tree_block, rb_node); 2690 rb_erase(rb_node, blocks); 2691 kfree(block); 2692 } 2693 } 2694 2695 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2696 struct btrfs_root *reloc_root) 2697 { 2698 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2699 struct btrfs_root *root; 2700 int ret; 2701 2702 if (reloc_root->last_trans == trans->transid) 2703 return 0; 2704 2705 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2706 BUG_ON(IS_ERR(root)); 2707 BUG_ON(root->reloc_root != reloc_root); 2708 ret = btrfs_record_root_in_trans(trans, root); 2709 btrfs_put_root(root); 2710 2711 return ret; 2712 } 2713 2714 static noinline_for_stack 2715 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2716 struct reloc_control *rc, 2717 struct backref_node *node, 2718 struct backref_edge *edges[]) 2719 { 2720 struct backref_node *next; 2721 struct btrfs_root *root; 2722 int index = 0; 2723 2724 next = node; 2725 while (1) { 2726 cond_resched(); 2727 next = walk_up_backref(next, edges, &index); 2728 root = next->root; 2729 BUG_ON(!root); 2730 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); 2731 2732 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2733 record_reloc_root_in_trans(trans, root); 2734 break; 2735 } 2736 2737 btrfs_record_root_in_trans(trans, root); 2738 root = root->reloc_root; 2739 2740 if (next->new_bytenr != root->node->start) { 2741 BUG_ON(next->new_bytenr); 2742 BUG_ON(!list_empty(&next->list)); 2743 next->new_bytenr = root->node->start; 2744 btrfs_put_root(next->root); 2745 next->root = btrfs_grab_root(root); 2746 ASSERT(next->root); 2747 list_add_tail(&next->list, 2748 &rc->backref_cache.changed); 2749 __mark_block_processed(rc, next); 2750 break; 2751 } 2752 2753 WARN_ON(1); 2754 root = NULL; 2755 next = walk_down_backref(edges, &index); 2756 if (!next || next->level <= node->level) 2757 break; 2758 } 2759 if (!root) 2760 return NULL; 2761 2762 next = node; 2763 /* setup backref node path for btrfs_reloc_cow_block */ 2764 while (1) { 2765 rc->backref_cache.path[next->level] = next; 2766 if (--index < 0) 2767 break; 2768 next = edges[index]->node[UPPER]; 2769 } 2770 return root; 2771 } 2772 2773 /* 2774 * select a tree root for relocation. return NULL if the block 2775 * is reference counted. we should use do_relocation() in this 2776 * case. return a tree root pointer if the block isn't reference 2777 * counted. return -ENOENT if the block is root of reloc tree. 2778 */ 2779 static noinline_for_stack 2780 struct btrfs_root *select_one_root(struct backref_node *node) 2781 { 2782 struct backref_node *next; 2783 struct btrfs_root *root; 2784 struct btrfs_root *fs_root = NULL; 2785 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2786 int index = 0; 2787 2788 next = node; 2789 while (1) { 2790 cond_resched(); 2791 next = walk_up_backref(next, edges, &index); 2792 root = next->root; 2793 BUG_ON(!root); 2794 2795 /* no other choice for non-references counted tree */ 2796 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 2797 return root; 2798 2799 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2800 fs_root = root; 2801 2802 if (next != node) 2803 return NULL; 2804 2805 next = walk_down_backref(edges, &index); 2806 if (!next || next->level <= node->level) 2807 break; 2808 } 2809 2810 if (!fs_root) 2811 return ERR_PTR(-ENOENT); 2812 return fs_root; 2813 } 2814 2815 static noinline_for_stack 2816 u64 calcu_metadata_size(struct reloc_control *rc, 2817 struct backref_node *node, int reserve) 2818 { 2819 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2820 struct backref_node *next = node; 2821 struct backref_edge *edge; 2822 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2823 u64 num_bytes = 0; 2824 int index = 0; 2825 2826 BUG_ON(reserve && node->processed); 2827 2828 while (next) { 2829 cond_resched(); 2830 while (1) { 2831 if (next->processed && (reserve || next != node)) 2832 break; 2833 2834 num_bytes += fs_info->nodesize; 2835 2836 if (list_empty(&next->upper)) 2837 break; 2838 2839 edge = list_entry(next->upper.next, 2840 struct backref_edge, list[LOWER]); 2841 edges[index++] = edge; 2842 next = edge->node[UPPER]; 2843 } 2844 next = walk_down_backref(edges, &index); 2845 } 2846 return num_bytes; 2847 } 2848 2849 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2850 struct reloc_control *rc, 2851 struct backref_node *node) 2852 { 2853 struct btrfs_root *root = rc->extent_root; 2854 struct btrfs_fs_info *fs_info = root->fs_info; 2855 u64 num_bytes; 2856 int ret; 2857 u64 tmp; 2858 2859 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2860 2861 trans->block_rsv = rc->block_rsv; 2862 rc->reserved_bytes += num_bytes; 2863 2864 /* 2865 * We are under a transaction here so we can only do limited flushing. 2866 * If we get an enospc just kick back -EAGAIN so we know to drop the 2867 * transaction and try to refill when we can flush all the things. 2868 */ 2869 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, 2870 BTRFS_RESERVE_FLUSH_LIMIT); 2871 if (ret) { 2872 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2873 while (tmp <= rc->reserved_bytes) 2874 tmp <<= 1; 2875 /* 2876 * only one thread can access block_rsv at this point, 2877 * so we don't need hold lock to protect block_rsv. 2878 * we expand more reservation size here to allow enough 2879 * space for relocation and we will return earlier in 2880 * enospc case. 2881 */ 2882 rc->block_rsv->size = tmp + fs_info->nodesize * 2883 RELOCATION_RESERVED_NODES; 2884 return -EAGAIN; 2885 } 2886 2887 return 0; 2888 } 2889 2890 /* 2891 * relocate a block tree, and then update pointers in upper level 2892 * blocks that reference the block to point to the new location. 2893 * 2894 * if called by link_to_upper, the block has already been relocated. 2895 * in that case this function just updates pointers. 2896 */ 2897 static int do_relocation(struct btrfs_trans_handle *trans, 2898 struct reloc_control *rc, 2899 struct backref_node *node, 2900 struct btrfs_key *key, 2901 struct btrfs_path *path, int lowest) 2902 { 2903 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2904 struct backref_node *upper; 2905 struct backref_edge *edge; 2906 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2907 struct btrfs_root *root; 2908 struct extent_buffer *eb; 2909 u32 blocksize; 2910 u64 bytenr; 2911 u64 generation; 2912 int slot; 2913 int ret; 2914 int err = 0; 2915 2916 BUG_ON(lowest && node->eb); 2917 2918 path->lowest_level = node->level + 1; 2919 rc->backref_cache.path[node->level] = node; 2920 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2921 struct btrfs_key first_key; 2922 struct btrfs_ref ref = { 0 }; 2923 2924 cond_resched(); 2925 2926 upper = edge->node[UPPER]; 2927 root = select_reloc_root(trans, rc, upper, edges); 2928 BUG_ON(!root); 2929 2930 if (upper->eb && !upper->locked) { 2931 if (!lowest) { 2932 ret = btrfs_bin_search(upper->eb, key, 2933 upper->level, &slot); 2934 if (ret < 0) { 2935 err = ret; 2936 goto next; 2937 } 2938 BUG_ON(ret); 2939 bytenr = btrfs_node_blockptr(upper->eb, slot); 2940 if (node->eb->start == bytenr) 2941 goto next; 2942 } 2943 drop_node_buffer(upper); 2944 } 2945 2946 if (!upper->eb) { 2947 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2948 if (ret) { 2949 if (ret < 0) 2950 err = ret; 2951 else 2952 err = -ENOENT; 2953 2954 btrfs_release_path(path); 2955 break; 2956 } 2957 2958 if (!upper->eb) { 2959 upper->eb = path->nodes[upper->level]; 2960 path->nodes[upper->level] = NULL; 2961 } else { 2962 BUG_ON(upper->eb != path->nodes[upper->level]); 2963 } 2964 2965 upper->locked = 1; 2966 path->locks[upper->level] = 0; 2967 2968 slot = path->slots[upper->level]; 2969 btrfs_release_path(path); 2970 } else { 2971 ret = btrfs_bin_search(upper->eb, key, upper->level, 2972 &slot); 2973 if (ret < 0) { 2974 err = ret; 2975 goto next; 2976 } 2977 BUG_ON(ret); 2978 } 2979 2980 bytenr = btrfs_node_blockptr(upper->eb, slot); 2981 if (lowest) { 2982 if (bytenr != node->bytenr) { 2983 btrfs_err(root->fs_info, 2984 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2985 bytenr, node->bytenr, slot, 2986 upper->eb->start); 2987 err = -EIO; 2988 goto next; 2989 } 2990 } else { 2991 if (node->eb->start == bytenr) 2992 goto next; 2993 } 2994 2995 blocksize = root->fs_info->nodesize; 2996 generation = btrfs_node_ptr_generation(upper->eb, slot); 2997 btrfs_node_key_to_cpu(upper->eb, &first_key, slot); 2998 eb = read_tree_block(fs_info, bytenr, generation, 2999 upper->level - 1, &first_key); 3000 if (IS_ERR(eb)) { 3001 err = PTR_ERR(eb); 3002 goto next; 3003 } else if (!extent_buffer_uptodate(eb)) { 3004 free_extent_buffer(eb); 3005 err = -EIO; 3006 goto next; 3007 } 3008 btrfs_tree_lock(eb); 3009 btrfs_set_lock_blocking_write(eb); 3010 3011 if (!node->eb) { 3012 ret = btrfs_cow_block(trans, root, eb, upper->eb, 3013 slot, &eb); 3014 btrfs_tree_unlock(eb); 3015 free_extent_buffer(eb); 3016 if (ret < 0) { 3017 err = ret; 3018 goto next; 3019 } 3020 BUG_ON(node->eb != eb); 3021 } else { 3022 btrfs_set_node_blockptr(upper->eb, slot, 3023 node->eb->start); 3024 btrfs_set_node_ptr_generation(upper->eb, slot, 3025 trans->transid); 3026 btrfs_mark_buffer_dirty(upper->eb); 3027 3028 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 3029 node->eb->start, blocksize, 3030 upper->eb->start); 3031 ref.real_root = root->root_key.objectid; 3032 btrfs_init_tree_ref(&ref, node->level, 3033 btrfs_header_owner(upper->eb)); 3034 ret = btrfs_inc_extent_ref(trans, &ref); 3035 BUG_ON(ret); 3036 3037 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 3038 BUG_ON(ret); 3039 } 3040 next: 3041 if (!upper->pending) 3042 drop_node_buffer(upper); 3043 else 3044 unlock_node_buffer(upper); 3045 if (err) 3046 break; 3047 } 3048 3049 if (!err && node->pending) { 3050 drop_node_buffer(node); 3051 list_move_tail(&node->list, &rc->backref_cache.changed); 3052 node->pending = 0; 3053 } 3054 3055 path->lowest_level = 0; 3056 BUG_ON(err == -ENOSPC); 3057 return err; 3058 } 3059 3060 static int link_to_upper(struct btrfs_trans_handle *trans, 3061 struct reloc_control *rc, 3062 struct backref_node *node, 3063 struct btrfs_path *path) 3064 { 3065 struct btrfs_key key; 3066 3067 btrfs_node_key_to_cpu(node->eb, &key, 0); 3068 return do_relocation(trans, rc, node, &key, path, 0); 3069 } 3070 3071 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 3072 struct reloc_control *rc, 3073 struct btrfs_path *path, int err) 3074 { 3075 LIST_HEAD(list); 3076 struct backref_cache *cache = &rc->backref_cache; 3077 struct backref_node *node; 3078 int level; 3079 int ret; 3080 3081 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 3082 while (!list_empty(&cache->pending[level])) { 3083 node = list_entry(cache->pending[level].next, 3084 struct backref_node, list); 3085 list_move_tail(&node->list, &list); 3086 BUG_ON(!node->pending); 3087 3088 if (!err) { 3089 ret = link_to_upper(trans, rc, node, path); 3090 if (ret < 0) 3091 err = ret; 3092 } 3093 } 3094 list_splice_init(&list, &cache->pending[level]); 3095 } 3096 return err; 3097 } 3098 3099 static void mark_block_processed(struct reloc_control *rc, 3100 u64 bytenr, u32 blocksize) 3101 { 3102 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, 3103 EXTENT_DIRTY); 3104 } 3105 3106 static void __mark_block_processed(struct reloc_control *rc, 3107 struct backref_node *node) 3108 { 3109 u32 blocksize; 3110 if (node->level == 0 || 3111 in_block_group(node->bytenr, rc->block_group)) { 3112 blocksize = rc->extent_root->fs_info->nodesize; 3113 mark_block_processed(rc, node->bytenr, blocksize); 3114 } 3115 node->processed = 1; 3116 } 3117 3118 /* 3119 * mark a block and all blocks directly/indirectly reference the block 3120 * as processed. 3121 */ 3122 static void update_processed_blocks(struct reloc_control *rc, 3123 struct backref_node *node) 3124 { 3125 struct backref_node *next = node; 3126 struct backref_edge *edge; 3127 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 3128 int index = 0; 3129 3130 while (next) { 3131 cond_resched(); 3132 while (1) { 3133 if (next->processed) 3134 break; 3135 3136 __mark_block_processed(rc, next); 3137 3138 if (list_empty(&next->upper)) 3139 break; 3140 3141 edge = list_entry(next->upper.next, 3142 struct backref_edge, list[LOWER]); 3143 edges[index++] = edge; 3144 next = edge->node[UPPER]; 3145 } 3146 next = walk_down_backref(edges, &index); 3147 } 3148 } 3149 3150 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 3151 { 3152 u32 blocksize = rc->extent_root->fs_info->nodesize; 3153 3154 if (test_range_bit(&rc->processed_blocks, bytenr, 3155 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 3156 return 1; 3157 return 0; 3158 } 3159 3160 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 3161 struct tree_block *block) 3162 { 3163 struct extent_buffer *eb; 3164 3165 eb = read_tree_block(fs_info, block->bytenr, block->key.offset, 3166 block->level, NULL); 3167 if (IS_ERR(eb)) { 3168 return PTR_ERR(eb); 3169 } else if (!extent_buffer_uptodate(eb)) { 3170 free_extent_buffer(eb); 3171 return -EIO; 3172 } 3173 if (block->level == 0) 3174 btrfs_item_key_to_cpu(eb, &block->key, 0); 3175 else 3176 btrfs_node_key_to_cpu(eb, &block->key, 0); 3177 free_extent_buffer(eb); 3178 block->key_ready = 1; 3179 return 0; 3180 } 3181 3182 /* 3183 * helper function to relocate a tree block 3184 */ 3185 static int relocate_tree_block(struct btrfs_trans_handle *trans, 3186 struct reloc_control *rc, 3187 struct backref_node *node, 3188 struct btrfs_key *key, 3189 struct btrfs_path *path) 3190 { 3191 struct btrfs_root *root; 3192 int ret = 0; 3193 3194 if (!node) 3195 return 0; 3196 3197 /* 3198 * If we fail here we want to drop our backref_node because we are going 3199 * to start over and regenerate the tree for it. 3200 */ 3201 ret = reserve_metadata_space(trans, rc, node); 3202 if (ret) 3203 goto out; 3204 3205 BUG_ON(node->processed); 3206 root = select_one_root(node); 3207 if (root == ERR_PTR(-ENOENT)) { 3208 update_processed_blocks(rc, node); 3209 goto out; 3210 } 3211 3212 if (root) { 3213 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3214 BUG_ON(node->new_bytenr); 3215 BUG_ON(!list_empty(&node->list)); 3216 btrfs_record_root_in_trans(trans, root); 3217 root = root->reloc_root; 3218 node->new_bytenr = root->node->start; 3219 btrfs_put_root(node->root); 3220 node->root = btrfs_grab_root(root); 3221 ASSERT(node->root); 3222 list_add_tail(&node->list, &rc->backref_cache.changed); 3223 } else { 3224 path->lowest_level = node->level; 3225 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 3226 btrfs_release_path(path); 3227 if (ret > 0) 3228 ret = 0; 3229 } 3230 if (!ret) 3231 update_processed_blocks(rc, node); 3232 } else { 3233 ret = do_relocation(trans, rc, node, key, path, 1); 3234 } 3235 out: 3236 if (ret || node->level == 0 || node->cowonly) 3237 remove_backref_node(&rc->backref_cache, node); 3238 return ret; 3239 } 3240 3241 /* 3242 * relocate a list of blocks 3243 */ 3244 static noinline_for_stack 3245 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 3246 struct reloc_control *rc, struct rb_root *blocks) 3247 { 3248 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3249 struct backref_node *node; 3250 struct btrfs_path *path; 3251 struct tree_block *block; 3252 struct tree_block *next; 3253 int ret; 3254 int err = 0; 3255 3256 path = btrfs_alloc_path(); 3257 if (!path) { 3258 err = -ENOMEM; 3259 goto out_free_blocks; 3260 } 3261 3262 /* Kick in readahead for tree blocks with missing keys */ 3263 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3264 if (!block->key_ready) 3265 readahead_tree_block(fs_info, block->bytenr); 3266 } 3267 3268 /* Get first keys */ 3269 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3270 if (!block->key_ready) { 3271 err = get_tree_block_key(fs_info, block); 3272 if (err) 3273 goto out_free_path; 3274 } 3275 } 3276 3277 /* Do tree relocation */ 3278 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3279 node = build_backref_tree(rc, &block->key, 3280 block->level, block->bytenr); 3281 if (IS_ERR(node)) { 3282 err = PTR_ERR(node); 3283 goto out; 3284 } 3285 3286 ret = relocate_tree_block(trans, rc, node, &block->key, 3287 path); 3288 if (ret < 0) { 3289 err = ret; 3290 break; 3291 } 3292 } 3293 out: 3294 err = finish_pending_nodes(trans, rc, path, err); 3295 3296 out_free_path: 3297 btrfs_free_path(path); 3298 out_free_blocks: 3299 free_block_list(blocks); 3300 return err; 3301 } 3302 3303 static noinline_for_stack 3304 int prealloc_file_extent_cluster(struct inode *inode, 3305 struct file_extent_cluster *cluster) 3306 { 3307 u64 alloc_hint = 0; 3308 u64 start; 3309 u64 end; 3310 u64 offset = BTRFS_I(inode)->index_cnt; 3311 u64 num_bytes; 3312 int nr = 0; 3313 int ret = 0; 3314 u64 prealloc_start = cluster->start - offset; 3315 u64 prealloc_end = cluster->end - offset; 3316 u64 cur_offset; 3317 struct extent_changeset *data_reserved = NULL; 3318 3319 BUG_ON(cluster->start != cluster->boundary[0]); 3320 inode_lock(inode); 3321 3322 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, 3323 prealloc_end + 1 - prealloc_start); 3324 if (ret) 3325 goto out; 3326 3327 cur_offset = prealloc_start; 3328 while (nr < cluster->nr) { 3329 start = cluster->boundary[nr] - offset; 3330 if (nr + 1 < cluster->nr) 3331 end = cluster->boundary[nr + 1] - 1 - offset; 3332 else 3333 end = cluster->end - offset; 3334 3335 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3336 num_bytes = end + 1 - start; 3337 if (cur_offset < start) 3338 btrfs_free_reserved_data_space(inode, data_reserved, 3339 cur_offset, start - cur_offset); 3340 ret = btrfs_prealloc_file_range(inode, 0, start, 3341 num_bytes, num_bytes, 3342 end + 1, &alloc_hint); 3343 cur_offset = end + 1; 3344 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3345 if (ret) 3346 break; 3347 nr++; 3348 } 3349 if (cur_offset < prealloc_end) 3350 btrfs_free_reserved_data_space(inode, data_reserved, 3351 cur_offset, prealloc_end + 1 - cur_offset); 3352 out: 3353 inode_unlock(inode); 3354 extent_changeset_free(data_reserved); 3355 return ret; 3356 } 3357 3358 static noinline_for_stack 3359 int setup_extent_mapping(struct inode *inode, u64 start, u64 end, 3360 u64 block_start) 3361 { 3362 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3363 struct extent_map *em; 3364 int ret = 0; 3365 3366 em = alloc_extent_map(); 3367 if (!em) 3368 return -ENOMEM; 3369 3370 em->start = start; 3371 em->len = end + 1 - start; 3372 em->block_len = em->len; 3373 em->block_start = block_start; 3374 set_bit(EXTENT_FLAG_PINNED, &em->flags); 3375 3376 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3377 while (1) { 3378 write_lock(&em_tree->lock); 3379 ret = add_extent_mapping(em_tree, em, 0); 3380 write_unlock(&em_tree->lock); 3381 if (ret != -EEXIST) { 3382 free_extent_map(em); 3383 break; 3384 } 3385 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 3386 } 3387 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3388 return ret; 3389 } 3390 3391 /* 3392 * Allow error injection to test balance cancellation 3393 */ 3394 int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) 3395 { 3396 return atomic_read(&fs_info->balance_cancel_req); 3397 } 3398 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 3399 3400 static int relocate_file_extent_cluster(struct inode *inode, 3401 struct file_extent_cluster *cluster) 3402 { 3403 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3404 u64 page_start; 3405 u64 page_end; 3406 u64 offset = BTRFS_I(inode)->index_cnt; 3407 unsigned long index; 3408 unsigned long last_index; 3409 struct page *page; 3410 struct file_ra_state *ra; 3411 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 3412 int nr = 0; 3413 int ret = 0; 3414 3415 if (!cluster->nr) 3416 return 0; 3417 3418 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3419 if (!ra) 3420 return -ENOMEM; 3421 3422 ret = prealloc_file_extent_cluster(inode, cluster); 3423 if (ret) 3424 goto out; 3425 3426 file_ra_state_init(ra, inode->i_mapping); 3427 3428 ret = setup_extent_mapping(inode, cluster->start - offset, 3429 cluster->end - offset, cluster->start); 3430 if (ret) 3431 goto out; 3432 3433 index = (cluster->start - offset) >> PAGE_SHIFT; 3434 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3435 while (index <= last_index) { 3436 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3437 PAGE_SIZE); 3438 if (ret) 3439 goto out; 3440 3441 page = find_lock_page(inode->i_mapping, index); 3442 if (!page) { 3443 page_cache_sync_readahead(inode->i_mapping, 3444 ra, NULL, index, 3445 last_index + 1 - index); 3446 page = find_or_create_page(inode->i_mapping, index, 3447 mask); 3448 if (!page) { 3449 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3450 PAGE_SIZE, true); 3451 btrfs_delalloc_release_extents(BTRFS_I(inode), 3452 PAGE_SIZE); 3453 ret = -ENOMEM; 3454 goto out; 3455 } 3456 } 3457 3458 if (PageReadahead(page)) { 3459 page_cache_async_readahead(inode->i_mapping, 3460 ra, NULL, page, index, 3461 last_index + 1 - index); 3462 } 3463 3464 if (!PageUptodate(page)) { 3465 btrfs_readpage(NULL, page); 3466 lock_page(page); 3467 if (!PageUptodate(page)) { 3468 unlock_page(page); 3469 put_page(page); 3470 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3471 PAGE_SIZE, true); 3472 btrfs_delalloc_release_extents(BTRFS_I(inode), 3473 PAGE_SIZE); 3474 ret = -EIO; 3475 goto out; 3476 } 3477 } 3478 3479 page_start = page_offset(page); 3480 page_end = page_start + PAGE_SIZE - 1; 3481 3482 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3483 3484 set_page_extent_mapped(page); 3485 3486 if (nr < cluster->nr && 3487 page_start + offset == cluster->boundary[nr]) { 3488 set_extent_bits(&BTRFS_I(inode)->io_tree, 3489 page_start, page_end, 3490 EXTENT_BOUNDARY); 3491 nr++; 3492 } 3493 3494 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 3495 NULL); 3496 if (ret) { 3497 unlock_page(page); 3498 put_page(page); 3499 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3500 PAGE_SIZE, true); 3501 btrfs_delalloc_release_extents(BTRFS_I(inode), 3502 PAGE_SIZE); 3503 3504 clear_extent_bits(&BTRFS_I(inode)->io_tree, 3505 page_start, page_end, 3506 EXTENT_LOCKED | EXTENT_BOUNDARY); 3507 goto out; 3508 3509 } 3510 set_page_dirty(page); 3511 3512 unlock_extent(&BTRFS_I(inode)->io_tree, 3513 page_start, page_end); 3514 unlock_page(page); 3515 put_page(page); 3516 3517 index++; 3518 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 3519 balance_dirty_pages_ratelimited(inode->i_mapping); 3520 btrfs_throttle(fs_info); 3521 if (btrfs_should_cancel_balance(fs_info)) { 3522 ret = -ECANCELED; 3523 goto out; 3524 } 3525 } 3526 WARN_ON(nr != cluster->nr); 3527 out: 3528 kfree(ra); 3529 return ret; 3530 } 3531 3532 static noinline_for_stack 3533 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3534 struct file_extent_cluster *cluster) 3535 { 3536 int ret; 3537 3538 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3539 ret = relocate_file_extent_cluster(inode, cluster); 3540 if (ret) 3541 return ret; 3542 cluster->nr = 0; 3543 } 3544 3545 if (!cluster->nr) 3546 cluster->start = extent_key->objectid; 3547 else 3548 BUG_ON(cluster->nr >= MAX_EXTENTS); 3549 cluster->end = extent_key->objectid + extent_key->offset - 1; 3550 cluster->boundary[cluster->nr] = extent_key->objectid; 3551 cluster->nr++; 3552 3553 if (cluster->nr >= MAX_EXTENTS) { 3554 ret = relocate_file_extent_cluster(inode, cluster); 3555 if (ret) 3556 return ret; 3557 cluster->nr = 0; 3558 } 3559 return 0; 3560 } 3561 3562 /* 3563 * helper to add a tree block to the list. 3564 * the major work is getting the generation and level of the block 3565 */ 3566 static int add_tree_block(struct reloc_control *rc, 3567 struct btrfs_key *extent_key, 3568 struct btrfs_path *path, 3569 struct rb_root *blocks) 3570 { 3571 struct extent_buffer *eb; 3572 struct btrfs_extent_item *ei; 3573 struct btrfs_tree_block_info *bi; 3574 struct tree_block *block; 3575 struct rb_node *rb_node; 3576 u32 item_size; 3577 int level = -1; 3578 u64 generation; 3579 3580 eb = path->nodes[0]; 3581 item_size = btrfs_item_size_nr(eb, path->slots[0]); 3582 3583 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3584 item_size >= sizeof(*ei) + sizeof(*bi)) { 3585 ei = btrfs_item_ptr(eb, path->slots[0], 3586 struct btrfs_extent_item); 3587 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3588 bi = (struct btrfs_tree_block_info *)(ei + 1); 3589 level = btrfs_tree_block_level(eb, bi); 3590 } else { 3591 level = (int)extent_key->offset; 3592 } 3593 generation = btrfs_extent_generation(eb, ei); 3594 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3595 btrfs_print_v0_err(eb->fs_info); 3596 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3597 return -EINVAL; 3598 } else { 3599 BUG(); 3600 } 3601 3602 btrfs_release_path(path); 3603 3604 BUG_ON(level == -1); 3605 3606 block = kmalloc(sizeof(*block), GFP_NOFS); 3607 if (!block) 3608 return -ENOMEM; 3609 3610 block->bytenr = extent_key->objectid; 3611 block->key.objectid = rc->extent_root->fs_info->nodesize; 3612 block->key.offset = generation; 3613 block->level = level; 3614 block->key_ready = 0; 3615 3616 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); 3617 if (rb_node) 3618 backref_tree_panic(rb_node, -EEXIST, block->bytenr); 3619 3620 return 0; 3621 } 3622 3623 /* 3624 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3625 */ 3626 static int __add_tree_block(struct reloc_control *rc, 3627 u64 bytenr, u32 blocksize, 3628 struct rb_root *blocks) 3629 { 3630 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3631 struct btrfs_path *path; 3632 struct btrfs_key key; 3633 int ret; 3634 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3635 3636 if (tree_block_processed(bytenr, rc)) 3637 return 0; 3638 3639 if (tree_search(blocks, bytenr)) 3640 return 0; 3641 3642 path = btrfs_alloc_path(); 3643 if (!path) 3644 return -ENOMEM; 3645 again: 3646 key.objectid = bytenr; 3647 if (skinny) { 3648 key.type = BTRFS_METADATA_ITEM_KEY; 3649 key.offset = (u64)-1; 3650 } else { 3651 key.type = BTRFS_EXTENT_ITEM_KEY; 3652 key.offset = blocksize; 3653 } 3654 3655 path->search_commit_root = 1; 3656 path->skip_locking = 1; 3657 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3658 if (ret < 0) 3659 goto out; 3660 3661 if (ret > 0 && skinny) { 3662 if (path->slots[0]) { 3663 path->slots[0]--; 3664 btrfs_item_key_to_cpu(path->nodes[0], &key, 3665 path->slots[0]); 3666 if (key.objectid == bytenr && 3667 (key.type == BTRFS_METADATA_ITEM_KEY || 3668 (key.type == BTRFS_EXTENT_ITEM_KEY && 3669 key.offset == blocksize))) 3670 ret = 0; 3671 } 3672 3673 if (ret) { 3674 skinny = false; 3675 btrfs_release_path(path); 3676 goto again; 3677 } 3678 } 3679 if (ret) { 3680 ASSERT(ret == 1); 3681 btrfs_print_leaf(path->nodes[0]); 3682 btrfs_err(fs_info, 3683 "tree block extent item (%llu) is not found in extent tree", 3684 bytenr); 3685 WARN_ON(1); 3686 ret = -EINVAL; 3687 goto out; 3688 } 3689 3690 ret = add_tree_block(rc, &key, path, blocks); 3691 out: 3692 btrfs_free_path(path); 3693 return ret; 3694 } 3695 3696 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3697 struct btrfs_block_group *block_group, 3698 struct inode *inode, 3699 u64 ino) 3700 { 3701 struct btrfs_key key; 3702 struct btrfs_root *root = fs_info->tree_root; 3703 struct btrfs_trans_handle *trans; 3704 int ret = 0; 3705 3706 if (inode) 3707 goto truncate; 3708 3709 key.objectid = ino; 3710 key.type = BTRFS_INODE_ITEM_KEY; 3711 key.offset = 0; 3712 3713 inode = btrfs_iget(fs_info->sb, &key, root); 3714 if (IS_ERR(inode)) 3715 return -ENOENT; 3716 3717 truncate: 3718 ret = btrfs_check_trunc_cache_free_space(fs_info, 3719 &fs_info->global_block_rsv); 3720 if (ret) 3721 goto out; 3722 3723 trans = btrfs_join_transaction(root); 3724 if (IS_ERR(trans)) { 3725 ret = PTR_ERR(trans); 3726 goto out; 3727 } 3728 3729 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3730 3731 btrfs_end_transaction(trans); 3732 btrfs_btree_balance_dirty(fs_info); 3733 out: 3734 iput(inode); 3735 return ret; 3736 } 3737 3738 /* 3739 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3740 * cache inode, to avoid free space cache data extent blocking data relocation. 3741 */ 3742 static int delete_v1_space_cache(struct extent_buffer *leaf, 3743 struct btrfs_block_group *block_group, 3744 u64 data_bytenr) 3745 { 3746 u64 space_cache_ino; 3747 struct btrfs_file_extent_item *ei; 3748 struct btrfs_key key; 3749 bool found = false; 3750 int i; 3751 int ret; 3752 3753 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3754 return 0; 3755 3756 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3757 btrfs_item_key_to_cpu(leaf, &key, i); 3758 if (key.type != BTRFS_EXTENT_DATA_KEY) 3759 continue; 3760 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3761 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG && 3762 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3763 found = true; 3764 space_cache_ino = key.objectid; 3765 break; 3766 } 3767 } 3768 if (!found) 3769 return -ENOENT; 3770 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3771 space_cache_ino); 3772 return ret; 3773 } 3774 3775 /* 3776 * helper to find all tree blocks that reference a given data extent 3777 */ 3778 static noinline_for_stack 3779 int add_data_references(struct reloc_control *rc, 3780 struct btrfs_key *extent_key, 3781 struct btrfs_path *path, 3782 struct rb_root *blocks) 3783 { 3784 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3785 struct ulist *leaves = NULL; 3786 struct ulist_iterator leaf_uiter; 3787 struct ulist_node *ref_node = NULL; 3788 const u32 blocksize = fs_info->nodesize; 3789 int ret = 0; 3790 3791 btrfs_release_path(path); 3792 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, 3793 0, &leaves, NULL, true); 3794 if (ret < 0) 3795 return ret; 3796 3797 ULIST_ITER_INIT(&leaf_uiter); 3798 while ((ref_node = ulist_next(leaves, &leaf_uiter))) { 3799 struct extent_buffer *eb; 3800 3801 eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); 3802 if (IS_ERR(eb)) { 3803 ret = PTR_ERR(eb); 3804 break; 3805 } 3806 ret = delete_v1_space_cache(eb, rc->block_group, 3807 extent_key->objectid); 3808 free_extent_buffer(eb); 3809 if (ret < 0) 3810 break; 3811 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3812 if (ret < 0) 3813 break; 3814 } 3815 if (ret < 0) 3816 free_block_list(blocks); 3817 ulist_free(leaves); 3818 return ret; 3819 } 3820 3821 /* 3822 * helper to find next unprocessed extent 3823 */ 3824 static noinline_for_stack 3825 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3826 struct btrfs_key *extent_key) 3827 { 3828 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3829 struct btrfs_key key; 3830 struct extent_buffer *leaf; 3831 u64 start, end, last; 3832 int ret; 3833 3834 last = rc->block_group->start + rc->block_group->length; 3835 while (1) { 3836 cond_resched(); 3837 if (rc->search_start >= last) { 3838 ret = 1; 3839 break; 3840 } 3841 3842 key.objectid = rc->search_start; 3843 key.type = BTRFS_EXTENT_ITEM_KEY; 3844 key.offset = 0; 3845 3846 path->search_commit_root = 1; 3847 path->skip_locking = 1; 3848 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3849 0, 0); 3850 if (ret < 0) 3851 break; 3852 next: 3853 leaf = path->nodes[0]; 3854 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3855 ret = btrfs_next_leaf(rc->extent_root, path); 3856 if (ret != 0) 3857 break; 3858 leaf = path->nodes[0]; 3859 } 3860 3861 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3862 if (key.objectid >= last) { 3863 ret = 1; 3864 break; 3865 } 3866 3867 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3868 key.type != BTRFS_METADATA_ITEM_KEY) { 3869 path->slots[0]++; 3870 goto next; 3871 } 3872 3873 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3874 key.objectid + key.offset <= rc->search_start) { 3875 path->slots[0]++; 3876 goto next; 3877 } 3878 3879 if (key.type == BTRFS_METADATA_ITEM_KEY && 3880 key.objectid + fs_info->nodesize <= 3881 rc->search_start) { 3882 path->slots[0]++; 3883 goto next; 3884 } 3885 3886 ret = find_first_extent_bit(&rc->processed_blocks, 3887 key.objectid, &start, &end, 3888 EXTENT_DIRTY, NULL); 3889 3890 if (ret == 0 && start <= key.objectid) { 3891 btrfs_release_path(path); 3892 rc->search_start = end + 1; 3893 } else { 3894 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3895 rc->search_start = key.objectid + key.offset; 3896 else 3897 rc->search_start = key.objectid + 3898 fs_info->nodesize; 3899 memcpy(extent_key, &key, sizeof(key)); 3900 return 0; 3901 } 3902 } 3903 btrfs_release_path(path); 3904 return ret; 3905 } 3906 3907 static void set_reloc_control(struct reloc_control *rc) 3908 { 3909 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3910 3911 mutex_lock(&fs_info->reloc_mutex); 3912 fs_info->reloc_ctl = rc; 3913 mutex_unlock(&fs_info->reloc_mutex); 3914 } 3915 3916 static void unset_reloc_control(struct reloc_control *rc) 3917 { 3918 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3919 3920 mutex_lock(&fs_info->reloc_mutex); 3921 fs_info->reloc_ctl = NULL; 3922 mutex_unlock(&fs_info->reloc_mutex); 3923 } 3924 3925 static int check_extent_flags(u64 flags) 3926 { 3927 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3928 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3929 return 1; 3930 if (!(flags & BTRFS_EXTENT_FLAG_DATA) && 3931 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3932 return 1; 3933 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3934 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 3935 return 1; 3936 return 0; 3937 } 3938 3939 static noinline_for_stack 3940 int prepare_to_relocate(struct reloc_control *rc) 3941 { 3942 struct btrfs_trans_handle *trans; 3943 int ret; 3944 3945 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3946 BTRFS_BLOCK_RSV_TEMP); 3947 if (!rc->block_rsv) 3948 return -ENOMEM; 3949 3950 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3951 rc->search_start = rc->block_group->start; 3952 rc->extents_found = 0; 3953 rc->nodes_relocated = 0; 3954 rc->merging_rsv_size = 0; 3955 rc->reserved_bytes = 0; 3956 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3957 RELOCATION_RESERVED_NODES; 3958 ret = btrfs_block_rsv_refill(rc->extent_root, 3959 rc->block_rsv, rc->block_rsv->size, 3960 BTRFS_RESERVE_FLUSH_ALL); 3961 if (ret) 3962 return ret; 3963 3964 rc->create_reloc_tree = 1; 3965 set_reloc_control(rc); 3966 3967 trans = btrfs_join_transaction(rc->extent_root); 3968 if (IS_ERR(trans)) { 3969 unset_reloc_control(rc); 3970 /* 3971 * extent tree is not a ref_cow tree and has no reloc_root to 3972 * cleanup. And callers are responsible to free the above 3973 * block rsv. 3974 */ 3975 return PTR_ERR(trans); 3976 } 3977 btrfs_commit_transaction(trans); 3978 return 0; 3979 } 3980 3981 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3982 { 3983 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3984 struct rb_root blocks = RB_ROOT; 3985 struct btrfs_key key; 3986 struct btrfs_trans_handle *trans = NULL; 3987 struct btrfs_path *path; 3988 struct btrfs_extent_item *ei; 3989 u64 flags; 3990 u32 item_size; 3991 int ret; 3992 int err = 0; 3993 int progress = 0; 3994 3995 path = btrfs_alloc_path(); 3996 if (!path) 3997 return -ENOMEM; 3998 path->reada = READA_FORWARD; 3999 4000 ret = prepare_to_relocate(rc); 4001 if (ret) { 4002 err = ret; 4003 goto out_free; 4004 } 4005 4006 while (1) { 4007 rc->reserved_bytes = 0; 4008 ret = btrfs_block_rsv_refill(rc->extent_root, 4009 rc->block_rsv, rc->block_rsv->size, 4010 BTRFS_RESERVE_FLUSH_ALL); 4011 if (ret) { 4012 err = ret; 4013 break; 4014 } 4015 progress++; 4016 trans = btrfs_start_transaction(rc->extent_root, 0); 4017 if (IS_ERR(trans)) { 4018 err = PTR_ERR(trans); 4019 trans = NULL; 4020 break; 4021 } 4022 restart: 4023 if (update_backref_cache(trans, &rc->backref_cache)) { 4024 btrfs_end_transaction(trans); 4025 trans = NULL; 4026 continue; 4027 } 4028 4029 ret = find_next_extent(rc, path, &key); 4030 if (ret < 0) 4031 err = ret; 4032 if (ret != 0) 4033 break; 4034 4035 rc->extents_found++; 4036 4037 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 4038 struct btrfs_extent_item); 4039 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 4040 if (item_size >= sizeof(*ei)) { 4041 flags = btrfs_extent_flags(path->nodes[0], ei); 4042 ret = check_extent_flags(flags); 4043 BUG_ON(ret); 4044 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 4045 err = -EINVAL; 4046 btrfs_print_v0_err(trans->fs_info); 4047 btrfs_abort_transaction(trans, err); 4048 break; 4049 } else { 4050 BUG(); 4051 } 4052 4053 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 4054 ret = add_tree_block(rc, &key, path, &blocks); 4055 } else if (rc->stage == UPDATE_DATA_PTRS && 4056 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4057 ret = add_data_references(rc, &key, path, &blocks); 4058 } else { 4059 btrfs_release_path(path); 4060 ret = 0; 4061 } 4062 if (ret < 0) { 4063 err = ret; 4064 break; 4065 } 4066 4067 if (!RB_EMPTY_ROOT(&blocks)) { 4068 ret = relocate_tree_blocks(trans, rc, &blocks); 4069 if (ret < 0) { 4070 if (ret != -EAGAIN) { 4071 err = ret; 4072 break; 4073 } 4074 rc->extents_found--; 4075 rc->search_start = key.objectid; 4076 } 4077 } 4078 4079 btrfs_end_transaction_throttle(trans); 4080 btrfs_btree_balance_dirty(fs_info); 4081 trans = NULL; 4082 4083 if (rc->stage == MOVE_DATA_EXTENTS && 4084 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4085 rc->found_file_extent = 1; 4086 ret = relocate_data_extent(rc->data_inode, 4087 &key, &rc->cluster); 4088 if (ret < 0) { 4089 err = ret; 4090 break; 4091 } 4092 } 4093 if (btrfs_should_cancel_balance(fs_info)) { 4094 err = -ECANCELED; 4095 break; 4096 } 4097 } 4098 if (trans && progress && err == -ENOSPC) { 4099 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 4100 if (ret == 1) { 4101 err = 0; 4102 progress = 0; 4103 goto restart; 4104 } 4105 } 4106 4107 btrfs_release_path(path); 4108 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 4109 4110 if (trans) { 4111 btrfs_end_transaction_throttle(trans); 4112 btrfs_btree_balance_dirty(fs_info); 4113 } 4114 4115 if (!err) { 4116 ret = relocate_file_extent_cluster(rc->data_inode, 4117 &rc->cluster); 4118 if (ret < 0) 4119 err = ret; 4120 } 4121 4122 rc->create_reloc_tree = 0; 4123 set_reloc_control(rc); 4124 4125 backref_cache_cleanup(&rc->backref_cache); 4126 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 4127 4128 /* 4129 * Even in the case when the relocation is cancelled, we should all go 4130 * through prepare_to_merge() and merge_reloc_roots(). 4131 * 4132 * For error (including cancelled balance), prepare_to_merge() will 4133 * mark all reloc trees orphan, then queue them for cleanup in 4134 * merge_reloc_roots() 4135 */ 4136 err = prepare_to_merge(rc, err); 4137 4138 merge_reloc_roots(rc); 4139 4140 rc->merge_reloc_tree = 0; 4141 unset_reloc_control(rc); 4142 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 4143 4144 /* get rid of pinned extents */ 4145 trans = btrfs_join_transaction(rc->extent_root); 4146 if (IS_ERR(trans)) { 4147 err = PTR_ERR(trans); 4148 goto out_free; 4149 } 4150 btrfs_commit_transaction(trans); 4151 out_free: 4152 ret = clean_dirty_subvols(rc); 4153 if (ret < 0 && !err) 4154 err = ret; 4155 btrfs_free_block_rsv(fs_info, rc->block_rsv); 4156 btrfs_free_path(path); 4157 return err; 4158 } 4159 4160 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 4161 struct btrfs_root *root, u64 objectid) 4162 { 4163 struct btrfs_path *path; 4164 struct btrfs_inode_item *item; 4165 struct extent_buffer *leaf; 4166 int ret; 4167 4168 path = btrfs_alloc_path(); 4169 if (!path) 4170 return -ENOMEM; 4171 4172 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 4173 if (ret) 4174 goto out; 4175 4176 leaf = path->nodes[0]; 4177 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 4178 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 4179 btrfs_set_inode_generation(leaf, item, 1); 4180 btrfs_set_inode_size(leaf, item, 0); 4181 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 4182 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4183 BTRFS_INODE_PREALLOC); 4184 btrfs_mark_buffer_dirty(leaf); 4185 out: 4186 btrfs_free_path(path); 4187 return ret; 4188 } 4189 4190 /* 4191 * helper to create inode for data relocation. 4192 * the inode is in data relocation tree and its link count is 0 4193 */ 4194 static noinline_for_stack 4195 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 4196 struct btrfs_block_group *group) 4197 { 4198 struct inode *inode = NULL; 4199 struct btrfs_trans_handle *trans; 4200 struct btrfs_root *root; 4201 struct btrfs_key key; 4202 u64 objectid; 4203 int err = 0; 4204 4205 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4206 if (IS_ERR(root)) 4207 return ERR_CAST(root); 4208 4209 trans = btrfs_start_transaction(root, 6); 4210 if (IS_ERR(trans)) { 4211 btrfs_put_root(root); 4212 return ERR_CAST(trans); 4213 } 4214 4215 err = btrfs_find_free_objectid(root, &objectid); 4216 if (err) 4217 goto out; 4218 4219 err = __insert_orphan_inode(trans, root, objectid); 4220 BUG_ON(err); 4221 4222 key.objectid = objectid; 4223 key.type = BTRFS_INODE_ITEM_KEY; 4224 key.offset = 0; 4225 inode = btrfs_iget(fs_info->sb, &key, root); 4226 BUG_ON(IS_ERR(inode)); 4227 BTRFS_I(inode)->index_cnt = group->start; 4228 4229 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4230 out: 4231 btrfs_put_root(root); 4232 btrfs_end_transaction(trans); 4233 btrfs_btree_balance_dirty(fs_info); 4234 if (err) { 4235 if (inode) 4236 iput(inode); 4237 inode = ERR_PTR(err); 4238 } 4239 return inode; 4240 } 4241 4242 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 4243 { 4244 struct reloc_control *rc; 4245 4246 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4247 if (!rc) 4248 return NULL; 4249 4250 INIT_LIST_HEAD(&rc->reloc_roots); 4251 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 4252 backref_cache_init(&rc->backref_cache); 4253 mapping_tree_init(&rc->reloc_root_tree); 4254 extent_io_tree_init(fs_info, &rc->processed_blocks, 4255 IO_TREE_RELOC_BLOCKS, NULL); 4256 return rc; 4257 } 4258 4259 static void free_reloc_control(struct reloc_control *rc) 4260 { 4261 struct mapping_node *node, *tmp; 4262 4263 free_reloc_roots(&rc->reloc_roots); 4264 rbtree_postorder_for_each_entry_safe(node, tmp, 4265 &rc->reloc_root_tree.rb_root, rb_node) 4266 kfree(node); 4267 4268 kfree(rc); 4269 } 4270 4271 /* 4272 * Print the block group being relocated 4273 */ 4274 static void describe_relocation(struct btrfs_fs_info *fs_info, 4275 struct btrfs_block_group *block_group) 4276 { 4277 char buf[128] = {'\0'}; 4278 4279 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 4280 4281 btrfs_info(fs_info, 4282 "relocating block group %llu flags %s", 4283 block_group->start, buf); 4284 } 4285 4286 static const char *stage_to_string(int stage) 4287 { 4288 if (stage == MOVE_DATA_EXTENTS) 4289 return "move data extents"; 4290 if (stage == UPDATE_DATA_PTRS) 4291 return "update data pointers"; 4292 return "unknown"; 4293 } 4294 4295 /* 4296 * function to relocate all extents in a block group. 4297 */ 4298 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 4299 { 4300 struct btrfs_block_group *bg; 4301 struct btrfs_root *extent_root = fs_info->extent_root; 4302 struct reloc_control *rc; 4303 struct inode *inode; 4304 struct btrfs_path *path; 4305 int ret; 4306 int rw = 0; 4307 int err = 0; 4308 4309 bg = btrfs_lookup_block_group(fs_info, group_start); 4310 if (!bg) 4311 return -ENOENT; 4312 4313 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4314 btrfs_put_block_group(bg); 4315 return -ETXTBSY; 4316 } 4317 4318 rc = alloc_reloc_control(fs_info); 4319 if (!rc) { 4320 btrfs_put_block_group(bg); 4321 return -ENOMEM; 4322 } 4323 4324 rc->extent_root = extent_root; 4325 rc->block_group = bg; 4326 4327 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4328 if (ret) { 4329 err = ret; 4330 goto out; 4331 } 4332 rw = 1; 4333 4334 path = btrfs_alloc_path(); 4335 if (!path) { 4336 err = -ENOMEM; 4337 goto out; 4338 } 4339 4340 inode = lookup_free_space_inode(rc->block_group, path); 4341 btrfs_free_path(path); 4342 4343 if (!IS_ERR(inode)) 4344 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4345 else 4346 ret = PTR_ERR(inode); 4347 4348 if (ret && ret != -ENOENT) { 4349 err = ret; 4350 goto out; 4351 } 4352 4353 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4354 if (IS_ERR(rc->data_inode)) { 4355 err = PTR_ERR(rc->data_inode); 4356 rc->data_inode = NULL; 4357 goto out; 4358 } 4359 4360 describe_relocation(fs_info, rc->block_group); 4361 4362 btrfs_wait_block_group_reservations(rc->block_group); 4363 btrfs_wait_nocow_writers(rc->block_group); 4364 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4365 rc->block_group->start, 4366 rc->block_group->length); 4367 4368 while (1) { 4369 int finishes_stage; 4370 4371 mutex_lock(&fs_info->cleaner_mutex); 4372 ret = relocate_block_group(rc); 4373 mutex_unlock(&fs_info->cleaner_mutex); 4374 if (ret < 0) 4375 err = ret; 4376 4377 finishes_stage = rc->stage; 4378 /* 4379 * We may have gotten ENOSPC after we already dirtied some 4380 * extents. If writeout happens while we're relocating a 4381 * different block group we could end up hitting the 4382 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4383 * btrfs_reloc_cow_block. Make sure we write everything out 4384 * properly so we don't trip over this problem, and then break 4385 * out of the loop if we hit an error. 4386 */ 4387 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4388 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4389 (u64)-1); 4390 if (ret) 4391 err = ret; 4392 invalidate_mapping_pages(rc->data_inode->i_mapping, 4393 0, -1); 4394 rc->stage = UPDATE_DATA_PTRS; 4395 } 4396 4397 if (err < 0) 4398 goto out; 4399 4400 if (rc->extents_found == 0) 4401 break; 4402 4403 btrfs_info(fs_info, "found %llu extents, stage: %s", 4404 rc->extents_found, stage_to_string(finishes_stage)); 4405 } 4406 4407 WARN_ON(rc->block_group->pinned > 0); 4408 WARN_ON(rc->block_group->reserved > 0); 4409 WARN_ON(rc->block_group->used > 0); 4410 out: 4411 if (err && rw) 4412 btrfs_dec_block_group_ro(rc->block_group); 4413 iput(rc->data_inode); 4414 btrfs_put_block_group(rc->block_group); 4415 free_reloc_control(rc); 4416 return err; 4417 } 4418 4419 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4420 { 4421 struct btrfs_fs_info *fs_info = root->fs_info; 4422 struct btrfs_trans_handle *trans; 4423 int ret, err; 4424 4425 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4426 if (IS_ERR(trans)) 4427 return PTR_ERR(trans); 4428 4429 memset(&root->root_item.drop_progress, 0, 4430 sizeof(root->root_item.drop_progress)); 4431 root->root_item.drop_level = 0; 4432 btrfs_set_root_refs(&root->root_item, 0); 4433 ret = btrfs_update_root(trans, fs_info->tree_root, 4434 &root->root_key, &root->root_item); 4435 4436 err = btrfs_end_transaction(trans); 4437 if (err) 4438 return err; 4439 return ret; 4440 } 4441 4442 /* 4443 * recover relocation interrupted by system crash. 4444 * 4445 * this function resumes merging reloc trees with corresponding fs trees. 4446 * this is important for keeping the sharing of tree blocks 4447 */ 4448 int btrfs_recover_relocation(struct btrfs_root *root) 4449 { 4450 struct btrfs_fs_info *fs_info = root->fs_info; 4451 LIST_HEAD(reloc_roots); 4452 struct btrfs_key key; 4453 struct btrfs_root *fs_root; 4454 struct btrfs_root *reloc_root; 4455 struct btrfs_path *path; 4456 struct extent_buffer *leaf; 4457 struct reloc_control *rc = NULL; 4458 struct btrfs_trans_handle *trans; 4459 int ret; 4460 int err = 0; 4461 4462 path = btrfs_alloc_path(); 4463 if (!path) 4464 return -ENOMEM; 4465 path->reada = READA_BACK; 4466 4467 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4468 key.type = BTRFS_ROOT_ITEM_KEY; 4469 key.offset = (u64)-1; 4470 4471 while (1) { 4472 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4473 path, 0, 0); 4474 if (ret < 0) { 4475 err = ret; 4476 goto out; 4477 } 4478 if (ret > 0) { 4479 if (path->slots[0] == 0) 4480 break; 4481 path->slots[0]--; 4482 } 4483 leaf = path->nodes[0]; 4484 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4485 btrfs_release_path(path); 4486 4487 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4488 key.type != BTRFS_ROOT_ITEM_KEY) 4489 break; 4490 4491 reloc_root = btrfs_read_tree_root(root, &key); 4492 if (IS_ERR(reloc_root)) { 4493 err = PTR_ERR(reloc_root); 4494 goto out; 4495 } 4496 4497 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); 4498 list_add(&reloc_root->root_list, &reloc_roots); 4499 4500 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4501 fs_root = read_fs_root(fs_info, 4502 reloc_root->root_key.offset); 4503 if (IS_ERR(fs_root)) { 4504 ret = PTR_ERR(fs_root); 4505 if (ret != -ENOENT) { 4506 err = ret; 4507 goto out; 4508 } 4509 ret = mark_garbage_root(reloc_root); 4510 if (ret < 0) { 4511 err = ret; 4512 goto out; 4513 } 4514 } else { 4515 btrfs_put_root(fs_root); 4516 } 4517 } 4518 4519 if (key.offset == 0) 4520 break; 4521 4522 key.offset--; 4523 } 4524 btrfs_release_path(path); 4525 4526 if (list_empty(&reloc_roots)) 4527 goto out; 4528 4529 rc = alloc_reloc_control(fs_info); 4530 if (!rc) { 4531 err = -ENOMEM; 4532 goto out; 4533 } 4534 4535 rc->extent_root = fs_info->extent_root; 4536 4537 set_reloc_control(rc); 4538 4539 trans = btrfs_join_transaction(rc->extent_root); 4540 if (IS_ERR(trans)) { 4541 err = PTR_ERR(trans); 4542 goto out_unset; 4543 } 4544 4545 rc->merge_reloc_tree = 1; 4546 4547 while (!list_empty(&reloc_roots)) { 4548 reloc_root = list_entry(reloc_roots.next, 4549 struct btrfs_root, root_list); 4550 list_del(&reloc_root->root_list); 4551 4552 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4553 list_add_tail(&reloc_root->root_list, 4554 &rc->reloc_roots); 4555 continue; 4556 } 4557 4558 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); 4559 if (IS_ERR(fs_root)) { 4560 err = PTR_ERR(fs_root); 4561 list_add_tail(&reloc_root->root_list, &reloc_roots); 4562 btrfs_end_transaction(trans); 4563 goto out_unset; 4564 } 4565 4566 err = __add_reloc_root(reloc_root); 4567 BUG_ON(err < 0); /* -ENOMEM or logic error */ 4568 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4569 btrfs_put_root(fs_root); 4570 } 4571 4572 err = btrfs_commit_transaction(trans); 4573 if (err) 4574 goto out_unset; 4575 4576 merge_reloc_roots(rc); 4577 4578 unset_reloc_control(rc); 4579 4580 trans = btrfs_join_transaction(rc->extent_root); 4581 if (IS_ERR(trans)) { 4582 err = PTR_ERR(trans); 4583 goto out_clean; 4584 } 4585 err = btrfs_commit_transaction(trans); 4586 out_clean: 4587 ret = clean_dirty_subvols(rc); 4588 if (ret < 0 && !err) 4589 err = ret; 4590 out_unset: 4591 unset_reloc_control(rc); 4592 free_reloc_control(rc); 4593 out: 4594 if (!list_empty(&reloc_roots)) 4595 free_reloc_roots(&reloc_roots); 4596 4597 btrfs_free_path(path); 4598 4599 if (err == 0) { 4600 /* cleanup orphan inode in data relocation tree */ 4601 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4602 if (IS_ERR(fs_root)) { 4603 err = PTR_ERR(fs_root); 4604 } else { 4605 err = btrfs_orphan_cleanup(fs_root); 4606 btrfs_put_root(fs_root); 4607 } 4608 } 4609 return err; 4610 } 4611 4612 /* 4613 * helper to add ordered checksum for data relocation. 4614 * 4615 * cloning checksum properly handles the nodatasum extents. 4616 * it also saves CPU time to re-calculate the checksum. 4617 */ 4618 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4619 { 4620 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4621 struct btrfs_ordered_sum *sums; 4622 struct btrfs_ordered_extent *ordered; 4623 int ret; 4624 u64 disk_bytenr; 4625 u64 new_bytenr; 4626 LIST_HEAD(list); 4627 4628 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4629 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); 4630 4631 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 4632 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 4633 disk_bytenr + len - 1, &list, 0); 4634 if (ret) 4635 goto out; 4636 4637 while (!list_empty(&list)) { 4638 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4639 list_del_init(&sums->list); 4640 4641 /* 4642 * We need to offset the new_bytenr based on where the csum is. 4643 * We need to do this because we will read in entire prealloc 4644 * extents but we may have written to say the middle of the 4645 * prealloc extent, so we need to make sure the csum goes with 4646 * the right disk offset. 4647 * 4648 * We can do this because the data reloc inode refers strictly 4649 * to the on disk bytes, so we don't have to worry about 4650 * disk_len vs real len like with real inodes since it's all 4651 * disk length. 4652 */ 4653 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; 4654 sums->bytenr = new_bytenr; 4655 4656 btrfs_add_ordered_sum(ordered, sums); 4657 } 4658 out: 4659 btrfs_put_ordered_extent(ordered); 4660 return ret; 4661 } 4662 4663 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4664 struct btrfs_root *root, struct extent_buffer *buf, 4665 struct extent_buffer *cow) 4666 { 4667 struct btrfs_fs_info *fs_info = root->fs_info; 4668 struct reloc_control *rc; 4669 struct backref_node *node; 4670 int first_cow = 0; 4671 int level; 4672 int ret = 0; 4673 4674 rc = fs_info->reloc_ctl; 4675 if (!rc) 4676 return 0; 4677 4678 BUG_ON(rc->stage == UPDATE_DATA_PTRS && 4679 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); 4680 4681 level = btrfs_header_level(buf); 4682 if (btrfs_header_generation(buf) <= 4683 btrfs_root_last_snapshot(&root->root_item)) 4684 first_cow = 1; 4685 4686 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4687 rc->create_reloc_tree) { 4688 WARN_ON(!first_cow && level == 0); 4689 4690 node = rc->backref_cache.path[level]; 4691 BUG_ON(node->bytenr != buf->start && 4692 node->new_bytenr != buf->start); 4693 4694 drop_node_buffer(node); 4695 atomic_inc(&cow->refs); 4696 node->eb = cow; 4697 node->new_bytenr = cow->start; 4698 4699 if (!node->pending) { 4700 list_move_tail(&node->list, 4701 &rc->backref_cache.pending[level]); 4702 node->pending = 1; 4703 } 4704 4705 if (first_cow) 4706 __mark_block_processed(rc, node); 4707 4708 if (first_cow && level > 0) 4709 rc->nodes_relocated += buf->len; 4710 } 4711 4712 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4713 ret = replace_file_extents(trans, rc, root, cow); 4714 return ret; 4715 } 4716 4717 /* 4718 * called before creating snapshot. it calculates metadata reservation 4719 * required for relocating tree blocks in the snapshot 4720 */ 4721 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4722 u64 *bytes_to_reserve) 4723 { 4724 struct btrfs_root *root = pending->root; 4725 struct reloc_control *rc = root->fs_info->reloc_ctl; 4726 4727 if (!rc || !have_reloc_root(root)) 4728 return; 4729 4730 if (!rc->merge_reloc_tree) 4731 return; 4732 4733 root = root->reloc_root; 4734 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4735 /* 4736 * relocation is in the stage of merging trees. the space 4737 * used by merging a reloc tree is twice the size of 4738 * relocated tree nodes in the worst case. half for cowing 4739 * the reloc tree, half for cowing the fs tree. the space 4740 * used by cowing the reloc tree will be freed after the 4741 * tree is dropped. if we create snapshot, cowing the fs 4742 * tree may use more space than it frees. so we need 4743 * reserve extra space. 4744 */ 4745 *bytes_to_reserve += rc->nodes_relocated; 4746 } 4747 4748 /* 4749 * called after snapshot is created. migrate block reservation 4750 * and create reloc root for the newly created snapshot 4751 * 4752 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4753 * references held on the reloc_root, one for root->reloc_root and one for 4754 * rc->reloc_roots. 4755 */ 4756 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4757 struct btrfs_pending_snapshot *pending) 4758 { 4759 struct btrfs_root *root = pending->root; 4760 struct btrfs_root *reloc_root; 4761 struct btrfs_root *new_root; 4762 struct reloc_control *rc = root->fs_info->reloc_ctl; 4763 int ret; 4764 4765 if (!rc || !have_reloc_root(root)) 4766 return 0; 4767 4768 rc = root->fs_info->reloc_ctl; 4769 rc->merging_rsv_size += rc->nodes_relocated; 4770 4771 if (rc->merge_reloc_tree) { 4772 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4773 rc->block_rsv, 4774 rc->nodes_relocated, true); 4775 if (ret) 4776 return ret; 4777 } 4778 4779 new_root = pending->snap; 4780 reloc_root = create_reloc_root(trans, root->reloc_root, 4781 new_root->root_key.objectid); 4782 if (IS_ERR(reloc_root)) 4783 return PTR_ERR(reloc_root); 4784 4785 ret = __add_reloc_root(reloc_root); 4786 BUG_ON(ret < 0); 4787 new_root->reloc_root = btrfs_grab_root(reloc_root); 4788 4789 if (rc->create_reloc_tree) 4790 ret = clone_backref_node(trans, rc, root, reloc_root); 4791 return ret; 4792 } 4793