1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "inode-map.h" 22 #include "qgroup.h" 23 #include "print-tree.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "backref.h" 27 #include "misc.h" 28 29 /* 30 * Relocation overview 31 * 32 * [What does relocation do] 33 * 34 * The objective of relocation is to relocate all extents of the target block 35 * group to other block groups. 36 * This is utilized by resize (shrink only), profile converting, compacting 37 * space, or balance routine to spread chunks over devices. 38 * 39 * Before | After 40 * ------------------------------------------------------------------ 41 * BG A: 10 data extents | BG A: deleted 42 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 43 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 44 * 45 * [How does relocation work] 46 * 47 * 1. Mark the target block group read-only 48 * New extents won't be allocated from the target block group. 49 * 50 * 2.1 Record each extent in the target block group 51 * To build a proper map of extents to be relocated. 52 * 53 * 2.2 Build data reloc tree and reloc trees 54 * Data reloc tree will contain an inode, recording all newly relocated 55 * data extents. 56 * There will be only one data reloc tree for one data block group. 57 * 58 * Reloc tree will be a special snapshot of its source tree, containing 59 * relocated tree blocks. 60 * Each tree referring to a tree block in target block group will get its 61 * reloc tree built. 62 * 63 * 2.3 Swap source tree with its corresponding reloc tree 64 * Each involved tree only refers to new extents after swap. 65 * 66 * 3. Cleanup reloc trees and data reloc tree. 67 * As old extents in the target block group are still referenced by reloc 68 * trees, we need to clean them up before really freeing the target block 69 * group. 70 * 71 * The main complexity is in steps 2.2 and 2.3. 72 * 73 * The entry point of relocation is relocate_block_group() function. 74 */ 75 76 #define RELOCATION_RESERVED_NODES 256 77 /* 78 * map address of tree root to tree 79 */ 80 struct mapping_node { 81 struct { 82 struct rb_node rb_node; 83 u64 bytenr; 84 }; /* Use rb_simle_node for search/insert */ 85 void *data; 86 }; 87 88 struct mapping_tree { 89 struct rb_root rb_root; 90 spinlock_t lock; 91 }; 92 93 /* 94 * present a tree block to process 95 */ 96 struct tree_block { 97 struct { 98 struct rb_node rb_node; 99 u64 bytenr; 100 }; /* Use rb_simple_node for search/insert */ 101 struct btrfs_key key; 102 unsigned int level:8; 103 unsigned int key_ready:1; 104 }; 105 106 #define MAX_EXTENTS 128 107 108 struct file_extent_cluster { 109 u64 start; 110 u64 end; 111 u64 boundary[MAX_EXTENTS]; 112 unsigned int nr; 113 }; 114 115 struct reloc_control { 116 /* block group to relocate */ 117 struct btrfs_block_group *block_group; 118 /* extent tree */ 119 struct btrfs_root *extent_root; 120 /* inode for moving data */ 121 struct inode *data_inode; 122 123 struct btrfs_block_rsv *block_rsv; 124 125 struct btrfs_backref_cache backref_cache; 126 127 struct file_extent_cluster cluster; 128 /* tree blocks have been processed */ 129 struct extent_io_tree processed_blocks; 130 /* map start of tree root to corresponding reloc tree */ 131 struct mapping_tree reloc_root_tree; 132 /* list of reloc trees */ 133 struct list_head reloc_roots; 134 /* list of subvolume trees that get relocated */ 135 struct list_head dirty_subvol_roots; 136 /* size of metadata reservation for merging reloc trees */ 137 u64 merging_rsv_size; 138 /* size of relocated tree nodes */ 139 u64 nodes_relocated; 140 /* reserved size for block group relocation*/ 141 u64 reserved_bytes; 142 143 u64 search_start; 144 u64 extents_found; 145 146 unsigned int stage:8; 147 unsigned int create_reloc_tree:1; 148 unsigned int merge_reloc_tree:1; 149 unsigned int found_file_extent:1; 150 }; 151 152 /* stages of data relocation */ 153 #define MOVE_DATA_EXTENTS 0 154 #define UPDATE_DATA_PTRS 1 155 156 static void mark_block_processed(struct reloc_control *rc, 157 struct btrfs_backref_node *node) 158 { 159 u32 blocksize; 160 161 if (node->level == 0 || 162 in_range(node->bytenr, rc->block_group->start, 163 rc->block_group->length)) { 164 blocksize = rc->extent_root->fs_info->nodesize; 165 set_extent_bits(&rc->processed_blocks, node->bytenr, 166 node->bytenr + blocksize - 1, EXTENT_DIRTY); 167 } 168 node->processed = 1; 169 } 170 171 172 static void mapping_tree_init(struct mapping_tree *tree) 173 { 174 tree->rb_root = RB_ROOT; 175 spin_lock_init(&tree->lock); 176 } 177 178 /* 179 * walk up backref nodes until reach node presents tree root 180 */ 181 static struct btrfs_backref_node *walk_up_backref( 182 struct btrfs_backref_node *node, 183 struct btrfs_backref_edge *edges[], int *index) 184 { 185 struct btrfs_backref_edge *edge; 186 int idx = *index; 187 188 while (!list_empty(&node->upper)) { 189 edge = list_entry(node->upper.next, 190 struct btrfs_backref_edge, list[LOWER]); 191 edges[idx++] = edge; 192 node = edge->node[UPPER]; 193 } 194 BUG_ON(node->detached); 195 *index = idx; 196 return node; 197 } 198 199 /* 200 * walk down backref nodes to find start of next reference path 201 */ 202 static struct btrfs_backref_node *walk_down_backref( 203 struct btrfs_backref_edge *edges[], int *index) 204 { 205 struct btrfs_backref_edge *edge; 206 struct btrfs_backref_node *lower; 207 int idx = *index; 208 209 while (idx > 0) { 210 edge = edges[idx - 1]; 211 lower = edge->node[LOWER]; 212 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 213 idx--; 214 continue; 215 } 216 edge = list_entry(edge->list[LOWER].next, 217 struct btrfs_backref_edge, list[LOWER]); 218 edges[idx - 1] = edge; 219 *index = idx; 220 return edge->node[UPPER]; 221 } 222 *index = 0; 223 return NULL; 224 } 225 226 static void update_backref_node(struct btrfs_backref_cache *cache, 227 struct btrfs_backref_node *node, u64 bytenr) 228 { 229 struct rb_node *rb_node; 230 rb_erase(&node->rb_node, &cache->rb_root); 231 node->bytenr = bytenr; 232 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); 233 if (rb_node) 234 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); 235 } 236 237 /* 238 * update backref cache after a transaction commit 239 */ 240 static int update_backref_cache(struct btrfs_trans_handle *trans, 241 struct btrfs_backref_cache *cache) 242 { 243 struct btrfs_backref_node *node; 244 int level = 0; 245 246 if (cache->last_trans == 0) { 247 cache->last_trans = trans->transid; 248 return 0; 249 } 250 251 if (cache->last_trans == trans->transid) 252 return 0; 253 254 /* 255 * detached nodes are used to avoid unnecessary backref 256 * lookup. transaction commit changes the extent tree. 257 * so the detached nodes are no longer useful. 258 */ 259 while (!list_empty(&cache->detached)) { 260 node = list_entry(cache->detached.next, 261 struct btrfs_backref_node, list); 262 btrfs_backref_cleanup_node(cache, node); 263 } 264 265 while (!list_empty(&cache->changed)) { 266 node = list_entry(cache->changed.next, 267 struct btrfs_backref_node, list); 268 list_del_init(&node->list); 269 BUG_ON(node->pending); 270 update_backref_node(cache, node, node->new_bytenr); 271 } 272 273 /* 274 * some nodes can be left in the pending list if there were 275 * errors during processing the pending nodes. 276 */ 277 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 278 list_for_each_entry(node, &cache->pending[level], list) { 279 BUG_ON(!node->pending); 280 if (node->bytenr == node->new_bytenr) 281 continue; 282 update_backref_node(cache, node, node->new_bytenr); 283 } 284 } 285 286 cache->last_trans = 0; 287 return 1; 288 } 289 290 static bool reloc_root_is_dead(struct btrfs_root *root) 291 { 292 /* 293 * Pair with set_bit/clear_bit in clean_dirty_subvols and 294 * btrfs_update_reloc_root. We need to see the updated bit before 295 * trying to access reloc_root 296 */ 297 smp_rmb(); 298 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 299 return true; 300 return false; 301 } 302 303 /* 304 * Check if this subvolume tree has valid reloc tree. 305 * 306 * Reloc tree after swap is considered dead, thus not considered as valid. 307 * This is enough for most callers, as they don't distinguish dead reloc root 308 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 309 * special case. 310 */ 311 static bool have_reloc_root(struct btrfs_root *root) 312 { 313 if (reloc_root_is_dead(root)) 314 return false; 315 if (!root->reloc_root) 316 return false; 317 return true; 318 } 319 320 int btrfs_should_ignore_reloc_root(struct btrfs_root *root) 321 { 322 struct btrfs_root *reloc_root; 323 324 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 325 return 0; 326 327 /* This root has been merged with its reloc tree, we can ignore it */ 328 if (reloc_root_is_dead(root)) 329 return 1; 330 331 reloc_root = root->reloc_root; 332 if (!reloc_root) 333 return 0; 334 335 if (btrfs_header_generation(reloc_root->commit_root) == 336 root->fs_info->running_transaction->transid) 337 return 0; 338 /* 339 * if there is reloc tree and it was created in previous 340 * transaction backref lookup can find the reloc tree, 341 * so backref node for the fs tree root is useless for 342 * relocation. 343 */ 344 return 1; 345 } 346 347 /* 348 * find reloc tree by address of tree root 349 */ 350 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 351 { 352 struct reloc_control *rc = fs_info->reloc_ctl; 353 struct rb_node *rb_node; 354 struct mapping_node *node; 355 struct btrfs_root *root = NULL; 356 357 ASSERT(rc); 358 spin_lock(&rc->reloc_root_tree.lock); 359 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 360 if (rb_node) { 361 node = rb_entry(rb_node, struct mapping_node, rb_node); 362 root = (struct btrfs_root *)node->data; 363 } 364 spin_unlock(&rc->reloc_root_tree.lock); 365 return btrfs_grab_root(root); 366 } 367 368 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, 369 u64 root_objectid) 370 { 371 struct btrfs_key key; 372 373 key.objectid = root_objectid; 374 key.type = BTRFS_ROOT_ITEM_KEY; 375 key.offset = (u64)-1; 376 377 return btrfs_get_fs_root(fs_info, &key, false); 378 } 379 380 /* 381 * For useless nodes, do two major clean ups: 382 * 383 * - Cleanup the children edges and nodes 384 * If child node is also orphan (no parent) during cleanup, then the child 385 * node will also be cleaned up. 386 * 387 * - Freeing up leaves (level 0), keeps nodes detached 388 * For nodes, the node is still cached as "detached" 389 * 390 * Return false if @node is not in the @useless_nodes list. 391 * Return true if @node is in the @useless_nodes list. 392 */ 393 static bool handle_useless_nodes(struct reloc_control *rc, 394 struct btrfs_backref_node *node) 395 { 396 struct btrfs_backref_cache *cache = &rc->backref_cache; 397 struct list_head *useless_node = &cache->useless_node; 398 bool ret = false; 399 400 while (!list_empty(useless_node)) { 401 struct btrfs_backref_node *cur; 402 403 cur = list_first_entry(useless_node, struct btrfs_backref_node, 404 list); 405 list_del_init(&cur->list); 406 407 /* Only tree root nodes can be added to @useless_nodes */ 408 ASSERT(list_empty(&cur->upper)); 409 410 if (cur == node) 411 ret = true; 412 413 /* The node is the lowest node */ 414 if (cur->lowest) { 415 list_del_init(&cur->lower); 416 cur->lowest = 0; 417 } 418 419 /* Cleanup the lower edges */ 420 while (!list_empty(&cur->lower)) { 421 struct btrfs_backref_edge *edge; 422 struct btrfs_backref_node *lower; 423 424 edge = list_entry(cur->lower.next, 425 struct btrfs_backref_edge, list[UPPER]); 426 list_del(&edge->list[UPPER]); 427 list_del(&edge->list[LOWER]); 428 lower = edge->node[LOWER]; 429 btrfs_backref_free_edge(cache, edge); 430 431 /* Child node is also orphan, queue for cleanup */ 432 if (list_empty(&lower->upper)) 433 list_add(&lower->list, useless_node); 434 } 435 /* Mark this block processed for relocation */ 436 mark_block_processed(rc, cur); 437 438 /* 439 * Backref nodes for tree leaves are deleted from the cache. 440 * Backref nodes for upper level tree blocks are left in the 441 * cache to avoid unnecessary backref lookup. 442 */ 443 if (cur->level > 0) { 444 list_add(&cur->list, &cache->detached); 445 cur->detached = 1; 446 } else { 447 rb_erase(&cur->rb_node, &cache->rb_root); 448 btrfs_backref_free_node(cache, cur); 449 } 450 } 451 return ret; 452 } 453 454 /* 455 * Build backref tree for a given tree block. Root of the backref tree 456 * corresponds the tree block, leaves of the backref tree correspond roots of 457 * b-trees that reference the tree block. 458 * 459 * The basic idea of this function is check backrefs of a given block to find 460 * upper level blocks that reference the block, and then check backrefs of 461 * these upper level blocks recursively. The recursion stops when tree root is 462 * reached or backrefs for the block is cached. 463 * 464 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 465 * all upper level blocks that directly/indirectly reference the block are also 466 * cached. 467 */ 468 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 469 struct reloc_control *rc, struct btrfs_key *node_key, 470 int level, u64 bytenr) 471 { 472 struct btrfs_backref_iter *iter; 473 struct btrfs_backref_cache *cache = &rc->backref_cache; 474 /* For searching parent of TREE_BLOCK_REF */ 475 struct btrfs_path *path; 476 struct btrfs_backref_node *cur; 477 struct btrfs_backref_node *upper; 478 struct btrfs_backref_node *lower; 479 struct btrfs_backref_node *node = NULL; 480 struct btrfs_backref_edge *edge; 481 int ret; 482 int err = 0; 483 484 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); 485 if (!iter) 486 return ERR_PTR(-ENOMEM); 487 path = btrfs_alloc_path(); 488 if (!path) { 489 err = -ENOMEM; 490 goto out; 491 } 492 493 node = btrfs_backref_alloc_node(cache, bytenr, level); 494 if (!node) { 495 err = -ENOMEM; 496 goto out; 497 } 498 499 node->lowest = 1; 500 cur = node; 501 502 /* Breadth-first search to build backref cache */ 503 do { 504 ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, 505 cur); 506 if (ret < 0) { 507 err = ret; 508 goto out; 509 } 510 edge = list_first_entry_or_null(&cache->pending_edge, 511 struct btrfs_backref_edge, list[UPPER]); 512 /* 513 * The pending list isn't empty, take the first block to 514 * process 515 */ 516 if (edge) { 517 list_del_init(&edge->list[UPPER]); 518 cur = edge->node[UPPER]; 519 } 520 } while (edge); 521 522 /* Finish the upper linkage of newly added edges/nodes */ 523 ret = btrfs_backref_finish_upper_links(cache, node); 524 if (ret < 0) { 525 err = ret; 526 goto out; 527 } 528 529 if (handle_useless_nodes(rc, node)) 530 node = NULL; 531 out: 532 btrfs_backref_iter_free(iter); 533 btrfs_free_path(path); 534 if (err) { 535 while (!list_empty(&cache->useless_node)) { 536 lower = list_first_entry(&cache->useless_node, 537 struct btrfs_backref_node, list); 538 list_del_init(&lower->list); 539 } 540 while (!list_empty(&cache->pending_edge)) { 541 edge = list_first_entry(&cache->pending_edge, 542 struct btrfs_backref_edge, list[UPPER]); 543 list_del(&edge->list[UPPER]); 544 list_del(&edge->list[LOWER]); 545 lower = edge->node[LOWER]; 546 upper = edge->node[UPPER]; 547 btrfs_backref_free_edge(cache, edge); 548 549 /* 550 * Lower is no longer linked to any upper backref nodes 551 * and isn't in the cache, we can free it ourselves. 552 */ 553 if (list_empty(&lower->upper) && 554 RB_EMPTY_NODE(&lower->rb_node)) 555 list_add(&lower->list, &cache->useless_node); 556 557 if (!RB_EMPTY_NODE(&upper->rb_node)) 558 continue; 559 560 /* Add this guy's upper edges to the list to process */ 561 list_for_each_entry(edge, &upper->upper, list[LOWER]) 562 list_add_tail(&edge->list[UPPER], 563 &cache->pending_edge); 564 if (list_empty(&upper->upper)) 565 list_add(&upper->list, &cache->useless_node); 566 } 567 568 while (!list_empty(&cache->useless_node)) { 569 lower = list_first_entry(&cache->useless_node, 570 struct btrfs_backref_node, list); 571 list_del_init(&lower->list); 572 if (lower == node) 573 node = NULL; 574 btrfs_backref_free_node(cache, lower); 575 } 576 577 btrfs_backref_cleanup_node(cache, node); 578 ASSERT(list_empty(&cache->useless_node) && 579 list_empty(&cache->pending_edge)); 580 return ERR_PTR(err); 581 } 582 ASSERT(!node || !node->detached); 583 ASSERT(list_empty(&cache->useless_node) && 584 list_empty(&cache->pending_edge)); 585 return node; 586 } 587 588 /* 589 * helper to add backref node for the newly created snapshot. 590 * the backref node is created by cloning backref node that 591 * corresponds to root of source tree 592 */ 593 static int clone_backref_node(struct btrfs_trans_handle *trans, 594 struct reloc_control *rc, 595 struct btrfs_root *src, 596 struct btrfs_root *dest) 597 { 598 struct btrfs_root *reloc_root = src->reloc_root; 599 struct btrfs_backref_cache *cache = &rc->backref_cache; 600 struct btrfs_backref_node *node = NULL; 601 struct btrfs_backref_node *new_node; 602 struct btrfs_backref_edge *edge; 603 struct btrfs_backref_edge *new_edge; 604 struct rb_node *rb_node; 605 606 if (cache->last_trans > 0) 607 update_backref_cache(trans, cache); 608 609 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 610 if (rb_node) { 611 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 612 if (node->detached) 613 node = NULL; 614 else 615 BUG_ON(node->new_bytenr != reloc_root->node->start); 616 } 617 618 if (!node) { 619 rb_node = rb_simple_search(&cache->rb_root, 620 reloc_root->commit_root->start); 621 if (rb_node) { 622 node = rb_entry(rb_node, struct btrfs_backref_node, 623 rb_node); 624 BUG_ON(node->detached); 625 } 626 } 627 628 if (!node) 629 return 0; 630 631 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 632 node->level); 633 if (!new_node) 634 return -ENOMEM; 635 636 new_node->lowest = node->lowest; 637 new_node->checked = 1; 638 new_node->root = btrfs_grab_root(dest); 639 ASSERT(new_node->root); 640 641 if (!node->lowest) { 642 list_for_each_entry(edge, &node->lower, list[UPPER]) { 643 new_edge = btrfs_backref_alloc_edge(cache); 644 if (!new_edge) 645 goto fail; 646 647 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 648 new_node, LINK_UPPER); 649 } 650 } else { 651 list_add_tail(&new_node->lower, &cache->leaves); 652 } 653 654 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 655 &new_node->rb_node); 656 if (rb_node) 657 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 658 659 if (!new_node->lowest) { 660 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 661 list_add_tail(&new_edge->list[LOWER], 662 &new_edge->node[LOWER]->upper); 663 } 664 } 665 return 0; 666 fail: 667 while (!list_empty(&new_node->lower)) { 668 new_edge = list_entry(new_node->lower.next, 669 struct btrfs_backref_edge, list[UPPER]); 670 list_del(&new_edge->list[UPPER]); 671 btrfs_backref_free_edge(cache, new_edge); 672 } 673 btrfs_backref_free_node(cache, new_node); 674 return -ENOMEM; 675 } 676 677 /* 678 * helper to add 'address of tree root -> reloc tree' mapping 679 */ 680 static int __must_check __add_reloc_root(struct btrfs_root *root) 681 { 682 struct btrfs_fs_info *fs_info = root->fs_info; 683 struct rb_node *rb_node; 684 struct mapping_node *node; 685 struct reloc_control *rc = fs_info->reloc_ctl; 686 687 node = kmalloc(sizeof(*node), GFP_NOFS); 688 if (!node) 689 return -ENOMEM; 690 691 node->bytenr = root->commit_root->start; 692 node->data = root; 693 694 spin_lock(&rc->reloc_root_tree.lock); 695 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 696 node->bytenr, &node->rb_node); 697 spin_unlock(&rc->reloc_root_tree.lock); 698 if (rb_node) { 699 btrfs_panic(fs_info, -EEXIST, 700 "Duplicate root found for start=%llu while inserting into relocation tree", 701 node->bytenr); 702 } 703 704 list_add_tail(&root->root_list, &rc->reloc_roots); 705 return 0; 706 } 707 708 /* 709 * helper to delete the 'address of tree root -> reloc tree' 710 * mapping 711 */ 712 static void __del_reloc_root(struct btrfs_root *root) 713 { 714 struct btrfs_fs_info *fs_info = root->fs_info; 715 struct rb_node *rb_node; 716 struct mapping_node *node = NULL; 717 struct reloc_control *rc = fs_info->reloc_ctl; 718 bool put_ref = false; 719 720 if (rc && root->node) { 721 spin_lock(&rc->reloc_root_tree.lock); 722 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 723 root->commit_root->start); 724 if (rb_node) { 725 node = rb_entry(rb_node, struct mapping_node, rb_node); 726 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 727 RB_CLEAR_NODE(&node->rb_node); 728 } 729 spin_unlock(&rc->reloc_root_tree.lock); 730 if (!node) 731 return; 732 BUG_ON((struct btrfs_root *)node->data != root); 733 } 734 735 /* 736 * We only put the reloc root here if it's on the list. There's a lot 737 * of places where the pattern is to splice the rc->reloc_roots, process 738 * the reloc roots, and then add the reloc root back onto 739 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 740 * list we don't want the reference being dropped, because the guy 741 * messing with the list is in charge of the reference. 742 */ 743 spin_lock(&fs_info->trans_lock); 744 if (!list_empty(&root->root_list)) { 745 put_ref = true; 746 list_del_init(&root->root_list); 747 } 748 spin_unlock(&fs_info->trans_lock); 749 if (put_ref) 750 btrfs_put_root(root); 751 kfree(node); 752 } 753 754 /* 755 * helper to update the 'address of tree root -> reloc tree' 756 * mapping 757 */ 758 static int __update_reloc_root(struct btrfs_root *root) 759 { 760 struct btrfs_fs_info *fs_info = root->fs_info; 761 struct rb_node *rb_node; 762 struct mapping_node *node = NULL; 763 struct reloc_control *rc = fs_info->reloc_ctl; 764 765 spin_lock(&rc->reloc_root_tree.lock); 766 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 767 root->commit_root->start); 768 if (rb_node) { 769 node = rb_entry(rb_node, struct mapping_node, rb_node); 770 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 771 } 772 spin_unlock(&rc->reloc_root_tree.lock); 773 774 if (!node) 775 return 0; 776 BUG_ON((struct btrfs_root *)node->data != root); 777 778 spin_lock(&rc->reloc_root_tree.lock); 779 node->bytenr = root->node->start; 780 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 781 node->bytenr, &node->rb_node); 782 spin_unlock(&rc->reloc_root_tree.lock); 783 if (rb_node) 784 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 785 return 0; 786 } 787 788 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 789 struct btrfs_root *root, u64 objectid) 790 { 791 struct btrfs_fs_info *fs_info = root->fs_info; 792 struct btrfs_root *reloc_root; 793 struct extent_buffer *eb; 794 struct btrfs_root_item *root_item; 795 struct btrfs_key root_key; 796 int ret; 797 798 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 799 BUG_ON(!root_item); 800 801 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 802 root_key.type = BTRFS_ROOT_ITEM_KEY; 803 root_key.offset = objectid; 804 805 if (root->root_key.objectid == objectid) { 806 u64 commit_root_gen; 807 808 /* called by btrfs_init_reloc_root */ 809 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 810 BTRFS_TREE_RELOC_OBJECTID); 811 BUG_ON(ret); 812 /* 813 * Set the last_snapshot field to the generation of the commit 814 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 815 * correctly (returns true) when the relocation root is created 816 * either inside the critical section of a transaction commit 817 * (through transaction.c:qgroup_account_snapshot()) and when 818 * it's created before the transaction commit is started. 819 */ 820 commit_root_gen = btrfs_header_generation(root->commit_root); 821 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 822 } else { 823 /* 824 * called by btrfs_reloc_post_snapshot_hook. 825 * the source tree is a reloc tree, all tree blocks 826 * modified after it was created have RELOC flag 827 * set in their headers. so it's OK to not update 828 * the 'last_snapshot'. 829 */ 830 ret = btrfs_copy_root(trans, root, root->node, &eb, 831 BTRFS_TREE_RELOC_OBJECTID); 832 BUG_ON(ret); 833 } 834 835 memcpy(root_item, &root->root_item, sizeof(*root_item)); 836 btrfs_set_root_bytenr(root_item, eb->start); 837 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 838 btrfs_set_root_generation(root_item, trans->transid); 839 840 if (root->root_key.objectid == objectid) { 841 btrfs_set_root_refs(root_item, 0); 842 memset(&root_item->drop_progress, 0, 843 sizeof(struct btrfs_disk_key)); 844 root_item->drop_level = 0; 845 } 846 847 btrfs_tree_unlock(eb); 848 free_extent_buffer(eb); 849 850 ret = btrfs_insert_root(trans, fs_info->tree_root, 851 &root_key, root_item); 852 BUG_ON(ret); 853 kfree(root_item); 854 855 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 856 BUG_ON(IS_ERR(reloc_root)); 857 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); 858 reloc_root->last_trans = trans->transid; 859 return reloc_root; 860 } 861 862 /* 863 * create reloc tree for a given fs tree. reloc tree is just a 864 * snapshot of the fs tree with special root objectid. 865 * 866 * The reloc_root comes out of here with two references, one for 867 * root->reloc_root, and another for being on the rc->reloc_roots list. 868 */ 869 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 870 struct btrfs_root *root) 871 { 872 struct btrfs_fs_info *fs_info = root->fs_info; 873 struct btrfs_root *reloc_root; 874 struct reloc_control *rc = fs_info->reloc_ctl; 875 struct btrfs_block_rsv *rsv; 876 int clear_rsv = 0; 877 int ret; 878 879 if (!rc) 880 return 0; 881 882 /* 883 * The subvolume has reloc tree but the swap is finished, no need to 884 * create/update the dead reloc tree 885 */ 886 if (reloc_root_is_dead(root)) 887 return 0; 888 889 /* 890 * This is subtle but important. We do not do 891 * record_root_in_transaction for reloc roots, instead we record their 892 * corresponding fs root, and then here we update the last trans for the 893 * reloc root. This means that we have to do this for the entire life 894 * of the reloc root, regardless of which stage of the relocation we are 895 * in. 896 */ 897 if (root->reloc_root) { 898 reloc_root = root->reloc_root; 899 reloc_root->last_trans = trans->transid; 900 return 0; 901 } 902 903 /* 904 * We are merging reloc roots, we do not need new reloc trees. Also 905 * reloc trees never need their own reloc tree. 906 */ 907 if (!rc->create_reloc_tree || 908 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 909 return 0; 910 911 if (!trans->reloc_reserved) { 912 rsv = trans->block_rsv; 913 trans->block_rsv = rc->block_rsv; 914 clear_rsv = 1; 915 } 916 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 917 if (clear_rsv) 918 trans->block_rsv = rsv; 919 920 ret = __add_reloc_root(reloc_root); 921 BUG_ON(ret < 0); 922 root->reloc_root = btrfs_grab_root(reloc_root); 923 return 0; 924 } 925 926 /* 927 * update root item of reloc tree 928 */ 929 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 930 struct btrfs_root *root) 931 { 932 struct btrfs_fs_info *fs_info = root->fs_info; 933 struct btrfs_root *reloc_root; 934 struct btrfs_root_item *root_item; 935 int ret; 936 937 if (!have_reloc_root(root)) 938 goto out; 939 940 reloc_root = root->reloc_root; 941 root_item = &reloc_root->root_item; 942 943 /* 944 * We are probably ok here, but __del_reloc_root() will drop its ref of 945 * the root. We have the ref for root->reloc_root, but just in case 946 * hold it while we update the reloc root. 947 */ 948 btrfs_grab_root(reloc_root); 949 950 /* root->reloc_root will stay until current relocation finished */ 951 if (fs_info->reloc_ctl->merge_reloc_tree && 952 btrfs_root_refs(root_item) == 0) { 953 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 954 /* 955 * Mark the tree as dead before we change reloc_root so 956 * have_reloc_root will not touch it from now on. 957 */ 958 smp_wmb(); 959 __del_reloc_root(reloc_root); 960 } 961 962 if (reloc_root->commit_root != reloc_root->node) { 963 __update_reloc_root(reloc_root); 964 btrfs_set_root_node(root_item, reloc_root->node); 965 free_extent_buffer(reloc_root->commit_root); 966 reloc_root->commit_root = btrfs_root_node(reloc_root); 967 } 968 969 ret = btrfs_update_root(trans, fs_info->tree_root, 970 &reloc_root->root_key, root_item); 971 BUG_ON(ret); 972 btrfs_put_root(reloc_root); 973 out: 974 return 0; 975 } 976 977 /* 978 * helper to find first cached inode with inode number >= objectid 979 * in a subvolume 980 */ 981 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 982 { 983 struct rb_node *node; 984 struct rb_node *prev; 985 struct btrfs_inode *entry; 986 struct inode *inode; 987 988 spin_lock(&root->inode_lock); 989 again: 990 node = root->inode_tree.rb_node; 991 prev = NULL; 992 while (node) { 993 prev = node; 994 entry = rb_entry(node, struct btrfs_inode, rb_node); 995 996 if (objectid < btrfs_ino(entry)) 997 node = node->rb_left; 998 else if (objectid > btrfs_ino(entry)) 999 node = node->rb_right; 1000 else 1001 break; 1002 } 1003 if (!node) { 1004 while (prev) { 1005 entry = rb_entry(prev, struct btrfs_inode, rb_node); 1006 if (objectid <= btrfs_ino(entry)) { 1007 node = prev; 1008 break; 1009 } 1010 prev = rb_next(prev); 1011 } 1012 } 1013 while (node) { 1014 entry = rb_entry(node, struct btrfs_inode, rb_node); 1015 inode = igrab(&entry->vfs_inode); 1016 if (inode) { 1017 spin_unlock(&root->inode_lock); 1018 return inode; 1019 } 1020 1021 objectid = btrfs_ino(entry) + 1; 1022 if (cond_resched_lock(&root->inode_lock)) 1023 goto again; 1024 1025 node = rb_next(node); 1026 } 1027 spin_unlock(&root->inode_lock); 1028 return NULL; 1029 } 1030 1031 /* 1032 * get new location of data 1033 */ 1034 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1035 u64 bytenr, u64 num_bytes) 1036 { 1037 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1038 struct btrfs_path *path; 1039 struct btrfs_file_extent_item *fi; 1040 struct extent_buffer *leaf; 1041 int ret; 1042 1043 path = btrfs_alloc_path(); 1044 if (!path) 1045 return -ENOMEM; 1046 1047 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1048 ret = btrfs_lookup_file_extent(NULL, root, path, 1049 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1050 if (ret < 0) 1051 goto out; 1052 if (ret > 0) { 1053 ret = -ENOENT; 1054 goto out; 1055 } 1056 1057 leaf = path->nodes[0]; 1058 fi = btrfs_item_ptr(leaf, path->slots[0], 1059 struct btrfs_file_extent_item); 1060 1061 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1062 btrfs_file_extent_compression(leaf, fi) || 1063 btrfs_file_extent_encryption(leaf, fi) || 1064 btrfs_file_extent_other_encoding(leaf, fi)); 1065 1066 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1067 ret = -EINVAL; 1068 goto out; 1069 } 1070 1071 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1072 ret = 0; 1073 out: 1074 btrfs_free_path(path); 1075 return ret; 1076 } 1077 1078 /* 1079 * update file extent items in the tree leaf to point to 1080 * the new locations. 1081 */ 1082 static noinline_for_stack 1083 int replace_file_extents(struct btrfs_trans_handle *trans, 1084 struct reloc_control *rc, 1085 struct btrfs_root *root, 1086 struct extent_buffer *leaf) 1087 { 1088 struct btrfs_fs_info *fs_info = root->fs_info; 1089 struct btrfs_key key; 1090 struct btrfs_file_extent_item *fi; 1091 struct inode *inode = NULL; 1092 u64 parent; 1093 u64 bytenr; 1094 u64 new_bytenr = 0; 1095 u64 num_bytes; 1096 u64 end; 1097 u32 nritems; 1098 u32 i; 1099 int ret = 0; 1100 int first = 1; 1101 int dirty = 0; 1102 1103 if (rc->stage != UPDATE_DATA_PTRS) 1104 return 0; 1105 1106 /* reloc trees always use full backref */ 1107 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1108 parent = leaf->start; 1109 else 1110 parent = 0; 1111 1112 nritems = btrfs_header_nritems(leaf); 1113 for (i = 0; i < nritems; i++) { 1114 struct btrfs_ref ref = { 0 }; 1115 1116 cond_resched(); 1117 btrfs_item_key_to_cpu(leaf, &key, i); 1118 if (key.type != BTRFS_EXTENT_DATA_KEY) 1119 continue; 1120 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1121 if (btrfs_file_extent_type(leaf, fi) == 1122 BTRFS_FILE_EXTENT_INLINE) 1123 continue; 1124 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1125 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1126 if (bytenr == 0) 1127 continue; 1128 if (!in_range(bytenr, rc->block_group->start, 1129 rc->block_group->length)) 1130 continue; 1131 1132 /* 1133 * if we are modifying block in fs tree, wait for readpage 1134 * to complete and drop the extent cache 1135 */ 1136 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1137 if (first) { 1138 inode = find_next_inode(root, key.objectid); 1139 first = 0; 1140 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1141 btrfs_add_delayed_iput(inode); 1142 inode = find_next_inode(root, key.objectid); 1143 } 1144 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1145 end = key.offset + 1146 btrfs_file_extent_num_bytes(leaf, fi); 1147 WARN_ON(!IS_ALIGNED(key.offset, 1148 fs_info->sectorsize)); 1149 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1150 end--; 1151 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1152 key.offset, end); 1153 if (!ret) 1154 continue; 1155 1156 btrfs_drop_extent_cache(BTRFS_I(inode), 1157 key.offset, end, 1); 1158 unlock_extent(&BTRFS_I(inode)->io_tree, 1159 key.offset, end); 1160 } 1161 } 1162 1163 ret = get_new_location(rc->data_inode, &new_bytenr, 1164 bytenr, num_bytes); 1165 if (ret) { 1166 /* 1167 * Don't have to abort since we've not changed anything 1168 * in the file extent yet. 1169 */ 1170 break; 1171 } 1172 1173 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1174 dirty = 1; 1175 1176 key.offset -= btrfs_file_extent_offset(leaf, fi); 1177 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1178 num_bytes, parent); 1179 ref.real_root = root->root_key.objectid; 1180 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1181 key.objectid, key.offset); 1182 ret = btrfs_inc_extent_ref(trans, &ref); 1183 if (ret) { 1184 btrfs_abort_transaction(trans, ret); 1185 break; 1186 } 1187 1188 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1189 num_bytes, parent); 1190 ref.real_root = root->root_key.objectid; 1191 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1192 key.objectid, key.offset); 1193 ret = btrfs_free_extent(trans, &ref); 1194 if (ret) { 1195 btrfs_abort_transaction(trans, ret); 1196 break; 1197 } 1198 } 1199 if (dirty) 1200 btrfs_mark_buffer_dirty(leaf); 1201 if (inode) 1202 btrfs_add_delayed_iput(inode); 1203 return ret; 1204 } 1205 1206 static noinline_for_stack 1207 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1208 struct btrfs_path *path, int level) 1209 { 1210 struct btrfs_disk_key key1; 1211 struct btrfs_disk_key key2; 1212 btrfs_node_key(eb, &key1, slot); 1213 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1214 return memcmp(&key1, &key2, sizeof(key1)); 1215 } 1216 1217 /* 1218 * try to replace tree blocks in fs tree with the new blocks 1219 * in reloc tree. tree blocks haven't been modified since the 1220 * reloc tree was create can be replaced. 1221 * 1222 * if a block was replaced, level of the block + 1 is returned. 1223 * if no block got replaced, 0 is returned. if there are other 1224 * errors, a negative error number is returned. 1225 */ 1226 static noinline_for_stack 1227 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1228 struct btrfs_root *dest, struct btrfs_root *src, 1229 struct btrfs_path *path, struct btrfs_key *next_key, 1230 int lowest_level, int max_level) 1231 { 1232 struct btrfs_fs_info *fs_info = dest->fs_info; 1233 struct extent_buffer *eb; 1234 struct extent_buffer *parent; 1235 struct btrfs_ref ref = { 0 }; 1236 struct btrfs_key key; 1237 u64 old_bytenr; 1238 u64 new_bytenr; 1239 u64 old_ptr_gen; 1240 u64 new_ptr_gen; 1241 u64 last_snapshot; 1242 u32 blocksize; 1243 int cow = 0; 1244 int level; 1245 int ret; 1246 int slot; 1247 1248 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1249 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1250 1251 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1252 again: 1253 slot = path->slots[lowest_level]; 1254 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1255 1256 eb = btrfs_lock_root_node(dest); 1257 btrfs_set_lock_blocking_write(eb); 1258 level = btrfs_header_level(eb); 1259 1260 if (level < lowest_level) { 1261 btrfs_tree_unlock(eb); 1262 free_extent_buffer(eb); 1263 return 0; 1264 } 1265 1266 if (cow) { 1267 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1268 BUG_ON(ret); 1269 } 1270 btrfs_set_lock_blocking_write(eb); 1271 1272 if (next_key) { 1273 next_key->objectid = (u64)-1; 1274 next_key->type = (u8)-1; 1275 next_key->offset = (u64)-1; 1276 } 1277 1278 parent = eb; 1279 while (1) { 1280 struct btrfs_key first_key; 1281 1282 level = btrfs_header_level(parent); 1283 BUG_ON(level < lowest_level); 1284 1285 ret = btrfs_bin_search(parent, &key, level, &slot); 1286 if (ret < 0) 1287 break; 1288 if (ret && slot > 0) 1289 slot--; 1290 1291 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1292 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1293 1294 old_bytenr = btrfs_node_blockptr(parent, slot); 1295 blocksize = fs_info->nodesize; 1296 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1297 btrfs_node_key_to_cpu(parent, &first_key, slot); 1298 1299 if (level <= max_level) { 1300 eb = path->nodes[level]; 1301 new_bytenr = btrfs_node_blockptr(eb, 1302 path->slots[level]); 1303 new_ptr_gen = btrfs_node_ptr_generation(eb, 1304 path->slots[level]); 1305 } else { 1306 new_bytenr = 0; 1307 new_ptr_gen = 0; 1308 } 1309 1310 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1311 ret = level; 1312 break; 1313 } 1314 1315 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1316 memcmp_node_keys(parent, slot, path, level)) { 1317 if (level <= lowest_level) { 1318 ret = 0; 1319 break; 1320 } 1321 1322 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, 1323 level - 1, &first_key); 1324 if (IS_ERR(eb)) { 1325 ret = PTR_ERR(eb); 1326 break; 1327 } else if (!extent_buffer_uptodate(eb)) { 1328 ret = -EIO; 1329 free_extent_buffer(eb); 1330 break; 1331 } 1332 btrfs_tree_lock(eb); 1333 if (cow) { 1334 ret = btrfs_cow_block(trans, dest, eb, parent, 1335 slot, &eb); 1336 BUG_ON(ret); 1337 } 1338 btrfs_set_lock_blocking_write(eb); 1339 1340 btrfs_tree_unlock(parent); 1341 free_extent_buffer(parent); 1342 1343 parent = eb; 1344 continue; 1345 } 1346 1347 if (!cow) { 1348 btrfs_tree_unlock(parent); 1349 free_extent_buffer(parent); 1350 cow = 1; 1351 goto again; 1352 } 1353 1354 btrfs_node_key_to_cpu(path->nodes[level], &key, 1355 path->slots[level]); 1356 btrfs_release_path(path); 1357 1358 path->lowest_level = level; 1359 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1360 path->lowest_level = 0; 1361 BUG_ON(ret); 1362 1363 /* 1364 * Info qgroup to trace both subtrees. 1365 * 1366 * We must trace both trees. 1367 * 1) Tree reloc subtree 1368 * If not traced, we will leak data numbers 1369 * 2) Fs subtree 1370 * If not traced, we will double count old data 1371 * 1372 * We don't scan the subtree right now, but only record 1373 * the swapped tree blocks. 1374 * The real subtree rescan is delayed until we have new 1375 * CoW on the subtree root node before transaction commit. 1376 */ 1377 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1378 rc->block_group, parent, slot, 1379 path->nodes[level], path->slots[level], 1380 last_snapshot); 1381 if (ret < 0) 1382 break; 1383 /* 1384 * swap blocks in fs tree and reloc tree. 1385 */ 1386 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1387 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1388 btrfs_mark_buffer_dirty(parent); 1389 1390 btrfs_set_node_blockptr(path->nodes[level], 1391 path->slots[level], old_bytenr); 1392 btrfs_set_node_ptr_generation(path->nodes[level], 1393 path->slots[level], old_ptr_gen); 1394 btrfs_mark_buffer_dirty(path->nodes[level]); 1395 1396 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 1397 blocksize, path->nodes[level]->start); 1398 ref.skip_qgroup = true; 1399 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 1400 ret = btrfs_inc_extent_ref(trans, &ref); 1401 BUG_ON(ret); 1402 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1403 blocksize, 0); 1404 ref.skip_qgroup = true; 1405 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 1406 ret = btrfs_inc_extent_ref(trans, &ref); 1407 BUG_ON(ret); 1408 1409 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 1410 blocksize, path->nodes[level]->start); 1411 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 1412 ref.skip_qgroup = true; 1413 ret = btrfs_free_extent(trans, &ref); 1414 BUG_ON(ret); 1415 1416 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 1417 blocksize, 0); 1418 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 1419 ref.skip_qgroup = true; 1420 ret = btrfs_free_extent(trans, &ref); 1421 BUG_ON(ret); 1422 1423 btrfs_unlock_up_safe(path, 0); 1424 1425 ret = level; 1426 break; 1427 } 1428 btrfs_tree_unlock(parent); 1429 free_extent_buffer(parent); 1430 return ret; 1431 } 1432 1433 /* 1434 * helper to find next relocated block in reloc tree 1435 */ 1436 static noinline_for_stack 1437 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1438 int *level) 1439 { 1440 struct extent_buffer *eb; 1441 int i; 1442 u64 last_snapshot; 1443 u32 nritems; 1444 1445 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1446 1447 for (i = 0; i < *level; i++) { 1448 free_extent_buffer(path->nodes[i]); 1449 path->nodes[i] = NULL; 1450 } 1451 1452 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1453 eb = path->nodes[i]; 1454 nritems = btrfs_header_nritems(eb); 1455 while (path->slots[i] + 1 < nritems) { 1456 path->slots[i]++; 1457 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1458 last_snapshot) 1459 continue; 1460 1461 *level = i; 1462 return 0; 1463 } 1464 free_extent_buffer(path->nodes[i]); 1465 path->nodes[i] = NULL; 1466 } 1467 return 1; 1468 } 1469 1470 /* 1471 * walk down reloc tree to find relocated block of lowest level 1472 */ 1473 static noinline_for_stack 1474 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1475 int *level) 1476 { 1477 struct btrfs_fs_info *fs_info = root->fs_info; 1478 struct extent_buffer *eb = NULL; 1479 int i; 1480 u64 bytenr; 1481 u64 ptr_gen = 0; 1482 u64 last_snapshot; 1483 u32 nritems; 1484 1485 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1486 1487 for (i = *level; i > 0; i--) { 1488 struct btrfs_key first_key; 1489 1490 eb = path->nodes[i]; 1491 nritems = btrfs_header_nritems(eb); 1492 while (path->slots[i] < nritems) { 1493 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1494 if (ptr_gen > last_snapshot) 1495 break; 1496 path->slots[i]++; 1497 } 1498 if (path->slots[i] >= nritems) { 1499 if (i == *level) 1500 break; 1501 *level = i + 1; 1502 return 0; 1503 } 1504 if (i == 1) { 1505 *level = i; 1506 return 0; 1507 } 1508 1509 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 1510 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); 1511 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, 1512 &first_key); 1513 if (IS_ERR(eb)) { 1514 return PTR_ERR(eb); 1515 } else if (!extent_buffer_uptodate(eb)) { 1516 free_extent_buffer(eb); 1517 return -EIO; 1518 } 1519 BUG_ON(btrfs_header_level(eb) != i - 1); 1520 path->nodes[i - 1] = eb; 1521 path->slots[i - 1] = 0; 1522 } 1523 return 1; 1524 } 1525 1526 /* 1527 * invalidate extent cache for file extents whose key in range of 1528 * [min_key, max_key) 1529 */ 1530 static int invalidate_extent_cache(struct btrfs_root *root, 1531 struct btrfs_key *min_key, 1532 struct btrfs_key *max_key) 1533 { 1534 struct btrfs_fs_info *fs_info = root->fs_info; 1535 struct inode *inode = NULL; 1536 u64 objectid; 1537 u64 start, end; 1538 u64 ino; 1539 1540 objectid = min_key->objectid; 1541 while (1) { 1542 cond_resched(); 1543 iput(inode); 1544 1545 if (objectid > max_key->objectid) 1546 break; 1547 1548 inode = find_next_inode(root, objectid); 1549 if (!inode) 1550 break; 1551 ino = btrfs_ino(BTRFS_I(inode)); 1552 1553 if (ino > max_key->objectid) { 1554 iput(inode); 1555 break; 1556 } 1557 1558 objectid = ino + 1; 1559 if (!S_ISREG(inode->i_mode)) 1560 continue; 1561 1562 if (unlikely(min_key->objectid == ino)) { 1563 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1564 continue; 1565 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1566 start = 0; 1567 else { 1568 start = min_key->offset; 1569 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1570 } 1571 } else { 1572 start = 0; 1573 } 1574 1575 if (unlikely(max_key->objectid == ino)) { 1576 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1577 continue; 1578 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1579 end = (u64)-1; 1580 } else { 1581 if (max_key->offset == 0) 1582 continue; 1583 end = max_key->offset; 1584 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1585 end--; 1586 } 1587 } else { 1588 end = (u64)-1; 1589 } 1590 1591 /* the lock_extent waits for readpage to complete */ 1592 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 1593 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 1594 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 1595 } 1596 return 0; 1597 } 1598 1599 static int find_next_key(struct btrfs_path *path, int level, 1600 struct btrfs_key *key) 1601 1602 { 1603 while (level < BTRFS_MAX_LEVEL) { 1604 if (!path->nodes[level]) 1605 break; 1606 if (path->slots[level] + 1 < 1607 btrfs_header_nritems(path->nodes[level])) { 1608 btrfs_node_key_to_cpu(path->nodes[level], key, 1609 path->slots[level] + 1); 1610 return 0; 1611 } 1612 level++; 1613 } 1614 return 1; 1615 } 1616 1617 /* 1618 * Insert current subvolume into reloc_control::dirty_subvol_roots 1619 */ 1620 static void insert_dirty_subvol(struct btrfs_trans_handle *trans, 1621 struct reloc_control *rc, 1622 struct btrfs_root *root) 1623 { 1624 struct btrfs_root *reloc_root = root->reloc_root; 1625 struct btrfs_root_item *reloc_root_item; 1626 1627 /* @root must be a subvolume tree root with a valid reloc tree */ 1628 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1629 ASSERT(reloc_root); 1630 1631 reloc_root_item = &reloc_root->root_item; 1632 memset(&reloc_root_item->drop_progress, 0, 1633 sizeof(reloc_root_item->drop_progress)); 1634 reloc_root_item->drop_level = 0; 1635 btrfs_set_root_refs(reloc_root_item, 0); 1636 btrfs_update_reloc_root(trans, root); 1637 1638 if (list_empty(&root->reloc_dirty_list)) { 1639 btrfs_grab_root(root); 1640 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1641 } 1642 } 1643 1644 static int clean_dirty_subvols(struct reloc_control *rc) 1645 { 1646 struct btrfs_root *root; 1647 struct btrfs_root *next; 1648 int ret = 0; 1649 int ret2; 1650 1651 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1652 reloc_dirty_list) { 1653 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1654 /* Merged subvolume, cleanup its reloc root */ 1655 struct btrfs_root *reloc_root = root->reloc_root; 1656 1657 list_del_init(&root->reloc_dirty_list); 1658 root->reloc_root = NULL; 1659 /* 1660 * Need barrier to ensure clear_bit() only happens after 1661 * root->reloc_root = NULL. Pairs with have_reloc_root. 1662 */ 1663 smp_wmb(); 1664 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1665 if (reloc_root) { 1666 /* 1667 * btrfs_drop_snapshot drops our ref we hold for 1668 * ->reloc_root. If it fails however we must 1669 * drop the ref ourselves. 1670 */ 1671 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1672 if (ret2 < 0) { 1673 btrfs_put_root(reloc_root); 1674 if (!ret) 1675 ret = ret2; 1676 } 1677 } 1678 btrfs_put_root(root); 1679 } else { 1680 /* Orphan reloc tree, just clean it up */ 1681 ret2 = btrfs_drop_snapshot(root, 0, 1); 1682 if (ret2 < 0) { 1683 btrfs_put_root(root); 1684 if (!ret) 1685 ret = ret2; 1686 } 1687 } 1688 } 1689 return ret; 1690 } 1691 1692 /* 1693 * merge the relocated tree blocks in reloc tree with corresponding 1694 * fs tree. 1695 */ 1696 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1697 struct btrfs_root *root) 1698 { 1699 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1700 struct btrfs_key key; 1701 struct btrfs_key next_key; 1702 struct btrfs_trans_handle *trans = NULL; 1703 struct btrfs_root *reloc_root; 1704 struct btrfs_root_item *root_item; 1705 struct btrfs_path *path; 1706 struct extent_buffer *leaf; 1707 int level; 1708 int max_level; 1709 int replaced = 0; 1710 int ret; 1711 int err = 0; 1712 u32 min_reserved; 1713 1714 path = btrfs_alloc_path(); 1715 if (!path) 1716 return -ENOMEM; 1717 path->reada = READA_FORWARD; 1718 1719 reloc_root = root->reloc_root; 1720 root_item = &reloc_root->root_item; 1721 1722 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1723 level = btrfs_root_level(root_item); 1724 atomic_inc(&reloc_root->node->refs); 1725 path->nodes[level] = reloc_root->node; 1726 path->slots[level] = 0; 1727 } else { 1728 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1729 1730 level = root_item->drop_level; 1731 BUG_ON(level == 0); 1732 path->lowest_level = level; 1733 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1734 path->lowest_level = 0; 1735 if (ret < 0) { 1736 btrfs_free_path(path); 1737 return ret; 1738 } 1739 1740 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1741 path->slots[level]); 1742 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1743 1744 btrfs_unlock_up_safe(path, 0); 1745 } 1746 1747 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1748 memset(&next_key, 0, sizeof(next_key)); 1749 1750 while (1) { 1751 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 1752 BTRFS_RESERVE_FLUSH_ALL); 1753 if (ret) { 1754 err = ret; 1755 goto out; 1756 } 1757 trans = btrfs_start_transaction(root, 0); 1758 if (IS_ERR(trans)) { 1759 err = PTR_ERR(trans); 1760 trans = NULL; 1761 goto out; 1762 } 1763 1764 /* 1765 * At this point we no longer have a reloc_control, so we can't 1766 * depend on btrfs_init_reloc_root to update our last_trans. 1767 * 1768 * But that's ok, we started the trans handle on our 1769 * corresponding fs_root, which means it's been added to the 1770 * dirty list. At commit time we'll still call 1771 * btrfs_update_reloc_root() and update our root item 1772 * appropriately. 1773 */ 1774 reloc_root->last_trans = trans->transid; 1775 trans->block_rsv = rc->block_rsv; 1776 1777 replaced = 0; 1778 max_level = level; 1779 1780 ret = walk_down_reloc_tree(reloc_root, path, &level); 1781 if (ret < 0) { 1782 err = ret; 1783 goto out; 1784 } 1785 if (ret > 0) 1786 break; 1787 1788 if (!find_next_key(path, level, &key) && 1789 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1790 ret = 0; 1791 } else { 1792 ret = replace_path(trans, rc, root, reloc_root, path, 1793 &next_key, level, max_level); 1794 } 1795 if (ret < 0) { 1796 err = ret; 1797 goto out; 1798 } 1799 1800 if (ret > 0) { 1801 level = ret; 1802 btrfs_node_key_to_cpu(path->nodes[level], &key, 1803 path->slots[level]); 1804 replaced = 1; 1805 } 1806 1807 ret = walk_up_reloc_tree(reloc_root, path, &level); 1808 if (ret > 0) 1809 break; 1810 1811 BUG_ON(level == 0); 1812 /* 1813 * save the merging progress in the drop_progress. 1814 * this is OK since root refs == 1 in this case. 1815 */ 1816 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1817 path->slots[level]); 1818 root_item->drop_level = level; 1819 1820 btrfs_end_transaction_throttle(trans); 1821 trans = NULL; 1822 1823 btrfs_btree_balance_dirty(fs_info); 1824 1825 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1826 invalidate_extent_cache(root, &key, &next_key); 1827 } 1828 1829 /* 1830 * handle the case only one block in the fs tree need to be 1831 * relocated and the block is tree root. 1832 */ 1833 leaf = btrfs_lock_root_node(root); 1834 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); 1835 btrfs_tree_unlock(leaf); 1836 free_extent_buffer(leaf); 1837 if (ret < 0) 1838 err = ret; 1839 out: 1840 btrfs_free_path(path); 1841 1842 if (err == 0) 1843 insert_dirty_subvol(trans, rc, root); 1844 1845 if (trans) 1846 btrfs_end_transaction_throttle(trans); 1847 1848 btrfs_btree_balance_dirty(fs_info); 1849 1850 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1851 invalidate_extent_cache(root, &key, &next_key); 1852 1853 return err; 1854 } 1855 1856 static noinline_for_stack 1857 int prepare_to_merge(struct reloc_control *rc, int err) 1858 { 1859 struct btrfs_root *root = rc->extent_root; 1860 struct btrfs_fs_info *fs_info = root->fs_info; 1861 struct btrfs_root *reloc_root; 1862 struct btrfs_trans_handle *trans; 1863 LIST_HEAD(reloc_roots); 1864 u64 num_bytes = 0; 1865 int ret; 1866 1867 mutex_lock(&fs_info->reloc_mutex); 1868 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1869 rc->merging_rsv_size += rc->nodes_relocated * 2; 1870 mutex_unlock(&fs_info->reloc_mutex); 1871 1872 again: 1873 if (!err) { 1874 num_bytes = rc->merging_rsv_size; 1875 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 1876 BTRFS_RESERVE_FLUSH_ALL); 1877 if (ret) 1878 err = ret; 1879 } 1880 1881 trans = btrfs_join_transaction(rc->extent_root); 1882 if (IS_ERR(trans)) { 1883 if (!err) 1884 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1885 num_bytes, NULL); 1886 return PTR_ERR(trans); 1887 } 1888 1889 if (!err) { 1890 if (num_bytes != rc->merging_rsv_size) { 1891 btrfs_end_transaction(trans); 1892 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1893 num_bytes, NULL); 1894 goto again; 1895 } 1896 } 1897 1898 rc->merge_reloc_tree = 1; 1899 1900 while (!list_empty(&rc->reloc_roots)) { 1901 reloc_root = list_entry(rc->reloc_roots.next, 1902 struct btrfs_root, root_list); 1903 list_del_init(&reloc_root->root_list); 1904 1905 root = read_fs_root(fs_info, reloc_root->root_key.offset); 1906 BUG_ON(IS_ERR(root)); 1907 BUG_ON(root->reloc_root != reloc_root); 1908 1909 /* 1910 * set reference count to 1, so btrfs_recover_relocation 1911 * knows it should resumes merging 1912 */ 1913 if (!err) 1914 btrfs_set_root_refs(&reloc_root->root_item, 1); 1915 btrfs_update_reloc_root(trans, root); 1916 1917 list_add(&reloc_root->root_list, &reloc_roots); 1918 btrfs_put_root(root); 1919 } 1920 1921 list_splice(&reloc_roots, &rc->reloc_roots); 1922 1923 if (!err) 1924 btrfs_commit_transaction(trans); 1925 else 1926 btrfs_end_transaction(trans); 1927 return err; 1928 } 1929 1930 static noinline_for_stack 1931 void free_reloc_roots(struct list_head *list) 1932 { 1933 struct btrfs_root *reloc_root; 1934 1935 while (!list_empty(list)) { 1936 reloc_root = list_entry(list->next, struct btrfs_root, 1937 root_list); 1938 __del_reloc_root(reloc_root); 1939 } 1940 } 1941 1942 static noinline_for_stack 1943 void merge_reloc_roots(struct reloc_control *rc) 1944 { 1945 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1946 struct btrfs_root *root; 1947 struct btrfs_root *reloc_root; 1948 LIST_HEAD(reloc_roots); 1949 int found = 0; 1950 int ret = 0; 1951 again: 1952 root = rc->extent_root; 1953 1954 /* 1955 * this serializes us with btrfs_record_root_in_transaction, 1956 * we have to make sure nobody is in the middle of 1957 * adding their roots to the list while we are 1958 * doing this splice 1959 */ 1960 mutex_lock(&fs_info->reloc_mutex); 1961 list_splice_init(&rc->reloc_roots, &reloc_roots); 1962 mutex_unlock(&fs_info->reloc_mutex); 1963 1964 while (!list_empty(&reloc_roots)) { 1965 found = 1; 1966 reloc_root = list_entry(reloc_roots.next, 1967 struct btrfs_root, root_list); 1968 1969 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1970 root = read_fs_root(fs_info, 1971 reloc_root->root_key.offset); 1972 BUG_ON(IS_ERR(root)); 1973 BUG_ON(root->reloc_root != reloc_root); 1974 1975 ret = merge_reloc_root(rc, root); 1976 btrfs_put_root(root); 1977 if (ret) { 1978 if (list_empty(&reloc_root->root_list)) 1979 list_add_tail(&reloc_root->root_list, 1980 &reloc_roots); 1981 goto out; 1982 } 1983 } else { 1984 list_del_init(&reloc_root->root_list); 1985 /* Don't forget to queue this reloc root for cleanup */ 1986 list_add_tail(&reloc_root->reloc_dirty_list, 1987 &rc->dirty_subvol_roots); 1988 } 1989 } 1990 1991 if (found) { 1992 found = 0; 1993 goto again; 1994 } 1995 out: 1996 if (ret) { 1997 btrfs_handle_fs_error(fs_info, ret, NULL); 1998 if (!list_empty(&reloc_roots)) 1999 free_reloc_roots(&reloc_roots); 2000 2001 /* new reloc root may be added */ 2002 mutex_lock(&fs_info->reloc_mutex); 2003 list_splice_init(&rc->reloc_roots, &reloc_roots); 2004 mutex_unlock(&fs_info->reloc_mutex); 2005 if (!list_empty(&reloc_roots)) 2006 free_reloc_roots(&reloc_roots); 2007 } 2008 2009 /* 2010 * We used to have 2011 * 2012 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2013 * 2014 * here, but it's wrong. If we fail to start the transaction in 2015 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2016 * have actually been removed from the reloc_root_tree rb tree. This is 2017 * fine because we're bailing here, and we hold a reference on the root 2018 * for the list that holds it, so these roots will be cleaned up when we 2019 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2020 * will be cleaned up on unmount. 2021 * 2022 * The remaining nodes will be cleaned up by free_reloc_control. 2023 */ 2024 } 2025 2026 static void free_block_list(struct rb_root *blocks) 2027 { 2028 struct tree_block *block; 2029 struct rb_node *rb_node; 2030 while ((rb_node = rb_first(blocks))) { 2031 block = rb_entry(rb_node, struct tree_block, rb_node); 2032 rb_erase(rb_node, blocks); 2033 kfree(block); 2034 } 2035 } 2036 2037 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2038 struct btrfs_root *reloc_root) 2039 { 2040 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2041 struct btrfs_root *root; 2042 int ret; 2043 2044 if (reloc_root->last_trans == trans->transid) 2045 return 0; 2046 2047 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2048 BUG_ON(IS_ERR(root)); 2049 BUG_ON(root->reloc_root != reloc_root); 2050 ret = btrfs_record_root_in_trans(trans, root); 2051 btrfs_put_root(root); 2052 2053 return ret; 2054 } 2055 2056 static noinline_for_stack 2057 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2058 struct reloc_control *rc, 2059 struct btrfs_backref_node *node, 2060 struct btrfs_backref_edge *edges[]) 2061 { 2062 struct btrfs_backref_node *next; 2063 struct btrfs_root *root; 2064 int index = 0; 2065 2066 next = node; 2067 while (1) { 2068 cond_resched(); 2069 next = walk_up_backref(next, edges, &index); 2070 root = next->root; 2071 BUG_ON(!root); 2072 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); 2073 2074 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2075 record_reloc_root_in_trans(trans, root); 2076 break; 2077 } 2078 2079 btrfs_record_root_in_trans(trans, root); 2080 root = root->reloc_root; 2081 2082 if (next->new_bytenr != root->node->start) { 2083 BUG_ON(next->new_bytenr); 2084 BUG_ON(!list_empty(&next->list)); 2085 next->new_bytenr = root->node->start; 2086 btrfs_put_root(next->root); 2087 next->root = btrfs_grab_root(root); 2088 ASSERT(next->root); 2089 list_add_tail(&next->list, 2090 &rc->backref_cache.changed); 2091 mark_block_processed(rc, next); 2092 break; 2093 } 2094 2095 WARN_ON(1); 2096 root = NULL; 2097 next = walk_down_backref(edges, &index); 2098 if (!next || next->level <= node->level) 2099 break; 2100 } 2101 if (!root) 2102 return NULL; 2103 2104 next = node; 2105 /* setup backref node path for btrfs_reloc_cow_block */ 2106 while (1) { 2107 rc->backref_cache.path[next->level] = next; 2108 if (--index < 0) 2109 break; 2110 next = edges[index]->node[UPPER]; 2111 } 2112 return root; 2113 } 2114 2115 /* 2116 * select a tree root for relocation. return NULL if the block 2117 * is reference counted. we should use do_relocation() in this 2118 * case. return a tree root pointer if the block isn't reference 2119 * counted. return -ENOENT if the block is root of reloc tree. 2120 */ 2121 static noinline_for_stack 2122 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2123 { 2124 struct btrfs_backref_node *next; 2125 struct btrfs_root *root; 2126 struct btrfs_root *fs_root = NULL; 2127 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2128 int index = 0; 2129 2130 next = node; 2131 while (1) { 2132 cond_resched(); 2133 next = walk_up_backref(next, edges, &index); 2134 root = next->root; 2135 BUG_ON(!root); 2136 2137 /* no other choice for non-references counted tree */ 2138 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 2139 return root; 2140 2141 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2142 fs_root = root; 2143 2144 if (next != node) 2145 return NULL; 2146 2147 next = walk_down_backref(edges, &index); 2148 if (!next || next->level <= node->level) 2149 break; 2150 } 2151 2152 if (!fs_root) 2153 return ERR_PTR(-ENOENT); 2154 return fs_root; 2155 } 2156 2157 static noinline_for_stack 2158 u64 calcu_metadata_size(struct reloc_control *rc, 2159 struct btrfs_backref_node *node, int reserve) 2160 { 2161 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2162 struct btrfs_backref_node *next = node; 2163 struct btrfs_backref_edge *edge; 2164 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2165 u64 num_bytes = 0; 2166 int index = 0; 2167 2168 BUG_ON(reserve && node->processed); 2169 2170 while (next) { 2171 cond_resched(); 2172 while (1) { 2173 if (next->processed && (reserve || next != node)) 2174 break; 2175 2176 num_bytes += fs_info->nodesize; 2177 2178 if (list_empty(&next->upper)) 2179 break; 2180 2181 edge = list_entry(next->upper.next, 2182 struct btrfs_backref_edge, list[LOWER]); 2183 edges[index++] = edge; 2184 next = edge->node[UPPER]; 2185 } 2186 next = walk_down_backref(edges, &index); 2187 } 2188 return num_bytes; 2189 } 2190 2191 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2192 struct reloc_control *rc, 2193 struct btrfs_backref_node *node) 2194 { 2195 struct btrfs_root *root = rc->extent_root; 2196 struct btrfs_fs_info *fs_info = root->fs_info; 2197 u64 num_bytes; 2198 int ret; 2199 u64 tmp; 2200 2201 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2202 2203 trans->block_rsv = rc->block_rsv; 2204 rc->reserved_bytes += num_bytes; 2205 2206 /* 2207 * We are under a transaction here so we can only do limited flushing. 2208 * If we get an enospc just kick back -EAGAIN so we know to drop the 2209 * transaction and try to refill when we can flush all the things. 2210 */ 2211 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, 2212 BTRFS_RESERVE_FLUSH_LIMIT); 2213 if (ret) { 2214 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2215 while (tmp <= rc->reserved_bytes) 2216 tmp <<= 1; 2217 /* 2218 * only one thread can access block_rsv at this point, 2219 * so we don't need hold lock to protect block_rsv. 2220 * we expand more reservation size here to allow enough 2221 * space for relocation and we will return earlier in 2222 * enospc case. 2223 */ 2224 rc->block_rsv->size = tmp + fs_info->nodesize * 2225 RELOCATION_RESERVED_NODES; 2226 return -EAGAIN; 2227 } 2228 2229 return 0; 2230 } 2231 2232 /* 2233 * relocate a block tree, and then update pointers in upper level 2234 * blocks that reference the block to point to the new location. 2235 * 2236 * if called by link_to_upper, the block has already been relocated. 2237 * in that case this function just updates pointers. 2238 */ 2239 static int do_relocation(struct btrfs_trans_handle *trans, 2240 struct reloc_control *rc, 2241 struct btrfs_backref_node *node, 2242 struct btrfs_key *key, 2243 struct btrfs_path *path, int lowest) 2244 { 2245 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2246 struct btrfs_backref_node *upper; 2247 struct btrfs_backref_edge *edge; 2248 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2249 struct btrfs_root *root; 2250 struct extent_buffer *eb; 2251 u32 blocksize; 2252 u64 bytenr; 2253 u64 generation; 2254 int slot; 2255 int ret; 2256 int err = 0; 2257 2258 BUG_ON(lowest && node->eb); 2259 2260 path->lowest_level = node->level + 1; 2261 rc->backref_cache.path[node->level] = node; 2262 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2263 struct btrfs_key first_key; 2264 struct btrfs_ref ref = { 0 }; 2265 2266 cond_resched(); 2267 2268 upper = edge->node[UPPER]; 2269 root = select_reloc_root(trans, rc, upper, edges); 2270 BUG_ON(!root); 2271 2272 if (upper->eb && !upper->locked) { 2273 if (!lowest) { 2274 ret = btrfs_bin_search(upper->eb, key, 2275 upper->level, &slot); 2276 if (ret < 0) { 2277 err = ret; 2278 goto next; 2279 } 2280 BUG_ON(ret); 2281 bytenr = btrfs_node_blockptr(upper->eb, slot); 2282 if (node->eb->start == bytenr) 2283 goto next; 2284 } 2285 btrfs_backref_drop_node_buffer(upper); 2286 } 2287 2288 if (!upper->eb) { 2289 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2290 if (ret) { 2291 if (ret < 0) 2292 err = ret; 2293 else 2294 err = -ENOENT; 2295 2296 btrfs_release_path(path); 2297 break; 2298 } 2299 2300 if (!upper->eb) { 2301 upper->eb = path->nodes[upper->level]; 2302 path->nodes[upper->level] = NULL; 2303 } else { 2304 BUG_ON(upper->eb != path->nodes[upper->level]); 2305 } 2306 2307 upper->locked = 1; 2308 path->locks[upper->level] = 0; 2309 2310 slot = path->slots[upper->level]; 2311 btrfs_release_path(path); 2312 } else { 2313 ret = btrfs_bin_search(upper->eb, key, upper->level, 2314 &slot); 2315 if (ret < 0) { 2316 err = ret; 2317 goto next; 2318 } 2319 BUG_ON(ret); 2320 } 2321 2322 bytenr = btrfs_node_blockptr(upper->eb, slot); 2323 if (lowest) { 2324 if (bytenr != node->bytenr) { 2325 btrfs_err(root->fs_info, 2326 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2327 bytenr, node->bytenr, slot, 2328 upper->eb->start); 2329 err = -EIO; 2330 goto next; 2331 } 2332 } else { 2333 if (node->eb->start == bytenr) 2334 goto next; 2335 } 2336 2337 blocksize = root->fs_info->nodesize; 2338 generation = btrfs_node_ptr_generation(upper->eb, slot); 2339 btrfs_node_key_to_cpu(upper->eb, &first_key, slot); 2340 eb = read_tree_block(fs_info, bytenr, generation, 2341 upper->level - 1, &first_key); 2342 if (IS_ERR(eb)) { 2343 err = PTR_ERR(eb); 2344 goto next; 2345 } else if (!extent_buffer_uptodate(eb)) { 2346 free_extent_buffer(eb); 2347 err = -EIO; 2348 goto next; 2349 } 2350 btrfs_tree_lock(eb); 2351 btrfs_set_lock_blocking_write(eb); 2352 2353 if (!node->eb) { 2354 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2355 slot, &eb); 2356 btrfs_tree_unlock(eb); 2357 free_extent_buffer(eb); 2358 if (ret < 0) { 2359 err = ret; 2360 goto next; 2361 } 2362 BUG_ON(node->eb != eb); 2363 } else { 2364 btrfs_set_node_blockptr(upper->eb, slot, 2365 node->eb->start); 2366 btrfs_set_node_ptr_generation(upper->eb, slot, 2367 trans->transid); 2368 btrfs_mark_buffer_dirty(upper->eb); 2369 2370 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2371 node->eb->start, blocksize, 2372 upper->eb->start); 2373 ref.real_root = root->root_key.objectid; 2374 btrfs_init_tree_ref(&ref, node->level, 2375 btrfs_header_owner(upper->eb)); 2376 ret = btrfs_inc_extent_ref(trans, &ref); 2377 BUG_ON(ret); 2378 2379 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2380 BUG_ON(ret); 2381 } 2382 next: 2383 if (!upper->pending) 2384 btrfs_backref_drop_node_buffer(upper); 2385 else 2386 btrfs_backref_unlock_node_buffer(upper); 2387 if (err) 2388 break; 2389 } 2390 2391 if (!err && node->pending) { 2392 btrfs_backref_drop_node_buffer(node); 2393 list_move_tail(&node->list, &rc->backref_cache.changed); 2394 node->pending = 0; 2395 } 2396 2397 path->lowest_level = 0; 2398 BUG_ON(err == -ENOSPC); 2399 return err; 2400 } 2401 2402 static int link_to_upper(struct btrfs_trans_handle *trans, 2403 struct reloc_control *rc, 2404 struct btrfs_backref_node *node, 2405 struct btrfs_path *path) 2406 { 2407 struct btrfs_key key; 2408 2409 btrfs_node_key_to_cpu(node->eb, &key, 0); 2410 return do_relocation(trans, rc, node, &key, path, 0); 2411 } 2412 2413 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2414 struct reloc_control *rc, 2415 struct btrfs_path *path, int err) 2416 { 2417 LIST_HEAD(list); 2418 struct btrfs_backref_cache *cache = &rc->backref_cache; 2419 struct btrfs_backref_node *node; 2420 int level; 2421 int ret; 2422 2423 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2424 while (!list_empty(&cache->pending[level])) { 2425 node = list_entry(cache->pending[level].next, 2426 struct btrfs_backref_node, list); 2427 list_move_tail(&node->list, &list); 2428 BUG_ON(!node->pending); 2429 2430 if (!err) { 2431 ret = link_to_upper(trans, rc, node, path); 2432 if (ret < 0) 2433 err = ret; 2434 } 2435 } 2436 list_splice_init(&list, &cache->pending[level]); 2437 } 2438 return err; 2439 } 2440 2441 /* 2442 * mark a block and all blocks directly/indirectly reference the block 2443 * as processed. 2444 */ 2445 static void update_processed_blocks(struct reloc_control *rc, 2446 struct btrfs_backref_node *node) 2447 { 2448 struct btrfs_backref_node *next = node; 2449 struct btrfs_backref_edge *edge; 2450 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2451 int index = 0; 2452 2453 while (next) { 2454 cond_resched(); 2455 while (1) { 2456 if (next->processed) 2457 break; 2458 2459 mark_block_processed(rc, next); 2460 2461 if (list_empty(&next->upper)) 2462 break; 2463 2464 edge = list_entry(next->upper.next, 2465 struct btrfs_backref_edge, list[LOWER]); 2466 edges[index++] = edge; 2467 next = edge->node[UPPER]; 2468 } 2469 next = walk_down_backref(edges, &index); 2470 } 2471 } 2472 2473 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2474 { 2475 u32 blocksize = rc->extent_root->fs_info->nodesize; 2476 2477 if (test_range_bit(&rc->processed_blocks, bytenr, 2478 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2479 return 1; 2480 return 0; 2481 } 2482 2483 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2484 struct tree_block *block) 2485 { 2486 struct extent_buffer *eb; 2487 2488 eb = read_tree_block(fs_info, block->bytenr, block->key.offset, 2489 block->level, NULL); 2490 if (IS_ERR(eb)) { 2491 return PTR_ERR(eb); 2492 } else if (!extent_buffer_uptodate(eb)) { 2493 free_extent_buffer(eb); 2494 return -EIO; 2495 } 2496 if (block->level == 0) 2497 btrfs_item_key_to_cpu(eb, &block->key, 0); 2498 else 2499 btrfs_node_key_to_cpu(eb, &block->key, 0); 2500 free_extent_buffer(eb); 2501 block->key_ready = 1; 2502 return 0; 2503 } 2504 2505 /* 2506 * helper function to relocate a tree block 2507 */ 2508 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2509 struct reloc_control *rc, 2510 struct btrfs_backref_node *node, 2511 struct btrfs_key *key, 2512 struct btrfs_path *path) 2513 { 2514 struct btrfs_root *root; 2515 int ret = 0; 2516 2517 if (!node) 2518 return 0; 2519 2520 /* 2521 * If we fail here we want to drop our backref_node because we are going 2522 * to start over and regenerate the tree for it. 2523 */ 2524 ret = reserve_metadata_space(trans, rc, node); 2525 if (ret) 2526 goto out; 2527 2528 BUG_ON(node->processed); 2529 root = select_one_root(node); 2530 if (root == ERR_PTR(-ENOENT)) { 2531 update_processed_blocks(rc, node); 2532 goto out; 2533 } 2534 2535 if (root) { 2536 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 2537 BUG_ON(node->new_bytenr); 2538 BUG_ON(!list_empty(&node->list)); 2539 btrfs_record_root_in_trans(trans, root); 2540 root = root->reloc_root; 2541 node->new_bytenr = root->node->start; 2542 btrfs_put_root(node->root); 2543 node->root = btrfs_grab_root(root); 2544 ASSERT(node->root); 2545 list_add_tail(&node->list, &rc->backref_cache.changed); 2546 } else { 2547 path->lowest_level = node->level; 2548 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2549 btrfs_release_path(path); 2550 if (ret > 0) 2551 ret = 0; 2552 } 2553 if (!ret) 2554 update_processed_blocks(rc, node); 2555 } else { 2556 ret = do_relocation(trans, rc, node, key, path, 1); 2557 } 2558 out: 2559 if (ret || node->level == 0 || node->cowonly) 2560 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2561 return ret; 2562 } 2563 2564 /* 2565 * relocate a list of blocks 2566 */ 2567 static noinline_for_stack 2568 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2569 struct reloc_control *rc, struct rb_root *blocks) 2570 { 2571 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2572 struct btrfs_backref_node *node; 2573 struct btrfs_path *path; 2574 struct tree_block *block; 2575 struct tree_block *next; 2576 int ret; 2577 int err = 0; 2578 2579 path = btrfs_alloc_path(); 2580 if (!path) { 2581 err = -ENOMEM; 2582 goto out_free_blocks; 2583 } 2584 2585 /* Kick in readahead for tree blocks with missing keys */ 2586 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2587 if (!block->key_ready) 2588 readahead_tree_block(fs_info, block->bytenr); 2589 } 2590 2591 /* Get first keys */ 2592 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2593 if (!block->key_ready) { 2594 err = get_tree_block_key(fs_info, block); 2595 if (err) 2596 goto out_free_path; 2597 } 2598 } 2599 2600 /* Do tree relocation */ 2601 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2602 node = build_backref_tree(rc, &block->key, 2603 block->level, block->bytenr); 2604 if (IS_ERR(node)) { 2605 err = PTR_ERR(node); 2606 goto out; 2607 } 2608 2609 ret = relocate_tree_block(trans, rc, node, &block->key, 2610 path); 2611 if (ret < 0) { 2612 err = ret; 2613 break; 2614 } 2615 } 2616 out: 2617 err = finish_pending_nodes(trans, rc, path, err); 2618 2619 out_free_path: 2620 btrfs_free_path(path); 2621 out_free_blocks: 2622 free_block_list(blocks); 2623 return err; 2624 } 2625 2626 static noinline_for_stack 2627 int prealloc_file_extent_cluster(struct inode *inode, 2628 struct file_extent_cluster *cluster) 2629 { 2630 u64 alloc_hint = 0; 2631 u64 start; 2632 u64 end; 2633 u64 offset = BTRFS_I(inode)->index_cnt; 2634 u64 num_bytes; 2635 int nr = 0; 2636 int ret = 0; 2637 u64 prealloc_start = cluster->start - offset; 2638 u64 prealloc_end = cluster->end - offset; 2639 u64 cur_offset; 2640 struct extent_changeset *data_reserved = NULL; 2641 2642 BUG_ON(cluster->start != cluster->boundary[0]); 2643 inode_lock(inode); 2644 2645 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, 2646 prealloc_end + 1 - prealloc_start); 2647 if (ret) 2648 goto out; 2649 2650 cur_offset = prealloc_start; 2651 while (nr < cluster->nr) { 2652 start = cluster->boundary[nr] - offset; 2653 if (nr + 1 < cluster->nr) 2654 end = cluster->boundary[nr + 1] - 1 - offset; 2655 else 2656 end = cluster->end - offset; 2657 2658 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2659 num_bytes = end + 1 - start; 2660 if (cur_offset < start) 2661 btrfs_free_reserved_data_space(inode, data_reserved, 2662 cur_offset, start - cur_offset); 2663 ret = btrfs_prealloc_file_range(inode, 0, start, 2664 num_bytes, num_bytes, 2665 end + 1, &alloc_hint); 2666 cur_offset = end + 1; 2667 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2668 if (ret) 2669 break; 2670 nr++; 2671 } 2672 if (cur_offset < prealloc_end) 2673 btrfs_free_reserved_data_space(inode, data_reserved, 2674 cur_offset, prealloc_end + 1 - cur_offset); 2675 out: 2676 inode_unlock(inode); 2677 extent_changeset_free(data_reserved); 2678 return ret; 2679 } 2680 2681 static noinline_for_stack 2682 int setup_extent_mapping(struct inode *inode, u64 start, u64 end, 2683 u64 block_start) 2684 { 2685 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2686 struct extent_map *em; 2687 int ret = 0; 2688 2689 em = alloc_extent_map(); 2690 if (!em) 2691 return -ENOMEM; 2692 2693 em->start = start; 2694 em->len = end + 1 - start; 2695 em->block_len = em->len; 2696 em->block_start = block_start; 2697 set_bit(EXTENT_FLAG_PINNED, &em->flags); 2698 2699 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2700 while (1) { 2701 write_lock(&em_tree->lock); 2702 ret = add_extent_mapping(em_tree, em, 0); 2703 write_unlock(&em_tree->lock); 2704 if (ret != -EEXIST) { 2705 free_extent_map(em); 2706 break; 2707 } 2708 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 2709 } 2710 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2711 return ret; 2712 } 2713 2714 /* 2715 * Allow error injection to test balance cancellation 2716 */ 2717 int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) 2718 { 2719 return atomic_read(&fs_info->balance_cancel_req); 2720 } 2721 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2722 2723 static int relocate_file_extent_cluster(struct inode *inode, 2724 struct file_extent_cluster *cluster) 2725 { 2726 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2727 u64 page_start; 2728 u64 page_end; 2729 u64 offset = BTRFS_I(inode)->index_cnt; 2730 unsigned long index; 2731 unsigned long last_index; 2732 struct page *page; 2733 struct file_ra_state *ra; 2734 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2735 int nr = 0; 2736 int ret = 0; 2737 2738 if (!cluster->nr) 2739 return 0; 2740 2741 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2742 if (!ra) 2743 return -ENOMEM; 2744 2745 ret = prealloc_file_extent_cluster(inode, cluster); 2746 if (ret) 2747 goto out; 2748 2749 file_ra_state_init(ra, inode->i_mapping); 2750 2751 ret = setup_extent_mapping(inode, cluster->start - offset, 2752 cluster->end - offset, cluster->start); 2753 if (ret) 2754 goto out; 2755 2756 index = (cluster->start - offset) >> PAGE_SHIFT; 2757 last_index = (cluster->end - offset) >> PAGE_SHIFT; 2758 while (index <= last_index) { 2759 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2760 PAGE_SIZE); 2761 if (ret) 2762 goto out; 2763 2764 page = find_lock_page(inode->i_mapping, index); 2765 if (!page) { 2766 page_cache_sync_readahead(inode->i_mapping, 2767 ra, NULL, index, 2768 last_index + 1 - index); 2769 page = find_or_create_page(inode->i_mapping, index, 2770 mask); 2771 if (!page) { 2772 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2773 PAGE_SIZE, true); 2774 btrfs_delalloc_release_extents(BTRFS_I(inode), 2775 PAGE_SIZE); 2776 ret = -ENOMEM; 2777 goto out; 2778 } 2779 } 2780 2781 if (PageReadahead(page)) { 2782 page_cache_async_readahead(inode->i_mapping, 2783 ra, NULL, page, index, 2784 last_index + 1 - index); 2785 } 2786 2787 if (!PageUptodate(page)) { 2788 btrfs_readpage(NULL, page); 2789 lock_page(page); 2790 if (!PageUptodate(page)) { 2791 unlock_page(page); 2792 put_page(page); 2793 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2794 PAGE_SIZE, true); 2795 btrfs_delalloc_release_extents(BTRFS_I(inode), 2796 PAGE_SIZE); 2797 ret = -EIO; 2798 goto out; 2799 } 2800 } 2801 2802 page_start = page_offset(page); 2803 page_end = page_start + PAGE_SIZE - 1; 2804 2805 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 2806 2807 set_page_extent_mapped(page); 2808 2809 if (nr < cluster->nr && 2810 page_start + offset == cluster->boundary[nr]) { 2811 set_extent_bits(&BTRFS_I(inode)->io_tree, 2812 page_start, page_end, 2813 EXTENT_BOUNDARY); 2814 nr++; 2815 } 2816 2817 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 2818 NULL); 2819 if (ret) { 2820 unlock_page(page); 2821 put_page(page); 2822 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2823 PAGE_SIZE, true); 2824 btrfs_delalloc_release_extents(BTRFS_I(inode), 2825 PAGE_SIZE); 2826 2827 clear_extent_bits(&BTRFS_I(inode)->io_tree, 2828 page_start, page_end, 2829 EXTENT_LOCKED | EXTENT_BOUNDARY); 2830 goto out; 2831 2832 } 2833 set_page_dirty(page); 2834 2835 unlock_extent(&BTRFS_I(inode)->io_tree, 2836 page_start, page_end); 2837 unlock_page(page); 2838 put_page(page); 2839 2840 index++; 2841 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); 2842 balance_dirty_pages_ratelimited(inode->i_mapping); 2843 btrfs_throttle(fs_info); 2844 if (btrfs_should_cancel_balance(fs_info)) { 2845 ret = -ECANCELED; 2846 goto out; 2847 } 2848 } 2849 WARN_ON(nr != cluster->nr); 2850 out: 2851 kfree(ra); 2852 return ret; 2853 } 2854 2855 static noinline_for_stack 2856 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 2857 struct file_extent_cluster *cluster) 2858 { 2859 int ret; 2860 2861 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 2862 ret = relocate_file_extent_cluster(inode, cluster); 2863 if (ret) 2864 return ret; 2865 cluster->nr = 0; 2866 } 2867 2868 if (!cluster->nr) 2869 cluster->start = extent_key->objectid; 2870 else 2871 BUG_ON(cluster->nr >= MAX_EXTENTS); 2872 cluster->end = extent_key->objectid + extent_key->offset - 1; 2873 cluster->boundary[cluster->nr] = extent_key->objectid; 2874 cluster->nr++; 2875 2876 if (cluster->nr >= MAX_EXTENTS) { 2877 ret = relocate_file_extent_cluster(inode, cluster); 2878 if (ret) 2879 return ret; 2880 cluster->nr = 0; 2881 } 2882 return 0; 2883 } 2884 2885 /* 2886 * helper to add a tree block to the list. 2887 * the major work is getting the generation and level of the block 2888 */ 2889 static int add_tree_block(struct reloc_control *rc, 2890 struct btrfs_key *extent_key, 2891 struct btrfs_path *path, 2892 struct rb_root *blocks) 2893 { 2894 struct extent_buffer *eb; 2895 struct btrfs_extent_item *ei; 2896 struct btrfs_tree_block_info *bi; 2897 struct tree_block *block; 2898 struct rb_node *rb_node; 2899 u32 item_size; 2900 int level = -1; 2901 u64 generation; 2902 2903 eb = path->nodes[0]; 2904 item_size = btrfs_item_size_nr(eb, path->slots[0]); 2905 2906 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 2907 item_size >= sizeof(*ei) + sizeof(*bi)) { 2908 ei = btrfs_item_ptr(eb, path->slots[0], 2909 struct btrfs_extent_item); 2910 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 2911 bi = (struct btrfs_tree_block_info *)(ei + 1); 2912 level = btrfs_tree_block_level(eb, bi); 2913 } else { 2914 level = (int)extent_key->offset; 2915 } 2916 generation = btrfs_extent_generation(eb, ei); 2917 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 2918 btrfs_print_v0_err(eb->fs_info); 2919 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 2920 return -EINVAL; 2921 } else { 2922 BUG(); 2923 } 2924 2925 btrfs_release_path(path); 2926 2927 BUG_ON(level == -1); 2928 2929 block = kmalloc(sizeof(*block), GFP_NOFS); 2930 if (!block) 2931 return -ENOMEM; 2932 2933 block->bytenr = extent_key->objectid; 2934 block->key.objectid = rc->extent_root->fs_info->nodesize; 2935 block->key.offset = generation; 2936 block->level = level; 2937 block->key_ready = 0; 2938 2939 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 2940 if (rb_node) 2941 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 2942 -EEXIST); 2943 2944 return 0; 2945 } 2946 2947 /* 2948 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 2949 */ 2950 static int __add_tree_block(struct reloc_control *rc, 2951 u64 bytenr, u32 blocksize, 2952 struct rb_root *blocks) 2953 { 2954 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2955 struct btrfs_path *path; 2956 struct btrfs_key key; 2957 int ret; 2958 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 2959 2960 if (tree_block_processed(bytenr, rc)) 2961 return 0; 2962 2963 if (rb_simple_search(blocks, bytenr)) 2964 return 0; 2965 2966 path = btrfs_alloc_path(); 2967 if (!path) 2968 return -ENOMEM; 2969 again: 2970 key.objectid = bytenr; 2971 if (skinny) { 2972 key.type = BTRFS_METADATA_ITEM_KEY; 2973 key.offset = (u64)-1; 2974 } else { 2975 key.type = BTRFS_EXTENT_ITEM_KEY; 2976 key.offset = blocksize; 2977 } 2978 2979 path->search_commit_root = 1; 2980 path->skip_locking = 1; 2981 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 2982 if (ret < 0) 2983 goto out; 2984 2985 if (ret > 0 && skinny) { 2986 if (path->slots[0]) { 2987 path->slots[0]--; 2988 btrfs_item_key_to_cpu(path->nodes[0], &key, 2989 path->slots[0]); 2990 if (key.objectid == bytenr && 2991 (key.type == BTRFS_METADATA_ITEM_KEY || 2992 (key.type == BTRFS_EXTENT_ITEM_KEY && 2993 key.offset == blocksize))) 2994 ret = 0; 2995 } 2996 2997 if (ret) { 2998 skinny = false; 2999 btrfs_release_path(path); 3000 goto again; 3001 } 3002 } 3003 if (ret) { 3004 ASSERT(ret == 1); 3005 btrfs_print_leaf(path->nodes[0]); 3006 btrfs_err(fs_info, 3007 "tree block extent item (%llu) is not found in extent tree", 3008 bytenr); 3009 WARN_ON(1); 3010 ret = -EINVAL; 3011 goto out; 3012 } 3013 3014 ret = add_tree_block(rc, &key, path, blocks); 3015 out: 3016 btrfs_free_path(path); 3017 return ret; 3018 } 3019 3020 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3021 struct btrfs_block_group *block_group, 3022 struct inode *inode, 3023 u64 ino) 3024 { 3025 struct btrfs_key key; 3026 struct btrfs_root *root = fs_info->tree_root; 3027 struct btrfs_trans_handle *trans; 3028 int ret = 0; 3029 3030 if (inode) 3031 goto truncate; 3032 3033 key.objectid = ino; 3034 key.type = BTRFS_INODE_ITEM_KEY; 3035 key.offset = 0; 3036 3037 inode = btrfs_iget(fs_info->sb, &key, root); 3038 if (IS_ERR(inode)) 3039 return -ENOENT; 3040 3041 truncate: 3042 ret = btrfs_check_trunc_cache_free_space(fs_info, 3043 &fs_info->global_block_rsv); 3044 if (ret) 3045 goto out; 3046 3047 trans = btrfs_join_transaction(root); 3048 if (IS_ERR(trans)) { 3049 ret = PTR_ERR(trans); 3050 goto out; 3051 } 3052 3053 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3054 3055 btrfs_end_transaction(trans); 3056 btrfs_btree_balance_dirty(fs_info); 3057 out: 3058 iput(inode); 3059 return ret; 3060 } 3061 3062 /* 3063 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3064 * cache inode, to avoid free space cache data extent blocking data relocation. 3065 */ 3066 static int delete_v1_space_cache(struct extent_buffer *leaf, 3067 struct btrfs_block_group *block_group, 3068 u64 data_bytenr) 3069 { 3070 u64 space_cache_ino; 3071 struct btrfs_file_extent_item *ei; 3072 struct btrfs_key key; 3073 bool found = false; 3074 int i; 3075 int ret; 3076 3077 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3078 return 0; 3079 3080 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3081 btrfs_item_key_to_cpu(leaf, &key, i); 3082 if (key.type != BTRFS_EXTENT_DATA_KEY) 3083 continue; 3084 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3085 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_REG && 3086 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3087 found = true; 3088 space_cache_ino = key.objectid; 3089 break; 3090 } 3091 } 3092 if (!found) 3093 return -ENOENT; 3094 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3095 space_cache_ino); 3096 return ret; 3097 } 3098 3099 /* 3100 * helper to find all tree blocks that reference a given data extent 3101 */ 3102 static noinline_for_stack 3103 int add_data_references(struct reloc_control *rc, 3104 struct btrfs_key *extent_key, 3105 struct btrfs_path *path, 3106 struct rb_root *blocks) 3107 { 3108 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3109 struct ulist *leaves = NULL; 3110 struct ulist_iterator leaf_uiter; 3111 struct ulist_node *ref_node = NULL; 3112 const u32 blocksize = fs_info->nodesize; 3113 int ret = 0; 3114 3115 btrfs_release_path(path); 3116 ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, 3117 0, &leaves, NULL, true); 3118 if (ret < 0) 3119 return ret; 3120 3121 ULIST_ITER_INIT(&leaf_uiter); 3122 while ((ref_node = ulist_next(leaves, &leaf_uiter))) { 3123 struct extent_buffer *eb; 3124 3125 eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL); 3126 if (IS_ERR(eb)) { 3127 ret = PTR_ERR(eb); 3128 break; 3129 } 3130 ret = delete_v1_space_cache(eb, rc->block_group, 3131 extent_key->objectid); 3132 free_extent_buffer(eb); 3133 if (ret < 0) 3134 break; 3135 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3136 if (ret < 0) 3137 break; 3138 } 3139 if (ret < 0) 3140 free_block_list(blocks); 3141 ulist_free(leaves); 3142 return ret; 3143 } 3144 3145 /* 3146 * helper to find next unprocessed extent 3147 */ 3148 static noinline_for_stack 3149 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3150 struct btrfs_key *extent_key) 3151 { 3152 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3153 struct btrfs_key key; 3154 struct extent_buffer *leaf; 3155 u64 start, end, last; 3156 int ret; 3157 3158 last = rc->block_group->start + rc->block_group->length; 3159 while (1) { 3160 cond_resched(); 3161 if (rc->search_start >= last) { 3162 ret = 1; 3163 break; 3164 } 3165 3166 key.objectid = rc->search_start; 3167 key.type = BTRFS_EXTENT_ITEM_KEY; 3168 key.offset = 0; 3169 3170 path->search_commit_root = 1; 3171 path->skip_locking = 1; 3172 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3173 0, 0); 3174 if (ret < 0) 3175 break; 3176 next: 3177 leaf = path->nodes[0]; 3178 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3179 ret = btrfs_next_leaf(rc->extent_root, path); 3180 if (ret != 0) 3181 break; 3182 leaf = path->nodes[0]; 3183 } 3184 3185 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3186 if (key.objectid >= last) { 3187 ret = 1; 3188 break; 3189 } 3190 3191 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3192 key.type != BTRFS_METADATA_ITEM_KEY) { 3193 path->slots[0]++; 3194 goto next; 3195 } 3196 3197 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3198 key.objectid + key.offset <= rc->search_start) { 3199 path->slots[0]++; 3200 goto next; 3201 } 3202 3203 if (key.type == BTRFS_METADATA_ITEM_KEY && 3204 key.objectid + fs_info->nodesize <= 3205 rc->search_start) { 3206 path->slots[0]++; 3207 goto next; 3208 } 3209 3210 ret = find_first_extent_bit(&rc->processed_blocks, 3211 key.objectid, &start, &end, 3212 EXTENT_DIRTY, NULL); 3213 3214 if (ret == 0 && start <= key.objectid) { 3215 btrfs_release_path(path); 3216 rc->search_start = end + 1; 3217 } else { 3218 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3219 rc->search_start = key.objectid + key.offset; 3220 else 3221 rc->search_start = key.objectid + 3222 fs_info->nodesize; 3223 memcpy(extent_key, &key, sizeof(key)); 3224 return 0; 3225 } 3226 } 3227 btrfs_release_path(path); 3228 return ret; 3229 } 3230 3231 static void set_reloc_control(struct reloc_control *rc) 3232 { 3233 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3234 3235 mutex_lock(&fs_info->reloc_mutex); 3236 fs_info->reloc_ctl = rc; 3237 mutex_unlock(&fs_info->reloc_mutex); 3238 } 3239 3240 static void unset_reloc_control(struct reloc_control *rc) 3241 { 3242 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3243 3244 mutex_lock(&fs_info->reloc_mutex); 3245 fs_info->reloc_ctl = NULL; 3246 mutex_unlock(&fs_info->reloc_mutex); 3247 } 3248 3249 static int check_extent_flags(u64 flags) 3250 { 3251 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3252 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3253 return 1; 3254 if (!(flags & BTRFS_EXTENT_FLAG_DATA) && 3255 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3256 return 1; 3257 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3258 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 3259 return 1; 3260 return 0; 3261 } 3262 3263 static noinline_for_stack 3264 int prepare_to_relocate(struct reloc_control *rc) 3265 { 3266 struct btrfs_trans_handle *trans; 3267 int ret; 3268 3269 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3270 BTRFS_BLOCK_RSV_TEMP); 3271 if (!rc->block_rsv) 3272 return -ENOMEM; 3273 3274 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3275 rc->search_start = rc->block_group->start; 3276 rc->extents_found = 0; 3277 rc->nodes_relocated = 0; 3278 rc->merging_rsv_size = 0; 3279 rc->reserved_bytes = 0; 3280 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3281 RELOCATION_RESERVED_NODES; 3282 ret = btrfs_block_rsv_refill(rc->extent_root, 3283 rc->block_rsv, rc->block_rsv->size, 3284 BTRFS_RESERVE_FLUSH_ALL); 3285 if (ret) 3286 return ret; 3287 3288 rc->create_reloc_tree = 1; 3289 set_reloc_control(rc); 3290 3291 trans = btrfs_join_transaction(rc->extent_root); 3292 if (IS_ERR(trans)) { 3293 unset_reloc_control(rc); 3294 /* 3295 * extent tree is not a ref_cow tree and has no reloc_root to 3296 * cleanup. And callers are responsible to free the above 3297 * block rsv. 3298 */ 3299 return PTR_ERR(trans); 3300 } 3301 btrfs_commit_transaction(trans); 3302 return 0; 3303 } 3304 3305 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3306 { 3307 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3308 struct rb_root blocks = RB_ROOT; 3309 struct btrfs_key key; 3310 struct btrfs_trans_handle *trans = NULL; 3311 struct btrfs_path *path; 3312 struct btrfs_extent_item *ei; 3313 u64 flags; 3314 u32 item_size; 3315 int ret; 3316 int err = 0; 3317 int progress = 0; 3318 3319 path = btrfs_alloc_path(); 3320 if (!path) 3321 return -ENOMEM; 3322 path->reada = READA_FORWARD; 3323 3324 ret = prepare_to_relocate(rc); 3325 if (ret) { 3326 err = ret; 3327 goto out_free; 3328 } 3329 3330 while (1) { 3331 rc->reserved_bytes = 0; 3332 ret = btrfs_block_rsv_refill(rc->extent_root, 3333 rc->block_rsv, rc->block_rsv->size, 3334 BTRFS_RESERVE_FLUSH_ALL); 3335 if (ret) { 3336 err = ret; 3337 break; 3338 } 3339 progress++; 3340 trans = btrfs_start_transaction(rc->extent_root, 0); 3341 if (IS_ERR(trans)) { 3342 err = PTR_ERR(trans); 3343 trans = NULL; 3344 break; 3345 } 3346 restart: 3347 if (update_backref_cache(trans, &rc->backref_cache)) { 3348 btrfs_end_transaction(trans); 3349 trans = NULL; 3350 continue; 3351 } 3352 3353 ret = find_next_extent(rc, path, &key); 3354 if (ret < 0) 3355 err = ret; 3356 if (ret != 0) 3357 break; 3358 3359 rc->extents_found++; 3360 3361 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3362 struct btrfs_extent_item); 3363 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 3364 if (item_size >= sizeof(*ei)) { 3365 flags = btrfs_extent_flags(path->nodes[0], ei); 3366 ret = check_extent_flags(flags); 3367 BUG_ON(ret); 3368 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3369 err = -EINVAL; 3370 btrfs_print_v0_err(trans->fs_info); 3371 btrfs_abort_transaction(trans, err); 3372 break; 3373 } else { 3374 BUG(); 3375 } 3376 3377 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3378 ret = add_tree_block(rc, &key, path, &blocks); 3379 } else if (rc->stage == UPDATE_DATA_PTRS && 3380 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3381 ret = add_data_references(rc, &key, path, &blocks); 3382 } else { 3383 btrfs_release_path(path); 3384 ret = 0; 3385 } 3386 if (ret < 0) { 3387 err = ret; 3388 break; 3389 } 3390 3391 if (!RB_EMPTY_ROOT(&blocks)) { 3392 ret = relocate_tree_blocks(trans, rc, &blocks); 3393 if (ret < 0) { 3394 if (ret != -EAGAIN) { 3395 err = ret; 3396 break; 3397 } 3398 rc->extents_found--; 3399 rc->search_start = key.objectid; 3400 } 3401 } 3402 3403 btrfs_end_transaction_throttle(trans); 3404 btrfs_btree_balance_dirty(fs_info); 3405 trans = NULL; 3406 3407 if (rc->stage == MOVE_DATA_EXTENTS && 3408 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3409 rc->found_file_extent = 1; 3410 ret = relocate_data_extent(rc->data_inode, 3411 &key, &rc->cluster); 3412 if (ret < 0) { 3413 err = ret; 3414 break; 3415 } 3416 } 3417 if (btrfs_should_cancel_balance(fs_info)) { 3418 err = -ECANCELED; 3419 break; 3420 } 3421 } 3422 if (trans && progress && err == -ENOSPC) { 3423 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3424 if (ret == 1) { 3425 err = 0; 3426 progress = 0; 3427 goto restart; 3428 } 3429 } 3430 3431 btrfs_release_path(path); 3432 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3433 3434 if (trans) { 3435 btrfs_end_transaction_throttle(trans); 3436 btrfs_btree_balance_dirty(fs_info); 3437 } 3438 3439 if (!err) { 3440 ret = relocate_file_extent_cluster(rc->data_inode, 3441 &rc->cluster); 3442 if (ret < 0) 3443 err = ret; 3444 } 3445 3446 rc->create_reloc_tree = 0; 3447 set_reloc_control(rc); 3448 3449 btrfs_backref_release_cache(&rc->backref_cache); 3450 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3451 3452 /* 3453 * Even in the case when the relocation is cancelled, we should all go 3454 * through prepare_to_merge() and merge_reloc_roots(). 3455 * 3456 * For error (including cancelled balance), prepare_to_merge() will 3457 * mark all reloc trees orphan, then queue them for cleanup in 3458 * merge_reloc_roots() 3459 */ 3460 err = prepare_to_merge(rc, err); 3461 3462 merge_reloc_roots(rc); 3463 3464 rc->merge_reloc_tree = 0; 3465 unset_reloc_control(rc); 3466 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3467 3468 /* get rid of pinned extents */ 3469 trans = btrfs_join_transaction(rc->extent_root); 3470 if (IS_ERR(trans)) { 3471 err = PTR_ERR(trans); 3472 goto out_free; 3473 } 3474 btrfs_commit_transaction(trans); 3475 out_free: 3476 ret = clean_dirty_subvols(rc); 3477 if (ret < 0 && !err) 3478 err = ret; 3479 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3480 btrfs_free_path(path); 3481 return err; 3482 } 3483 3484 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3485 struct btrfs_root *root, u64 objectid) 3486 { 3487 struct btrfs_path *path; 3488 struct btrfs_inode_item *item; 3489 struct extent_buffer *leaf; 3490 int ret; 3491 3492 path = btrfs_alloc_path(); 3493 if (!path) 3494 return -ENOMEM; 3495 3496 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3497 if (ret) 3498 goto out; 3499 3500 leaf = path->nodes[0]; 3501 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3502 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3503 btrfs_set_inode_generation(leaf, item, 1); 3504 btrfs_set_inode_size(leaf, item, 0); 3505 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3506 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3507 BTRFS_INODE_PREALLOC); 3508 btrfs_mark_buffer_dirty(leaf); 3509 out: 3510 btrfs_free_path(path); 3511 return ret; 3512 } 3513 3514 /* 3515 * helper to create inode for data relocation. 3516 * the inode is in data relocation tree and its link count is 0 3517 */ 3518 static noinline_for_stack 3519 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3520 struct btrfs_block_group *group) 3521 { 3522 struct inode *inode = NULL; 3523 struct btrfs_trans_handle *trans; 3524 struct btrfs_root *root; 3525 struct btrfs_key key; 3526 u64 objectid; 3527 int err = 0; 3528 3529 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 3530 if (IS_ERR(root)) 3531 return ERR_CAST(root); 3532 3533 trans = btrfs_start_transaction(root, 6); 3534 if (IS_ERR(trans)) { 3535 btrfs_put_root(root); 3536 return ERR_CAST(trans); 3537 } 3538 3539 err = btrfs_find_free_objectid(root, &objectid); 3540 if (err) 3541 goto out; 3542 3543 err = __insert_orphan_inode(trans, root, objectid); 3544 BUG_ON(err); 3545 3546 key.objectid = objectid; 3547 key.type = BTRFS_INODE_ITEM_KEY; 3548 key.offset = 0; 3549 inode = btrfs_iget(fs_info->sb, &key, root); 3550 BUG_ON(IS_ERR(inode)); 3551 BTRFS_I(inode)->index_cnt = group->start; 3552 3553 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 3554 out: 3555 btrfs_put_root(root); 3556 btrfs_end_transaction(trans); 3557 btrfs_btree_balance_dirty(fs_info); 3558 if (err) { 3559 if (inode) 3560 iput(inode); 3561 inode = ERR_PTR(err); 3562 } 3563 return inode; 3564 } 3565 3566 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3567 { 3568 struct reloc_control *rc; 3569 3570 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3571 if (!rc) 3572 return NULL; 3573 3574 INIT_LIST_HEAD(&rc->reloc_roots); 3575 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3576 btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); 3577 mapping_tree_init(&rc->reloc_root_tree); 3578 extent_io_tree_init(fs_info, &rc->processed_blocks, 3579 IO_TREE_RELOC_BLOCKS, NULL); 3580 return rc; 3581 } 3582 3583 static void free_reloc_control(struct reloc_control *rc) 3584 { 3585 struct mapping_node *node, *tmp; 3586 3587 free_reloc_roots(&rc->reloc_roots); 3588 rbtree_postorder_for_each_entry_safe(node, tmp, 3589 &rc->reloc_root_tree.rb_root, rb_node) 3590 kfree(node); 3591 3592 kfree(rc); 3593 } 3594 3595 /* 3596 * Print the block group being relocated 3597 */ 3598 static void describe_relocation(struct btrfs_fs_info *fs_info, 3599 struct btrfs_block_group *block_group) 3600 { 3601 char buf[128] = {'\0'}; 3602 3603 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3604 3605 btrfs_info(fs_info, 3606 "relocating block group %llu flags %s", 3607 block_group->start, buf); 3608 } 3609 3610 static const char *stage_to_string(int stage) 3611 { 3612 if (stage == MOVE_DATA_EXTENTS) 3613 return "move data extents"; 3614 if (stage == UPDATE_DATA_PTRS) 3615 return "update data pointers"; 3616 return "unknown"; 3617 } 3618 3619 /* 3620 * function to relocate all extents in a block group. 3621 */ 3622 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 3623 { 3624 struct btrfs_block_group *bg; 3625 struct btrfs_root *extent_root = fs_info->extent_root; 3626 struct reloc_control *rc; 3627 struct inode *inode; 3628 struct btrfs_path *path; 3629 int ret; 3630 int rw = 0; 3631 int err = 0; 3632 3633 bg = btrfs_lookup_block_group(fs_info, group_start); 3634 if (!bg) 3635 return -ENOENT; 3636 3637 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3638 btrfs_put_block_group(bg); 3639 return -ETXTBSY; 3640 } 3641 3642 rc = alloc_reloc_control(fs_info); 3643 if (!rc) { 3644 btrfs_put_block_group(bg); 3645 return -ENOMEM; 3646 } 3647 3648 rc->extent_root = extent_root; 3649 rc->block_group = bg; 3650 3651 ret = btrfs_inc_block_group_ro(rc->block_group, true); 3652 if (ret) { 3653 err = ret; 3654 goto out; 3655 } 3656 rw = 1; 3657 3658 path = btrfs_alloc_path(); 3659 if (!path) { 3660 err = -ENOMEM; 3661 goto out; 3662 } 3663 3664 inode = lookup_free_space_inode(rc->block_group, path); 3665 btrfs_free_path(path); 3666 3667 if (!IS_ERR(inode)) 3668 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 3669 else 3670 ret = PTR_ERR(inode); 3671 3672 if (ret && ret != -ENOENT) { 3673 err = ret; 3674 goto out; 3675 } 3676 3677 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3678 if (IS_ERR(rc->data_inode)) { 3679 err = PTR_ERR(rc->data_inode); 3680 rc->data_inode = NULL; 3681 goto out; 3682 } 3683 3684 describe_relocation(fs_info, rc->block_group); 3685 3686 btrfs_wait_block_group_reservations(rc->block_group); 3687 btrfs_wait_nocow_writers(rc->block_group); 3688 btrfs_wait_ordered_roots(fs_info, U64_MAX, 3689 rc->block_group->start, 3690 rc->block_group->length); 3691 3692 while (1) { 3693 int finishes_stage; 3694 3695 mutex_lock(&fs_info->cleaner_mutex); 3696 ret = relocate_block_group(rc); 3697 mutex_unlock(&fs_info->cleaner_mutex); 3698 if (ret < 0) 3699 err = ret; 3700 3701 finishes_stage = rc->stage; 3702 /* 3703 * We may have gotten ENOSPC after we already dirtied some 3704 * extents. If writeout happens while we're relocating a 3705 * different block group we could end up hitting the 3706 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 3707 * btrfs_reloc_cow_block. Make sure we write everything out 3708 * properly so we don't trip over this problem, and then break 3709 * out of the loop if we hit an error. 3710 */ 3711 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 3712 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 3713 (u64)-1); 3714 if (ret) 3715 err = ret; 3716 invalidate_mapping_pages(rc->data_inode->i_mapping, 3717 0, -1); 3718 rc->stage = UPDATE_DATA_PTRS; 3719 } 3720 3721 if (err < 0) 3722 goto out; 3723 3724 if (rc->extents_found == 0) 3725 break; 3726 3727 btrfs_info(fs_info, "found %llu extents, stage: %s", 3728 rc->extents_found, stage_to_string(finishes_stage)); 3729 } 3730 3731 WARN_ON(rc->block_group->pinned > 0); 3732 WARN_ON(rc->block_group->reserved > 0); 3733 WARN_ON(rc->block_group->used > 0); 3734 out: 3735 if (err && rw) 3736 btrfs_dec_block_group_ro(rc->block_group); 3737 iput(rc->data_inode); 3738 btrfs_put_block_group(rc->block_group); 3739 free_reloc_control(rc); 3740 return err; 3741 } 3742 3743 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 3744 { 3745 struct btrfs_fs_info *fs_info = root->fs_info; 3746 struct btrfs_trans_handle *trans; 3747 int ret, err; 3748 3749 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3750 if (IS_ERR(trans)) 3751 return PTR_ERR(trans); 3752 3753 memset(&root->root_item.drop_progress, 0, 3754 sizeof(root->root_item.drop_progress)); 3755 root->root_item.drop_level = 0; 3756 btrfs_set_root_refs(&root->root_item, 0); 3757 ret = btrfs_update_root(trans, fs_info->tree_root, 3758 &root->root_key, &root->root_item); 3759 3760 err = btrfs_end_transaction(trans); 3761 if (err) 3762 return err; 3763 return ret; 3764 } 3765 3766 /* 3767 * recover relocation interrupted by system crash. 3768 * 3769 * this function resumes merging reloc trees with corresponding fs trees. 3770 * this is important for keeping the sharing of tree blocks 3771 */ 3772 int btrfs_recover_relocation(struct btrfs_root *root) 3773 { 3774 struct btrfs_fs_info *fs_info = root->fs_info; 3775 LIST_HEAD(reloc_roots); 3776 struct btrfs_key key; 3777 struct btrfs_root *fs_root; 3778 struct btrfs_root *reloc_root; 3779 struct btrfs_path *path; 3780 struct extent_buffer *leaf; 3781 struct reloc_control *rc = NULL; 3782 struct btrfs_trans_handle *trans; 3783 int ret; 3784 int err = 0; 3785 3786 path = btrfs_alloc_path(); 3787 if (!path) 3788 return -ENOMEM; 3789 path->reada = READA_BACK; 3790 3791 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 3792 key.type = BTRFS_ROOT_ITEM_KEY; 3793 key.offset = (u64)-1; 3794 3795 while (1) { 3796 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 3797 path, 0, 0); 3798 if (ret < 0) { 3799 err = ret; 3800 goto out; 3801 } 3802 if (ret > 0) { 3803 if (path->slots[0] == 0) 3804 break; 3805 path->slots[0]--; 3806 } 3807 leaf = path->nodes[0]; 3808 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3809 btrfs_release_path(path); 3810 3811 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 3812 key.type != BTRFS_ROOT_ITEM_KEY) 3813 break; 3814 3815 reloc_root = btrfs_read_tree_root(root, &key); 3816 if (IS_ERR(reloc_root)) { 3817 err = PTR_ERR(reloc_root); 3818 goto out; 3819 } 3820 3821 set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); 3822 list_add(&reloc_root->root_list, &reloc_roots); 3823 3824 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 3825 fs_root = read_fs_root(fs_info, 3826 reloc_root->root_key.offset); 3827 if (IS_ERR(fs_root)) { 3828 ret = PTR_ERR(fs_root); 3829 if (ret != -ENOENT) { 3830 err = ret; 3831 goto out; 3832 } 3833 ret = mark_garbage_root(reloc_root); 3834 if (ret < 0) { 3835 err = ret; 3836 goto out; 3837 } 3838 } else { 3839 btrfs_put_root(fs_root); 3840 } 3841 } 3842 3843 if (key.offset == 0) 3844 break; 3845 3846 key.offset--; 3847 } 3848 btrfs_release_path(path); 3849 3850 if (list_empty(&reloc_roots)) 3851 goto out; 3852 3853 rc = alloc_reloc_control(fs_info); 3854 if (!rc) { 3855 err = -ENOMEM; 3856 goto out; 3857 } 3858 3859 rc->extent_root = fs_info->extent_root; 3860 3861 set_reloc_control(rc); 3862 3863 trans = btrfs_join_transaction(rc->extent_root); 3864 if (IS_ERR(trans)) { 3865 err = PTR_ERR(trans); 3866 goto out_unset; 3867 } 3868 3869 rc->merge_reloc_tree = 1; 3870 3871 while (!list_empty(&reloc_roots)) { 3872 reloc_root = list_entry(reloc_roots.next, 3873 struct btrfs_root, root_list); 3874 list_del(&reloc_root->root_list); 3875 3876 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 3877 list_add_tail(&reloc_root->root_list, 3878 &rc->reloc_roots); 3879 continue; 3880 } 3881 3882 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); 3883 if (IS_ERR(fs_root)) { 3884 err = PTR_ERR(fs_root); 3885 list_add_tail(&reloc_root->root_list, &reloc_roots); 3886 btrfs_end_transaction(trans); 3887 goto out_unset; 3888 } 3889 3890 err = __add_reloc_root(reloc_root); 3891 BUG_ON(err < 0); /* -ENOMEM or logic error */ 3892 fs_root->reloc_root = btrfs_grab_root(reloc_root); 3893 btrfs_put_root(fs_root); 3894 } 3895 3896 err = btrfs_commit_transaction(trans); 3897 if (err) 3898 goto out_unset; 3899 3900 merge_reloc_roots(rc); 3901 3902 unset_reloc_control(rc); 3903 3904 trans = btrfs_join_transaction(rc->extent_root); 3905 if (IS_ERR(trans)) { 3906 err = PTR_ERR(trans); 3907 goto out_clean; 3908 } 3909 err = btrfs_commit_transaction(trans); 3910 out_clean: 3911 ret = clean_dirty_subvols(rc); 3912 if (ret < 0 && !err) 3913 err = ret; 3914 out_unset: 3915 unset_reloc_control(rc); 3916 free_reloc_control(rc); 3917 out: 3918 if (!list_empty(&reloc_roots)) 3919 free_reloc_roots(&reloc_roots); 3920 3921 btrfs_free_path(path); 3922 3923 if (err == 0) { 3924 /* cleanup orphan inode in data relocation tree */ 3925 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 3926 if (IS_ERR(fs_root)) { 3927 err = PTR_ERR(fs_root); 3928 } else { 3929 err = btrfs_orphan_cleanup(fs_root); 3930 btrfs_put_root(fs_root); 3931 } 3932 } 3933 return err; 3934 } 3935 3936 /* 3937 * helper to add ordered checksum for data relocation. 3938 * 3939 * cloning checksum properly handles the nodatasum extents. 3940 * it also saves CPU time to re-calculate the checksum. 3941 */ 3942 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 3943 { 3944 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3945 struct btrfs_ordered_sum *sums; 3946 struct btrfs_ordered_extent *ordered; 3947 int ret; 3948 u64 disk_bytenr; 3949 u64 new_bytenr; 3950 LIST_HEAD(list); 3951 3952 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 3953 BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); 3954 3955 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 3956 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 3957 disk_bytenr + len - 1, &list, 0); 3958 if (ret) 3959 goto out; 3960 3961 while (!list_empty(&list)) { 3962 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 3963 list_del_init(&sums->list); 3964 3965 /* 3966 * We need to offset the new_bytenr based on where the csum is. 3967 * We need to do this because we will read in entire prealloc 3968 * extents but we may have written to say the middle of the 3969 * prealloc extent, so we need to make sure the csum goes with 3970 * the right disk offset. 3971 * 3972 * We can do this because the data reloc inode refers strictly 3973 * to the on disk bytes, so we don't have to worry about 3974 * disk_len vs real len like with real inodes since it's all 3975 * disk length. 3976 */ 3977 new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; 3978 sums->bytenr = new_bytenr; 3979 3980 btrfs_add_ordered_sum(ordered, sums); 3981 } 3982 out: 3983 btrfs_put_ordered_extent(ordered); 3984 return ret; 3985 } 3986 3987 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 3988 struct btrfs_root *root, struct extent_buffer *buf, 3989 struct extent_buffer *cow) 3990 { 3991 struct btrfs_fs_info *fs_info = root->fs_info; 3992 struct reloc_control *rc; 3993 struct btrfs_backref_node *node; 3994 int first_cow = 0; 3995 int level; 3996 int ret = 0; 3997 3998 rc = fs_info->reloc_ctl; 3999 if (!rc) 4000 return 0; 4001 4002 BUG_ON(rc->stage == UPDATE_DATA_PTRS && 4003 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); 4004 4005 level = btrfs_header_level(buf); 4006 if (btrfs_header_generation(buf) <= 4007 btrfs_root_last_snapshot(&root->root_item)) 4008 first_cow = 1; 4009 4010 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4011 rc->create_reloc_tree) { 4012 WARN_ON(!first_cow && level == 0); 4013 4014 node = rc->backref_cache.path[level]; 4015 BUG_ON(node->bytenr != buf->start && 4016 node->new_bytenr != buf->start); 4017 4018 btrfs_backref_drop_node_buffer(node); 4019 atomic_inc(&cow->refs); 4020 node->eb = cow; 4021 node->new_bytenr = cow->start; 4022 4023 if (!node->pending) { 4024 list_move_tail(&node->list, 4025 &rc->backref_cache.pending[level]); 4026 node->pending = 1; 4027 } 4028 4029 if (first_cow) 4030 mark_block_processed(rc, node); 4031 4032 if (first_cow && level > 0) 4033 rc->nodes_relocated += buf->len; 4034 } 4035 4036 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4037 ret = replace_file_extents(trans, rc, root, cow); 4038 return ret; 4039 } 4040 4041 /* 4042 * called before creating snapshot. it calculates metadata reservation 4043 * required for relocating tree blocks in the snapshot 4044 */ 4045 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4046 u64 *bytes_to_reserve) 4047 { 4048 struct btrfs_root *root = pending->root; 4049 struct reloc_control *rc = root->fs_info->reloc_ctl; 4050 4051 if (!rc || !have_reloc_root(root)) 4052 return; 4053 4054 if (!rc->merge_reloc_tree) 4055 return; 4056 4057 root = root->reloc_root; 4058 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4059 /* 4060 * relocation is in the stage of merging trees. the space 4061 * used by merging a reloc tree is twice the size of 4062 * relocated tree nodes in the worst case. half for cowing 4063 * the reloc tree, half for cowing the fs tree. the space 4064 * used by cowing the reloc tree will be freed after the 4065 * tree is dropped. if we create snapshot, cowing the fs 4066 * tree may use more space than it frees. so we need 4067 * reserve extra space. 4068 */ 4069 *bytes_to_reserve += rc->nodes_relocated; 4070 } 4071 4072 /* 4073 * called after snapshot is created. migrate block reservation 4074 * and create reloc root for the newly created snapshot 4075 * 4076 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4077 * references held on the reloc_root, one for root->reloc_root and one for 4078 * rc->reloc_roots. 4079 */ 4080 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4081 struct btrfs_pending_snapshot *pending) 4082 { 4083 struct btrfs_root *root = pending->root; 4084 struct btrfs_root *reloc_root; 4085 struct btrfs_root *new_root; 4086 struct reloc_control *rc = root->fs_info->reloc_ctl; 4087 int ret; 4088 4089 if (!rc || !have_reloc_root(root)) 4090 return 0; 4091 4092 rc = root->fs_info->reloc_ctl; 4093 rc->merging_rsv_size += rc->nodes_relocated; 4094 4095 if (rc->merge_reloc_tree) { 4096 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4097 rc->block_rsv, 4098 rc->nodes_relocated, true); 4099 if (ret) 4100 return ret; 4101 } 4102 4103 new_root = pending->snap; 4104 reloc_root = create_reloc_root(trans, root->reloc_root, 4105 new_root->root_key.objectid); 4106 if (IS_ERR(reloc_root)) 4107 return PTR_ERR(reloc_root); 4108 4109 ret = __add_reloc_root(reloc_root); 4110 BUG_ON(ret < 0); 4111 new_root->reloc_root = btrfs_grab_root(reloc_root); 4112 4113 if (rc->create_reloc_tree) 4114 ret = clone_backref_node(trans, rc, root, reloc_root); 4115 return ret; 4116 } 4117