1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "volumes.h" 16 #include "locking.h" 17 #include "btrfs_inode.h" 18 #include "async-thread.h" 19 #include "free-space-cache.h" 20 #include "inode-map.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 24 /* 25 * backref_node, mapping_node and tree_block start with this 26 */ 27 struct tree_entry { 28 struct rb_node rb_node; 29 u64 bytenr; 30 }; 31 32 /* 33 * present a tree block in the backref cache 34 */ 35 struct backref_node { 36 struct rb_node rb_node; 37 u64 bytenr; 38 39 u64 new_bytenr; 40 /* objectid of tree block owner, can be not uptodate */ 41 u64 owner; 42 /* link to pending, changed or detached list */ 43 struct list_head list; 44 /* list of upper level blocks reference this block */ 45 struct list_head upper; 46 /* list of child blocks in the cache */ 47 struct list_head lower; 48 /* NULL if this node is not tree root */ 49 struct btrfs_root *root; 50 /* extent buffer got by COW the block */ 51 struct extent_buffer *eb; 52 /* level of tree block */ 53 unsigned int level:8; 54 /* is the block in non-reference counted tree */ 55 unsigned int cowonly:1; 56 /* 1 if no child node in the cache */ 57 unsigned int lowest:1; 58 /* is the extent buffer locked */ 59 unsigned int locked:1; 60 /* has the block been processed */ 61 unsigned int processed:1; 62 /* have backrefs of this block been checked */ 63 unsigned int checked:1; 64 /* 65 * 1 if corresponding block has been cowed but some upper 66 * level block pointers may not point to the new location 67 */ 68 unsigned int pending:1; 69 /* 70 * 1 if the backref node isn't connected to any other 71 * backref node. 72 */ 73 unsigned int detached:1; 74 }; 75 76 /* 77 * present a block pointer in the backref cache 78 */ 79 struct backref_edge { 80 struct list_head list[2]; 81 struct backref_node *node[2]; 82 }; 83 84 #define LOWER 0 85 #define UPPER 1 86 #define RELOCATION_RESERVED_NODES 256 87 88 struct backref_cache { 89 /* red black tree of all backref nodes in the cache */ 90 struct rb_root rb_root; 91 /* for passing backref nodes to btrfs_reloc_cow_block */ 92 struct backref_node *path[BTRFS_MAX_LEVEL]; 93 /* 94 * list of blocks that have been cowed but some block 95 * pointers in upper level blocks may not reflect the 96 * new location 97 */ 98 struct list_head pending[BTRFS_MAX_LEVEL]; 99 /* list of backref nodes with no child node */ 100 struct list_head leaves; 101 /* list of blocks that have been cowed in current transaction */ 102 struct list_head changed; 103 /* list of detached backref node. */ 104 struct list_head detached; 105 106 u64 last_trans; 107 108 int nr_nodes; 109 int nr_edges; 110 }; 111 112 /* 113 * map address of tree root to tree 114 */ 115 struct mapping_node { 116 struct rb_node rb_node; 117 u64 bytenr; 118 void *data; 119 }; 120 121 struct mapping_tree { 122 struct rb_root rb_root; 123 spinlock_t lock; 124 }; 125 126 /* 127 * present a tree block to process 128 */ 129 struct tree_block { 130 struct rb_node rb_node; 131 u64 bytenr; 132 struct btrfs_key key; 133 unsigned int level:8; 134 unsigned int key_ready:1; 135 }; 136 137 #define MAX_EXTENTS 128 138 139 struct file_extent_cluster { 140 u64 start; 141 u64 end; 142 u64 boundary[MAX_EXTENTS]; 143 unsigned int nr; 144 }; 145 146 struct reloc_control { 147 /* block group to relocate */ 148 struct btrfs_block_group_cache *block_group; 149 /* extent tree */ 150 struct btrfs_root *extent_root; 151 /* inode for moving data */ 152 struct inode *data_inode; 153 154 struct btrfs_block_rsv *block_rsv; 155 156 struct backref_cache backref_cache; 157 158 struct file_extent_cluster cluster; 159 /* tree blocks have been processed */ 160 struct extent_io_tree processed_blocks; 161 /* map start of tree root to corresponding reloc tree */ 162 struct mapping_tree reloc_root_tree; 163 /* list of reloc trees */ 164 struct list_head reloc_roots; 165 /* list of subvolume trees that get relocated */ 166 struct list_head dirty_subvol_roots; 167 /* size of metadata reservation for merging reloc trees */ 168 u64 merging_rsv_size; 169 /* size of relocated tree nodes */ 170 u64 nodes_relocated; 171 /* reserved size for block group relocation*/ 172 u64 reserved_bytes; 173 174 u64 search_start; 175 u64 extents_found; 176 177 unsigned int stage:8; 178 unsigned int create_reloc_tree:1; 179 unsigned int merge_reloc_tree:1; 180 unsigned int found_file_extent:1; 181 }; 182 183 /* stages of data relocation */ 184 #define MOVE_DATA_EXTENTS 0 185 #define UPDATE_DATA_PTRS 1 186 187 static void remove_backref_node(struct backref_cache *cache, 188 struct backref_node *node); 189 static void __mark_block_processed(struct reloc_control *rc, 190 struct backref_node *node); 191 192 static void mapping_tree_init(struct mapping_tree *tree) 193 { 194 tree->rb_root = RB_ROOT; 195 spin_lock_init(&tree->lock); 196 } 197 198 static void backref_cache_init(struct backref_cache *cache) 199 { 200 int i; 201 cache->rb_root = RB_ROOT; 202 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 203 INIT_LIST_HEAD(&cache->pending[i]); 204 INIT_LIST_HEAD(&cache->changed); 205 INIT_LIST_HEAD(&cache->detached); 206 INIT_LIST_HEAD(&cache->leaves); 207 } 208 209 static void backref_cache_cleanup(struct backref_cache *cache) 210 { 211 struct backref_node *node; 212 int i; 213 214 while (!list_empty(&cache->detached)) { 215 node = list_entry(cache->detached.next, 216 struct backref_node, list); 217 remove_backref_node(cache, node); 218 } 219 220 while (!list_empty(&cache->leaves)) { 221 node = list_entry(cache->leaves.next, 222 struct backref_node, lower); 223 remove_backref_node(cache, node); 224 } 225 226 cache->last_trans = 0; 227 228 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 229 ASSERT(list_empty(&cache->pending[i])); 230 ASSERT(list_empty(&cache->changed)); 231 ASSERT(list_empty(&cache->detached)); 232 ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); 233 ASSERT(!cache->nr_nodes); 234 ASSERT(!cache->nr_edges); 235 } 236 237 static struct backref_node *alloc_backref_node(struct backref_cache *cache) 238 { 239 struct backref_node *node; 240 241 node = kzalloc(sizeof(*node), GFP_NOFS); 242 if (node) { 243 INIT_LIST_HEAD(&node->list); 244 INIT_LIST_HEAD(&node->upper); 245 INIT_LIST_HEAD(&node->lower); 246 RB_CLEAR_NODE(&node->rb_node); 247 cache->nr_nodes++; 248 } 249 return node; 250 } 251 252 static void free_backref_node(struct backref_cache *cache, 253 struct backref_node *node) 254 { 255 if (node) { 256 cache->nr_nodes--; 257 kfree(node); 258 } 259 } 260 261 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) 262 { 263 struct backref_edge *edge; 264 265 edge = kzalloc(sizeof(*edge), GFP_NOFS); 266 if (edge) 267 cache->nr_edges++; 268 return edge; 269 } 270 271 static void free_backref_edge(struct backref_cache *cache, 272 struct backref_edge *edge) 273 { 274 if (edge) { 275 cache->nr_edges--; 276 kfree(edge); 277 } 278 } 279 280 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 281 struct rb_node *node) 282 { 283 struct rb_node **p = &root->rb_node; 284 struct rb_node *parent = NULL; 285 struct tree_entry *entry; 286 287 while (*p) { 288 parent = *p; 289 entry = rb_entry(parent, struct tree_entry, rb_node); 290 291 if (bytenr < entry->bytenr) 292 p = &(*p)->rb_left; 293 else if (bytenr > entry->bytenr) 294 p = &(*p)->rb_right; 295 else 296 return parent; 297 } 298 299 rb_link_node(node, parent, p); 300 rb_insert_color(node, root); 301 return NULL; 302 } 303 304 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) 305 { 306 struct rb_node *n = root->rb_node; 307 struct tree_entry *entry; 308 309 while (n) { 310 entry = rb_entry(n, struct tree_entry, rb_node); 311 312 if (bytenr < entry->bytenr) 313 n = n->rb_left; 314 else if (bytenr > entry->bytenr) 315 n = n->rb_right; 316 else 317 return n; 318 } 319 return NULL; 320 } 321 322 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) 323 { 324 325 struct btrfs_fs_info *fs_info = NULL; 326 struct backref_node *bnode = rb_entry(rb_node, struct backref_node, 327 rb_node); 328 if (bnode->root) 329 fs_info = bnode->root->fs_info; 330 btrfs_panic(fs_info, errno, 331 "Inconsistency in backref cache found at offset %llu", 332 bytenr); 333 } 334 335 /* 336 * walk up backref nodes until reach node presents tree root 337 */ 338 static struct backref_node *walk_up_backref(struct backref_node *node, 339 struct backref_edge *edges[], 340 int *index) 341 { 342 struct backref_edge *edge; 343 int idx = *index; 344 345 while (!list_empty(&node->upper)) { 346 edge = list_entry(node->upper.next, 347 struct backref_edge, list[LOWER]); 348 edges[idx++] = edge; 349 node = edge->node[UPPER]; 350 } 351 BUG_ON(node->detached); 352 *index = idx; 353 return node; 354 } 355 356 /* 357 * walk down backref nodes to find start of next reference path 358 */ 359 static struct backref_node *walk_down_backref(struct backref_edge *edges[], 360 int *index) 361 { 362 struct backref_edge *edge; 363 struct backref_node *lower; 364 int idx = *index; 365 366 while (idx > 0) { 367 edge = edges[idx - 1]; 368 lower = edge->node[LOWER]; 369 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 370 idx--; 371 continue; 372 } 373 edge = list_entry(edge->list[LOWER].next, 374 struct backref_edge, list[LOWER]); 375 edges[idx - 1] = edge; 376 *index = idx; 377 return edge->node[UPPER]; 378 } 379 *index = 0; 380 return NULL; 381 } 382 383 static void unlock_node_buffer(struct backref_node *node) 384 { 385 if (node->locked) { 386 btrfs_tree_unlock(node->eb); 387 node->locked = 0; 388 } 389 } 390 391 static void drop_node_buffer(struct backref_node *node) 392 { 393 if (node->eb) { 394 unlock_node_buffer(node); 395 free_extent_buffer(node->eb); 396 node->eb = NULL; 397 } 398 } 399 400 static void drop_backref_node(struct backref_cache *tree, 401 struct backref_node *node) 402 { 403 BUG_ON(!list_empty(&node->upper)); 404 405 drop_node_buffer(node); 406 list_del(&node->list); 407 list_del(&node->lower); 408 if (!RB_EMPTY_NODE(&node->rb_node)) 409 rb_erase(&node->rb_node, &tree->rb_root); 410 free_backref_node(tree, node); 411 } 412 413 /* 414 * remove a backref node from the backref cache 415 */ 416 static void remove_backref_node(struct backref_cache *cache, 417 struct backref_node *node) 418 { 419 struct backref_node *upper; 420 struct backref_edge *edge; 421 422 if (!node) 423 return; 424 425 BUG_ON(!node->lowest && !node->detached); 426 while (!list_empty(&node->upper)) { 427 edge = list_entry(node->upper.next, struct backref_edge, 428 list[LOWER]); 429 upper = edge->node[UPPER]; 430 list_del(&edge->list[LOWER]); 431 list_del(&edge->list[UPPER]); 432 free_backref_edge(cache, edge); 433 434 if (RB_EMPTY_NODE(&upper->rb_node)) { 435 BUG_ON(!list_empty(&node->upper)); 436 drop_backref_node(cache, node); 437 node = upper; 438 node->lowest = 1; 439 continue; 440 } 441 /* 442 * add the node to leaf node list if no other 443 * child block cached. 444 */ 445 if (list_empty(&upper->lower)) { 446 list_add_tail(&upper->lower, &cache->leaves); 447 upper->lowest = 1; 448 } 449 } 450 451 drop_backref_node(cache, node); 452 } 453 454 static void update_backref_node(struct backref_cache *cache, 455 struct backref_node *node, u64 bytenr) 456 { 457 struct rb_node *rb_node; 458 rb_erase(&node->rb_node, &cache->rb_root); 459 node->bytenr = bytenr; 460 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 461 if (rb_node) 462 backref_tree_panic(rb_node, -EEXIST, bytenr); 463 } 464 465 /* 466 * update backref cache after a transaction commit 467 */ 468 static int update_backref_cache(struct btrfs_trans_handle *trans, 469 struct backref_cache *cache) 470 { 471 struct backref_node *node; 472 int level = 0; 473 474 if (cache->last_trans == 0) { 475 cache->last_trans = trans->transid; 476 return 0; 477 } 478 479 if (cache->last_trans == trans->transid) 480 return 0; 481 482 /* 483 * detached nodes are used to avoid unnecessary backref 484 * lookup. transaction commit changes the extent tree. 485 * so the detached nodes are no longer useful. 486 */ 487 while (!list_empty(&cache->detached)) { 488 node = list_entry(cache->detached.next, 489 struct backref_node, list); 490 remove_backref_node(cache, node); 491 } 492 493 while (!list_empty(&cache->changed)) { 494 node = list_entry(cache->changed.next, 495 struct backref_node, list); 496 list_del_init(&node->list); 497 BUG_ON(node->pending); 498 update_backref_node(cache, node, node->new_bytenr); 499 } 500 501 /* 502 * some nodes can be left in the pending list if there were 503 * errors during processing the pending nodes. 504 */ 505 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 506 list_for_each_entry(node, &cache->pending[level], list) { 507 BUG_ON(!node->pending); 508 if (node->bytenr == node->new_bytenr) 509 continue; 510 update_backref_node(cache, node, node->new_bytenr); 511 } 512 } 513 514 cache->last_trans = 0; 515 return 1; 516 } 517 518 519 static int should_ignore_root(struct btrfs_root *root) 520 { 521 struct btrfs_root *reloc_root; 522 523 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 524 return 0; 525 526 reloc_root = root->reloc_root; 527 if (!reloc_root) 528 return 0; 529 530 if (btrfs_root_last_snapshot(&reloc_root->root_item) == 531 root->fs_info->running_transaction->transid - 1) 532 return 0; 533 /* 534 * if there is reloc tree and it was created in previous 535 * transaction backref lookup can find the reloc tree, 536 * so backref node for the fs tree root is useless for 537 * relocation. 538 */ 539 return 1; 540 } 541 /* 542 * find reloc tree by address of tree root 543 */ 544 static struct btrfs_root *find_reloc_root(struct reloc_control *rc, 545 u64 bytenr) 546 { 547 struct rb_node *rb_node; 548 struct mapping_node *node; 549 struct btrfs_root *root = NULL; 550 551 spin_lock(&rc->reloc_root_tree.lock); 552 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); 553 if (rb_node) { 554 node = rb_entry(rb_node, struct mapping_node, rb_node); 555 root = (struct btrfs_root *)node->data; 556 } 557 spin_unlock(&rc->reloc_root_tree.lock); 558 return root; 559 } 560 561 static int is_cowonly_root(u64 root_objectid) 562 { 563 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || 564 root_objectid == BTRFS_EXTENT_TREE_OBJECTID || 565 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 566 root_objectid == BTRFS_DEV_TREE_OBJECTID || 567 root_objectid == BTRFS_TREE_LOG_OBJECTID || 568 root_objectid == BTRFS_CSUM_TREE_OBJECTID || 569 root_objectid == BTRFS_UUID_TREE_OBJECTID || 570 root_objectid == BTRFS_QUOTA_TREE_OBJECTID || 571 root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) 572 return 1; 573 return 0; 574 } 575 576 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, 577 u64 root_objectid) 578 { 579 struct btrfs_key key; 580 581 key.objectid = root_objectid; 582 key.type = BTRFS_ROOT_ITEM_KEY; 583 if (is_cowonly_root(root_objectid)) 584 key.offset = 0; 585 else 586 key.offset = (u64)-1; 587 588 return btrfs_get_fs_root(fs_info, &key, false); 589 } 590 591 static noinline_for_stack 592 int find_inline_backref(struct extent_buffer *leaf, int slot, 593 unsigned long *ptr, unsigned long *end) 594 { 595 struct btrfs_key key; 596 struct btrfs_extent_item *ei; 597 struct btrfs_tree_block_info *bi; 598 u32 item_size; 599 600 btrfs_item_key_to_cpu(leaf, &key, slot); 601 602 item_size = btrfs_item_size_nr(leaf, slot); 603 if (item_size < sizeof(*ei)) { 604 btrfs_print_v0_err(leaf->fs_info); 605 btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL); 606 return 1; 607 } 608 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 609 WARN_ON(!(btrfs_extent_flags(leaf, ei) & 610 BTRFS_EXTENT_FLAG_TREE_BLOCK)); 611 612 if (key.type == BTRFS_EXTENT_ITEM_KEY && 613 item_size <= sizeof(*ei) + sizeof(*bi)) { 614 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); 615 return 1; 616 } 617 if (key.type == BTRFS_METADATA_ITEM_KEY && 618 item_size <= sizeof(*ei)) { 619 WARN_ON(item_size < sizeof(*ei)); 620 return 1; 621 } 622 623 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 624 bi = (struct btrfs_tree_block_info *)(ei + 1); 625 *ptr = (unsigned long)(bi + 1); 626 } else { 627 *ptr = (unsigned long)(ei + 1); 628 } 629 *end = (unsigned long)ei + item_size; 630 return 0; 631 } 632 633 /* 634 * build backref tree for a given tree block. root of the backref tree 635 * corresponds the tree block, leaves of the backref tree correspond 636 * roots of b-trees that reference the tree block. 637 * 638 * the basic idea of this function is check backrefs of a given block 639 * to find upper level blocks that reference the block, and then check 640 * backrefs of these upper level blocks recursively. the recursion stop 641 * when tree root is reached or backrefs for the block is cached. 642 * 643 * NOTE: if we find backrefs for a block are cached, we know backrefs 644 * for all upper level blocks that directly/indirectly reference the 645 * block are also cached. 646 */ 647 static noinline_for_stack 648 struct backref_node *build_backref_tree(struct reloc_control *rc, 649 struct btrfs_key *node_key, 650 int level, u64 bytenr) 651 { 652 struct backref_cache *cache = &rc->backref_cache; 653 struct btrfs_path *path1; /* For searching extent root */ 654 struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ 655 struct extent_buffer *eb; 656 struct btrfs_root *root; 657 struct backref_node *cur; 658 struct backref_node *upper; 659 struct backref_node *lower; 660 struct backref_node *node = NULL; 661 struct backref_node *exist = NULL; 662 struct backref_edge *edge; 663 struct rb_node *rb_node; 664 struct btrfs_key key; 665 unsigned long end; 666 unsigned long ptr; 667 LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ 668 LIST_HEAD(useless); 669 int cowonly; 670 int ret; 671 int err = 0; 672 bool need_check = true; 673 674 path1 = btrfs_alloc_path(); 675 path2 = btrfs_alloc_path(); 676 if (!path1 || !path2) { 677 err = -ENOMEM; 678 goto out; 679 } 680 path1->reada = READA_FORWARD; 681 path2->reada = READA_FORWARD; 682 683 node = alloc_backref_node(cache); 684 if (!node) { 685 err = -ENOMEM; 686 goto out; 687 } 688 689 node->bytenr = bytenr; 690 node->level = level; 691 node->lowest = 1; 692 cur = node; 693 again: 694 end = 0; 695 ptr = 0; 696 key.objectid = cur->bytenr; 697 key.type = BTRFS_METADATA_ITEM_KEY; 698 key.offset = (u64)-1; 699 700 path1->search_commit_root = 1; 701 path1->skip_locking = 1; 702 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, 703 0, 0); 704 if (ret < 0) { 705 err = ret; 706 goto out; 707 } 708 ASSERT(ret); 709 ASSERT(path1->slots[0]); 710 711 path1->slots[0]--; 712 713 WARN_ON(cur->checked); 714 if (!list_empty(&cur->upper)) { 715 /* 716 * the backref was added previously when processing 717 * backref of type BTRFS_TREE_BLOCK_REF_KEY 718 */ 719 ASSERT(list_is_singular(&cur->upper)); 720 edge = list_entry(cur->upper.next, struct backref_edge, 721 list[LOWER]); 722 ASSERT(list_empty(&edge->list[UPPER])); 723 exist = edge->node[UPPER]; 724 /* 725 * add the upper level block to pending list if we need 726 * check its backrefs 727 */ 728 if (!exist->checked) 729 list_add_tail(&edge->list[UPPER], &list); 730 } else { 731 exist = NULL; 732 } 733 734 while (1) { 735 cond_resched(); 736 eb = path1->nodes[0]; 737 738 if (ptr >= end) { 739 if (path1->slots[0] >= btrfs_header_nritems(eb)) { 740 ret = btrfs_next_leaf(rc->extent_root, path1); 741 if (ret < 0) { 742 err = ret; 743 goto out; 744 } 745 if (ret > 0) 746 break; 747 eb = path1->nodes[0]; 748 } 749 750 btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); 751 if (key.objectid != cur->bytenr) { 752 WARN_ON(exist); 753 break; 754 } 755 756 if (key.type == BTRFS_EXTENT_ITEM_KEY || 757 key.type == BTRFS_METADATA_ITEM_KEY) { 758 ret = find_inline_backref(eb, path1->slots[0], 759 &ptr, &end); 760 if (ret) 761 goto next; 762 } 763 } 764 765 if (ptr < end) { 766 /* update key for inline back ref */ 767 struct btrfs_extent_inline_ref *iref; 768 int type; 769 iref = (struct btrfs_extent_inline_ref *)ptr; 770 type = btrfs_get_extent_inline_ref_type(eb, iref, 771 BTRFS_REF_TYPE_BLOCK); 772 if (type == BTRFS_REF_TYPE_INVALID) { 773 err = -EUCLEAN; 774 goto out; 775 } 776 key.type = type; 777 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 778 779 WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && 780 key.type != BTRFS_SHARED_BLOCK_REF_KEY); 781 } 782 783 /* 784 * Parent node found and matches current inline ref, no need to 785 * rebuild this node for this inline ref. 786 */ 787 if (exist && 788 ((key.type == BTRFS_TREE_BLOCK_REF_KEY && 789 exist->owner == key.offset) || 790 (key.type == BTRFS_SHARED_BLOCK_REF_KEY && 791 exist->bytenr == key.offset))) { 792 exist = NULL; 793 goto next; 794 } 795 796 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ 797 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 798 if (key.objectid == key.offset) { 799 /* 800 * Only root blocks of reloc trees use backref 801 * pointing to itself. 802 */ 803 root = find_reloc_root(rc, cur->bytenr); 804 ASSERT(root); 805 cur->root = root; 806 break; 807 } 808 809 edge = alloc_backref_edge(cache); 810 if (!edge) { 811 err = -ENOMEM; 812 goto out; 813 } 814 rb_node = tree_search(&cache->rb_root, key.offset); 815 if (!rb_node) { 816 upper = alloc_backref_node(cache); 817 if (!upper) { 818 free_backref_edge(cache, edge); 819 err = -ENOMEM; 820 goto out; 821 } 822 upper->bytenr = key.offset; 823 upper->level = cur->level + 1; 824 /* 825 * backrefs for the upper level block isn't 826 * cached, add the block to pending list 827 */ 828 list_add_tail(&edge->list[UPPER], &list); 829 } else { 830 upper = rb_entry(rb_node, struct backref_node, 831 rb_node); 832 ASSERT(upper->checked); 833 INIT_LIST_HEAD(&edge->list[UPPER]); 834 } 835 list_add_tail(&edge->list[LOWER], &cur->upper); 836 edge->node[LOWER] = cur; 837 edge->node[UPPER] = upper; 838 839 goto next; 840 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 841 err = -EINVAL; 842 btrfs_print_v0_err(rc->extent_root->fs_info); 843 btrfs_handle_fs_error(rc->extent_root->fs_info, err, 844 NULL); 845 goto out; 846 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 847 goto next; 848 } 849 850 /* 851 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset 852 * means the root objectid. We need to search the tree to get 853 * its parent bytenr. 854 */ 855 root = read_fs_root(rc->extent_root->fs_info, key.offset); 856 if (IS_ERR(root)) { 857 err = PTR_ERR(root); 858 goto out; 859 } 860 861 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 862 cur->cowonly = 1; 863 864 if (btrfs_root_level(&root->root_item) == cur->level) { 865 /* tree root */ 866 ASSERT(btrfs_root_bytenr(&root->root_item) == 867 cur->bytenr); 868 if (should_ignore_root(root)) 869 list_add(&cur->list, &useless); 870 else 871 cur->root = root; 872 break; 873 } 874 875 level = cur->level + 1; 876 877 /* Search the tree to find parent blocks referring the block. */ 878 path2->search_commit_root = 1; 879 path2->skip_locking = 1; 880 path2->lowest_level = level; 881 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); 882 path2->lowest_level = 0; 883 if (ret < 0) { 884 err = ret; 885 goto out; 886 } 887 if (ret > 0 && path2->slots[level] > 0) 888 path2->slots[level]--; 889 890 eb = path2->nodes[level]; 891 if (btrfs_node_blockptr(eb, path2->slots[level]) != 892 cur->bytenr) { 893 btrfs_err(root->fs_info, 894 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", 895 cur->bytenr, level - 1, 896 root->root_key.objectid, 897 node_key->objectid, node_key->type, 898 node_key->offset); 899 err = -ENOENT; 900 goto out; 901 } 902 lower = cur; 903 need_check = true; 904 905 /* Add all nodes and edges in the path */ 906 for (; level < BTRFS_MAX_LEVEL; level++) { 907 if (!path2->nodes[level]) { 908 ASSERT(btrfs_root_bytenr(&root->root_item) == 909 lower->bytenr); 910 if (should_ignore_root(root)) 911 list_add(&lower->list, &useless); 912 else 913 lower->root = root; 914 break; 915 } 916 917 edge = alloc_backref_edge(cache); 918 if (!edge) { 919 err = -ENOMEM; 920 goto out; 921 } 922 923 eb = path2->nodes[level]; 924 rb_node = tree_search(&cache->rb_root, eb->start); 925 if (!rb_node) { 926 upper = alloc_backref_node(cache); 927 if (!upper) { 928 free_backref_edge(cache, edge); 929 err = -ENOMEM; 930 goto out; 931 } 932 upper->bytenr = eb->start; 933 upper->owner = btrfs_header_owner(eb); 934 upper->level = lower->level + 1; 935 if (!test_bit(BTRFS_ROOT_REF_COWS, 936 &root->state)) 937 upper->cowonly = 1; 938 939 /* 940 * if we know the block isn't shared 941 * we can void checking its backrefs. 942 */ 943 if (btrfs_block_can_be_shared(root, eb)) 944 upper->checked = 0; 945 else 946 upper->checked = 1; 947 948 /* 949 * add the block to pending list if we 950 * need check its backrefs, we only do this once 951 * while walking up a tree as we will catch 952 * anything else later on. 953 */ 954 if (!upper->checked && need_check) { 955 need_check = false; 956 list_add_tail(&edge->list[UPPER], 957 &list); 958 } else { 959 if (upper->checked) 960 need_check = true; 961 INIT_LIST_HEAD(&edge->list[UPPER]); 962 } 963 } else { 964 upper = rb_entry(rb_node, struct backref_node, 965 rb_node); 966 ASSERT(upper->checked); 967 INIT_LIST_HEAD(&edge->list[UPPER]); 968 if (!upper->owner) 969 upper->owner = btrfs_header_owner(eb); 970 } 971 list_add_tail(&edge->list[LOWER], &lower->upper); 972 edge->node[LOWER] = lower; 973 edge->node[UPPER] = upper; 974 975 if (rb_node) 976 break; 977 lower = upper; 978 upper = NULL; 979 } 980 btrfs_release_path(path2); 981 next: 982 if (ptr < end) { 983 ptr += btrfs_extent_inline_ref_size(key.type); 984 if (ptr >= end) { 985 WARN_ON(ptr > end); 986 ptr = 0; 987 end = 0; 988 } 989 } 990 if (ptr >= end) 991 path1->slots[0]++; 992 } 993 btrfs_release_path(path1); 994 995 cur->checked = 1; 996 WARN_ON(exist); 997 998 /* the pending list isn't empty, take the first block to process */ 999 if (!list_empty(&list)) { 1000 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1001 list_del_init(&edge->list[UPPER]); 1002 cur = edge->node[UPPER]; 1003 goto again; 1004 } 1005 1006 /* 1007 * everything goes well, connect backref nodes and insert backref nodes 1008 * into the cache. 1009 */ 1010 ASSERT(node->checked); 1011 cowonly = node->cowonly; 1012 if (!cowonly) { 1013 rb_node = tree_insert(&cache->rb_root, node->bytenr, 1014 &node->rb_node); 1015 if (rb_node) 1016 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1017 list_add_tail(&node->lower, &cache->leaves); 1018 } 1019 1020 list_for_each_entry(edge, &node->upper, list[LOWER]) 1021 list_add_tail(&edge->list[UPPER], &list); 1022 1023 while (!list_empty(&list)) { 1024 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1025 list_del_init(&edge->list[UPPER]); 1026 upper = edge->node[UPPER]; 1027 if (upper->detached) { 1028 list_del(&edge->list[LOWER]); 1029 lower = edge->node[LOWER]; 1030 free_backref_edge(cache, edge); 1031 if (list_empty(&lower->upper)) 1032 list_add(&lower->list, &useless); 1033 continue; 1034 } 1035 1036 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1037 if (upper->lowest) { 1038 list_del_init(&upper->lower); 1039 upper->lowest = 0; 1040 } 1041 1042 list_add_tail(&edge->list[UPPER], &upper->lower); 1043 continue; 1044 } 1045 1046 if (!upper->checked) { 1047 /* 1048 * Still want to blow up for developers since this is a 1049 * logic bug. 1050 */ 1051 ASSERT(0); 1052 err = -EINVAL; 1053 goto out; 1054 } 1055 if (cowonly != upper->cowonly) { 1056 ASSERT(0); 1057 err = -EINVAL; 1058 goto out; 1059 } 1060 1061 if (!cowonly) { 1062 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1063 &upper->rb_node); 1064 if (rb_node) 1065 backref_tree_panic(rb_node, -EEXIST, 1066 upper->bytenr); 1067 } 1068 1069 list_add_tail(&edge->list[UPPER], &upper->lower); 1070 1071 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1072 list_add_tail(&edge->list[UPPER], &list); 1073 } 1074 /* 1075 * process useless backref nodes. backref nodes for tree leaves 1076 * are deleted from the cache. backref nodes for upper level 1077 * tree blocks are left in the cache to avoid unnecessary backref 1078 * lookup. 1079 */ 1080 while (!list_empty(&useless)) { 1081 upper = list_entry(useless.next, struct backref_node, list); 1082 list_del_init(&upper->list); 1083 ASSERT(list_empty(&upper->upper)); 1084 if (upper == node) 1085 node = NULL; 1086 if (upper->lowest) { 1087 list_del_init(&upper->lower); 1088 upper->lowest = 0; 1089 } 1090 while (!list_empty(&upper->lower)) { 1091 edge = list_entry(upper->lower.next, 1092 struct backref_edge, list[UPPER]); 1093 list_del(&edge->list[UPPER]); 1094 list_del(&edge->list[LOWER]); 1095 lower = edge->node[LOWER]; 1096 free_backref_edge(cache, edge); 1097 1098 if (list_empty(&lower->upper)) 1099 list_add(&lower->list, &useless); 1100 } 1101 __mark_block_processed(rc, upper); 1102 if (upper->level > 0) { 1103 list_add(&upper->list, &cache->detached); 1104 upper->detached = 1; 1105 } else { 1106 rb_erase(&upper->rb_node, &cache->rb_root); 1107 free_backref_node(cache, upper); 1108 } 1109 } 1110 out: 1111 btrfs_free_path(path1); 1112 btrfs_free_path(path2); 1113 if (err) { 1114 while (!list_empty(&useless)) { 1115 lower = list_entry(useless.next, 1116 struct backref_node, list); 1117 list_del_init(&lower->list); 1118 } 1119 while (!list_empty(&list)) { 1120 edge = list_first_entry(&list, struct backref_edge, 1121 list[UPPER]); 1122 list_del(&edge->list[UPPER]); 1123 list_del(&edge->list[LOWER]); 1124 lower = edge->node[LOWER]; 1125 upper = edge->node[UPPER]; 1126 free_backref_edge(cache, edge); 1127 1128 /* 1129 * Lower is no longer linked to any upper backref nodes 1130 * and isn't in the cache, we can free it ourselves. 1131 */ 1132 if (list_empty(&lower->upper) && 1133 RB_EMPTY_NODE(&lower->rb_node)) 1134 list_add(&lower->list, &useless); 1135 1136 if (!RB_EMPTY_NODE(&upper->rb_node)) 1137 continue; 1138 1139 /* Add this guy's upper edges to the list to process */ 1140 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1141 list_add_tail(&edge->list[UPPER], &list); 1142 if (list_empty(&upper->upper)) 1143 list_add(&upper->list, &useless); 1144 } 1145 1146 while (!list_empty(&useless)) { 1147 lower = list_entry(useless.next, 1148 struct backref_node, list); 1149 list_del_init(&lower->list); 1150 if (lower == node) 1151 node = NULL; 1152 free_backref_node(cache, lower); 1153 } 1154 1155 free_backref_node(cache, node); 1156 return ERR_PTR(err); 1157 } 1158 ASSERT(!node || !node->detached); 1159 return node; 1160 } 1161 1162 /* 1163 * helper to add backref node for the newly created snapshot. 1164 * the backref node is created by cloning backref node that 1165 * corresponds to root of source tree 1166 */ 1167 static int clone_backref_node(struct btrfs_trans_handle *trans, 1168 struct reloc_control *rc, 1169 struct btrfs_root *src, 1170 struct btrfs_root *dest) 1171 { 1172 struct btrfs_root *reloc_root = src->reloc_root; 1173 struct backref_cache *cache = &rc->backref_cache; 1174 struct backref_node *node = NULL; 1175 struct backref_node *new_node; 1176 struct backref_edge *edge; 1177 struct backref_edge *new_edge; 1178 struct rb_node *rb_node; 1179 1180 if (cache->last_trans > 0) 1181 update_backref_cache(trans, cache); 1182 1183 rb_node = tree_search(&cache->rb_root, src->commit_root->start); 1184 if (rb_node) { 1185 node = rb_entry(rb_node, struct backref_node, rb_node); 1186 if (node->detached) 1187 node = NULL; 1188 else 1189 BUG_ON(node->new_bytenr != reloc_root->node->start); 1190 } 1191 1192 if (!node) { 1193 rb_node = tree_search(&cache->rb_root, 1194 reloc_root->commit_root->start); 1195 if (rb_node) { 1196 node = rb_entry(rb_node, struct backref_node, 1197 rb_node); 1198 BUG_ON(node->detached); 1199 } 1200 } 1201 1202 if (!node) 1203 return 0; 1204 1205 new_node = alloc_backref_node(cache); 1206 if (!new_node) 1207 return -ENOMEM; 1208 1209 new_node->bytenr = dest->node->start; 1210 new_node->level = node->level; 1211 new_node->lowest = node->lowest; 1212 new_node->checked = 1; 1213 new_node->root = dest; 1214 1215 if (!node->lowest) { 1216 list_for_each_entry(edge, &node->lower, list[UPPER]) { 1217 new_edge = alloc_backref_edge(cache); 1218 if (!new_edge) 1219 goto fail; 1220 1221 new_edge->node[UPPER] = new_node; 1222 new_edge->node[LOWER] = edge->node[LOWER]; 1223 list_add_tail(&new_edge->list[UPPER], 1224 &new_node->lower); 1225 } 1226 } else { 1227 list_add_tail(&new_node->lower, &cache->leaves); 1228 } 1229 1230 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1231 &new_node->rb_node); 1232 if (rb_node) 1233 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); 1234 1235 if (!new_node->lowest) { 1236 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 1237 list_add_tail(&new_edge->list[LOWER], 1238 &new_edge->node[LOWER]->upper); 1239 } 1240 } 1241 return 0; 1242 fail: 1243 while (!list_empty(&new_node->lower)) { 1244 new_edge = list_entry(new_node->lower.next, 1245 struct backref_edge, list[UPPER]); 1246 list_del(&new_edge->list[UPPER]); 1247 free_backref_edge(cache, new_edge); 1248 } 1249 free_backref_node(cache, new_node); 1250 return -ENOMEM; 1251 } 1252 1253 /* 1254 * helper to add 'address of tree root -> reloc tree' mapping 1255 */ 1256 static int __must_check __add_reloc_root(struct btrfs_root *root) 1257 { 1258 struct btrfs_fs_info *fs_info = root->fs_info; 1259 struct rb_node *rb_node; 1260 struct mapping_node *node; 1261 struct reloc_control *rc = fs_info->reloc_ctl; 1262 1263 node = kmalloc(sizeof(*node), GFP_NOFS); 1264 if (!node) 1265 return -ENOMEM; 1266 1267 node->bytenr = root->node->start; 1268 node->data = root; 1269 1270 spin_lock(&rc->reloc_root_tree.lock); 1271 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1272 node->bytenr, &node->rb_node); 1273 spin_unlock(&rc->reloc_root_tree.lock); 1274 if (rb_node) { 1275 btrfs_panic(fs_info, -EEXIST, 1276 "Duplicate root found for start=%llu while inserting into relocation tree", 1277 node->bytenr); 1278 } 1279 1280 list_add_tail(&root->root_list, &rc->reloc_roots); 1281 return 0; 1282 } 1283 1284 /* 1285 * helper to delete the 'address of tree root -> reloc tree' 1286 * mapping 1287 */ 1288 static void __del_reloc_root(struct btrfs_root *root) 1289 { 1290 struct btrfs_fs_info *fs_info = root->fs_info; 1291 struct rb_node *rb_node; 1292 struct mapping_node *node = NULL; 1293 struct reloc_control *rc = fs_info->reloc_ctl; 1294 1295 if (rc && root->node) { 1296 spin_lock(&rc->reloc_root_tree.lock); 1297 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1298 root->node->start); 1299 if (rb_node) { 1300 node = rb_entry(rb_node, struct mapping_node, rb_node); 1301 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1302 } 1303 spin_unlock(&rc->reloc_root_tree.lock); 1304 if (!node) 1305 return; 1306 BUG_ON((struct btrfs_root *)node->data != root); 1307 } 1308 1309 spin_lock(&fs_info->trans_lock); 1310 list_del_init(&root->root_list); 1311 spin_unlock(&fs_info->trans_lock); 1312 kfree(node); 1313 } 1314 1315 /* 1316 * helper to update the 'address of tree root -> reloc tree' 1317 * mapping 1318 */ 1319 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) 1320 { 1321 struct btrfs_fs_info *fs_info = root->fs_info; 1322 struct rb_node *rb_node; 1323 struct mapping_node *node = NULL; 1324 struct reloc_control *rc = fs_info->reloc_ctl; 1325 1326 spin_lock(&rc->reloc_root_tree.lock); 1327 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1328 root->node->start); 1329 if (rb_node) { 1330 node = rb_entry(rb_node, struct mapping_node, rb_node); 1331 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1332 } 1333 spin_unlock(&rc->reloc_root_tree.lock); 1334 1335 if (!node) 1336 return 0; 1337 BUG_ON((struct btrfs_root *)node->data != root); 1338 1339 spin_lock(&rc->reloc_root_tree.lock); 1340 node->bytenr = new_bytenr; 1341 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1342 node->bytenr, &node->rb_node); 1343 spin_unlock(&rc->reloc_root_tree.lock); 1344 if (rb_node) 1345 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1346 return 0; 1347 } 1348 1349 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 1350 struct btrfs_root *root, u64 objectid) 1351 { 1352 struct btrfs_fs_info *fs_info = root->fs_info; 1353 struct btrfs_root *reloc_root; 1354 struct extent_buffer *eb; 1355 struct btrfs_root_item *root_item; 1356 struct btrfs_key root_key; 1357 int ret; 1358 1359 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1360 BUG_ON(!root_item); 1361 1362 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1363 root_key.type = BTRFS_ROOT_ITEM_KEY; 1364 root_key.offset = objectid; 1365 1366 if (root->root_key.objectid == objectid) { 1367 u64 commit_root_gen; 1368 1369 /* called by btrfs_init_reloc_root */ 1370 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1371 BTRFS_TREE_RELOC_OBJECTID); 1372 BUG_ON(ret); 1373 /* 1374 * Set the last_snapshot field to the generation of the commit 1375 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 1376 * correctly (returns true) when the relocation root is created 1377 * either inside the critical section of a transaction commit 1378 * (through transaction.c:qgroup_account_snapshot()) and when 1379 * it's created before the transaction commit is started. 1380 */ 1381 commit_root_gen = btrfs_header_generation(root->commit_root); 1382 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 1383 } else { 1384 /* 1385 * called by btrfs_reloc_post_snapshot_hook. 1386 * the source tree is a reloc tree, all tree blocks 1387 * modified after it was created have RELOC flag 1388 * set in their headers. so it's OK to not update 1389 * the 'last_snapshot'. 1390 */ 1391 ret = btrfs_copy_root(trans, root, root->node, &eb, 1392 BTRFS_TREE_RELOC_OBJECTID); 1393 BUG_ON(ret); 1394 } 1395 1396 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1397 btrfs_set_root_bytenr(root_item, eb->start); 1398 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1399 btrfs_set_root_generation(root_item, trans->transid); 1400 1401 if (root->root_key.objectid == objectid) { 1402 btrfs_set_root_refs(root_item, 0); 1403 memset(&root_item->drop_progress, 0, 1404 sizeof(struct btrfs_disk_key)); 1405 root_item->drop_level = 0; 1406 } 1407 1408 btrfs_tree_unlock(eb); 1409 free_extent_buffer(eb); 1410 1411 ret = btrfs_insert_root(trans, fs_info->tree_root, 1412 &root_key, root_item); 1413 BUG_ON(ret); 1414 kfree(root_item); 1415 1416 reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); 1417 BUG_ON(IS_ERR(reloc_root)); 1418 reloc_root->last_trans = trans->transid; 1419 return reloc_root; 1420 } 1421 1422 /* 1423 * create reloc tree for a given fs tree. reloc tree is just a 1424 * snapshot of the fs tree with special root objectid. 1425 */ 1426 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 1427 struct btrfs_root *root) 1428 { 1429 struct btrfs_fs_info *fs_info = root->fs_info; 1430 struct btrfs_root *reloc_root; 1431 struct reloc_control *rc = fs_info->reloc_ctl; 1432 struct btrfs_block_rsv *rsv; 1433 int clear_rsv = 0; 1434 int ret; 1435 1436 if (root->reloc_root) { 1437 reloc_root = root->reloc_root; 1438 reloc_root->last_trans = trans->transid; 1439 return 0; 1440 } 1441 1442 if (!rc || !rc->create_reloc_tree || 1443 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1444 return 0; 1445 1446 if (!trans->reloc_reserved) { 1447 rsv = trans->block_rsv; 1448 trans->block_rsv = rc->block_rsv; 1449 clear_rsv = 1; 1450 } 1451 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 1452 if (clear_rsv) 1453 trans->block_rsv = rsv; 1454 1455 ret = __add_reloc_root(reloc_root); 1456 BUG_ON(ret < 0); 1457 root->reloc_root = reloc_root; 1458 return 0; 1459 } 1460 1461 /* 1462 * update root item of reloc tree 1463 */ 1464 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 1465 struct btrfs_root *root) 1466 { 1467 struct btrfs_fs_info *fs_info = root->fs_info; 1468 struct btrfs_root *reloc_root; 1469 struct btrfs_root_item *root_item; 1470 int ret; 1471 1472 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || 1473 !root->reloc_root) 1474 goto out; 1475 1476 reloc_root = root->reloc_root; 1477 root_item = &reloc_root->root_item; 1478 1479 /* root->reloc_root will stay until current relocation finished */ 1480 if (fs_info->reloc_ctl->merge_reloc_tree && 1481 btrfs_root_refs(root_item) == 0) { 1482 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1483 __del_reloc_root(reloc_root); 1484 } 1485 1486 if (reloc_root->commit_root != reloc_root->node) { 1487 btrfs_set_root_node(root_item, reloc_root->node); 1488 free_extent_buffer(reloc_root->commit_root); 1489 reloc_root->commit_root = btrfs_root_node(reloc_root); 1490 } 1491 1492 ret = btrfs_update_root(trans, fs_info->tree_root, 1493 &reloc_root->root_key, root_item); 1494 BUG_ON(ret); 1495 1496 out: 1497 return 0; 1498 } 1499 1500 /* 1501 * helper to find first cached inode with inode number >= objectid 1502 * in a subvolume 1503 */ 1504 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 1505 { 1506 struct rb_node *node; 1507 struct rb_node *prev; 1508 struct btrfs_inode *entry; 1509 struct inode *inode; 1510 1511 spin_lock(&root->inode_lock); 1512 again: 1513 node = root->inode_tree.rb_node; 1514 prev = NULL; 1515 while (node) { 1516 prev = node; 1517 entry = rb_entry(node, struct btrfs_inode, rb_node); 1518 1519 if (objectid < btrfs_ino(entry)) 1520 node = node->rb_left; 1521 else if (objectid > btrfs_ino(entry)) 1522 node = node->rb_right; 1523 else 1524 break; 1525 } 1526 if (!node) { 1527 while (prev) { 1528 entry = rb_entry(prev, struct btrfs_inode, rb_node); 1529 if (objectid <= btrfs_ino(entry)) { 1530 node = prev; 1531 break; 1532 } 1533 prev = rb_next(prev); 1534 } 1535 } 1536 while (node) { 1537 entry = rb_entry(node, struct btrfs_inode, rb_node); 1538 inode = igrab(&entry->vfs_inode); 1539 if (inode) { 1540 spin_unlock(&root->inode_lock); 1541 return inode; 1542 } 1543 1544 objectid = btrfs_ino(entry) + 1; 1545 if (cond_resched_lock(&root->inode_lock)) 1546 goto again; 1547 1548 node = rb_next(node); 1549 } 1550 spin_unlock(&root->inode_lock); 1551 return NULL; 1552 } 1553 1554 static int in_block_group(u64 bytenr, 1555 struct btrfs_block_group_cache *block_group) 1556 { 1557 if (bytenr >= block_group->key.objectid && 1558 bytenr < block_group->key.objectid + block_group->key.offset) 1559 return 1; 1560 return 0; 1561 } 1562 1563 /* 1564 * get new location of data 1565 */ 1566 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1567 u64 bytenr, u64 num_bytes) 1568 { 1569 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1570 struct btrfs_path *path; 1571 struct btrfs_file_extent_item *fi; 1572 struct extent_buffer *leaf; 1573 int ret; 1574 1575 path = btrfs_alloc_path(); 1576 if (!path) 1577 return -ENOMEM; 1578 1579 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1580 ret = btrfs_lookup_file_extent(NULL, root, path, 1581 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1582 if (ret < 0) 1583 goto out; 1584 if (ret > 0) { 1585 ret = -ENOENT; 1586 goto out; 1587 } 1588 1589 leaf = path->nodes[0]; 1590 fi = btrfs_item_ptr(leaf, path->slots[0], 1591 struct btrfs_file_extent_item); 1592 1593 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1594 btrfs_file_extent_compression(leaf, fi) || 1595 btrfs_file_extent_encryption(leaf, fi) || 1596 btrfs_file_extent_other_encoding(leaf, fi)); 1597 1598 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1599 ret = -EINVAL; 1600 goto out; 1601 } 1602 1603 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1604 ret = 0; 1605 out: 1606 btrfs_free_path(path); 1607 return ret; 1608 } 1609 1610 /* 1611 * update file extent items in the tree leaf to point to 1612 * the new locations. 1613 */ 1614 static noinline_for_stack 1615 int replace_file_extents(struct btrfs_trans_handle *trans, 1616 struct reloc_control *rc, 1617 struct btrfs_root *root, 1618 struct extent_buffer *leaf) 1619 { 1620 struct btrfs_fs_info *fs_info = root->fs_info; 1621 struct btrfs_key key; 1622 struct btrfs_file_extent_item *fi; 1623 struct inode *inode = NULL; 1624 u64 parent; 1625 u64 bytenr; 1626 u64 new_bytenr = 0; 1627 u64 num_bytes; 1628 u64 end; 1629 u32 nritems; 1630 u32 i; 1631 int ret = 0; 1632 int first = 1; 1633 int dirty = 0; 1634 1635 if (rc->stage != UPDATE_DATA_PTRS) 1636 return 0; 1637 1638 /* reloc trees always use full backref */ 1639 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1640 parent = leaf->start; 1641 else 1642 parent = 0; 1643 1644 nritems = btrfs_header_nritems(leaf); 1645 for (i = 0; i < nritems; i++) { 1646 cond_resched(); 1647 btrfs_item_key_to_cpu(leaf, &key, i); 1648 if (key.type != BTRFS_EXTENT_DATA_KEY) 1649 continue; 1650 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1651 if (btrfs_file_extent_type(leaf, fi) == 1652 BTRFS_FILE_EXTENT_INLINE) 1653 continue; 1654 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1655 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1656 if (bytenr == 0) 1657 continue; 1658 if (!in_block_group(bytenr, rc->block_group)) 1659 continue; 1660 1661 /* 1662 * if we are modifying block in fs tree, wait for readpage 1663 * to complete and drop the extent cache 1664 */ 1665 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1666 if (first) { 1667 inode = find_next_inode(root, key.objectid); 1668 first = 0; 1669 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1670 btrfs_add_delayed_iput(inode); 1671 inode = find_next_inode(root, key.objectid); 1672 } 1673 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1674 end = key.offset + 1675 btrfs_file_extent_num_bytes(leaf, fi); 1676 WARN_ON(!IS_ALIGNED(key.offset, 1677 fs_info->sectorsize)); 1678 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1679 end--; 1680 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1681 key.offset, end); 1682 if (!ret) 1683 continue; 1684 1685 btrfs_drop_extent_cache(BTRFS_I(inode), 1686 key.offset, end, 1); 1687 unlock_extent(&BTRFS_I(inode)->io_tree, 1688 key.offset, end); 1689 } 1690 } 1691 1692 ret = get_new_location(rc->data_inode, &new_bytenr, 1693 bytenr, num_bytes); 1694 if (ret) { 1695 /* 1696 * Don't have to abort since we've not changed anything 1697 * in the file extent yet. 1698 */ 1699 break; 1700 } 1701 1702 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1703 dirty = 1; 1704 1705 key.offset -= btrfs_file_extent_offset(leaf, fi); 1706 ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1707 num_bytes, parent, 1708 btrfs_header_owner(leaf), 1709 key.objectid, key.offset); 1710 if (ret) { 1711 btrfs_abort_transaction(trans, ret); 1712 break; 1713 } 1714 1715 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1716 parent, btrfs_header_owner(leaf), 1717 key.objectid, key.offset); 1718 if (ret) { 1719 btrfs_abort_transaction(trans, ret); 1720 break; 1721 } 1722 } 1723 if (dirty) 1724 btrfs_mark_buffer_dirty(leaf); 1725 if (inode) 1726 btrfs_add_delayed_iput(inode); 1727 return ret; 1728 } 1729 1730 static noinline_for_stack 1731 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1732 struct btrfs_path *path, int level) 1733 { 1734 struct btrfs_disk_key key1; 1735 struct btrfs_disk_key key2; 1736 btrfs_node_key(eb, &key1, slot); 1737 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1738 return memcmp(&key1, &key2, sizeof(key1)); 1739 } 1740 1741 /* 1742 * try to replace tree blocks in fs tree with the new blocks 1743 * in reloc tree. tree blocks haven't been modified since the 1744 * reloc tree was create can be replaced. 1745 * 1746 * if a block was replaced, level of the block + 1 is returned. 1747 * if no block got replaced, 0 is returned. if there are other 1748 * errors, a negative error number is returned. 1749 */ 1750 static noinline_for_stack 1751 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1752 struct btrfs_root *dest, struct btrfs_root *src, 1753 struct btrfs_path *path, struct btrfs_key *next_key, 1754 int lowest_level, int max_level) 1755 { 1756 struct btrfs_fs_info *fs_info = dest->fs_info; 1757 struct extent_buffer *eb; 1758 struct extent_buffer *parent; 1759 struct btrfs_key key; 1760 u64 old_bytenr; 1761 u64 new_bytenr; 1762 u64 old_ptr_gen; 1763 u64 new_ptr_gen; 1764 u64 last_snapshot; 1765 u32 blocksize; 1766 int cow = 0; 1767 int level; 1768 int ret; 1769 int slot; 1770 1771 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1772 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1773 1774 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1775 again: 1776 slot = path->slots[lowest_level]; 1777 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1778 1779 eb = btrfs_lock_root_node(dest); 1780 btrfs_set_lock_blocking_write(eb); 1781 level = btrfs_header_level(eb); 1782 1783 if (level < lowest_level) { 1784 btrfs_tree_unlock(eb); 1785 free_extent_buffer(eb); 1786 return 0; 1787 } 1788 1789 if (cow) { 1790 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1791 BUG_ON(ret); 1792 } 1793 btrfs_set_lock_blocking_write(eb); 1794 1795 if (next_key) { 1796 next_key->objectid = (u64)-1; 1797 next_key->type = (u8)-1; 1798 next_key->offset = (u64)-1; 1799 } 1800 1801 parent = eb; 1802 while (1) { 1803 struct btrfs_key first_key; 1804 1805 level = btrfs_header_level(parent); 1806 BUG_ON(level < lowest_level); 1807 1808 ret = btrfs_bin_search(parent, &key, level, &slot); 1809 if (ret < 0) 1810 break; 1811 if (ret && slot > 0) 1812 slot--; 1813 1814 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1815 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1816 1817 old_bytenr = btrfs_node_blockptr(parent, slot); 1818 blocksize = fs_info->nodesize; 1819 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1820 btrfs_node_key_to_cpu(parent, &first_key, slot); 1821 1822 if (level <= max_level) { 1823 eb = path->nodes[level]; 1824 new_bytenr = btrfs_node_blockptr(eb, 1825 path->slots[level]); 1826 new_ptr_gen = btrfs_node_ptr_generation(eb, 1827 path->slots[level]); 1828 } else { 1829 new_bytenr = 0; 1830 new_ptr_gen = 0; 1831 } 1832 1833 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1834 ret = level; 1835 break; 1836 } 1837 1838 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1839 memcmp_node_keys(parent, slot, path, level)) { 1840 if (level <= lowest_level) { 1841 ret = 0; 1842 break; 1843 } 1844 1845 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, 1846 level - 1, &first_key); 1847 if (IS_ERR(eb)) { 1848 ret = PTR_ERR(eb); 1849 break; 1850 } else if (!extent_buffer_uptodate(eb)) { 1851 ret = -EIO; 1852 free_extent_buffer(eb); 1853 break; 1854 } 1855 btrfs_tree_lock(eb); 1856 if (cow) { 1857 ret = btrfs_cow_block(trans, dest, eb, parent, 1858 slot, &eb); 1859 BUG_ON(ret); 1860 } 1861 btrfs_set_lock_blocking_write(eb); 1862 1863 btrfs_tree_unlock(parent); 1864 free_extent_buffer(parent); 1865 1866 parent = eb; 1867 continue; 1868 } 1869 1870 if (!cow) { 1871 btrfs_tree_unlock(parent); 1872 free_extent_buffer(parent); 1873 cow = 1; 1874 goto again; 1875 } 1876 1877 btrfs_node_key_to_cpu(path->nodes[level], &key, 1878 path->slots[level]); 1879 btrfs_release_path(path); 1880 1881 path->lowest_level = level; 1882 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1883 path->lowest_level = 0; 1884 BUG_ON(ret); 1885 1886 /* 1887 * Info qgroup to trace both subtrees. 1888 * 1889 * We must trace both trees. 1890 * 1) Tree reloc subtree 1891 * If not traced, we will leak data numbers 1892 * 2) Fs subtree 1893 * If not traced, we will double count old data 1894 * 1895 * We don't scan the subtree right now, but only record 1896 * the swapped tree blocks. 1897 * The real subtree rescan is delayed until we have new 1898 * CoW on the subtree root node before transaction commit. 1899 */ 1900 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1901 rc->block_group, parent, slot, 1902 path->nodes[level], path->slots[level], 1903 last_snapshot); 1904 if (ret < 0) 1905 break; 1906 /* 1907 * swap blocks in fs tree and reloc tree. 1908 */ 1909 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1910 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1911 btrfs_mark_buffer_dirty(parent); 1912 1913 btrfs_set_node_blockptr(path->nodes[level], 1914 path->slots[level], old_bytenr); 1915 btrfs_set_node_ptr_generation(path->nodes[level], 1916 path->slots[level], old_ptr_gen); 1917 btrfs_mark_buffer_dirty(path->nodes[level]); 1918 1919 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, 1920 blocksize, path->nodes[level]->start, 1921 src->root_key.objectid, level - 1, 0); 1922 BUG_ON(ret); 1923 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, 1924 blocksize, 0, dest->root_key.objectid, 1925 level - 1, 0); 1926 BUG_ON(ret); 1927 1928 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1929 path->nodes[level]->start, 1930 src->root_key.objectid, level - 1, 0); 1931 BUG_ON(ret); 1932 1933 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1934 0, dest->root_key.objectid, level - 1, 1935 0); 1936 BUG_ON(ret); 1937 1938 btrfs_unlock_up_safe(path, 0); 1939 1940 ret = level; 1941 break; 1942 } 1943 btrfs_tree_unlock(parent); 1944 free_extent_buffer(parent); 1945 return ret; 1946 } 1947 1948 /* 1949 * helper to find next relocated block in reloc tree 1950 */ 1951 static noinline_for_stack 1952 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1953 int *level) 1954 { 1955 struct extent_buffer *eb; 1956 int i; 1957 u64 last_snapshot; 1958 u32 nritems; 1959 1960 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1961 1962 for (i = 0; i < *level; i++) { 1963 free_extent_buffer(path->nodes[i]); 1964 path->nodes[i] = NULL; 1965 } 1966 1967 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1968 eb = path->nodes[i]; 1969 nritems = btrfs_header_nritems(eb); 1970 while (path->slots[i] + 1 < nritems) { 1971 path->slots[i]++; 1972 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1973 last_snapshot) 1974 continue; 1975 1976 *level = i; 1977 return 0; 1978 } 1979 free_extent_buffer(path->nodes[i]); 1980 path->nodes[i] = NULL; 1981 } 1982 return 1; 1983 } 1984 1985 /* 1986 * walk down reloc tree to find relocated block of lowest level 1987 */ 1988 static noinline_for_stack 1989 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1990 int *level) 1991 { 1992 struct btrfs_fs_info *fs_info = root->fs_info; 1993 struct extent_buffer *eb = NULL; 1994 int i; 1995 u64 bytenr; 1996 u64 ptr_gen = 0; 1997 u64 last_snapshot; 1998 u32 nritems; 1999 2000 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2001 2002 for (i = *level; i > 0; i--) { 2003 struct btrfs_key first_key; 2004 2005 eb = path->nodes[i]; 2006 nritems = btrfs_header_nritems(eb); 2007 while (path->slots[i] < nritems) { 2008 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 2009 if (ptr_gen > last_snapshot) 2010 break; 2011 path->slots[i]++; 2012 } 2013 if (path->slots[i] >= nritems) { 2014 if (i == *level) 2015 break; 2016 *level = i + 1; 2017 return 0; 2018 } 2019 if (i == 1) { 2020 *level = i; 2021 return 0; 2022 } 2023 2024 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2025 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); 2026 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, 2027 &first_key); 2028 if (IS_ERR(eb)) { 2029 return PTR_ERR(eb); 2030 } else if (!extent_buffer_uptodate(eb)) { 2031 free_extent_buffer(eb); 2032 return -EIO; 2033 } 2034 BUG_ON(btrfs_header_level(eb) != i - 1); 2035 path->nodes[i - 1] = eb; 2036 path->slots[i - 1] = 0; 2037 } 2038 return 1; 2039 } 2040 2041 /* 2042 * invalidate extent cache for file extents whose key in range of 2043 * [min_key, max_key) 2044 */ 2045 static int invalidate_extent_cache(struct btrfs_root *root, 2046 struct btrfs_key *min_key, 2047 struct btrfs_key *max_key) 2048 { 2049 struct btrfs_fs_info *fs_info = root->fs_info; 2050 struct inode *inode = NULL; 2051 u64 objectid; 2052 u64 start, end; 2053 u64 ino; 2054 2055 objectid = min_key->objectid; 2056 while (1) { 2057 cond_resched(); 2058 iput(inode); 2059 2060 if (objectid > max_key->objectid) 2061 break; 2062 2063 inode = find_next_inode(root, objectid); 2064 if (!inode) 2065 break; 2066 ino = btrfs_ino(BTRFS_I(inode)); 2067 2068 if (ino > max_key->objectid) { 2069 iput(inode); 2070 break; 2071 } 2072 2073 objectid = ino + 1; 2074 if (!S_ISREG(inode->i_mode)) 2075 continue; 2076 2077 if (unlikely(min_key->objectid == ino)) { 2078 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 2079 continue; 2080 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 2081 start = 0; 2082 else { 2083 start = min_key->offset; 2084 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 2085 } 2086 } else { 2087 start = 0; 2088 } 2089 2090 if (unlikely(max_key->objectid == ino)) { 2091 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 2092 continue; 2093 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 2094 end = (u64)-1; 2095 } else { 2096 if (max_key->offset == 0) 2097 continue; 2098 end = max_key->offset; 2099 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 2100 end--; 2101 } 2102 } else { 2103 end = (u64)-1; 2104 } 2105 2106 /* the lock_extent waits for readpage to complete */ 2107 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2108 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 2109 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2110 } 2111 return 0; 2112 } 2113 2114 static int find_next_key(struct btrfs_path *path, int level, 2115 struct btrfs_key *key) 2116 2117 { 2118 while (level < BTRFS_MAX_LEVEL) { 2119 if (!path->nodes[level]) 2120 break; 2121 if (path->slots[level] + 1 < 2122 btrfs_header_nritems(path->nodes[level])) { 2123 btrfs_node_key_to_cpu(path->nodes[level], key, 2124 path->slots[level] + 1); 2125 return 0; 2126 } 2127 level++; 2128 } 2129 return 1; 2130 } 2131 2132 /* 2133 * Insert current subvolume into reloc_control::dirty_subvol_roots 2134 */ 2135 static void insert_dirty_subvol(struct btrfs_trans_handle *trans, 2136 struct reloc_control *rc, 2137 struct btrfs_root *root) 2138 { 2139 struct btrfs_root *reloc_root = root->reloc_root; 2140 struct btrfs_root_item *reloc_root_item; 2141 2142 /* @root must be a subvolume tree root with a valid reloc tree */ 2143 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 2144 ASSERT(reloc_root); 2145 2146 reloc_root_item = &reloc_root->root_item; 2147 memset(&reloc_root_item->drop_progress, 0, 2148 sizeof(reloc_root_item->drop_progress)); 2149 reloc_root_item->drop_level = 0; 2150 btrfs_set_root_refs(reloc_root_item, 0); 2151 btrfs_update_reloc_root(trans, root); 2152 2153 if (list_empty(&root->reloc_dirty_list)) { 2154 btrfs_grab_fs_root(root); 2155 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 2156 } 2157 } 2158 2159 static int clean_dirty_subvols(struct reloc_control *rc) 2160 { 2161 struct btrfs_root *root; 2162 struct btrfs_root *next; 2163 int ret = 0; 2164 2165 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 2166 reloc_dirty_list) { 2167 struct btrfs_root *reloc_root = root->reloc_root; 2168 2169 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 2170 list_del_init(&root->reloc_dirty_list); 2171 root->reloc_root = NULL; 2172 if (reloc_root) { 2173 int ret2; 2174 2175 ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); 2176 if (ret2 < 0 && !ret) 2177 ret = ret2; 2178 } 2179 btrfs_put_fs_root(root); 2180 } 2181 return ret; 2182 } 2183 2184 /* 2185 * merge the relocated tree blocks in reloc tree with corresponding 2186 * fs tree. 2187 */ 2188 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 2189 struct btrfs_root *root) 2190 { 2191 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2192 struct btrfs_key key; 2193 struct btrfs_key next_key; 2194 struct btrfs_trans_handle *trans = NULL; 2195 struct btrfs_root *reloc_root; 2196 struct btrfs_root_item *root_item; 2197 struct btrfs_path *path; 2198 struct extent_buffer *leaf; 2199 int level; 2200 int max_level; 2201 int replaced = 0; 2202 int ret; 2203 int err = 0; 2204 u32 min_reserved; 2205 2206 path = btrfs_alloc_path(); 2207 if (!path) 2208 return -ENOMEM; 2209 path->reada = READA_FORWARD; 2210 2211 reloc_root = root->reloc_root; 2212 root_item = &reloc_root->root_item; 2213 2214 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 2215 level = btrfs_root_level(root_item); 2216 extent_buffer_get(reloc_root->node); 2217 path->nodes[level] = reloc_root->node; 2218 path->slots[level] = 0; 2219 } else { 2220 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 2221 2222 level = root_item->drop_level; 2223 BUG_ON(level == 0); 2224 path->lowest_level = level; 2225 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 2226 path->lowest_level = 0; 2227 if (ret < 0) { 2228 btrfs_free_path(path); 2229 return ret; 2230 } 2231 2232 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 2233 path->slots[level]); 2234 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 2235 2236 btrfs_unlock_up_safe(path, 0); 2237 } 2238 2239 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2240 memset(&next_key, 0, sizeof(next_key)); 2241 2242 while (1) { 2243 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 2244 BTRFS_RESERVE_FLUSH_ALL); 2245 if (ret) { 2246 err = ret; 2247 goto out; 2248 } 2249 trans = btrfs_start_transaction(root, 0); 2250 if (IS_ERR(trans)) { 2251 err = PTR_ERR(trans); 2252 trans = NULL; 2253 goto out; 2254 } 2255 trans->block_rsv = rc->block_rsv; 2256 2257 replaced = 0; 2258 max_level = level; 2259 2260 ret = walk_down_reloc_tree(reloc_root, path, &level); 2261 if (ret < 0) { 2262 err = ret; 2263 goto out; 2264 } 2265 if (ret > 0) 2266 break; 2267 2268 if (!find_next_key(path, level, &key) && 2269 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2270 ret = 0; 2271 } else { 2272 ret = replace_path(trans, rc, root, reloc_root, path, 2273 &next_key, level, max_level); 2274 } 2275 if (ret < 0) { 2276 err = ret; 2277 goto out; 2278 } 2279 2280 if (ret > 0) { 2281 level = ret; 2282 btrfs_node_key_to_cpu(path->nodes[level], &key, 2283 path->slots[level]); 2284 replaced = 1; 2285 } 2286 2287 ret = walk_up_reloc_tree(reloc_root, path, &level); 2288 if (ret > 0) 2289 break; 2290 2291 BUG_ON(level == 0); 2292 /* 2293 * save the merging progress in the drop_progress. 2294 * this is OK since root refs == 1 in this case. 2295 */ 2296 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 2297 path->slots[level]); 2298 root_item->drop_level = level; 2299 2300 btrfs_end_transaction_throttle(trans); 2301 trans = NULL; 2302 2303 btrfs_btree_balance_dirty(fs_info); 2304 2305 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2306 invalidate_extent_cache(root, &key, &next_key); 2307 } 2308 2309 /* 2310 * handle the case only one block in the fs tree need to be 2311 * relocated and the block is tree root. 2312 */ 2313 leaf = btrfs_lock_root_node(root); 2314 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); 2315 btrfs_tree_unlock(leaf); 2316 free_extent_buffer(leaf); 2317 if (ret < 0) 2318 err = ret; 2319 out: 2320 btrfs_free_path(path); 2321 2322 if (err == 0) 2323 insert_dirty_subvol(trans, rc, root); 2324 2325 if (trans) 2326 btrfs_end_transaction_throttle(trans); 2327 2328 btrfs_btree_balance_dirty(fs_info); 2329 2330 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2331 invalidate_extent_cache(root, &key, &next_key); 2332 2333 return err; 2334 } 2335 2336 static noinline_for_stack 2337 int prepare_to_merge(struct reloc_control *rc, int err) 2338 { 2339 struct btrfs_root *root = rc->extent_root; 2340 struct btrfs_fs_info *fs_info = root->fs_info; 2341 struct btrfs_root *reloc_root; 2342 struct btrfs_trans_handle *trans; 2343 LIST_HEAD(reloc_roots); 2344 u64 num_bytes = 0; 2345 int ret; 2346 2347 mutex_lock(&fs_info->reloc_mutex); 2348 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2349 rc->merging_rsv_size += rc->nodes_relocated * 2; 2350 mutex_unlock(&fs_info->reloc_mutex); 2351 2352 again: 2353 if (!err) { 2354 num_bytes = rc->merging_rsv_size; 2355 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2356 BTRFS_RESERVE_FLUSH_ALL); 2357 if (ret) 2358 err = ret; 2359 } 2360 2361 trans = btrfs_join_transaction(rc->extent_root); 2362 if (IS_ERR(trans)) { 2363 if (!err) 2364 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2365 num_bytes); 2366 return PTR_ERR(trans); 2367 } 2368 2369 if (!err) { 2370 if (num_bytes != rc->merging_rsv_size) { 2371 btrfs_end_transaction(trans); 2372 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2373 num_bytes); 2374 goto again; 2375 } 2376 } 2377 2378 rc->merge_reloc_tree = 1; 2379 2380 while (!list_empty(&rc->reloc_roots)) { 2381 reloc_root = list_entry(rc->reloc_roots.next, 2382 struct btrfs_root, root_list); 2383 list_del_init(&reloc_root->root_list); 2384 2385 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2386 BUG_ON(IS_ERR(root)); 2387 BUG_ON(root->reloc_root != reloc_root); 2388 2389 /* 2390 * set reference count to 1, so btrfs_recover_relocation 2391 * knows it should resumes merging 2392 */ 2393 if (!err) 2394 btrfs_set_root_refs(&reloc_root->root_item, 1); 2395 btrfs_update_reloc_root(trans, root); 2396 2397 list_add(&reloc_root->root_list, &reloc_roots); 2398 } 2399 2400 list_splice(&reloc_roots, &rc->reloc_roots); 2401 2402 if (!err) 2403 btrfs_commit_transaction(trans); 2404 else 2405 btrfs_end_transaction(trans); 2406 return err; 2407 } 2408 2409 static noinline_for_stack 2410 void free_reloc_roots(struct list_head *list) 2411 { 2412 struct btrfs_root *reloc_root; 2413 2414 while (!list_empty(list)) { 2415 reloc_root = list_entry(list->next, struct btrfs_root, 2416 root_list); 2417 __del_reloc_root(reloc_root); 2418 free_extent_buffer(reloc_root->node); 2419 free_extent_buffer(reloc_root->commit_root); 2420 reloc_root->node = NULL; 2421 reloc_root->commit_root = NULL; 2422 } 2423 } 2424 2425 static noinline_for_stack 2426 void merge_reloc_roots(struct reloc_control *rc) 2427 { 2428 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2429 struct btrfs_root *root; 2430 struct btrfs_root *reloc_root; 2431 LIST_HEAD(reloc_roots); 2432 int found = 0; 2433 int ret = 0; 2434 again: 2435 root = rc->extent_root; 2436 2437 /* 2438 * this serializes us with btrfs_record_root_in_transaction, 2439 * we have to make sure nobody is in the middle of 2440 * adding their roots to the list while we are 2441 * doing this splice 2442 */ 2443 mutex_lock(&fs_info->reloc_mutex); 2444 list_splice_init(&rc->reloc_roots, &reloc_roots); 2445 mutex_unlock(&fs_info->reloc_mutex); 2446 2447 while (!list_empty(&reloc_roots)) { 2448 found = 1; 2449 reloc_root = list_entry(reloc_roots.next, 2450 struct btrfs_root, root_list); 2451 2452 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 2453 root = read_fs_root(fs_info, 2454 reloc_root->root_key.offset); 2455 BUG_ON(IS_ERR(root)); 2456 BUG_ON(root->reloc_root != reloc_root); 2457 2458 ret = merge_reloc_root(rc, root); 2459 if (ret) { 2460 if (list_empty(&reloc_root->root_list)) 2461 list_add_tail(&reloc_root->root_list, 2462 &reloc_roots); 2463 goto out; 2464 } 2465 } else { 2466 list_del_init(&reloc_root->root_list); 2467 } 2468 } 2469 2470 if (found) { 2471 found = 0; 2472 goto again; 2473 } 2474 out: 2475 if (ret) { 2476 btrfs_handle_fs_error(fs_info, ret, NULL); 2477 if (!list_empty(&reloc_roots)) 2478 free_reloc_roots(&reloc_roots); 2479 2480 /* new reloc root may be added */ 2481 mutex_lock(&fs_info->reloc_mutex); 2482 list_splice_init(&rc->reloc_roots, &reloc_roots); 2483 mutex_unlock(&fs_info->reloc_mutex); 2484 if (!list_empty(&reloc_roots)) 2485 free_reloc_roots(&reloc_roots); 2486 } 2487 2488 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2489 } 2490 2491 static void free_block_list(struct rb_root *blocks) 2492 { 2493 struct tree_block *block; 2494 struct rb_node *rb_node; 2495 while ((rb_node = rb_first(blocks))) { 2496 block = rb_entry(rb_node, struct tree_block, rb_node); 2497 rb_erase(rb_node, blocks); 2498 kfree(block); 2499 } 2500 } 2501 2502 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2503 struct btrfs_root *reloc_root) 2504 { 2505 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2506 struct btrfs_root *root; 2507 2508 if (reloc_root->last_trans == trans->transid) 2509 return 0; 2510 2511 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2512 BUG_ON(IS_ERR(root)); 2513 BUG_ON(root->reloc_root != reloc_root); 2514 2515 return btrfs_record_root_in_trans(trans, root); 2516 } 2517 2518 static noinline_for_stack 2519 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2520 struct reloc_control *rc, 2521 struct backref_node *node, 2522 struct backref_edge *edges[]) 2523 { 2524 struct backref_node *next; 2525 struct btrfs_root *root; 2526 int index = 0; 2527 2528 next = node; 2529 while (1) { 2530 cond_resched(); 2531 next = walk_up_backref(next, edges, &index); 2532 root = next->root; 2533 BUG_ON(!root); 2534 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); 2535 2536 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2537 record_reloc_root_in_trans(trans, root); 2538 break; 2539 } 2540 2541 btrfs_record_root_in_trans(trans, root); 2542 root = root->reloc_root; 2543 2544 if (next->new_bytenr != root->node->start) { 2545 BUG_ON(next->new_bytenr); 2546 BUG_ON(!list_empty(&next->list)); 2547 next->new_bytenr = root->node->start; 2548 next->root = root; 2549 list_add_tail(&next->list, 2550 &rc->backref_cache.changed); 2551 __mark_block_processed(rc, next); 2552 break; 2553 } 2554 2555 WARN_ON(1); 2556 root = NULL; 2557 next = walk_down_backref(edges, &index); 2558 if (!next || next->level <= node->level) 2559 break; 2560 } 2561 if (!root) 2562 return NULL; 2563 2564 next = node; 2565 /* setup backref node path for btrfs_reloc_cow_block */ 2566 while (1) { 2567 rc->backref_cache.path[next->level] = next; 2568 if (--index < 0) 2569 break; 2570 next = edges[index]->node[UPPER]; 2571 } 2572 return root; 2573 } 2574 2575 /* 2576 * select a tree root for relocation. return NULL if the block 2577 * is reference counted. we should use do_relocation() in this 2578 * case. return a tree root pointer if the block isn't reference 2579 * counted. return -ENOENT if the block is root of reloc tree. 2580 */ 2581 static noinline_for_stack 2582 struct btrfs_root *select_one_root(struct backref_node *node) 2583 { 2584 struct backref_node *next; 2585 struct btrfs_root *root; 2586 struct btrfs_root *fs_root = NULL; 2587 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2588 int index = 0; 2589 2590 next = node; 2591 while (1) { 2592 cond_resched(); 2593 next = walk_up_backref(next, edges, &index); 2594 root = next->root; 2595 BUG_ON(!root); 2596 2597 /* no other choice for non-references counted tree */ 2598 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 2599 return root; 2600 2601 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2602 fs_root = root; 2603 2604 if (next != node) 2605 return NULL; 2606 2607 next = walk_down_backref(edges, &index); 2608 if (!next || next->level <= node->level) 2609 break; 2610 } 2611 2612 if (!fs_root) 2613 return ERR_PTR(-ENOENT); 2614 return fs_root; 2615 } 2616 2617 static noinline_for_stack 2618 u64 calcu_metadata_size(struct reloc_control *rc, 2619 struct backref_node *node, int reserve) 2620 { 2621 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2622 struct backref_node *next = node; 2623 struct backref_edge *edge; 2624 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2625 u64 num_bytes = 0; 2626 int index = 0; 2627 2628 BUG_ON(reserve && node->processed); 2629 2630 while (next) { 2631 cond_resched(); 2632 while (1) { 2633 if (next->processed && (reserve || next != node)) 2634 break; 2635 2636 num_bytes += fs_info->nodesize; 2637 2638 if (list_empty(&next->upper)) 2639 break; 2640 2641 edge = list_entry(next->upper.next, 2642 struct backref_edge, list[LOWER]); 2643 edges[index++] = edge; 2644 next = edge->node[UPPER]; 2645 } 2646 next = walk_down_backref(edges, &index); 2647 } 2648 return num_bytes; 2649 } 2650 2651 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2652 struct reloc_control *rc, 2653 struct backref_node *node) 2654 { 2655 struct btrfs_root *root = rc->extent_root; 2656 struct btrfs_fs_info *fs_info = root->fs_info; 2657 u64 num_bytes; 2658 int ret; 2659 u64 tmp; 2660 2661 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2662 2663 trans->block_rsv = rc->block_rsv; 2664 rc->reserved_bytes += num_bytes; 2665 2666 /* 2667 * We are under a transaction here so we can only do limited flushing. 2668 * If we get an enospc just kick back -EAGAIN so we know to drop the 2669 * transaction and try to refill when we can flush all the things. 2670 */ 2671 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, 2672 BTRFS_RESERVE_FLUSH_LIMIT); 2673 if (ret) { 2674 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2675 while (tmp <= rc->reserved_bytes) 2676 tmp <<= 1; 2677 /* 2678 * only one thread can access block_rsv at this point, 2679 * so we don't need hold lock to protect block_rsv. 2680 * we expand more reservation size here to allow enough 2681 * space for relocation and we will return earlier in 2682 * enospc case. 2683 */ 2684 rc->block_rsv->size = tmp + fs_info->nodesize * 2685 RELOCATION_RESERVED_NODES; 2686 return -EAGAIN; 2687 } 2688 2689 return 0; 2690 } 2691 2692 /* 2693 * relocate a block tree, and then update pointers in upper level 2694 * blocks that reference the block to point to the new location. 2695 * 2696 * if called by link_to_upper, the block has already been relocated. 2697 * in that case this function just updates pointers. 2698 */ 2699 static int do_relocation(struct btrfs_trans_handle *trans, 2700 struct reloc_control *rc, 2701 struct backref_node *node, 2702 struct btrfs_key *key, 2703 struct btrfs_path *path, int lowest) 2704 { 2705 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2706 struct backref_node *upper; 2707 struct backref_edge *edge; 2708 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2709 struct btrfs_root *root; 2710 struct extent_buffer *eb; 2711 u32 blocksize; 2712 u64 bytenr; 2713 u64 generation; 2714 int slot; 2715 int ret; 2716 int err = 0; 2717 2718 BUG_ON(lowest && node->eb); 2719 2720 path->lowest_level = node->level + 1; 2721 rc->backref_cache.path[node->level] = node; 2722 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2723 struct btrfs_key first_key; 2724 2725 cond_resched(); 2726 2727 upper = edge->node[UPPER]; 2728 root = select_reloc_root(trans, rc, upper, edges); 2729 BUG_ON(!root); 2730 2731 if (upper->eb && !upper->locked) { 2732 if (!lowest) { 2733 ret = btrfs_bin_search(upper->eb, key, 2734 upper->level, &slot); 2735 if (ret < 0) { 2736 err = ret; 2737 goto next; 2738 } 2739 BUG_ON(ret); 2740 bytenr = btrfs_node_blockptr(upper->eb, slot); 2741 if (node->eb->start == bytenr) 2742 goto next; 2743 } 2744 drop_node_buffer(upper); 2745 } 2746 2747 if (!upper->eb) { 2748 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2749 if (ret) { 2750 if (ret < 0) 2751 err = ret; 2752 else 2753 err = -ENOENT; 2754 2755 btrfs_release_path(path); 2756 break; 2757 } 2758 2759 if (!upper->eb) { 2760 upper->eb = path->nodes[upper->level]; 2761 path->nodes[upper->level] = NULL; 2762 } else { 2763 BUG_ON(upper->eb != path->nodes[upper->level]); 2764 } 2765 2766 upper->locked = 1; 2767 path->locks[upper->level] = 0; 2768 2769 slot = path->slots[upper->level]; 2770 btrfs_release_path(path); 2771 } else { 2772 ret = btrfs_bin_search(upper->eb, key, upper->level, 2773 &slot); 2774 if (ret < 0) { 2775 err = ret; 2776 goto next; 2777 } 2778 BUG_ON(ret); 2779 } 2780 2781 bytenr = btrfs_node_blockptr(upper->eb, slot); 2782 if (lowest) { 2783 if (bytenr != node->bytenr) { 2784 btrfs_err(root->fs_info, 2785 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2786 bytenr, node->bytenr, slot, 2787 upper->eb->start); 2788 err = -EIO; 2789 goto next; 2790 } 2791 } else { 2792 if (node->eb->start == bytenr) 2793 goto next; 2794 } 2795 2796 blocksize = root->fs_info->nodesize; 2797 generation = btrfs_node_ptr_generation(upper->eb, slot); 2798 btrfs_node_key_to_cpu(upper->eb, &first_key, slot); 2799 eb = read_tree_block(fs_info, bytenr, generation, 2800 upper->level - 1, &first_key); 2801 if (IS_ERR(eb)) { 2802 err = PTR_ERR(eb); 2803 goto next; 2804 } else if (!extent_buffer_uptodate(eb)) { 2805 free_extent_buffer(eb); 2806 err = -EIO; 2807 goto next; 2808 } 2809 btrfs_tree_lock(eb); 2810 btrfs_set_lock_blocking_write(eb); 2811 2812 if (!node->eb) { 2813 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2814 slot, &eb); 2815 btrfs_tree_unlock(eb); 2816 free_extent_buffer(eb); 2817 if (ret < 0) { 2818 err = ret; 2819 goto next; 2820 } 2821 BUG_ON(node->eb != eb); 2822 } else { 2823 btrfs_set_node_blockptr(upper->eb, slot, 2824 node->eb->start); 2825 btrfs_set_node_ptr_generation(upper->eb, slot, 2826 trans->transid); 2827 btrfs_mark_buffer_dirty(upper->eb); 2828 2829 ret = btrfs_inc_extent_ref(trans, root, 2830 node->eb->start, blocksize, 2831 upper->eb->start, 2832 btrfs_header_owner(upper->eb), 2833 node->level, 0); 2834 BUG_ON(ret); 2835 2836 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2837 BUG_ON(ret); 2838 } 2839 next: 2840 if (!upper->pending) 2841 drop_node_buffer(upper); 2842 else 2843 unlock_node_buffer(upper); 2844 if (err) 2845 break; 2846 } 2847 2848 if (!err && node->pending) { 2849 drop_node_buffer(node); 2850 list_move_tail(&node->list, &rc->backref_cache.changed); 2851 node->pending = 0; 2852 } 2853 2854 path->lowest_level = 0; 2855 BUG_ON(err == -ENOSPC); 2856 return err; 2857 } 2858 2859 static int link_to_upper(struct btrfs_trans_handle *trans, 2860 struct reloc_control *rc, 2861 struct backref_node *node, 2862 struct btrfs_path *path) 2863 { 2864 struct btrfs_key key; 2865 2866 btrfs_node_key_to_cpu(node->eb, &key, 0); 2867 return do_relocation(trans, rc, node, &key, path, 0); 2868 } 2869 2870 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2871 struct reloc_control *rc, 2872 struct btrfs_path *path, int err) 2873 { 2874 LIST_HEAD(list); 2875 struct backref_cache *cache = &rc->backref_cache; 2876 struct backref_node *node; 2877 int level; 2878 int ret; 2879 2880 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2881 while (!list_empty(&cache->pending[level])) { 2882 node = list_entry(cache->pending[level].next, 2883 struct backref_node, list); 2884 list_move_tail(&node->list, &list); 2885 BUG_ON(!node->pending); 2886 2887 if (!err) { 2888 ret = link_to_upper(trans, rc, node, path); 2889 if (ret < 0) 2890 err = ret; 2891 } 2892 } 2893 list_splice_init(&list, &cache->pending[level]); 2894 } 2895 return err; 2896 } 2897 2898 static void mark_block_processed(struct reloc_control *rc, 2899 u64 bytenr, u32 blocksize) 2900 { 2901 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, 2902 EXTENT_DIRTY); 2903 } 2904 2905 static void __mark_block_processed(struct reloc_control *rc, 2906 struct backref_node *node) 2907 { 2908 u32 blocksize; 2909 if (node->level == 0 || 2910 in_block_group(node->bytenr, rc->block_group)) { 2911 blocksize = rc->extent_root->fs_info->nodesize; 2912 mark_block_processed(rc, node->bytenr, blocksize); 2913 } 2914 node->processed = 1; 2915 } 2916 2917 /* 2918 * mark a block and all blocks directly/indirectly reference the block 2919 * as processed. 2920 */ 2921 static void update_processed_blocks(struct reloc_control *rc, 2922 struct backref_node *node) 2923 { 2924 struct backref_node *next = node; 2925 struct backref_edge *edge; 2926 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2927 int index = 0; 2928 2929 while (next) { 2930 cond_resched(); 2931 while (1) { 2932 if (next->processed) 2933 break; 2934 2935 __mark_block_processed(rc, next); 2936 2937 if (list_empty(&next->upper)) 2938 break; 2939 2940 edge = list_entry(next->upper.next, 2941 struct backref_edge, list[LOWER]); 2942 edges[index++] = edge; 2943 next = edge->node[UPPER]; 2944 } 2945 next = walk_down_backref(edges, &index); 2946 } 2947 } 2948 2949 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2950 { 2951 u32 blocksize = rc->extent_root->fs_info->nodesize; 2952 2953 if (test_range_bit(&rc->processed_blocks, bytenr, 2954 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2955 return 1; 2956 return 0; 2957 } 2958 2959 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2960 struct tree_block *block) 2961 { 2962 struct extent_buffer *eb; 2963 2964 BUG_ON(block->key_ready); 2965 eb = read_tree_block(fs_info, block->bytenr, block->key.offset, 2966 block->level, NULL); 2967 if (IS_ERR(eb)) { 2968 return PTR_ERR(eb); 2969 } else if (!extent_buffer_uptodate(eb)) { 2970 free_extent_buffer(eb); 2971 return -EIO; 2972 } 2973 if (block->level == 0) 2974 btrfs_item_key_to_cpu(eb, &block->key, 0); 2975 else 2976 btrfs_node_key_to_cpu(eb, &block->key, 0); 2977 free_extent_buffer(eb); 2978 block->key_ready = 1; 2979 return 0; 2980 } 2981 2982 /* 2983 * helper function to relocate a tree block 2984 */ 2985 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2986 struct reloc_control *rc, 2987 struct backref_node *node, 2988 struct btrfs_key *key, 2989 struct btrfs_path *path) 2990 { 2991 struct btrfs_root *root; 2992 int ret = 0; 2993 2994 if (!node) 2995 return 0; 2996 2997 BUG_ON(node->processed); 2998 root = select_one_root(node); 2999 if (root == ERR_PTR(-ENOENT)) { 3000 update_processed_blocks(rc, node); 3001 goto out; 3002 } 3003 3004 if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3005 ret = reserve_metadata_space(trans, rc, node); 3006 if (ret) 3007 goto out; 3008 } 3009 3010 if (root) { 3011 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3012 BUG_ON(node->new_bytenr); 3013 BUG_ON(!list_empty(&node->list)); 3014 btrfs_record_root_in_trans(trans, root); 3015 root = root->reloc_root; 3016 node->new_bytenr = root->node->start; 3017 node->root = root; 3018 list_add_tail(&node->list, &rc->backref_cache.changed); 3019 } else { 3020 path->lowest_level = node->level; 3021 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 3022 btrfs_release_path(path); 3023 if (ret > 0) 3024 ret = 0; 3025 } 3026 if (!ret) 3027 update_processed_blocks(rc, node); 3028 } else { 3029 ret = do_relocation(trans, rc, node, key, path, 1); 3030 } 3031 out: 3032 if (ret || node->level == 0 || node->cowonly) 3033 remove_backref_node(&rc->backref_cache, node); 3034 return ret; 3035 } 3036 3037 /* 3038 * relocate a list of blocks 3039 */ 3040 static noinline_for_stack 3041 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 3042 struct reloc_control *rc, struct rb_root *blocks) 3043 { 3044 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3045 struct backref_node *node; 3046 struct btrfs_path *path; 3047 struct tree_block *block; 3048 struct tree_block *next; 3049 int ret; 3050 int err = 0; 3051 3052 path = btrfs_alloc_path(); 3053 if (!path) { 3054 err = -ENOMEM; 3055 goto out_free_blocks; 3056 } 3057 3058 /* Kick in readahead for tree blocks with missing keys */ 3059 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3060 if (!block->key_ready) 3061 readahead_tree_block(fs_info, block->bytenr); 3062 } 3063 3064 /* Get first keys */ 3065 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3066 if (!block->key_ready) { 3067 err = get_tree_block_key(fs_info, block); 3068 if (err) 3069 goto out_free_path; 3070 } 3071 } 3072 3073 /* Do tree relocation */ 3074 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3075 node = build_backref_tree(rc, &block->key, 3076 block->level, block->bytenr); 3077 if (IS_ERR(node)) { 3078 err = PTR_ERR(node); 3079 goto out; 3080 } 3081 3082 ret = relocate_tree_block(trans, rc, node, &block->key, 3083 path); 3084 if (ret < 0) { 3085 if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) 3086 err = ret; 3087 goto out; 3088 } 3089 } 3090 out: 3091 err = finish_pending_nodes(trans, rc, path, err); 3092 3093 out_free_path: 3094 btrfs_free_path(path); 3095 out_free_blocks: 3096 free_block_list(blocks); 3097 return err; 3098 } 3099 3100 static noinline_for_stack 3101 int prealloc_file_extent_cluster(struct inode *inode, 3102 struct file_extent_cluster *cluster) 3103 { 3104 u64 alloc_hint = 0; 3105 u64 start; 3106 u64 end; 3107 u64 offset = BTRFS_I(inode)->index_cnt; 3108 u64 num_bytes; 3109 int nr = 0; 3110 int ret = 0; 3111 u64 prealloc_start = cluster->start - offset; 3112 u64 prealloc_end = cluster->end - offset; 3113 u64 cur_offset; 3114 struct extent_changeset *data_reserved = NULL; 3115 3116 BUG_ON(cluster->start != cluster->boundary[0]); 3117 inode_lock(inode); 3118 3119 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, 3120 prealloc_end + 1 - prealloc_start); 3121 if (ret) 3122 goto out; 3123 3124 cur_offset = prealloc_start; 3125 while (nr < cluster->nr) { 3126 start = cluster->boundary[nr] - offset; 3127 if (nr + 1 < cluster->nr) 3128 end = cluster->boundary[nr + 1] - 1 - offset; 3129 else 3130 end = cluster->end - offset; 3131 3132 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3133 num_bytes = end + 1 - start; 3134 if (cur_offset < start) 3135 btrfs_free_reserved_data_space(inode, data_reserved, 3136 cur_offset, start - cur_offset); 3137 ret = btrfs_prealloc_file_range(inode, 0, start, 3138 num_bytes, num_bytes, 3139 end + 1, &alloc_hint); 3140 cur_offset = end + 1; 3141 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3142 if (ret) 3143 break; 3144 nr++; 3145 } 3146 if (cur_offset < prealloc_end) 3147 btrfs_free_reserved_data_space(inode, data_reserved, 3148 cur_offset, prealloc_end + 1 - cur_offset); 3149 out: 3150 inode_unlock(inode); 3151 extent_changeset_free(data_reserved); 3152 return ret; 3153 } 3154 3155 static noinline_for_stack 3156 int setup_extent_mapping(struct inode *inode, u64 start, u64 end, 3157 u64 block_start) 3158 { 3159 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3160 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3161 struct extent_map *em; 3162 int ret = 0; 3163 3164 em = alloc_extent_map(); 3165 if (!em) 3166 return -ENOMEM; 3167 3168 em->start = start; 3169 em->len = end + 1 - start; 3170 em->block_len = em->len; 3171 em->block_start = block_start; 3172 em->bdev = fs_info->fs_devices->latest_bdev; 3173 set_bit(EXTENT_FLAG_PINNED, &em->flags); 3174 3175 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3176 while (1) { 3177 write_lock(&em_tree->lock); 3178 ret = add_extent_mapping(em_tree, em, 0); 3179 write_unlock(&em_tree->lock); 3180 if (ret != -EEXIST) { 3181 free_extent_map(em); 3182 break; 3183 } 3184 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 3185 } 3186 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3187 return ret; 3188 } 3189 3190 static int relocate_file_extent_cluster(struct inode *inode, 3191 struct file_extent_cluster *cluster) 3192 { 3193 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3194 u64 page_start; 3195 u64 page_end; 3196 u64 offset = BTRFS_I(inode)->index_cnt; 3197 unsigned long index; 3198 unsigned long last_index; 3199 struct page *page; 3200 struct file_ra_state *ra; 3201 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 3202 int nr = 0; 3203 int ret = 0; 3204 3205 if (!cluster->nr) 3206 return 0; 3207 3208 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3209 if (!ra) 3210 return -ENOMEM; 3211 3212 ret = prealloc_file_extent_cluster(inode, cluster); 3213 if (ret) 3214 goto out; 3215 3216 file_ra_state_init(ra, inode->i_mapping); 3217 3218 ret = setup_extent_mapping(inode, cluster->start - offset, 3219 cluster->end - offset, cluster->start); 3220 if (ret) 3221 goto out; 3222 3223 index = (cluster->start - offset) >> PAGE_SHIFT; 3224 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3225 while (index <= last_index) { 3226 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3227 PAGE_SIZE); 3228 if (ret) 3229 goto out; 3230 3231 page = find_lock_page(inode->i_mapping, index); 3232 if (!page) { 3233 page_cache_sync_readahead(inode->i_mapping, 3234 ra, NULL, index, 3235 last_index + 1 - index); 3236 page = find_or_create_page(inode->i_mapping, index, 3237 mask); 3238 if (!page) { 3239 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3240 PAGE_SIZE, true); 3241 ret = -ENOMEM; 3242 goto out; 3243 } 3244 } 3245 3246 if (PageReadahead(page)) { 3247 page_cache_async_readahead(inode->i_mapping, 3248 ra, NULL, page, index, 3249 last_index + 1 - index); 3250 } 3251 3252 if (!PageUptodate(page)) { 3253 btrfs_readpage(NULL, page); 3254 lock_page(page); 3255 if (!PageUptodate(page)) { 3256 unlock_page(page); 3257 put_page(page); 3258 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3259 PAGE_SIZE, true); 3260 btrfs_delalloc_release_extents(BTRFS_I(inode), 3261 PAGE_SIZE, true); 3262 ret = -EIO; 3263 goto out; 3264 } 3265 } 3266 3267 page_start = page_offset(page); 3268 page_end = page_start + PAGE_SIZE - 1; 3269 3270 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3271 3272 set_page_extent_mapped(page); 3273 3274 if (nr < cluster->nr && 3275 page_start + offset == cluster->boundary[nr]) { 3276 set_extent_bits(&BTRFS_I(inode)->io_tree, 3277 page_start, page_end, 3278 EXTENT_BOUNDARY); 3279 nr++; 3280 } 3281 3282 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 3283 NULL, 0); 3284 if (ret) { 3285 unlock_page(page); 3286 put_page(page); 3287 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3288 PAGE_SIZE, true); 3289 btrfs_delalloc_release_extents(BTRFS_I(inode), 3290 PAGE_SIZE, true); 3291 3292 clear_extent_bits(&BTRFS_I(inode)->io_tree, 3293 page_start, page_end, 3294 EXTENT_LOCKED | EXTENT_BOUNDARY); 3295 goto out; 3296 3297 } 3298 set_page_dirty(page); 3299 3300 unlock_extent(&BTRFS_I(inode)->io_tree, 3301 page_start, page_end); 3302 unlock_page(page); 3303 put_page(page); 3304 3305 index++; 3306 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, 3307 false); 3308 balance_dirty_pages_ratelimited(inode->i_mapping); 3309 btrfs_throttle(fs_info); 3310 } 3311 WARN_ON(nr != cluster->nr); 3312 out: 3313 kfree(ra); 3314 return ret; 3315 } 3316 3317 static noinline_for_stack 3318 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3319 struct file_extent_cluster *cluster) 3320 { 3321 int ret; 3322 3323 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3324 ret = relocate_file_extent_cluster(inode, cluster); 3325 if (ret) 3326 return ret; 3327 cluster->nr = 0; 3328 } 3329 3330 if (!cluster->nr) 3331 cluster->start = extent_key->objectid; 3332 else 3333 BUG_ON(cluster->nr >= MAX_EXTENTS); 3334 cluster->end = extent_key->objectid + extent_key->offset - 1; 3335 cluster->boundary[cluster->nr] = extent_key->objectid; 3336 cluster->nr++; 3337 3338 if (cluster->nr >= MAX_EXTENTS) { 3339 ret = relocate_file_extent_cluster(inode, cluster); 3340 if (ret) 3341 return ret; 3342 cluster->nr = 0; 3343 } 3344 return 0; 3345 } 3346 3347 /* 3348 * helper to add a tree block to the list. 3349 * the major work is getting the generation and level of the block 3350 */ 3351 static int add_tree_block(struct reloc_control *rc, 3352 struct btrfs_key *extent_key, 3353 struct btrfs_path *path, 3354 struct rb_root *blocks) 3355 { 3356 struct extent_buffer *eb; 3357 struct btrfs_extent_item *ei; 3358 struct btrfs_tree_block_info *bi; 3359 struct tree_block *block; 3360 struct rb_node *rb_node; 3361 u32 item_size; 3362 int level = -1; 3363 u64 generation; 3364 3365 eb = path->nodes[0]; 3366 item_size = btrfs_item_size_nr(eb, path->slots[0]); 3367 3368 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3369 item_size >= sizeof(*ei) + sizeof(*bi)) { 3370 ei = btrfs_item_ptr(eb, path->slots[0], 3371 struct btrfs_extent_item); 3372 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3373 bi = (struct btrfs_tree_block_info *)(ei + 1); 3374 level = btrfs_tree_block_level(eb, bi); 3375 } else { 3376 level = (int)extent_key->offset; 3377 } 3378 generation = btrfs_extent_generation(eb, ei); 3379 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3380 btrfs_print_v0_err(eb->fs_info); 3381 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3382 return -EINVAL; 3383 } else { 3384 BUG(); 3385 } 3386 3387 btrfs_release_path(path); 3388 3389 BUG_ON(level == -1); 3390 3391 block = kmalloc(sizeof(*block), GFP_NOFS); 3392 if (!block) 3393 return -ENOMEM; 3394 3395 block->bytenr = extent_key->objectid; 3396 block->key.objectid = rc->extent_root->fs_info->nodesize; 3397 block->key.offset = generation; 3398 block->level = level; 3399 block->key_ready = 0; 3400 3401 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); 3402 if (rb_node) 3403 backref_tree_panic(rb_node, -EEXIST, block->bytenr); 3404 3405 return 0; 3406 } 3407 3408 /* 3409 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3410 */ 3411 static int __add_tree_block(struct reloc_control *rc, 3412 u64 bytenr, u32 blocksize, 3413 struct rb_root *blocks) 3414 { 3415 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3416 struct btrfs_path *path; 3417 struct btrfs_key key; 3418 int ret; 3419 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3420 3421 if (tree_block_processed(bytenr, rc)) 3422 return 0; 3423 3424 if (tree_search(blocks, bytenr)) 3425 return 0; 3426 3427 path = btrfs_alloc_path(); 3428 if (!path) 3429 return -ENOMEM; 3430 again: 3431 key.objectid = bytenr; 3432 if (skinny) { 3433 key.type = BTRFS_METADATA_ITEM_KEY; 3434 key.offset = (u64)-1; 3435 } else { 3436 key.type = BTRFS_EXTENT_ITEM_KEY; 3437 key.offset = blocksize; 3438 } 3439 3440 path->search_commit_root = 1; 3441 path->skip_locking = 1; 3442 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3443 if (ret < 0) 3444 goto out; 3445 3446 if (ret > 0 && skinny) { 3447 if (path->slots[0]) { 3448 path->slots[0]--; 3449 btrfs_item_key_to_cpu(path->nodes[0], &key, 3450 path->slots[0]); 3451 if (key.objectid == bytenr && 3452 (key.type == BTRFS_METADATA_ITEM_KEY || 3453 (key.type == BTRFS_EXTENT_ITEM_KEY && 3454 key.offset == blocksize))) 3455 ret = 0; 3456 } 3457 3458 if (ret) { 3459 skinny = false; 3460 btrfs_release_path(path); 3461 goto again; 3462 } 3463 } 3464 if (ret) { 3465 ASSERT(ret == 1); 3466 btrfs_print_leaf(path->nodes[0]); 3467 btrfs_err(fs_info, 3468 "tree block extent item (%llu) is not found in extent tree", 3469 bytenr); 3470 WARN_ON(1); 3471 ret = -EINVAL; 3472 goto out; 3473 } 3474 3475 ret = add_tree_block(rc, &key, path, blocks); 3476 out: 3477 btrfs_free_path(path); 3478 return ret; 3479 } 3480 3481 /* 3482 * helper to check if the block use full backrefs for pointers in it 3483 */ 3484 static int block_use_full_backref(struct reloc_control *rc, 3485 struct extent_buffer *eb) 3486 { 3487 u64 flags; 3488 int ret; 3489 3490 if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || 3491 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3492 return 1; 3493 3494 ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, 3495 eb->start, btrfs_header_level(eb), 1, 3496 NULL, &flags); 3497 BUG_ON(ret); 3498 3499 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3500 ret = 1; 3501 else 3502 ret = 0; 3503 return ret; 3504 } 3505 3506 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3507 struct btrfs_block_group_cache *block_group, 3508 struct inode *inode, 3509 u64 ino) 3510 { 3511 struct btrfs_key key; 3512 struct btrfs_root *root = fs_info->tree_root; 3513 struct btrfs_trans_handle *trans; 3514 int ret = 0; 3515 3516 if (inode) 3517 goto truncate; 3518 3519 key.objectid = ino; 3520 key.type = BTRFS_INODE_ITEM_KEY; 3521 key.offset = 0; 3522 3523 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3524 if (IS_ERR(inode)) 3525 return -ENOENT; 3526 3527 truncate: 3528 ret = btrfs_check_trunc_cache_free_space(fs_info, 3529 &fs_info->global_block_rsv); 3530 if (ret) 3531 goto out; 3532 3533 trans = btrfs_join_transaction(root); 3534 if (IS_ERR(trans)) { 3535 ret = PTR_ERR(trans); 3536 goto out; 3537 } 3538 3539 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3540 3541 btrfs_end_transaction(trans); 3542 btrfs_btree_balance_dirty(fs_info); 3543 out: 3544 iput(inode); 3545 return ret; 3546 } 3547 3548 /* 3549 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY 3550 * this function scans fs tree to find blocks reference the data extent 3551 */ 3552 static int find_data_references(struct reloc_control *rc, 3553 struct btrfs_key *extent_key, 3554 struct extent_buffer *leaf, 3555 struct btrfs_extent_data_ref *ref, 3556 struct rb_root *blocks) 3557 { 3558 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3559 struct btrfs_path *path; 3560 struct tree_block *block; 3561 struct btrfs_root *root; 3562 struct btrfs_file_extent_item *fi; 3563 struct rb_node *rb_node; 3564 struct btrfs_key key; 3565 u64 ref_root; 3566 u64 ref_objectid; 3567 u64 ref_offset; 3568 u32 ref_count; 3569 u32 nritems; 3570 int err = 0; 3571 int added = 0; 3572 int counted; 3573 int ret; 3574 3575 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3576 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3577 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3578 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3579 3580 /* 3581 * This is an extent belonging to the free space cache, lets just delete 3582 * it and redo the search. 3583 */ 3584 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3585 ret = delete_block_group_cache(fs_info, rc->block_group, 3586 NULL, ref_objectid); 3587 if (ret != -ENOENT) 3588 return ret; 3589 ret = 0; 3590 } 3591 3592 path = btrfs_alloc_path(); 3593 if (!path) 3594 return -ENOMEM; 3595 path->reada = READA_FORWARD; 3596 3597 root = read_fs_root(fs_info, ref_root); 3598 if (IS_ERR(root)) { 3599 err = PTR_ERR(root); 3600 goto out; 3601 } 3602 3603 key.objectid = ref_objectid; 3604 key.type = BTRFS_EXTENT_DATA_KEY; 3605 if (ref_offset > ((u64)-1 << 32)) 3606 key.offset = 0; 3607 else 3608 key.offset = ref_offset; 3609 3610 path->search_commit_root = 1; 3611 path->skip_locking = 1; 3612 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3613 if (ret < 0) { 3614 err = ret; 3615 goto out; 3616 } 3617 3618 leaf = path->nodes[0]; 3619 nritems = btrfs_header_nritems(leaf); 3620 /* 3621 * the references in tree blocks that use full backrefs 3622 * are not counted in 3623 */ 3624 if (block_use_full_backref(rc, leaf)) 3625 counted = 0; 3626 else 3627 counted = 1; 3628 rb_node = tree_search(blocks, leaf->start); 3629 if (rb_node) { 3630 if (counted) 3631 added = 1; 3632 else 3633 path->slots[0] = nritems; 3634 } 3635 3636 while (ref_count > 0) { 3637 while (path->slots[0] >= nritems) { 3638 ret = btrfs_next_leaf(root, path); 3639 if (ret < 0) { 3640 err = ret; 3641 goto out; 3642 } 3643 if (WARN_ON(ret > 0)) 3644 goto out; 3645 3646 leaf = path->nodes[0]; 3647 nritems = btrfs_header_nritems(leaf); 3648 added = 0; 3649 3650 if (block_use_full_backref(rc, leaf)) 3651 counted = 0; 3652 else 3653 counted = 1; 3654 rb_node = tree_search(blocks, leaf->start); 3655 if (rb_node) { 3656 if (counted) 3657 added = 1; 3658 else 3659 path->slots[0] = nritems; 3660 } 3661 } 3662 3663 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3664 if (WARN_ON(key.objectid != ref_objectid || 3665 key.type != BTRFS_EXTENT_DATA_KEY)) 3666 break; 3667 3668 fi = btrfs_item_ptr(leaf, path->slots[0], 3669 struct btrfs_file_extent_item); 3670 3671 if (btrfs_file_extent_type(leaf, fi) == 3672 BTRFS_FILE_EXTENT_INLINE) 3673 goto next; 3674 3675 if (btrfs_file_extent_disk_bytenr(leaf, fi) != 3676 extent_key->objectid) 3677 goto next; 3678 3679 key.offset -= btrfs_file_extent_offset(leaf, fi); 3680 if (key.offset != ref_offset) 3681 goto next; 3682 3683 if (counted) 3684 ref_count--; 3685 if (added) 3686 goto next; 3687 3688 if (!tree_block_processed(leaf->start, rc)) { 3689 block = kmalloc(sizeof(*block), GFP_NOFS); 3690 if (!block) { 3691 err = -ENOMEM; 3692 break; 3693 } 3694 block->bytenr = leaf->start; 3695 btrfs_item_key_to_cpu(leaf, &block->key, 0); 3696 block->level = 0; 3697 block->key_ready = 1; 3698 rb_node = tree_insert(blocks, block->bytenr, 3699 &block->rb_node); 3700 if (rb_node) 3701 backref_tree_panic(rb_node, -EEXIST, 3702 block->bytenr); 3703 } 3704 if (counted) 3705 added = 1; 3706 else 3707 path->slots[0] = nritems; 3708 next: 3709 path->slots[0]++; 3710 3711 } 3712 out: 3713 btrfs_free_path(path); 3714 return err; 3715 } 3716 3717 /* 3718 * helper to find all tree blocks that reference a given data extent 3719 */ 3720 static noinline_for_stack 3721 int add_data_references(struct reloc_control *rc, 3722 struct btrfs_key *extent_key, 3723 struct btrfs_path *path, 3724 struct rb_root *blocks) 3725 { 3726 struct btrfs_key key; 3727 struct extent_buffer *eb; 3728 struct btrfs_extent_data_ref *dref; 3729 struct btrfs_extent_inline_ref *iref; 3730 unsigned long ptr; 3731 unsigned long end; 3732 u32 blocksize = rc->extent_root->fs_info->nodesize; 3733 int ret = 0; 3734 int err = 0; 3735 3736 eb = path->nodes[0]; 3737 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3738 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3739 ptr += sizeof(struct btrfs_extent_item); 3740 3741 while (ptr < end) { 3742 iref = (struct btrfs_extent_inline_ref *)ptr; 3743 key.type = btrfs_get_extent_inline_ref_type(eb, iref, 3744 BTRFS_REF_TYPE_DATA); 3745 if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 3746 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 3747 ret = __add_tree_block(rc, key.offset, blocksize, 3748 blocks); 3749 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 3750 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 3751 ret = find_data_references(rc, extent_key, 3752 eb, dref, blocks); 3753 } else { 3754 ret = -EUCLEAN; 3755 btrfs_err(rc->extent_root->fs_info, 3756 "extent %llu slot %d has an invalid inline ref type", 3757 eb->start, path->slots[0]); 3758 } 3759 if (ret) { 3760 err = ret; 3761 goto out; 3762 } 3763 ptr += btrfs_extent_inline_ref_size(key.type); 3764 } 3765 WARN_ON(ptr > end); 3766 3767 while (1) { 3768 cond_resched(); 3769 eb = path->nodes[0]; 3770 if (path->slots[0] >= btrfs_header_nritems(eb)) { 3771 ret = btrfs_next_leaf(rc->extent_root, path); 3772 if (ret < 0) { 3773 err = ret; 3774 break; 3775 } 3776 if (ret > 0) 3777 break; 3778 eb = path->nodes[0]; 3779 } 3780 3781 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 3782 if (key.objectid != extent_key->objectid) 3783 break; 3784 3785 if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 3786 ret = __add_tree_block(rc, key.offset, blocksize, 3787 blocks); 3788 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 3789 dref = btrfs_item_ptr(eb, path->slots[0], 3790 struct btrfs_extent_data_ref); 3791 ret = find_data_references(rc, extent_key, 3792 eb, dref, blocks); 3793 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 3794 btrfs_print_v0_err(eb->fs_info); 3795 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3796 ret = -EINVAL; 3797 } else { 3798 ret = 0; 3799 } 3800 if (ret) { 3801 err = ret; 3802 break; 3803 } 3804 path->slots[0]++; 3805 } 3806 out: 3807 btrfs_release_path(path); 3808 if (err) 3809 free_block_list(blocks); 3810 return err; 3811 } 3812 3813 /* 3814 * helper to find next unprocessed extent 3815 */ 3816 static noinline_for_stack 3817 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3818 struct btrfs_key *extent_key) 3819 { 3820 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3821 struct btrfs_key key; 3822 struct extent_buffer *leaf; 3823 u64 start, end, last; 3824 int ret; 3825 3826 last = rc->block_group->key.objectid + rc->block_group->key.offset; 3827 while (1) { 3828 cond_resched(); 3829 if (rc->search_start >= last) { 3830 ret = 1; 3831 break; 3832 } 3833 3834 key.objectid = rc->search_start; 3835 key.type = BTRFS_EXTENT_ITEM_KEY; 3836 key.offset = 0; 3837 3838 path->search_commit_root = 1; 3839 path->skip_locking = 1; 3840 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3841 0, 0); 3842 if (ret < 0) 3843 break; 3844 next: 3845 leaf = path->nodes[0]; 3846 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3847 ret = btrfs_next_leaf(rc->extent_root, path); 3848 if (ret != 0) 3849 break; 3850 leaf = path->nodes[0]; 3851 } 3852 3853 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3854 if (key.objectid >= last) { 3855 ret = 1; 3856 break; 3857 } 3858 3859 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3860 key.type != BTRFS_METADATA_ITEM_KEY) { 3861 path->slots[0]++; 3862 goto next; 3863 } 3864 3865 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3866 key.objectid + key.offset <= rc->search_start) { 3867 path->slots[0]++; 3868 goto next; 3869 } 3870 3871 if (key.type == BTRFS_METADATA_ITEM_KEY && 3872 key.objectid + fs_info->nodesize <= 3873 rc->search_start) { 3874 path->slots[0]++; 3875 goto next; 3876 } 3877 3878 ret = find_first_extent_bit(&rc->processed_blocks, 3879 key.objectid, &start, &end, 3880 EXTENT_DIRTY, NULL); 3881 3882 if (ret == 0 && start <= key.objectid) { 3883 btrfs_release_path(path); 3884 rc->search_start = end + 1; 3885 } else { 3886 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3887 rc->search_start = key.objectid + key.offset; 3888 else 3889 rc->search_start = key.objectid + 3890 fs_info->nodesize; 3891 memcpy(extent_key, &key, sizeof(key)); 3892 return 0; 3893 } 3894 } 3895 btrfs_release_path(path); 3896 return ret; 3897 } 3898 3899 static void set_reloc_control(struct reloc_control *rc) 3900 { 3901 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3902 3903 mutex_lock(&fs_info->reloc_mutex); 3904 fs_info->reloc_ctl = rc; 3905 mutex_unlock(&fs_info->reloc_mutex); 3906 } 3907 3908 static void unset_reloc_control(struct reloc_control *rc) 3909 { 3910 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3911 3912 mutex_lock(&fs_info->reloc_mutex); 3913 fs_info->reloc_ctl = NULL; 3914 mutex_unlock(&fs_info->reloc_mutex); 3915 } 3916 3917 static int check_extent_flags(u64 flags) 3918 { 3919 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3920 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3921 return 1; 3922 if (!(flags & BTRFS_EXTENT_FLAG_DATA) && 3923 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3924 return 1; 3925 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3926 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 3927 return 1; 3928 return 0; 3929 } 3930 3931 static noinline_for_stack 3932 int prepare_to_relocate(struct reloc_control *rc) 3933 { 3934 struct btrfs_trans_handle *trans; 3935 int ret; 3936 3937 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3938 BTRFS_BLOCK_RSV_TEMP); 3939 if (!rc->block_rsv) 3940 return -ENOMEM; 3941 3942 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3943 rc->search_start = rc->block_group->key.objectid; 3944 rc->extents_found = 0; 3945 rc->nodes_relocated = 0; 3946 rc->merging_rsv_size = 0; 3947 rc->reserved_bytes = 0; 3948 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3949 RELOCATION_RESERVED_NODES; 3950 ret = btrfs_block_rsv_refill(rc->extent_root, 3951 rc->block_rsv, rc->block_rsv->size, 3952 BTRFS_RESERVE_FLUSH_ALL); 3953 if (ret) 3954 return ret; 3955 3956 rc->create_reloc_tree = 1; 3957 set_reloc_control(rc); 3958 3959 trans = btrfs_join_transaction(rc->extent_root); 3960 if (IS_ERR(trans)) { 3961 unset_reloc_control(rc); 3962 /* 3963 * extent tree is not a ref_cow tree and has no reloc_root to 3964 * cleanup. And callers are responsible to free the above 3965 * block rsv. 3966 */ 3967 return PTR_ERR(trans); 3968 } 3969 btrfs_commit_transaction(trans); 3970 return 0; 3971 } 3972 3973 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3974 { 3975 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3976 struct rb_root blocks = RB_ROOT; 3977 struct btrfs_key key; 3978 struct btrfs_trans_handle *trans = NULL; 3979 struct btrfs_path *path; 3980 struct btrfs_extent_item *ei; 3981 u64 flags; 3982 u32 item_size; 3983 int ret; 3984 int err = 0; 3985 int progress = 0; 3986 3987 path = btrfs_alloc_path(); 3988 if (!path) 3989 return -ENOMEM; 3990 path->reada = READA_FORWARD; 3991 3992 ret = prepare_to_relocate(rc); 3993 if (ret) { 3994 err = ret; 3995 goto out_free; 3996 } 3997 3998 while (1) { 3999 rc->reserved_bytes = 0; 4000 ret = btrfs_block_rsv_refill(rc->extent_root, 4001 rc->block_rsv, rc->block_rsv->size, 4002 BTRFS_RESERVE_FLUSH_ALL); 4003 if (ret) { 4004 err = ret; 4005 break; 4006 } 4007 progress++; 4008 trans = btrfs_start_transaction(rc->extent_root, 0); 4009 if (IS_ERR(trans)) { 4010 err = PTR_ERR(trans); 4011 trans = NULL; 4012 break; 4013 } 4014 restart: 4015 if (update_backref_cache(trans, &rc->backref_cache)) { 4016 btrfs_end_transaction(trans); 4017 trans = NULL; 4018 continue; 4019 } 4020 4021 ret = find_next_extent(rc, path, &key); 4022 if (ret < 0) 4023 err = ret; 4024 if (ret != 0) 4025 break; 4026 4027 rc->extents_found++; 4028 4029 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 4030 struct btrfs_extent_item); 4031 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 4032 if (item_size >= sizeof(*ei)) { 4033 flags = btrfs_extent_flags(path->nodes[0], ei); 4034 ret = check_extent_flags(flags); 4035 BUG_ON(ret); 4036 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 4037 err = -EINVAL; 4038 btrfs_print_v0_err(trans->fs_info); 4039 btrfs_abort_transaction(trans, err); 4040 break; 4041 } else { 4042 BUG(); 4043 } 4044 4045 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 4046 ret = add_tree_block(rc, &key, path, &blocks); 4047 } else if (rc->stage == UPDATE_DATA_PTRS && 4048 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4049 ret = add_data_references(rc, &key, path, &blocks); 4050 } else { 4051 btrfs_release_path(path); 4052 ret = 0; 4053 } 4054 if (ret < 0) { 4055 err = ret; 4056 break; 4057 } 4058 4059 if (!RB_EMPTY_ROOT(&blocks)) { 4060 ret = relocate_tree_blocks(trans, rc, &blocks); 4061 if (ret < 0) { 4062 /* 4063 * if we fail to relocate tree blocks, force to update 4064 * backref cache when committing transaction. 4065 */ 4066 rc->backref_cache.last_trans = trans->transid - 1; 4067 4068 if (ret != -EAGAIN) { 4069 err = ret; 4070 break; 4071 } 4072 rc->extents_found--; 4073 rc->search_start = key.objectid; 4074 } 4075 } 4076 4077 btrfs_end_transaction_throttle(trans); 4078 btrfs_btree_balance_dirty(fs_info); 4079 trans = NULL; 4080 4081 if (rc->stage == MOVE_DATA_EXTENTS && 4082 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4083 rc->found_file_extent = 1; 4084 ret = relocate_data_extent(rc->data_inode, 4085 &key, &rc->cluster); 4086 if (ret < 0) { 4087 err = ret; 4088 break; 4089 } 4090 } 4091 } 4092 if (trans && progress && err == -ENOSPC) { 4093 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 4094 if (ret == 1) { 4095 err = 0; 4096 progress = 0; 4097 goto restart; 4098 } 4099 } 4100 4101 btrfs_release_path(path); 4102 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 4103 4104 if (trans) { 4105 btrfs_end_transaction_throttle(trans); 4106 btrfs_btree_balance_dirty(fs_info); 4107 } 4108 4109 if (!err) { 4110 ret = relocate_file_extent_cluster(rc->data_inode, 4111 &rc->cluster); 4112 if (ret < 0) 4113 err = ret; 4114 } 4115 4116 rc->create_reloc_tree = 0; 4117 set_reloc_control(rc); 4118 4119 backref_cache_cleanup(&rc->backref_cache); 4120 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); 4121 4122 err = prepare_to_merge(rc, err); 4123 4124 merge_reloc_roots(rc); 4125 4126 rc->merge_reloc_tree = 0; 4127 unset_reloc_control(rc); 4128 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); 4129 4130 /* get rid of pinned extents */ 4131 trans = btrfs_join_transaction(rc->extent_root); 4132 if (IS_ERR(trans)) { 4133 err = PTR_ERR(trans); 4134 goto out_free; 4135 } 4136 btrfs_commit_transaction(trans); 4137 ret = clean_dirty_subvols(rc); 4138 if (ret < 0 && !err) 4139 err = ret; 4140 out_free: 4141 btrfs_free_block_rsv(fs_info, rc->block_rsv); 4142 btrfs_free_path(path); 4143 return err; 4144 } 4145 4146 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 4147 struct btrfs_root *root, u64 objectid) 4148 { 4149 struct btrfs_path *path; 4150 struct btrfs_inode_item *item; 4151 struct extent_buffer *leaf; 4152 int ret; 4153 4154 path = btrfs_alloc_path(); 4155 if (!path) 4156 return -ENOMEM; 4157 4158 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 4159 if (ret) 4160 goto out; 4161 4162 leaf = path->nodes[0]; 4163 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 4164 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 4165 btrfs_set_inode_generation(leaf, item, 1); 4166 btrfs_set_inode_size(leaf, item, 0); 4167 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 4168 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4169 BTRFS_INODE_PREALLOC); 4170 btrfs_mark_buffer_dirty(leaf); 4171 out: 4172 btrfs_free_path(path); 4173 return ret; 4174 } 4175 4176 /* 4177 * helper to create inode for data relocation. 4178 * the inode is in data relocation tree and its link count is 0 4179 */ 4180 static noinline_for_stack 4181 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 4182 struct btrfs_block_group_cache *group) 4183 { 4184 struct inode *inode = NULL; 4185 struct btrfs_trans_handle *trans; 4186 struct btrfs_root *root; 4187 struct btrfs_key key; 4188 u64 objectid; 4189 int err = 0; 4190 4191 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4192 if (IS_ERR(root)) 4193 return ERR_CAST(root); 4194 4195 trans = btrfs_start_transaction(root, 6); 4196 if (IS_ERR(trans)) 4197 return ERR_CAST(trans); 4198 4199 err = btrfs_find_free_objectid(root, &objectid); 4200 if (err) 4201 goto out; 4202 4203 err = __insert_orphan_inode(trans, root, objectid); 4204 BUG_ON(err); 4205 4206 key.objectid = objectid; 4207 key.type = BTRFS_INODE_ITEM_KEY; 4208 key.offset = 0; 4209 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 4210 BUG_ON(IS_ERR(inode)); 4211 BTRFS_I(inode)->index_cnt = group->key.objectid; 4212 4213 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4214 out: 4215 btrfs_end_transaction(trans); 4216 btrfs_btree_balance_dirty(fs_info); 4217 if (err) { 4218 if (inode) 4219 iput(inode); 4220 inode = ERR_PTR(err); 4221 } 4222 return inode; 4223 } 4224 4225 static struct reloc_control *alloc_reloc_control(void) 4226 { 4227 struct reloc_control *rc; 4228 4229 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4230 if (!rc) 4231 return NULL; 4232 4233 INIT_LIST_HEAD(&rc->reloc_roots); 4234 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 4235 backref_cache_init(&rc->backref_cache); 4236 mapping_tree_init(&rc->reloc_root_tree); 4237 extent_io_tree_init(&rc->processed_blocks, NULL); 4238 return rc; 4239 } 4240 4241 /* 4242 * Print the block group being relocated 4243 */ 4244 static void describe_relocation(struct btrfs_fs_info *fs_info, 4245 struct btrfs_block_group_cache *block_group) 4246 { 4247 char buf[128] = {'\0'}; 4248 4249 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 4250 4251 btrfs_info(fs_info, 4252 "relocating block group %llu flags %s", 4253 block_group->key.objectid, buf); 4254 } 4255 4256 /* 4257 * function to relocate all extents in a block group. 4258 */ 4259 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 4260 { 4261 struct btrfs_block_group_cache *bg; 4262 struct btrfs_root *extent_root = fs_info->extent_root; 4263 struct reloc_control *rc; 4264 struct inode *inode; 4265 struct btrfs_path *path; 4266 int ret; 4267 int rw = 0; 4268 int err = 0; 4269 4270 bg = btrfs_lookup_block_group(fs_info, group_start); 4271 if (!bg) 4272 return -ENOENT; 4273 4274 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4275 btrfs_put_block_group(bg); 4276 return -ETXTBSY; 4277 } 4278 4279 rc = alloc_reloc_control(); 4280 if (!rc) { 4281 btrfs_put_block_group(bg); 4282 return -ENOMEM; 4283 } 4284 4285 rc->extent_root = extent_root; 4286 rc->block_group = bg; 4287 4288 ret = btrfs_inc_block_group_ro(rc->block_group); 4289 if (ret) { 4290 err = ret; 4291 goto out; 4292 } 4293 rw = 1; 4294 4295 path = btrfs_alloc_path(); 4296 if (!path) { 4297 err = -ENOMEM; 4298 goto out; 4299 } 4300 4301 inode = lookup_free_space_inode(fs_info, rc->block_group, path); 4302 btrfs_free_path(path); 4303 4304 if (!IS_ERR(inode)) 4305 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4306 else 4307 ret = PTR_ERR(inode); 4308 4309 if (ret && ret != -ENOENT) { 4310 err = ret; 4311 goto out; 4312 } 4313 4314 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4315 if (IS_ERR(rc->data_inode)) { 4316 err = PTR_ERR(rc->data_inode); 4317 rc->data_inode = NULL; 4318 goto out; 4319 } 4320 4321 describe_relocation(fs_info, rc->block_group); 4322 4323 btrfs_wait_block_group_reservations(rc->block_group); 4324 btrfs_wait_nocow_writers(rc->block_group); 4325 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4326 rc->block_group->key.objectid, 4327 rc->block_group->key.offset); 4328 4329 while (1) { 4330 mutex_lock(&fs_info->cleaner_mutex); 4331 ret = relocate_block_group(rc); 4332 mutex_unlock(&fs_info->cleaner_mutex); 4333 if (ret < 0) { 4334 err = ret; 4335 goto out; 4336 } 4337 4338 if (rc->extents_found == 0) 4339 break; 4340 4341 btrfs_info(fs_info, "found %llu extents", rc->extents_found); 4342 4343 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4344 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4345 (u64)-1); 4346 if (ret) { 4347 err = ret; 4348 goto out; 4349 } 4350 invalidate_mapping_pages(rc->data_inode->i_mapping, 4351 0, -1); 4352 rc->stage = UPDATE_DATA_PTRS; 4353 } 4354 } 4355 4356 WARN_ON(rc->block_group->pinned > 0); 4357 WARN_ON(rc->block_group->reserved > 0); 4358 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 4359 out: 4360 if (err && rw) 4361 btrfs_dec_block_group_ro(rc->block_group); 4362 iput(rc->data_inode); 4363 btrfs_put_block_group(rc->block_group); 4364 kfree(rc); 4365 return err; 4366 } 4367 4368 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4369 { 4370 struct btrfs_fs_info *fs_info = root->fs_info; 4371 struct btrfs_trans_handle *trans; 4372 int ret, err; 4373 4374 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4375 if (IS_ERR(trans)) 4376 return PTR_ERR(trans); 4377 4378 memset(&root->root_item.drop_progress, 0, 4379 sizeof(root->root_item.drop_progress)); 4380 root->root_item.drop_level = 0; 4381 btrfs_set_root_refs(&root->root_item, 0); 4382 ret = btrfs_update_root(trans, fs_info->tree_root, 4383 &root->root_key, &root->root_item); 4384 4385 err = btrfs_end_transaction(trans); 4386 if (err) 4387 return err; 4388 return ret; 4389 } 4390 4391 /* 4392 * recover relocation interrupted by system crash. 4393 * 4394 * this function resumes merging reloc trees with corresponding fs trees. 4395 * this is important for keeping the sharing of tree blocks 4396 */ 4397 int btrfs_recover_relocation(struct btrfs_root *root) 4398 { 4399 struct btrfs_fs_info *fs_info = root->fs_info; 4400 LIST_HEAD(reloc_roots); 4401 struct btrfs_key key; 4402 struct btrfs_root *fs_root; 4403 struct btrfs_root *reloc_root; 4404 struct btrfs_path *path; 4405 struct extent_buffer *leaf; 4406 struct reloc_control *rc = NULL; 4407 struct btrfs_trans_handle *trans; 4408 int ret; 4409 int err = 0; 4410 4411 path = btrfs_alloc_path(); 4412 if (!path) 4413 return -ENOMEM; 4414 path->reada = READA_BACK; 4415 4416 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4417 key.type = BTRFS_ROOT_ITEM_KEY; 4418 key.offset = (u64)-1; 4419 4420 while (1) { 4421 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4422 path, 0, 0); 4423 if (ret < 0) { 4424 err = ret; 4425 goto out; 4426 } 4427 if (ret > 0) { 4428 if (path->slots[0] == 0) 4429 break; 4430 path->slots[0]--; 4431 } 4432 leaf = path->nodes[0]; 4433 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4434 btrfs_release_path(path); 4435 4436 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4437 key.type != BTRFS_ROOT_ITEM_KEY) 4438 break; 4439 4440 reloc_root = btrfs_read_fs_root(root, &key); 4441 if (IS_ERR(reloc_root)) { 4442 err = PTR_ERR(reloc_root); 4443 goto out; 4444 } 4445 4446 list_add(&reloc_root->root_list, &reloc_roots); 4447 4448 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4449 fs_root = read_fs_root(fs_info, 4450 reloc_root->root_key.offset); 4451 if (IS_ERR(fs_root)) { 4452 ret = PTR_ERR(fs_root); 4453 if (ret != -ENOENT) { 4454 err = ret; 4455 goto out; 4456 } 4457 ret = mark_garbage_root(reloc_root); 4458 if (ret < 0) { 4459 err = ret; 4460 goto out; 4461 } 4462 } 4463 } 4464 4465 if (key.offset == 0) 4466 break; 4467 4468 key.offset--; 4469 } 4470 btrfs_release_path(path); 4471 4472 if (list_empty(&reloc_roots)) 4473 goto out; 4474 4475 rc = alloc_reloc_control(); 4476 if (!rc) { 4477 err = -ENOMEM; 4478 goto out; 4479 } 4480 4481 rc->extent_root = fs_info->extent_root; 4482 4483 set_reloc_control(rc); 4484 4485 trans = btrfs_join_transaction(rc->extent_root); 4486 if (IS_ERR(trans)) { 4487 unset_reloc_control(rc); 4488 err = PTR_ERR(trans); 4489 goto out_free; 4490 } 4491 4492 rc->merge_reloc_tree = 1; 4493 4494 while (!list_empty(&reloc_roots)) { 4495 reloc_root = list_entry(reloc_roots.next, 4496 struct btrfs_root, root_list); 4497 list_del(&reloc_root->root_list); 4498 4499 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4500 list_add_tail(&reloc_root->root_list, 4501 &rc->reloc_roots); 4502 continue; 4503 } 4504 4505 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); 4506 if (IS_ERR(fs_root)) { 4507 err = PTR_ERR(fs_root); 4508 goto out_free; 4509 } 4510 4511 err = __add_reloc_root(reloc_root); 4512 BUG_ON(err < 0); /* -ENOMEM or logic error */ 4513 fs_root->reloc_root = reloc_root; 4514 } 4515 4516 err = btrfs_commit_transaction(trans); 4517 if (err) 4518 goto out_free; 4519 4520 merge_reloc_roots(rc); 4521 4522 unset_reloc_control(rc); 4523 4524 trans = btrfs_join_transaction(rc->extent_root); 4525 if (IS_ERR(trans)) { 4526 err = PTR_ERR(trans); 4527 goto out_free; 4528 } 4529 err = btrfs_commit_transaction(trans); 4530 4531 ret = clean_dirty_subvols(rc); 4532 if (ret < 0 && !err) 4533 err = ret; 4534 out_free: 4535 kfree(rc); 4536 out: 4537 if (!list_empty(&reloc_roots)) 4538 free_reloc_roots(&reloc_roots); 4539 4540 btrfs_free_path(path); 4541 4542 if (err == 0) { 4543 /* cleanup orphan inode in data relocation tree */ 4544 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4545 if (IS_ERR(fs_root)) 4546 err = PTR_ERR(fs_root); 4547 else 4548 err = btrfs_orphan_cleanup(fs_root); 4549 } 4550 return err; 4551 } 4552 4553 /* 4554 * helper to add ordered checksum for data relocation. 4555 * 4556 * cloning checksum properly handles the nodatasum extents. 4557 * it also saves CPU time to re-calculate the checksum. 4558 */ 4559 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4560 { 4561 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4562 struct btrfs_ordered_sum *sums; 4563 struct btrfs_ordered_extent *ordered; 4564 int ret; 4565 u64 disk_bytenr; 4566 u64 new_bytenr; 4567 LIST_HEAD(list); 4568 4569 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4570 BUG_ON(ordered->file_offset != file_pos || ordered->len != len); 4571 4572 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 4573 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 4574 disk_bytenr + len - 1, &list, 0); 4575 if (ret) 4576 goto out; 4577 4578 while (!list_empty(&list)) { 4579 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4580 list_del_init(&sums->list); 4581 4582 /* 4583 * We need to offset the new_bytenr based on where the csum is. 4584 * We need to do this because we will read in entire prealloc 4585 * extents but we may have written to say the middle of the 4586 * prealloc extent, so we need to make sure the csum goes with 4587 * the right disk offset. 4588 * 4589 * We can do this because the data reloc inode refers strictly 4590 * to the on disk bytes, so we don't have to worry about 4591 * disk_len vs real len like with real inodes since it's all 4592 * disk length. 4593 */ 4594 new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); 4595 sums->bytenr = new_bytenr; 4596 4597 btrfs_add_ordered_sum(inode, ordered, sums); 4598 } 4599 out: 4600 btrfs_put_ordered_extent(ordered); 4601 return ret; 4602 } 4603 4604 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4605 struct btrfs_root *root, struct extent_buffer *buf, 4606 struct extent_buffer *cow) 4607 { 4608 struct btrfs_fs_info *fs_info = root->fs_info; 4609 struct reloc_control *rc; 4610 struct backref_node *node; 4611 int first_cow = 0; 4612 int level; 4613 int ret = 0; 4614 4615 rc = fs_info->reloc_ctl; 4616 if (!rc) 4617 return 0; 4618 4619 BUG_ON(rc->stage == UPDATE_DATA_PTRS && 4620 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); 4621 4622 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 4623 if (buf == root->node) 4624 __update_reloc_root(root, cow->start); 4625 } 4626 4627 level = btrfs_header_level(buf); 4628 if (btrfs_header_generation(buf) <= 4629 btrfs_root_last_snapshot(&root->root_item)) 4630 first_cow = 1; 4631 4632 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4633 rc->create_reloc_tree) { 4634 WARN_ON(!first_cow && level == 0); 4635 4636 node = rc->backref_cache.path[level]; 4637 BUG_ON(node->bytenr != buf->start && 4638 node->new_bytenr != buf->start); 4639 4640 drop_node_buffer(node); 4641 extent_buffer_get(cow); 4642 node->eb = cow; 4643 node->new_bytenr = cow->start; 4644 4645 if (!node->pending) { 4646 list_move_tail(&node->list, 4647 &rc->backref_cache.pending[level]); 4648 node->pending = 1; 4649 } 4650 4651 if (first_cow) 4652 __mark_block_processed(rc, node); 4653 4654 if (first_cow && level > 0) 4655 rc->nodes_relocated += buf->len; 4656 } 4657 4658 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4659 ret = replace_file_extents(trans, rc, root, cow); 4660 return ret; 4661 } 4662 4663 /* 4664 * called before creating snapshot. it calculates metadata reservation 4665 * required for relocating tree blocks in the snapshot 4666 */ 4667 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4668 u64 *bytes_to_reserve) 4669 { 4670 struct btrfs_root *root; 4671 struct reloc_control *rc; 4672 4673 root = pending->root; 4674 if (!root->reloc_root) 4675 return; 4676 4677 rc = root->fs_info->reloc_ctl; 4678 if (!rc->merge_reloc_tree) 4679 return; 4680 4681 root = root->reloc_root; 4682 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4683 /* 4684 * relocation is in the stage of merging trees. the space 4685 * used by merging a reloc tree is twice the size of 4686 * relocated tree nodes in the worst case. half for cowing 4687 * the reloc tree, half for cowing the fs tree. the space 4688 * used by cowing the reloc tree will be freed after the 4689 * tree is dropped. if we create snapshot, cowing the fs 4690 * tree may use more space than it frees. so we need 4691 * reserve extra space. 4692 */ 4693 *bytes_to_reserve += rc->nodes_relocated; 4694 } 4695 4696 /* 4697 * called after snapshot is created. migrate block reservation 4698 * and create reloc root for the newly created snapshot 4699 */ 4700 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4701 struct btrfs_pending_snapshot *pending) 4702 { 4703 struct btrfs_root *root = pending->root; 4704 struct btrfs_root *reloc_root; 4705 struct btrfs_root *new_root; 4706 struct reloc_control *rc; 4707 int ret; 4708 4709 if (!root->reloc_root) 4710 return 0; 4711 4712 rc = root->fs_info->reloc_ctl; 4713 rc->merging_rsv_size += rc->nodes_relocated; 4714 4715 if (rc->merge_reloc_tree) { 4716 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4717 rc->block_rsv, 4718 rc->nodes_relocated, true); 4719 if (ret) 4720 return ret; 4721 } 4722 4723 new_root = pending->snap; 4724 reloc_root = create_reloc_root(trans, root->reloc_root, 4725 new_root->root_key.objectid); 4726 if (IS_ERR(reloc_root)) 4727 return PTR_ERR(reloc_root); 4728 4729 ret = __add_reloc_root(reloc_root); 4730 BUG_ON(ret < 0); 4731 new_root->reloc_root = reloc_root; 4732 4733 if (rc->create_reloc_tree) 4734 ret = clone_backref_node(trans, rc, root, reloc_root); 4735 return ret; 4736 } 4737