1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "volumes.h" 16 #include "locking.h" 17 #include "btrfs_inode.h" 18 #include "async-thread.h" 19 #include "free-space-cache.h" 20 #include "inode-map.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 24 /* 25 * backref_node, mapping_node and tree_block start with this 26 */ 27 struct tree_entry { 28 struct rb_node rb_node; 29 u64 bytenr; 30 }; 31 32 /* 33 * present a tree block in the backref cache 34 */ 35 struct backref_node { 36 struct rb_node rb_node; 37 u64 bytenr; 38 39 u64 new_bytenr; 40 /* objectid of tree block owner, can be not uptodate */ 41 u64 owner; 42 /* link to pending, changed or detached list */ 43 struct list_head list; 44 /* list of upper level blocks reference this block */ 45 struct list_head upper; 46 /* list of child blocks in the cache */ 47 struct list_head lower; 48 /* NULL if this node is not tree root */ 49 struct btrfs_root *root; 50 /* extent buffer got by COW the block */ 51 struct extent_buffer *eb; 52 /* level of tree block */ 53 unsigned int level:8; 54 /* is the block in non-reference counted tree */ 55 unsigned int cowonly:1; 56 /* 1 if no child node in the cache */ 57 unsigned int lowest:1; 58 /* is the extent buffer locked */ 59 unsigned int locked:1; 60 /* has the block been processed */ 61 unsigned int processed:1; 62 /* have backrefs of this block been checked */ 63 unsigned int checked:1; 64 /* 65 * 1 if corresponding block has been cowed but some upper 66 * level block pointers may not point to the new location 67 */ 68 unsigned int pending:1; 69 /* 70 * 1 if the backref node isn't connected to any other 71 * backref node. 72 */ 73 unsigned int detached:1; 74 }; 75 76 /* 77 * present a block pointer in the backref cache 78 */ 79 struct backref_edge { 80 struct list_head list[2]; 81 struct backref_node *node[2]; 82 }; 83 84 #define LOWER 0 85 #define UPPER 1 86 #define RELOCATION_RESERVED_NODES 256 87 88 struct backref_cache { 89 /* red black tree of all backref nodes in the cache */ 90 struct rb_root rb_root; 91 /* for passing backref nodes to btrfs_reloc_cow_block */ 92 struct backref_node *path[BTRFS_MAX_LEVEL]; 93 /* 94 * list of blocks that have been cowed but some block 95 * pointers in upper level blocks may not reflect the 96 * new location 97 */ 98 struct list_head pending[BTRFS_MAX_LEVEL]; 99 /* list of backref nodes with no child node */ 100 struct list_head leaves; 101 /* list of blocks that have been cowed in current transaction */ 102 struct list_head changed; 103 /* list of detached backref node. */ 104 struct list_head detached; 105 106 u64 last_trans; 107 108 int nr_nodes; 109 int nr_edges; 110 }; 111 112 /* 113 * map address of tree root to tree 114 */ 115 struct mapping_node { 116 struct rb_node rb_node; 117 u64 bytenr; 118 void *data; 119 }; 120 121 struct mapping_tree { 122 struct rb_root rb_root; 123 spinlock_t lock; 124 }; 125 126 /* 127 * present a tree block to process 128 */ 129 struct tree_block { 130 struct rb_node rb_node; 131 u64 bytenr; 132 struct btrfs_key key; 133 unsigned int level:8; 134 unsigned int key_ready:1; 135 }; 136 137 #define MAX_EXTENTS 128 138 139 struct file_extent_cluster { 140 u64 start; 141 u64 end; 142 u64 boundary[MAX_EXTENTS]; 143 unsigned int nr; 144 }; 145 146 struct reloc_control { 147 /* block group to relocate */ 148 struct btrfs_block_group_cache *block_group; 149 /* extent tree */ 150 struct btrfs_root *extent_root; 151 /* inode for moving data */ 152 struct inode *data_inode; 153 154 struct btrfs_block_rsv *block_rsv; 155 156 struct backref_cache backref_cache; 157 158 struct file_extent_cluster cluster; 159 /* tree blocks have been processed */ 160 struct extent_io_tree processed_blocks; 161 /* map start of tree root to corresponding reloc tree */ 162 struct mapping_tree reloc_root_tree; 163 /* list of reloc trees */ 164 struct list_head reloc_roots; 165 /* list of subvolume trees that get relocated */ 166 struct list_head dirty_subvol_roots; 167 /* size of metadata reservation for merging reloc trees */ 168 u64 merging_rsv_size; 169 /* size of relocated tree nodes */ 170 u64 nodes_relocated; 171 /* reserved size for block group relocation*/ 172 u64 reserved_bytes; 173 174 u64 search_start; 175 u64 extents_found; 176 177 unsigned int stage:8; 178 unsigned int create_reloc_tree:1; 179 unsigned int merge_reloc_tree:1; 180 unsigned int found_file_extent:1; 181 }; 182 183 /* stages of data relocation */ 184 #define MOVE_DATA_EXTENTS 0 185 #define UPDATE_DATA_PTRS 1 186 187 static void remove_backref_node(struct backref_cache *cache, 188 struct backref_node *node); 189 static void __mark_block_processed(struct reloc_control *rc, 190 struct backref_node *node); 191 192 static void mapping_tree_init(struct mapping_tree *tree) 193 { 194 tree->rb_root = RB_ROOT; 195 spin_lock_init(&tree->lock); 196 } 197 198 static void backref_cache_init(struct backref_cache *cache) 199 { 200 int i; 201 cache->rb_root = RB_ROOT; 202 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 203 INIT_LIST_HEAD(&cache->pending[i]); 204 INIT_LIST_HEAD(&cache->changed); 205 INIT_LIST_HEAD(&cache->detached); 206 INIT_LIST_HEAD(&cache->leaves); 207 } 208 209 static void backref_cache_cleanup(struct backref_cache *cache) 210 { 211 struct backref_node *node; 212 int i; 213 214 while (!list_empty(&cache->detached)) { 215 node = list_entry(cache->detached.next, 216 struct backref_node, list); 217 remove_backref_node(cache, node); 218 } 219 220 while (!list_empty(&cache->leaves)) { 221 node = list_entry(cache->leaves.next, 222 struct backref_node, lower); 223 remove_backref_node(cache, node); 224 } 225 226 cache->last_trans = 0; 227 228 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 229 ASSERT(list_empty(&cache->pending[i])); 230 ASSERT(list_empty(&cache->changed)); 231 ASSERT(list_empty(&cache->detached)); 232 ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); 233 ASSERT(!cache->nr_nodes); 234 ASSERT(!cache->nr_edges); 235 } 236 237 static struct backref_node *alloc_backref_node(struct backref_cache *cache) 238 { 239 struct backref_node *node; 240 241 node = kzalloc(sizeof(*node), GFP_NOFS); 242 if (node) { 243 INIT_LIST_HEAD(&node->list); 244 INIT_LIST_HEAD(&node->upper); 245 INIT_LIST_HEAD(&node->lower); 246 RB_CLEAR_NODE(&node->rb_node); 247 cache->nr_nodes++; 248 } 249 return node; 250 } 251 252 static void free_backref_node(struct backref_cache *cache, 253 struct backref_node *node) 254 { 255 if (node) { 256 cache->nr_nodes--; 257 kfree(node); 258 } 259 } 260 261 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) 262 { 263 struct backref_edge *edge; 264 265 edge = kzalloc(sizeof(*edge), GFP_NOFS); 266 if (edge) 267 cache->nr_edges++; 268 return edge; 269 } 270 271 static void free_backref_edge(struct backref_cache *cache, 272 struct backref_edge *edge) 273 { 274 if (edge) { 275 cache->nr_edges--; 276 kfree(edge); 277 } 278 } 279 280 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 281 struct rb_node *node) 282 { 283 struct rb_node **p = &root->rb_node; 284 struct rb_node *parent = NULL; 285 struct tree_entry *entry; 286 287 while (*p) { 288 parent = *p; 289 entry = rb_entry(parent, struct tree_entry, rb_node); 290 291 if (bytenr < entry->bytenr) 292 p = &(*p)->rb_left; 293 else if (bytenr > entry->bytenr) 294 p = &(*p)->rb_right; 295 else 296 return parent; 297 } 298 299 rb_link_node(node, parent, p); 300 rb_insert_color(node, root); 301 return NULL; 302 } 303 304 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) 305 { 306 struct rb_node *n = root->rb_node; 307 struct tree_entry *entry; 308 309 while (n) { 310 entry = rb_entry(n, struct tree_entry, rb_node); 311 312 if (bytenr < entry->bytenr) 313 n = n->rb_left; 314 else if (bytenr > entry->bytenr) 315 n = n->rb_right; 316 else 317 return n; 318 } 319 return NULL; 320 } 321 322 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) 323 { 324 325 struct btrfs_fs_info *fs_info = NULL; 326 struct backref_node *bnode = rb_entry(rb_node, struct backref_node, 327 rb_node); 328 if (bnode->root) 329 fs_info = bnode->root->fs_info; 330 btrfs_panic(fs_info, errno, 331 "Inconsistency in backref cache found at offset %llu", 332 bytenr); 333 } 334 335 /* 336 * walk up backref nodes until reach node presents tree root 337 */ 338 static struct backref_node *walk_up_backref(struct backref_node *node, 339 struct backref_edge *edges[], 340 int *index) 341 { 342 struct backref_edge *edge; 343 int idx = *index; 344 345 while (!list_empty(&node->upper)) { 346 edge = list_entry(node->upper.next, 347 struct backref_edge, list[LOWER]); 348 edges[idx++] = edge; 349 node = edge->node[UPPER]; 350 } 351 BUG_ON(node->detached); 352 *index = idx; 353 return node; 354 } 355 356 /* 357 * walk down backref nodes to find start of next reference path 358 */ 359 static struct backref_node *walk_down_backref(struct backref_edge *edges[], 360 int *index) 361 { 362 struct backref_edge *edge; 363 struct backref_node *lower; 364 int idx = *index; 365 366 while (idx > 0) { 367 edge = edges[idx - 1]; 368 lower = edge->node[LOWER]; 369 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 370 idx--; 371 continue; 372 } 373 edge = list_entry(edge->list[LOWER].next, 374 struct backref_edge, list[LOWER]); 375 edges[idx - 1] = edge; 376 *index = idx; 377 return edge->node[UPPER]; 378 } 379 *index = 0; 380 return NULL; 381 } 382 383 static void unlock_node_buffer(struct backref_node *node) 384 { 385 if (node->locked) { 386 btrfs_tree_unlock(node->eb); 387 node->locked = 0; 388 } 389 } 390 391 static void drop_node_buffer(struct backref_node *node) 392 { 393 if (node->eb) { 394 unlock_node_buffer(node); 395 free_extent_buffer(node->eb); 396 node->eb = NULL; 397 } 398 } 399 400 static void drop_backref_node(struct backref_cache *tree, 401 struct backref_node *node) 402 { 403 BUG_ON(!list_empty(&node->upper)); 404 405 drop_node_buffer(node); 406 list_del(&node->list); 407 list_del(&node->lower); 408 if (!RB_EMPTY_NODE(&node->rb_node)) 409 rb_erase(&node->rb_node, &tree->rb_root); 410 free_backref_node(tree, node); 411 } 412 413 /* 414 * remove a backref node from the backref cache 415 */ 416 static void remove_backref_node(struct backref_cache *cache, 417 struct backref_node *node) 418 { 419 struct backref_node *upper; 420 struct backref_edge *edge; 421 422 if (!node) 423 return; 424 425 BUG_ON(!node->lowest && !node->detached); 426 while (!list_empty(&node->upper)) { 427 edge = list_entry(node->upper.next, struct backref_edge, 428 list[LOWER]); 429 upper = edge->node[UPPER]; 430 list_del(&edge->list[LOWER]); 431 list_del(&edge->list[UPPER]); 432 free_backref_edge(cache, edge); 433 434 if (RB_EMPTY_NODE(&upper->rb_node)) { 435 BUG_ON(!list_empty(&node->upper)); 436 drop_backref_node(cache, node); 437 node = upper; 438 node->lowest = 1; 439 continue; 440 } 441 /* 442 * add the node to leaf node list if no other 443 * child block cached. 444 */ 445 if (list_empty(&upper->lower)) { 446 list_add_tail(&upper->lower, &cache->leaves); 447 upper->lowest = 1; 448 } 449 } 450 451 drop_backref_node(cache, node); 452 } 453 454 static void update_backref_node(struct backref_cache *cache, 455 struct backref_node *node, u64 bytenr) 456 { 457 struct rb_node *rb_node; 458 rb_erase(&node->rb_node, &cache->rb_root); 459 node->bytenr = bytenr; 460 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 461 if (rb_node) 462 backref_tree_panic(rb_node, -EEXIST, bytenr); 463 } 464 465 /* 466 * update backref cache after a transaction commit 467 */ 468 static int update_backref_cache(struct btrfs_trans_handle *trans, 469 struct backref_cache *cache) 470 { 471 struct backref_node *node; 472 int level = 0; 473 474 if (cache->last_trans == 0) { 475 cache->last_trans = trans->transid; 476 return 0; 477 } 478 479 if (cache->last_trans == trans->transid) 480 return 0; 481 482 /* 483 * detached nodes are used to avoid unnecessary backref 484 * lookup. transaction commit changes the extent tree. 485 * so the detached nodes are no longer useful. 486 */ 487 while (!list_empty(&cache->detached)) { 488 node = list_entry(cache->detached.next, 489 struct backref_node, list); 490 remove_backref_node(cache, node); 491 } 492 493 while (!list_empty(&cache->changed)) { 494 node = list_entry(cache->changed.next, 495 struct backref_node, list); 496 list_del_init(&node->list); 497 BUG_ON(node->pending); 498 update_backref_node(cache, node, node->new_bytenr); 499 } 500 501 /* 502 * some nodes can be left in the pending list if there were 503 * errors during processing the pending nodes. 504 */ 505 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 506 list_for_each_entry(node, &cache->pending[level], list) { 507 BUG_ON(!node->pending); 508 if (node->bytenr == node->new_bytenr) 509 continue; 510 update_backref_node(cache, node, node->new_bytenr); 511 } 512 } 513 514 cache->last_trans = 0; 515 return 1; 516 } 517 518 519 static int should_ignore_root(struct btrfs_root *root) 520 { 521 struct btrfs_root *reloc_root; 522 523 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 524 return 0; 525 526 reloc_root = root->reloc_root; 527 if (!reloc_root) 528 return 0; 529 530 if (btrfs_root_last_snapshot(&reloc_root->root_item) == 531 root->fs_info->running_transaction->transid - 1) 532 return 0; 533 /* 534 * if there is reloc tree and it was created in previous 535 * transaction backref lookup can find the reloc tree, 536 * so backref node for the fs tree root is useless for 537 * relocation. 538 */ 539 return 1; 540 } 541 /* 542 * find reloc tree by address of tree root 543 */ 544 static struct btrfs_root *find_reloc_root(struct reloc_control *rc, 545 u64 bytenr) 546 { 547 struct rb_node *rb_node; 548 struct mapping_node *node; 549 struct btrfs_root *root = NULL; 550 551 spin_lock(&rc->reloc_root_tree.lock); 552 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); 553 if (rb_node) { 554 node = rb_entry(rb_node, struct mapping_node, rb_node); 555 root = (struct btrfs_root *)node->data; 556 } 557 spin_unlock(&rc->reloc_root_tree.lock); 558 return root; 559 } 560 561 static int is_cowonly_root(u64 root_objectid) 562 { 563 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || 564 root_objectid == BTRFS_EXTENT_TREE_OBJECTID || 565 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 566 root_objectid == BTRFS_DEV_TREE_OBJECTID || 567 root_objectid == BTRFS_TREE_LOG_OBJECTID || 568 root_objectid == BTRFS_CSUM_TREE_OBJECTID || 569 root_objectid == BTRFS_UUID_TREE_OBJECTID || 570 root_objectid == BTRFS_QUOTA_TREE_OBJECTID || 571 root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) 572 return 1; 573 return 0; 574 } 575 576 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, 577 u64 root_objectid) 578 { 579 struct btrfs_key key; 580 581 key.objectid = root_objectid; 582 key.type = BTRFS_ROOT_ITEM_KEY; 583 if (is_cowonly_root(root_objectid)) 584 key.offset = 0; 585 else 586 key.offset = (u64)-1; 587 588 return btrfs_get_fs_root(fs_info, &key, false); 589 } 590 591 static noinline_for_stack 592 int find_inline_backref(struct extent_buffer *leaf, int slot, 593 unsigned long *ptr, unsigned long *end) 594 { 595 struct btrfs_key key; 596 struct btrfs_extent_item *ei; 597 struct btrfs_tree_block_info *bi; 598 u32 item_size; 599 600 btrfs_item_key_to_cpu(leaf, &key, slot); 601 602 item_size = btrfs_item_size_nr(leaf, slot); 603 if (item_size < sizeof(*ei)) { 604 btrfs_print_v0_err(leaf->fs_info); 605 btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL); 606 return 1; 607 } 608 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 609 WARN_ON(!(btrfs_extent_flags(leaf, ei) & 610 BTRFS_EXTENT_FLAG_TREE_BLOCK)); 611 612 if (key.type == BTRFS_EXTENT_ITEM_KEY && 613 item_size <= sizeof(*ei) + sizeof(*bi)) { 614 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); 615 return 1; 616 } 617 if (key.type == BTRFS_METADATA_ITEM_KEY && 618 item_size <= sizeof(*ei)) { 619 WARN_ON(item_size < sizeof(*ei)); 620 return 1; 621 } 622 623 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 624 bi = (struct btrfs_tree_block_info *)(ei + 1); 625 *ptr = (unsigned long)(bi + 1); 626 } else { 627 *ptr = (unsigned long)(ei + 1); 628 } 629 *end = (unsigned long)ei + item_size; 630 return 0; 631 } 632 633 /* 634 * build backref tree for a given tree block. root of the backref tree 635 * corresponds the tree block, leaves of the backref tree correspond 636 * roots of b-trees that reference the tree block. 637 * 638 * the basic idea of this function is check backrefs of a given block 639 * to find upper level blocks that reference the block, and then check 640 * backrefs of these upper level blocks recursively. the recursion stop 641 * when tree root is reached or backrefs for the block is cached. 642 * 643 * NOTE: if we find backrefs for a block are cached, we know backrefs 644 * for all upper level blocks that directly/indirectly reference the 645 * block are also cached. 646 */ 647 static noinline_for_stack 648 struct backref_node *build_backref_tree(struct reloc_control *rc, 649 struct btrfs_key *node_key, 650 int level, u64 bytenr) 651 { 652 struct backref_cache *cache = &rc->backref_cache; 653 struct btrfs_path *path1; /* For searching extent root */ 654 struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ 655 struct extent_buffer *eb; 656 struct btrfs_root *root; 657 struct backref_node *cur; 658 struct backref_node *upper; 659 struct backref_node *lower; 660 struct backref_node *node = NULL; 661 struct backref_node *exist = NULL; 662 struct backref_edge *edge; 663 struct rb_node *rb_node; 664 struct btrfs_key key; 665 unsigned long end; 666 unsigned long ptr; 667 LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ 668 LIST_HEAD(useless); 669 int cowonly; 670 int ret; 671 int err = 0; 672 bool need_check = true; 673 674 path1 = btrfs_alloc_path(); 675 path2 = btrfs_alloc_path(); 676 if (!path1 || !path2) { 677 err = -ENOMEM; 678 goto out; 679 } 680 path1->reada = READA_FORWARD; 681 path2->reada = READA_FORWARD; 682 683 node = alloc_backref_node(cache); 684 if (!node) { 685 err = -ENOMEM; 686 goto out; 687 } 688 689 node->bytenr = bytenr; 690 node->level = level; 691 node->lowest = 1; 692 cur = node; 693 again: 694 end = 0; 695 ptr = 0; 696 key.objectid = cur->bytenr; 697 key.type = BTRFS_METADATA_ITEM_KEY; 698 key.offset = (u64)-1; 699 700 path1->search_commit_root = 1; 701 path1->skip_locking = 1; 702 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, 703 0, 0); 704 if (ret < 0) { 705 err = ret; 706 goto out; 707 } 708 ASSERT(ret); 709 ASSERT(path1->slots[0]); 710 711 path1->slots[0]--; 712 713 WARN_ON(cur->checked); 714 if (!list_empty(&cur->upper)) { 715 /* 716 * the backref was added previously when processing 717 * backref of type BTRFS_TREE_BLOCK_REF_KEY 718 */ 719 ASSERT(list_is_singular(&cur->upper)); 720 edge = list_entry(cur->upper.next, struct backref_edge, 721 list[LOWER]); 722 ASSERT(list_empty(&edge->list[UPPER])); 723 exist = edge->node[UPPER]; 724 /* 725 * add the upper level block to pending list if we need 726 * check its backrefs 727 */ 728 if (!exist->checked) 729 list_add_tail(&edge->list[UPPER], &list); 730 } else { 731 exist = NULL; 732 } 733 734 while (1) { 735 cond_resched(); 736 eb = path1->nodes[0]; 737 738 if (ptr >= end) { 739 if (path1->slots[0] >= btrfs_header_nritems(eb)) { 740 ret = btrfs_next_leaf(rc->extent_root, path1); 741 if (ret < 0) { 742 err = ret; 743 goto out; 744 } 745 if (ret > 0) 746 break; 747 eb = path1->nodes[0]; 748 } 749 750 btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); 751 if (key.objectid != cur->bytenr) { 752 WARN_ON(exist); 753 break; 754 } 755 756 if (key.type == BTRFS_EXTENT_ITEM_KEY || 757 key.type == BTRFS_METADATA_ITEM_KEY) { 758 ret = find_inline_backref(eb, path1->slots[0], 759 &ptr, &end); 760 if (ret) 761 goto next; 762 } 763 } 764 765 if (ptr < end) { 766 /* update key for inline back ref */ 767 struct btrfs_extent_inline_ref *iref; 768 int type; 769 iref = (struct btrfs_extent_inline_ref *)ptr; 770 type = btrfs_get_extent_inline_ref_type(eb, iref, 771 BTRFS_REF_TYPE_BLOCK); 772 if (type == BTRFS_REF_TYPE_INVALID) { 773 err = -EUCLEAN; 774 goto out; 775 } 776 key.type = type; 777 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 778 779 WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && 780 key.type != BTRFS_SHARED_BLOCK_REF_KEY); 781 } 782 783 /* 784 * Parent node found and matches current inline ref, no need to 785 * rebuild this node for this inline ref. 786 */ 787 if (exist && 788 ((key.type == BTRFS_TREE_BLOCK_REF_KEY && 789 exist->owner == key.offset) || 790 (key.type == BTRFS_SHARED_BLOCK_REF_KEY && 791 exist->bytenr == key.offset))) { 792 exist = NULL; 793 goto next; 794 } 795 796 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ 797 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 798 if (key.objectid == key.offset) { 799 /* 800 * Only root blocks of reloc trees use backref 801 * pointing to itself. 802 */ 803 root = find_reloc_root(rc, cur->bytenr); 804 ASSERT(root); 805 cur->root = root; 806 break; 807 } 808 809 edge = alloc_backref_edge(cache); 810 if (!edge) { 811 err = -ENOMEM; 812 goto out; 813 } 814 rb_node = tree_search(&cache->rb_root, key.offset); 815 if (!rb_node) { 816 upper = alloc_backref_node(cache); 817 if (!upper) { 818 free_backref_edge(cache, edge); 819 err = -ENOMEM; 820 goto out; 821 } 822 upper->bytenr = key.offset; 823 upper->level = cur->level + 1; 824 /* 825 * backrefs for the upper level block isn't 826 * cached, add the block to pending list 827 */ 828 list_add_tail(&edge->list[UPPER], &list); 829 } else { 830 upper = rb_entry(rb_node, struct backref_node, 831 rb_node); 832 ASSERT(upper->checked); 833 INIT_LIST_HEAD(&edge->list[UPPER]); 834 } 835 list_add_tail(&edge->list[LOWER], &cur->upper); 836 edge->node[LOWER] = cur; 837 edge->node[UPPER] = upper; 838 839 goto next; 840 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 841 err = -EINVAL; 842 btrfs_print_v0_err(rc->extent_root->fs_info); 843 btrfs_handle_fs_error(rc->extent_root->fs_info, err, 844 NULL); 845 goto out; 846 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 847 goto next; 848 } 849 850 /* 851 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset 852 * means the root objectid. We need to search the tree to get 853 * its parent bytenr. 854 */ 855 root = read_fs_root(rc->extent_root->fs_info, key.offset); 856 if (IS_ERR(root)) { 857 err = PTR_ERR(root); 858 goto out; 859 } 860 861 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 862 cur->cowonly = 1; 863 864 if (btrfs_root_level(&root->root_item) == cur->level) { 865 /* tree root */ 866 ASSERT(btrfs_root_bytenr(&root->root_item) == 867 cur->bytenr); 868 if (should_ignore_root(root)) 869 list_add(&cur->list, &useless); 870 else 871 cur->root = root; 872 break; 873 } 874 875 level = cur->level + 1; 876 877 /* Search the tree to find parent blocks referring the block. */ 878 path2->search_commit_root = 1; 879 path2->skip_locking = 1; 880 path2->lowest_level = level; 881 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); 882 path2->lowest_level = 0; 883 if (ret < 0) { 884 err = ret; 885 goto out; 886 } 887 if (ret > 0 && path2->slots[level] > 0) 888 path2->slots[level]--; 889 890 eb = path2->nodes[level]; 891 if (btrfs_node_blockptr(eb, path2->slots[level]) != 892 cur->bytenr) { 893 btrfs_err(root->fs_info, 894 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", 895 cur->bytenr, level - 1, 896 root->root_key.objectid, 897 node_key->objectid, node_key->type, 898 node_key->offset); 899 err = -ENOENT; 900 goto out; 901 } 902 lower = cur; 903 need_check = true; 904 905 /* Add all nodes and edges in the path */ 906 for (; level < BTRFS_MAX_LEVEL; level++) { 907 if (!path2->nodes[level]) { 908 ASSERT(btrfs_root_bytenr(&root->root_item) == 909 lower->bytenr); 910 if (should_ignore_root(root)) 911 list_add(&lower->list, &useless); 912 else 913 lower->root = root; 914 break; 915 } 916 917 edge = alloc_backref_edge(cache); 918 if (!edge) { 919 err = -ENOMEM; 920 goto out; 921 } 922 923 eb = path2->nodes[level]; 924 rb_node = tree_search(&cache->rb_root, eb->start); 925 if (!rb_node) { 926 upper = alloc_backref_node(cache); 927 if (!upper) { 928 free_backref_edge(cache, edge); 929 err = -ENOMEM; 930 goto out; 931 } 932 upper->bytenr = eb->start; 933 upper->owner = btrfs_header_owner(eb); 934 upper->level = lower->level + 1; 935 if (!test_bit(BTRFS_ROOT_REF_COWS, 936 &root->state)) 937 upper->cowonly = 1; 938 939 /* 940 * if we know the block isn't shared 941 * we can void checking its backrefs. 942 */ 943 if (btrfs_block_can_be_shared(root, eb)) 944 upper->checked = 0; 945 else 946 upper->checked = 1; 947 948 /* 949 * add the block to pending list if we 950 * need check its backrefs, we only do this once 951 * while walking up a tree as we will catch 952 * anything else later on. 953 */ 954 if (!upper->checked && need_check) { 955 need_check = false; 956 list_add_tail(&edge->list[UPPER], 957 &list); 958 } else { 959 if (upper->checked) 960 need_check = true; 961 INIT_LIST_HEAD(&edge->list[UPPER]); 962 } 963 } else { 964 upper = rb_entry(rb_node, struct backref_node, 965 rb_node); 966 ASSERT(upper->checked); 967 INIT_LIST_HEAD(&edge->list[UPPER]); 968 if (!upper->owner) 969 upper->owner = btrfs_header_owner(eb); 970 } 971 list_add_tail(&edge->list[LOWER], &lower->upper); 972 edge->node[LOWER] = lower; 973 edge->node[UPPER] = upper; 974 975 if (rb_node) 976 break; 977 lower = upper; 978 upper = NULL; 979 } 980 btrfs_release_path(path2); 981 next: 982 if (ptr < end) { 983 ptr += btrfs_extent_inline_ref_size(key.type); 984 if (ptr >= end) { 985 WARN_ON(ptr > end); 986 ptr = 0; 987 end = 0; 988 } 989 } 990 if (ptr >= end) 991 path1->slots[0]++; 992 } 993 btrfs_release_path(path1); 994 995 cur->checked = 1; 996 WARN_ON(exist); 997 998 /* the pending list isn't empty, take the first block to process */ 999 if (!list_empty(&list)) { 1000 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1001 list_del_init(&edge->list[UPPER]); 1002 cur = edge->node[UPPER]; 1003 goto again; 1004 } 1005 1006 /* 1007 * everything goes well, connect backref nodes and insert backref nodes 1008 * into the cache. 1009 */ 1010 ASSERT(node->checked); 1011 cowonly = node->cowonly; 1012 if (!cowonly) { 1013 rb_node = tree_insert(&cache->rb_root, node->bytenr, 1014 &node->rb_node); 1015 if (rb_node) 1016 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1017 list_add_tail(&node->lower, &cache->leaves); 1018 } 1019 1020 list_for_each_entry(edge, &node->upper, list[LOWER]) 1021 list_add_tail(&edge->list[UPPER], &list); 1022 1023 while (!list_empty(&list)) { 1024 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1025 list_del_init(&edge->list[UPPER]); 1026 upper = edge->node[UPPER]; 1027 if (upper->detached) { 1028 list_del(&edge->list[LOWER]); 1029 lower = edge->node[LOWER]; 1030 free_backref_edge(cache, edge); 1031 if (list_empty(&lower->upper)) 1032 list_add(&lower->list, &useless); 1033 continue; 1034 } 1035 1036 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1037 if (upper->lowest) { 1038 list_del_init(&upper->lower); 1039 upper->lowest = 0; 1040 } 1041 1042 list_add_tail(&edge->list[UPPER], &upper->lower); 1043 continue; 1044 } 1045 1046 if (!upper->checked) { 1047 /* 1048 * Still want to blow up for developers since this is a 1049 * logic bug. 1050 */ 1051 ASSERT(0); 1052 err = -EINVAL; 1053 goto out; 1054 } 1055 if (cowonly != upper->cowonly) { 1056 ASSERT(0); 1057 err = -EINVAL; 1058 goto out; 1059 } 1060 1061 if (!cowonly) { 1062 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1063 &upper->rb_node); 1064 if (rb_node) 1065 backref_tree_panic(rb_node, -EEXIST, 1066 upper->bytenr); 1067 } 1068 1069 list_add_tail(&edge->list[UPPER], &upper->lower); 1070 1071 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1072 list_add_tail(&edge->list[UPPER], &list); 1073 } 1074 /* 1075 * process useless backref nodes. backref nodes for tree leaves 1076 * are deleted from the cache. backref nodes for upper level 1077 * tree blocks are left in the cache to avoid unnecessary backref 1078 * lookup. 1079 */ 1080 while (!list_empty(&useless)) { 1081 upper = list_entry(useless.next, struct backref_node, list); 1082 list_del_init(&upper->list); 1083 ASSERT(list_empty(&upper->upper)); 1084 if (upper == node) 1085 node = NULL; 1086 if (upper->lowest) { 1087 list_del_init(&upper->lower); 1088 upper->lowest = 0; 1089 } 1090 while (!list_empty(&upper->lower)) { 1091 edge = list_entry(upper->lower.next, 1092 struct backref_edge, list[UPPER]); 1093 list_del(&edge->list[UPPER]); 1094 list_del(&edge->list[LOWER]); 1095 lower = edge->node[LOWER]; 1096 free_backref_edge(cache, edge); 1097 1098 if (list_empty(&lower->upper)) 1099 list_add(&lower->list, &useless); 1100 } 1101 __mark_block_processed(rc, upper); 1102 if (upper->level > 0) { 1103 list_add(&upper->list, &cache->detached); 1104 upper->detached = 1; 1105 } else { 1106 rb_erase(&upper->rb_node, &cache->rb_root); 1107 free_backref_node(cache, upper); 1108 } 1109 } 1110 out: 1111 btrfs_free_path(path1); 1112 btrfs_free_path(path2); 1113 if (err) { 1114 while (!list_empty(&useless)) { 1115 lower = list_entry(useless.next, 1116 struct backref_node, list); 1117 list_del_init(&lower->list); 1118 } 1119 while (!list_empty(&list)) { 1120 edge = list_first_entry(&list, struct backref_edge, 1121 list[UPPER]); 1122 list_del(&edge->list[UPPER]); 1123 list_del(&edge->list[LOWER]); 1124 lower = edge->node[LOWER]; 1125 upper = edge->node[UPPER]; 1126 free_backref_edge(cache, edge); 1127 1128 /* 1129 * Lower is no longer linked to any upper backref nodes 1130 * and isn't in the cache, we can free it ourselves. 1131 */ 1132 if (list_empty(&lower->upper) && 1133 RB_EMPTY_NODE(&lower->rb_node)) 1134 list_add(&lower->list, &useless); 1135 1136 if (!RB_EMPTY_NODE(&upper->rb_node)) 1137 continue; 1138 1139 /* Add this guy's upper edges to the list to process */ 1140 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1141 list_add_tail(&edge->list[UPPER], &list); 1142 if (list_empty(&upper->upper)) 1143 list_add(&upper->list, &useless); 1144 } 1145 1146 while (!list_empty(&useless)) { 1147 lower = list_entry(useless.next, 1148 struct backref_node, list); 1149 list_del_init(&lower->list); 1150 if (lower == node) 1151 node = NULL; 1152 free_backref_node(cache, lower); 1153 } 1154 1155 free_backref_node(cache, node); 1156 return ERR_PTR(err); 1157 } 1158 ASSERT(!node || !node->detached); 1159 return node; 1160 } 1161 1162 /* 1163 * helper to add backref node for the newly created snapshot. 1164 * the backref node is created by cloning backref node that 1165 * corresponds to root of source tree 1166 */ 1167 static int clone_backref_node(struct btrfs_trans_handle *trans, 1168 struct reloc_control *rc, 1169 struct btrfs_root *src, 1170 struct btrfs_root *dest) 1171 { 1172 struct btrfs_root *reloc_root = src->reloc_root; 1173 struct backref_cache *cache = &rc->backref_cache; 1174 struct backref_node *node = NULL; 1175 struct backref_node *new_node; 1176 struct backref_edge *edge; 1177 struct backref_edge *new_edge; 1178 struct rb_node *rb_node; 1179 1180 if (cache->last_trans > 0) 1181 update_backref_cache(trans, cache); 1182 1183 rb_node = tree_search(&cache->rb_root, src->commit_root->start); 1184 if (rb_node) { 1185 node = rb_entry(rb_node, struct backref_node, rb_node); 1186 if (node->detached) 1187 node = NULL; 1188 else 1189 BUG_ON(node->new_bytenr != reloc_root->node->start); 1190 } 1191 1192 if (!node) { 1193 rb_node = tree_search(&cache->rb_root, 1194 reloc_root->commit_root->start); 1195 if (rb_node) { 1196 node = rb_entry(rb_node, struct backref_node, 1197 rb_node); 1198 BUG_ON(node->detached); 1199 } 1200 } 1201 1202 if (!node) 1203 return 0; 1204 1205 new_node = alloc_backref_node(cache); 1206 if (!new_node) 1207 return -ENOMEM; 1208 1209 new_node->bytenr = dest->node->start; 1210 new_node->level = node->level; 1211 new_node->lowest = node->lowest; 1212 new_node->checked = 1; 1213 new_node->root = dest; 1214 1215 if (!node->lowest) { 1216 list_for_each_entry(edge, &node->lower, list[UPPER]) { 1217 new_edge = alloc_backref_edge(cache); 1218 if (!new_edge) 1219 goto fail; 1220 1221 new_edge->node[UPPER] = new_node; 1222 new_edge->node[LOWER] = edge->node[LOWER]; 1223 list_add_tail(&new_edge->list[UPPER], 1224 &new_node->lower); 1225 } 1226 } else { 1227 list_add_tail(&new_node->lower, &cache->leaves); 1228 } 1229 1230 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1231 &new_node->rb_node); 1232 if (rb_node) 1233 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); 1234 1235 if (!new_node->lowest) { 1236 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 1237 list_add_tail(&new_edge->list[LOWER], 1238 &new_edge->node[LOWER]->upper); 1239 } 1240 } 1241 return 0; 1242 fail: 1243 while (!list_empty(&new_node->lower)) { 1244 new_edge = list_entry(new_node->lower.next, 1245 struct backref_edge, list[UPPER]); 1246 list_del(&new_edge->list[UPPER]); 1247 free_backref_edge(cache, new_edge); 1248 } 1249 free_backref_node(cache, new_node); 1250 return -ENOMEM; 1251 } 1252 1253 /* 1254 * helper to add 'address of tree root -> reloc tree' mapping 1255 */ 1256 static int __must_check __add_reloc_root(struct btrfs_root *root) 1257 { 1258 struct btrfs_fs_info *fs_info = root->fs_info; 1259 struct rb_node *rb_node; 1260 struct mapping_node *node; 1261 struct reloc_control *rc = fs_info->reloc_ctl; 1262 1263 node = kmalloc(sizeof(*node), GFP_NOFS); 1264 if (!node) 1265 return -ENOMEM; 1266 1267 node->bytenr = root->node->start; 1268 node->data = root; 1269 1270 spin_lock(&rc->reloc_root_tree.lock); 1271 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1272 node->bytenr, &node->rb_node); 1273 spin_unlock(&rc->reloc_root_tree.lock); 1274 if (rb_node) { 1275 btrfs_panic(fs_info, -EEXIST, 1276 "Duplicate root found for start=%llu while inserting into relocation tree", 1277 node->bytenr); 1278 } 1279 1280 list_add_tail(&root->root_list, &rc->reloc_roots); 1281 return 0; 1282 } 1283 1284 /* 1285 * helper to delete the 'address of tree root -> reloc tree' 1286 * mapping 1287 */ 1288 static void __del_reloc_root(struct btrfs_root *root) 1289 { 1290 struct btrfs_fs_info *fs_info = root->fs_info; 1291 struct rb_node *rb_node; 1292 struct mapping_node *node = NULL; 1293 struct reloc_control *rc = fs_info->reloc_ctl; 1294 1295 if (rc && root->node) { 1296 spin_lock(&rc->reloc_root_tree.lock); 1297 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1298 root->node->start); 1299 if (rb_node) { 1300 node = rb_entry(rb_node, struct mapping_node, rb_node); 1301 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1302 } 1303 spin_unlock(&rc->reloc_root_tree.lock); 1304 if (!node) 1305 return; 1306 BUG_ON((struct btrfs_root *)node->data != root); 1307 } 1308 1309 spin_lock(&fs_info->trans_lock); 1310 list_del_init(&root->root_list); 1311 spin_unlock(&fs_info->trans_lock); 1312 kfree(node); 1313 } 1314 1315 /* 1316 * helper to update the 'address of tree root -> reloc tree' 1317 * mapping 1318 */ 1319 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) 1320 { 1321 struct btrfs_fs_info *fs_info = root->fs_info; 1322 struct rb_node *rb_node; 1323 struct mapping_node *node = NULL; 1324 struct reloc_control *rc = fs_info->reloc_ctl; 1325 1326 spin_lock(&rc->reloc_root_tree.lock); 1327 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1328 root->node->start); 1329 if (rb_node) { 1330 node = rb_entry(rb_node, struct mapping_node, rb_node); 1331 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1332 } 1333 spin_unlock(&rc->reloc_root_tree.lock); 1334 1335 if (!node) 1336 return 0; 1337 BUG_ON((struct btrfs_root *)node->data != root); 1338 1339 spin_lock(&rc->reloc_root_tree.lock); 1340 node->bytenr = new_bytenr; 1341 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1342 node->bytenr, &node->rb_node); 1343 spin_unlock(&rc->reloc_root_tree.lock); 1344 if (rb_node) 1345 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1346 return 0; 1347 } 1348 1349 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 1350 struct btrfs_root *root, u64 objectid) 1351 { 1352 struct btrfs_fs_info *fs_info = root->fs_info; 1353 struct btrfs_root *reloc_root; 1354 struct extent_buffer *eb; 1355 struct btrfs_root_item *root_item; 1356 struct btrfs_key root_key; 1357 int ret; 1358 1359 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1360 BUG_ON(!root_item); 1361 1362 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1363 root_key.type = BTRFS_ROOT_ITEM_KEY; 1364 root_key.offset = objectid; 1365 1366 if (root->root_key.objectid == objectid) { 1367 u64 commit_root_gen; 1368 1369 /* called by btrfs_init_reloc_root */ 1370 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1371 BTRFS_TREE_RELOC_OBJECTID); 1372 BUG_ON(ret); 1373 /* 1374 * Set the last_snapshot field to the generation of the commit 1375 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 1376 * correctly (returns true) when the relocation root is created 1377 * either inside the critical section of a transaction commit 1378 * (through transaction.c:qgroup_account_snapshot()) and when 1379 * it's created before the transaction commit is started. 1380 */ 1381 commit_root_gen = btrfs_header_generation(root->commit_root); 1382 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 1383 } else { 1384 /* 1385 * called by btrfs_reloc_post_snapshot_hook. 1386 * the source tree is a reloc tree, all tree blocks 1387 * modified after it was created have RELOC flag 1388 * set in their headers. so it's OK to not update 1389 * the 'last_snapshot'. 1390 */ 1391 ret = btrfs_copy_root(trans, root, root->node, &eb, 1392 BTRFS_TREE_RELOC_OBJECTID); 1393 BUG_ON(ret); 1394 } 1395 1396 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1397 btrfs_set_root_bytenr(root_item, eb->start); 1398 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1399 btrfs_set_root_generation(root_item, trans->transid); 1400 1401 if (root->root_key.objectid == objectid) { 1402 btrfs_set_root_refs(root_item, 0); 1403 memset(&root_item->drop_progress, 0, 1404 sizeof(struct btrfs_disk_key)); 1405 root_item->drop_level = 0; 1406 } 1407 1408 btrfs_tree_unlock(eb); 1409 free_extent_buffer(eb); 1410 1411 ret = btrfs_insert_root(trans, fs_info->tree_root, 1412 &root_key, root_item); 1413 BUG_ON(ret); 1414 kfree(root_item); 1415 1416 reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); 1417 BUG_ON(IS_ERR(reloc_root)); 1418 reloc_root->last_trans = trans->transid; 1419 return reloc_root; 1420 } 1421 1422 /* 1423 * create reloc tree for a given fs tree. reloc tree is just a 1424 * snapshot of the fs tree with special root objectid. 1425 */ 1426 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 1427 struct btrfs_root *root) 1428 { 1429 struct btrfs_fs_info *fs_info = root->fs_info; 1430 struct btrfs_root *reloc_root; 1431 struct reloc_control *rc = fs_info->reloc_ctl; 1432 struct btrfs_block_rsv *rsv; 1433 int clear_rsv = 0; 1434 int ret; 1435 1436 if (root->reloc_root) { 1437 reloc_root = root->reloc_root; 1438 reloc_root->last_trans = trans->transid; 1439 return 0; 1440 } 1441 1442 if (!rc || !rc->create_reloc_tree || 1443 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1444 return 0; 1445 1446 if (!trans->reloc_reserved) { 1447 rsv = trans->block_rsv; 1448 trans->block_rsv = rc->block_rsv; 1449 clear_rsv = 1; 1450 } 1451 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 1452 if (clear_rsv) 1453 trans->block_rsv = rsv; 1454 1455 ret = __add_reloc_root(reloc_root); 1456 BUG_ON(ret < 0); 1457 root->reloc_root = reloc_root; 1458 return 0; 1459 } 1460 1461 /* 1462 * update root item of reloc tree 1463 */ 1464 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 1465 struct btrfs_root *root) 1466 { 1467 struct btrfs_fs_info *fs_info = root->fs_info; 1468 struct btrfs_root *reloc_root; 1469 struct btrfs_root_item *root_item; 1470 int ret; 1471 1472 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || 1473 !root->reloc_root) 1474 goto out; 1475 1476 reloc_root = root->reloc_root; 1477 root_item = &reloc_root->root_item; 1478 1479 /* root->reloc_root will stay until current relocation finished */ 1480 if (fs_info->reloc_ctl->merge_reloc_tree && 1481 btrfs_root_refs(root_item) == 0) { 1482 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1483 __del_reloc_root(reloc_root); 1484 } 1485 1486 if (reloc_root->commit_root != reloc_root->node) { 1487 btrfs_set_root_node(root_item, reloc_root->node); 1488 free_extent_buffer(reloc_root->commit_root); 1489 reloc_root->commit_root = btrfs_root_node(reloc_root); 1490 } 1491 1492 ret = btrfs_update_root(trans, fs_info->tree_root, 1493 &reloc_root->root_key, root_item); 1494 BUG_ON(ret); 1495 1496 out: 1497 return 0; 1498 } 1499 1500 /* 1501 * helper to find first cached inode with inode number >= objectid 1502 * in a subvolume 1503 */ 1504 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 1505 { 1506 struct rb_node *node; 1507 struct rb_node *prev; 1508 struct btrfs_inode *entry; 1509 struct inode *inode; 1510 1511 spin_lock(&root->inode_lock); 1512 again: 1513 node = root->inode_tree.rb_node; 1514 prev = NULL; 1515 while (node) { 1516 prev = node; 1517 entry = rb_entry(node, struct btrfs_inode, rb_node); 1518 1519 if (objectid < btrfs_ino(entry)) 1520 node = node->rb_left; 1521 else if (objectid > btrfs_ino(entry)) 1522 node = node->rb_right; 1523 else 1524 break; 1525 } 1526 if (!node) { 1527 while (prev) { 1528 entry = rb_entry(prev, struct btrfs_inode, rb_node); 1529 if (objectid <= btrfs_ino(entry)) { 1530 node = prev; 1531 break; 1532 } 1533 prev = rb_next(prev); 1534 } 1535 } 1536 while (node) { 1537 entry = rb_entry(node, struct btrfs_inode, rb_node); 1538 inode = igrab(&entry->vfs_inode); 1539 if (inode) { 1540 spin_unlock(&root->inode_lock); 1541 return inode; 1542 } 1543 1544 objectid = btrfs_ino(entry) + 1; 1545 if (cond_resched_lock(&root->inode_lock)) 1546 goto again; 1547 1548 node = rb_next(node); 1549 } 1550 spin_unlock(&root->inode_lock); 1551 return NULL; 1552 } 1553 1554 static int in_block_group(u64 bytenr, 1555 struct btrfs_block_group_cache *block_group) 1556 { 1557 if (bytenr >= block_group->key.objectid && 1558 bytenr < block_group->key.objectid + block_group->key.offset) 1559 return 1; 1560 return 0; 1561 } 1562 1563 /* 1564 * get new location of data 1565 */ 1566 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1567 u64 bytenr, u64 num_bytes) 1568 { 1569 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1570 struct btrfs_path *path; 1571 struct btrfs_file_extent_item *fi; 1572 struct extent_buffer *leaf; 1573 int ret; 1574 1575 path = btrfs_alloc_path(); 1576 if (!path) 1577 return -ENOMEM; 1578 1579 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1580 ret = btrfs_lookup_file_extent(NULL, root, path, 1581 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1582 if (ret < 0) 1583 goto out; 1584 if (ret > 0) { 1585 ret = -ENOENT; 1586 goto out; 1587 } 1588 1589 leaf = path->nodes[0]; 1590 fi = btrfs_item_ptr(leaf, path->slots[0], 1591 struct btrfs_file_extent_item); 1592 1593 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1594 btrfs_file_extent_compression(leaf, fi) || 1595 btrfs_file_extent_encryption(leaf, fi) || 1596 btrfs_file_extent_other_encoding(leaf, fi)); 1597 1598 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1599 ret = -EINVAL; 1600 goto out; 1601 } 1602 1603 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1604 ret = 0; 1605 out: 1606 btrfs_free_path(path); 1607 return ret; 1608 } 1609 1610 /* 1611 * update file extent items in the tree leaf to point to 1612 * the new locations. 1613 */ 1614 static noinline_for_stack 1615 int replace_file_extents(struct btrfs_trans_handle *trans, 1616 struct reloc_control *rc, 1617 struct btrfs_root *root, 1618 struct extent_buffer *leaf) 1619 { 1620 struct btrfs_fs_info *fs_info = root->fs_info; 1621 struct btrfs_key key; 1622 struct btrfs_file_extent_item *fi; 1623 struct inode *inode = NULL; 1624 u64 parent; 1625 u64 bytenr; 1626 u64 new_bytenr = 0; 1627 u64 num_bytes; 1628 u64 end; 1629 u32 nritems; 1630 u32 i; 1631 int ret = 0; 1632 int first = 1; 1633 int dirty = 0; 1634 1635 if (rc->stage != UPDATE_DATA_PTRS) 1636 return 0; 1637 1638 /* reloc trees always use full backref */ 1639 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1640 parent = leaf->start; 1641 else 1642 parent = 0; 1643 1644 nritems = btrfs_header_nritems(leaf); 1645 for (i = 0; i < nritems; i++) { 1646 struct btrfs_ref ref = { 0 }; 1647 1648 cond_resched(); 1649 btrfs_item_key_to_cpu(leaf, &key, i); 1650 if (key.type != BTRFS_EXTENT_DATA_KEY) 1651 continue; 1652 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1653 if (btrfs_file_extent_type(leaf, fi) == 1654 BTRFS_FILE_EXTENT_INLINE) 1655 continue; 1656 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1657 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1658 if (bytenr == 0) 1659 continue; 1660 if (!in_block_group(bytenr, rc->block_group)) 1661 continue; 1662 1663 /* 1664 * if we are modifying block in fs tree, wait for readpage 1665 * to complete and drop the extent cache 1666 */ 1667 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1668 if (first) { 1669 inode = find_next_inode(root, key.objectid); 1670 first = 0; 1671 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1672 btrfs_add_delayed_iput(inode); 1673 inode = find_next_inode(root, key.objectid); 1674 } 1675 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1676 end = key.offset + 1677 btrfs_file_extent_num_bytes(leaf, fi); 1678 WARN_ON(!IS_ALIGNED(key.offset, 1679 fs_info->sectorsize)); 1680 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1681 end--; 1682 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1683 key.offset, end); 1684 if (!ret) 1685 continue; 1686 1687 btrfs_drop_extent_cache(BTRFS_I(inode), 1688 key.offset, end, 1); 1689 unlock_extent(&BTRFS_I(inode)->io_tree, 1690 key.offset, end); 1691 } 1692 } 1693 1694 ret = get_new_location(rc->data_inode, &new_bytenr, 1695 bytenr, num_bytes); 1696 if (ret) { 1697 /* 1698 * Don't have to abort since we've not changed anything 1699 * in the file extent yet. 1700 */ 1701 break; 1702 } 1703 1704 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1705 dirty = 1; 1706 1707 key.offset -= btrfs_file_extent_offset(leaf, fi); 1708 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1709 num_bytes, parent); 1710 ref.real_root = root->root_key.objectid; 1711 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1712 key.objectid, key.offset); 1713 ret = btrfs_inc_extent_ref(trans, &ref); 1714 if (ret) { 1715 btrfs_abort_transaction(trans, ret); 1716 break; 1717 } 1718 1719 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1720 num_bytes, parent); 1721 ref.real_root = root->root_key.objectid; 1722 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1723 key.objectid, key.offset); 1724 ret = btrfs_free_extent(trans, &ref); 1725 if (ret) { 1726 btrfs_abort_transaction(trans, ret); 1727 break; 1728 } 1729 } 1730 if (dirty) 1731 btrfs_mark_buffer_dirty(leaf); 1732 if (inode) 1733 btrfs_add_delayed_iput(inode); 1734 return ret; 1735 } 1736 1737 static noinline_for_stack 1738 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1739 struct btrfs_path *path, int level) 1740 { 1741 struct btrfs_disk_key key1; 1742 struct btrfs_disk_key key2; 1743 btrfs_node_key(eb, &key1, slot); 1744 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1745 return memcmp(&key1, &key2, sizeof(key1)); 1746 } 1747 1748 /* 1749 * try to replace tree blocks in fs tree with the new blocks 1750 * in reloc tree. tree blocks haven't been modified since the 1751 * reloc tree was create can be replaced. 1752 * 1753 * if a block was replaced, level of the block + 1 is returned. 1754 * if no block got replaced, 0 is returned. if there are other 1755 * errors, a negative error number is returned. 1756 */ 1757 static noinline_for_stack 1758 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1759 struct btrfs_root *dest, struct btrfs_root *src, 1760 struct btrfs_path *path, struct btrfs_key *next_key, 1761 int lowest_level, int max_level) 1762 { 1763 struct btrfs_fs_info *fs_info = dest->fs_info; 1764 struct extent_buffer *eb; 1765 struct extent_buffer *parent; 1766 struct btrfs_ref ref = { 0 }; 1767 struct btrfs_key key; 1768 u64 old_bytenr; 1769 u64 new_bytenr; 1770 u64 old_ptr_gen; 1771 u64 new_ptr_gen; 1772 u64 last_snapshot; 1773 u32 blocksize; 1774 int cow = 0; 1775 int level; 1776 int ret; 1777 int slot; 1778 1779 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1780 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1781 1782 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1783 again: 1784 slot = path->slots[lowest_level]; 1785 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1786 1787 eb = btrfs_lock_root_node(dest); 1788 btrfs_set_lock_blocking_write(eb); 1789 level = btrfs_header_level(eb); 1790 1791 if (level < lowest_level) { 1792 btrfs_tree_unlock(eb); 1793 free_extent_buffer(eb); 1794 return 0; 1795 } 1796 1797 if (cow) { 1798 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1799 BUG_ON(ret); 1800 } 1801 btrfs_set_lock_blocking_write(eb); 1802 1803 if (next_key) { 1804 next_key->objectid = (u64)-1; 1805 next_key->type = (u8)-1; 1806 next_key->offset = (u64)-1; 1807 } 1808 1809 parent = eb; 1810 while (1) { 1811 struct btrfs_key first_key; 1812 1813 level = btrfs_header_level(parent); 1814 BUG_ON(level < lowest_level); 1815 1816 ret = btrfs_bin_search(parent, &key, level, &slot); 1817 if (ret < 0) 1818 break; 1819 if (ret && slot > 0) 1820 slot--; 1821 1822 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1823 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1824 1825 old_bytenr = btrfs_node_blockptr(parent, slot); 1826 blocksize = fs_info->nodesize; 1827 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1828 btrfs_node_key_to_cpu(parent, &first_key, slot); 1829 1830 if (level <= max_level) { 1831 eb = path->nodes[level]; 1832 new_bytenr = btrfs_node_blockptr(eb, 1833 path->slots[level]); 1834 new_ptr_gen = btrfs_node_ptr_generation(eb, 1835 path->slots[level]); 1836 } else { 1837 new_bytenr = 0; 1838 new_ptr_gen = 0; 1839 } 1840 1841 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1842 ret = level; 1843 break; 1844 } 1845 1846 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1847 memcmp_node_keys(parent, slot, path, level)) { 1848 if (level <= lowest_level) { 1849 ret = 0; 1850 break; 1851 } 1852 1853 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, 1854 level - 1, &first_key); 1855 if (IS_ERR(eb)) { 1856 ret = PTR_ERR(eb); 1857 break; 1858 } else if (!extent_buffer_uptodate(eb)) { 1859 ret = -EIO; 1860 free_extent_buffer(eb); 1861 break; 1862 } 1863 btrfs_tree_lock(eb); 1864 if (cow) { 1865 ret = btrfs_cow_block(trans, dest, eb, parent, 1866 slot, &eb); 1867 BUG_ON(ret); 1868 } 1869 btrfs_set_lock_blocking_write(eb); 1870 1871 btrfs_tree_unlock(parent); 1872 free_extent_buffer(parent); 1873 1874 parent = eb; 1875 continue; 1876 } 1877 1878 if (!cow) { 1879 btrfs_tree_unlock(parent); 1880 free_extent_buffer(parent); 1881 cow = 1; 1882 goto again; 1883 } 1884 1885 btrfs_node_key_to_cpu(path->nodes[level], &key, 1886 path->slots[level]); 1887 btrfs_release_path(path); 1888 1889 path->lowest_level = level; 1890 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1891 path->lowest_level = 0; 1892 BUG_ON(ret); 1893 1894 /* 1895 * Info qgroup to trace both subtrees. 1896 * 1897 * We must trace both trees. 1898 * 1) Tree reloc subtree 1899 * If not traced, we will leak data numbers 1900 * 2) Fs subtree 1901 * If not traced, we will double count old data 1902 * 1903 * We don't scan the subtree right now, but only record 1904 * the swapped tree blocks. 1905 * The real subtree rescan is delayed until we have new 1906 * CoW on the subtree root node before transaction commit. 1907 */ 1908 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1909 rc->block_group, parent, slot, 1910 path->nodes[level], path->slots[level], 1911 last_snapshot); 1912 if (ret < 0) 1913 break; 1914 /* 1915 * swap blocks in fs tree and reloc tree. 1916 */ 1917 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1918 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1919 btrfs_mark_buffer_dirty(parent); 1920 1921 btrfs_set_node_blockptr(path->nodes[level], 1922 path->slots[level], old_bytenr); 1923 btrfs_set_node_ptr_generation(path->nodes[level], 1924 path->slots[level], old_ptr_gen); 1925 btrfs_mark_buffer_dirty(path->nodes[level]); 1926 1927 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 1928 blocksize, path->nodes[level]->start); 1929 ref.skip_qgroup = true; 1930 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 1931 ret = btrfs_inc_extent_ref(trans, &ref); 1932 BUG_ON(ret); 1933 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1934 blocksize, 0); 1935 ref.skip_qgroup = true; 1936 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 1937 ret = btrfs_inc_extent_ref(trans, &ref); 1938 BUG_ON(ret); 1939 1940 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 1941 blocksize, path->nodes[level]->start); 1942 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 1943 ref.skip_qgroup = true; 1944 ret = btrfs_free_extent(trans, &ref); 1945 BUG_ON(ret); 1946 1947 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 1948 blocksize, 0); 1949 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 1950 ref.skip_qgroup = true; 1951 ret = btrfs_free_extent(trans, &ref); 1952 BUG_ON(ret); 1953 1954 btrfs_unlock_up_safe(path, 0); 1955 1956 ret = level; 1957 break; 1958 } 1959 btrfs_tree_unlock(parent); 1960 free_extent_buffer(parent); 1961 return ret; 1962 } 1963 1964 /* 1965 * helper to find next relocated block in reloc tree 1966 */ 1967 static noinline_for_stack 1968 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1969 int *level) 1970 { 1971 struct extent_buffer *eb; 1972 int i; 1973 u64 last_snapshot; 1974 u32 nritems; 1975 1976 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1977 1978 for (i = 0; i < *level; i++) { 1979 free_extent_buffer(path->nodes[i]); 1980 path->nodes[i] = NULL; 1981 } 1982 1983 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1984 eb = path->nodes[i]; 1985 nritems = btrfs_header_nritems(eb); 1986 while (path->slots[i] + 1 < nritems) { 1987 path->slots[i]++; 1988 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1989 last_snapshot) 1990 continue; 1991 1992 *level = i; 1993 return 0; 1994 } 1995 free_extent_buffer(path->nodes[i]); 1996 path->nodes[i] = NULL; 1997 } 1998 return 1; 1999 } 2000 2001 /* 2002 * walk down reloc tree to find relocated block of lowest level 2003 */ 2004 static noinline_for_stack 2005 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 2006 int *level) 2007 { 2008 struct btrfs_fs_info *fs_info = root->fs_info; 2009 struct extent_buffer *eb = NULL; 2010 int i; 2011 u64 bytenr; 2012 u64 ptr_gen = 0; 2013 u64 last_snapshot; 2014 u32 nritems; 2015 2016 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2017 2018 for (i = *level; i > 0; i--) { 2019 struct btrfs_key first_key; 2020 2021 eb = path->nodes[i]; 2022 nritems = btrfs_header_nritems(eb); 2023 while (path->slots[i] < nritems) { 2024 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 2025 if (ptr_gen > last_snapshot) 2026 break; 2027 path->slots[i]++; 2028 } 2029 if (path->slots[i] >= nritems) { 2030 if (i == *level) 2031 break; 2032 *level = i + 1; 2033 return 0; 2034 } 2035 if (i == 1) { 2036 *level = i; 2037 return 0; 2038 } 2039 2040 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2041 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); 2042 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, 2043 &first_key); 2044 if (IS_ERR(eb)) { 2045 return PTR_ERR(eb); 2046 } else if (!extent_buffer_uptodate(eb)) { 2047 free_extent_buffer(eb); 2048 return -EIO; 2049 } 2050 BUG_ON(btrfs_header_level(eb) != i - 1); 2051 path->nodes[i - 1] = eb; 2052 path->slots[i - 1] = 0; 2053 } 2054 return 1; 2055 } 2056 2057 /* 2058 * invalidate extent cache for file extents whose key in range of 2059 * [min_key, max_key) 2060 */ 2061 static int invalidate_extent_cache(struct btrfs_root *root, 2062 struct btrfs_key *min_key, 2063 struct btrfs_key *max_key) 2064 { 2065 struct btrfs_fs_info *fs_info = root->fs_info; 2066 struct inode *inode = NULL; 2067 u64 objectid; 2068 u64 start, end; 2069 u64 ino; 2070 2071 objectid = min_key->objectid; 2072 while (1) { 2073 cond_resched(); 2074 iput(inode); 2075 2076 if (objectid > max_key->objectid) 2077 break; 2078 2079 inode = find_next_inode(root, objectid); 2080 if (!inode) 2081 break; 2082 ino = btrfs_ino(BTRFS_I(inode)); 2083 2084 if (ino > max_key->objectid) { 2085 iput(inode); 2086 break; 2087 } 2088 2089 objectid = ino + 1; 2090 if (!S_ISREG(inode->i_mode)) 2091 continue; 2092 2093 if (unlikely(min_key->objectid == ino)) { 2094 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 2095 continue; 2096 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 2097 start = 0; 2098 else { 2099 start = min_key->offset; 2100 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 2101 } 2102 } else { 2103 start = 0; 2104 } 2105 2106 if (unlikely(max_key->objectid == ino)) { 2107 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 2108 continue; 2109 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 2110 end = (u64)-1; 2111 } else { 2112 if (max_key->offset == 0) 2113 continue; 2114 end = max_key->offset; 2115 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 2116 end--; 2117 } 2118 } else { 2119 end = (u64)-1; 2120 } 2121 2122 /* the lock_extent waits for readpage to complete */ 2123 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2124 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 2125 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2126 } 2127 return 0; 2128 } 2129 2130 static int find_next_key(struct btrfs_path *path, int level, 2131 struct btrfs_key *key) 2132 2133 { 2134 while (level < BTRFS_MAX_LEVEL) { 2135 if (!path->nodes[level]) 2136 break; 2137 if (path->slots[level] + 1 < 2138 btrfs_header_nritems(path->nodes[level])) { 2139 btrfs_node_key_to_cpu(path->nodes[level], key, 2140 path->slots[level] + 1); 2141 return 0; 2142 } 2143 level++; 2144 } 2145 return 1; 2146 } 2147 2148 /* 2149 * Insert current subvolume into reloc_control::dirty_subvol_roots 2150 */ 2151 static void insert_dirty_subvol(struct btrfs_trans_handle *trans, 2152 struct reloc_control *rc, 2153 struct btrfs_root *root) 2154 { 2155 struct btrfs_root *reloc_root = root->reloc_root; 2156 struct btrfs_root_item *reloc_root_item; 2157 2158 /* @root must be a subvolume tree root with a valid reloc tree */ 2159 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 2160 ASSERT(reloc_root); 2161 2162 reloc_root_item = &reloc_root->root_item; 2163 memset(&reloc_root_item->drop_progress, 0, 2164 sizeof(reloc_root_item->drop_progress)); 2165 reloc_root_item->drop_level = 0; 2166 btrfs_set_root_refs(reloc_root_item, 0); 2167 btrfs_update_reloc_root(trans, root); 2168 2169 if (list_empty(&root->reloc_dirty_list)) { 2170 btrfs_grab_fs_root(root); 2171 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 2172 } 2173 } 2174 2175 static int clean_dirty_subvols(struct reloc_control *rc) 2176 { 2177 struct btrfs_root *root; 2178 struct btrfs_root *next; 2179 int ret = 0; 2180 int ret2; 2181 2182 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 2183 reloc_dirty_list) { 2184 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 2185 /* Merged subvolume, cleanup its reloc root */ 2186 struct btrfs_root *reloc_root = root->reloc_root; 2187 2188 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 2189 list_del_init(&root->reloc_dirty_list); 2190 root->reloc_root = NULL; 2191 if (reloc_root) { 2192 2193 ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); 2194 if (ret2 < 0 && !ret) 2195 ret = ret2; 2196 } 2197 btrfs_put_fs_root(root); 2198 } else { 2199 /* Orphan reloc tree, just clean it up */ 2200 ret2 = btrfs_drop_snapshot(root, NULL, 0, 1); 2201 if (ret2 < 0 && !ret) 2202 ret = ret2; 2203 } 2204 } 2205 return ret; 2206 } 2207 2208 /* 2209 * merge the relocated tree blocks in reloc tree with corresponding 2210 * fs tree. 2211 */ 2212 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 2213 struct btrfs_root *root) 2214 { 2215 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2216 struct btrfs_key key; 2217 struct btrfs_key next_key; 2218 struct btrfs_trans_handle *trans = NULL; 2219 struct btrfs_root *reloc_root; 2220 struct btrfs_root_item *root_item; 2221 struct btrfs_path *path; 2222 struct extent_buffer *leaf; 2223 int level; 2224 int max_level; 2225 int replaced = 0; 2226 int ret; 2227 int err = 0; 2228 u32 min_reserved; 2229 2230 path = btrfs_alloc_path(); 2231 if (!path) 2232 return -ENOMEM; 2233 path->reada = READA_FORWARD; 2234 2235 reloc_root = root->reloc_root; 2236 root_item = &reloc_root->root_item; 2237 2238 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 2239 level = btrfs_root_level(root_item); 2240 extent_buffer_get(reloc_root->node); 2241 path->nodes[level] = reloc_root->node; 2242 path->slots[level] = 0; 2243 } else { 2244 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 2245 2246 level = root_item->drop_level; 2247 BUG_ON(level == 0); 2248 path->lowest_level = level; 2249 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 2250 path->lowest_level = 0; 2251 if (ret < 0) { 2252 btrfs_free_path(path); 2253 return ret; 2254 } 2255 2256 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 2257 path->slots[level]); 2258 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 2259 2260 btrfs_unlock_up_safe(path, 0); 2261 } 2262 2263 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2264 memset(&next_key, 0, sizeof(next_key)); 2265 2266 while (1) { 2267 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 2268 BTRFS_RESERVE_FLUSH_ALL); 2269 if (ret) { 2270 err = ret; 2271 goto out; 2272 } 2273 trans = btrfs_start_transaction(root, 0); 2274 if (IS_ERR(trans)) { 2275 err = PTR_ERR(trans); 2276 trans = NULL; 2277 goto out; 2278 } 2279 trans->block_rsv = rc->block_rsv; 2280 2281 replaced = 0; 2282 max_level = level; 2283 2284 ret = walk_down_reloc_tree(reloc_root, path, &level); 2285 if (ret < 0) { 2286 err = ret; 2287 goto out; 2288 } 2289 if (ret > 0) 2290 break; 2291 2292 if (!find_next_key(path, level, &key) && 2293 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2294 ret = 0; 2295 } else { 2296 ret = replace_path(trans, rc, root, reloc_root, path, 2297 &next_key, level, max_level); 2298 } 2299 if (ret < 0) { 2300 err = ret; 2301 goto out; 2302 } 2303 2304 if (ret > 0) { 2305 level = ret; 2306 btrfs_node_key_to_cpu(path->nodes[level], &key, 2307 path->slots[level]); 2308 replaced = 1; 2309 } 2310 2311 ret = walk_up_reloc_tree(reloc_root, path, &level); 2312 if (ret > 0) 2313 break; 2314 2315 BUG_ON(level == 0); 2316 /* 2317 * save the merging progress in the drop_progress. 2318 * this is OK since root refs == 1 in this case. 2319 */ 2320 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 2321 path->slots[level]); 2322 root_item->drop_level = level; 2323 2324 btrfs_end_transaction_throttle(trans); 2325 trans = NULL; 2326 2327 btrfs_btree_balance_dirty(fs_info); 2328 2329 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2330 invalidate_extent_cache(root, &key, &next_key); 2331 } 2332 2333 /* 2334 * handle the case only one block in the fs tree need to be 2335 * relocated and the block is tree root. 2336 */ 2337 leaf = btrfs_lock_root_node(root); 2338 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); 2339 btrfs_tree_unlock(leaf); 2340 free_extent_buffer(leaf); 2341 if (ret < 0) 2342 err = ret; 2343 out: 2344 btrfs_free_path(path); 2345 2346 if (err == 0) 2347 insert_dirty_subvol(trans, rc, root); 2348 2349 if (trans) 2350 btrfs_end_transaction_throttle(trans); 2351 2352 btrfs_btree_balance_dirty(fs_info); 2353 2354 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2355 invalidate_extent_cache(root, &key, &next_key); 2356 2357 return err; 2358 } 2359 2360 static noinline_for_stack 2361 int prepare_to_merge(struct reloc_control *rc, int err) 2362 { 2363 struct btrfs_root *root = rc->extent_root; 2364 struct btrfs_fs_info *fs_info = root->fs_info; 2365 struct btrfs_root *reloc_root; 2366 struct btrfs_trans_handle *trans; 2367 LIST_HEAD(reloc_roots); 2368 u64 num_bytes = 0; 2369 int ret; 2370 2371 mutex_lock(&fs_info->reloc_mutex); 2372 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2373 rc->merging_rsv_size += rc->nodes_relocated * 2; 2374 mutex_unlock(&fs_info->reloc_mutex); 2375 2376 again: 2377 if (!err) { 2378 num_bytes = rc->merging_rsv_size; 2379 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2380 BTRFS_RESERVE_FLUSH_ALL); 2381 if (ret) 2382 err = ret; 2383 } 2384 2385 trans = btrfs_join_transaction(rc->extent_root); 2386 if (IS_ERR(trans)) { 2387 if (!err) 2388 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2389 num_bytes); 2390 return PTR_ERR(trans); 2391 } 2392 2393 if (!err) { 2394 if (num_bytes != rc->merging_rsv_size) { 2395 btrfs_end_transaction(trans); 2396 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2397 num_bytes); 2398 goto again; 2399 } 2400 } 2401 2402 rc->merge_reloc_tree = 1; 2403 2404 while (!list_empty(&rc->reloc_roots)) { 2405 reloc_root = list_entry(rc->reloc_roots.next, 2406 struct btrfs_root, root_list); 2407 list_del_init(&reloc_root->root_list); 2408 2409 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2410 BUG_ON(IS_ERR(root)); 2411 BUG_ON(root->reloc_root != reloc_root); 2412 2413 /* 2414 * set reference count to 1, so btrfs_recover_relocation 2415 * knows it should resumes merging 2416 */ 2417 if (!err) 2418 btrfs_set_root_refs(&reloc_root->root_item, 1); 2419 btrfs_update_reloc_root(trans, root); 2420 2421 list_add(&reloc_root->root_list, &reloc_roots); 2422 } 2423 2424 list_splice(&reloc_roots, &rc->reloc_roots); 2425 2426 if (!err) 2427 btrfs_commit_transaction(trans); 2428 else 2429 btrfs_end_transaction(trans); 2430 return err; 2431 } 2432 2433 static noinline_for_stack 2434 void free_reloc_roots(struct list_head *list) 2435 { 2436 struct btrfs_root *reloc_root; 2437 2438 while (!list_empty(list)) { 2439 reloc_root = list_entry(list->next, struct btrfs_root, 2440 root_list); 2441 __del_reloc_root(reloc_root); 2442 free_extent_buffer(reloc_root->node); 2443 free_extent_buffer(reloc_root->commit_root); 2444 reloc_root->node = NULL; 2445 reloc_root->commit_root = NULL; 2446 } 2447 } 2448 2449 static noinline_for_stack 2450 void merge_reloc_roots(struct reloc_control *rc) 2451 { 2452 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2453 struct btrfs_root *root; 2454 struct btrfs_root *reloc_root; 2455 LIST_HEAD(reloc_roots); 2456 int found = 0; 2457 int ret = 0; 2458 again: 2459 root = rc->extent_root; 2460 2461 /* 2462 * this serializes us with btrfs_record_root_in_transaction, 2463 * we have to make sure nobody is in the middle of 2464 * adding their roots to the list while we are 2465 * doing this splice 2466 */ 2467 mutex_lock(&fs_info->reloc_mutex); 2468 list_splice_init(&rc->reloc_roots, &reloc_roots); 2469 mutex_unlock(&fs_info->reloc_mutex); 2470 2471 while (!list_empty(&reloc_roots)) { 2472 found = 1; 2473 reloc_root = list_entry(reloc_roots.next, 2474 struct btrfs_root, root_list); 2475 2476 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 2477 root = read_fs_root(fs_info, 2478 reloc_root->root_key.offset); 2479 BUG_ON(IS_ERR(root)); 2480 BUG_ON(root->reloc_root != reloc_root); 2481 2482 ret = merge_reloc_root(rc, root); 2483 if (ret) { 2484 if (list_empty(&reloc_root->root_list)) 2485 list_add_tail(&reloc_root->root_list, 2486 &reloc_roots); 2487 goto out; 2488 } 2489 } else { 2490 list_del_init(&reloc_root->root_list); 2491 /* Don't forget to queue this reloc root for cleanup */ 2492 list_add_tail(&reloc_root->reloc_dirty_list, 2493 &rc->dirty_subvol_roots); 2494 } 2495 } 2496 2497 if (found) { 2498 found = 0; 2499 goto again; 2500 } 2501 out: 2502 if (ret) { 2503 btrfs_handle_fs_error(fs_info, ret, NULL); 2504 if (!list_empty(&reloc_roots)) 2505 free_reloc_roots(&reloc_roots); 2506 2507 /* new reloc root may be added */ 2508 mutex_lock(&fs_info->reloc_mutex); 2509 list_splice_init(&rc->reloc_roots, &reloc_roots); 2510 mutex_unlock(&fs_info->reloc_mutex); 2511 if (!list_empty(&reloc_roots)) 2512 free_reloc_roots(&reloc_roots); 2513 } 2514 2515 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2516 } 2517 2518 static void free_block_list(struct rb_root *blocks) 2519 { 2520 struct tree_block *block; 2521 struct rb_node *rb_node; 2522 while ((rb_node = rb_first(blocks))) { 2523 block = rb_entry(rb_node, struct tree_block, rb_node); 2524 rb_erase(rb_node, blocks); 2525 kfree(block); 2526 } 2527 } 2528 2529 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2530 struct btrfs_root *reloc_root) 2531 { 2532 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2533 struct btrfs_root *root; 2534 2535 if (reloc_root->last_trans == trans->transid) 2536 return 0; 2537 2538 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2539 BUG_ON(IS_ERR(root)); 2540 BUG_ON(root->reloc_root != reloc_root); 2541 2542 return btrfs_record_root_in_trans(trans, root); 2543 } 2544 2545 static noinline_for_stack 2546 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2547 struct reloc_control *rc, 2548 struct backref_node *node, 2549 struct backref_edge *edges[]) 2550 { 2551 struct backref_node *next; 2552 struct btrfs_root *root; 2553 int index = 0; 2554 2555 next = node; 2556 while (1) { 2557 cond_resched(); 2558 next = walk_up_backref(next, edges, &index); 2559 root = next->root; 2560 BUG_ON(!root); 2561 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); 2562 2563 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2564 record_reloc_root_in_trans(trans, root); 2565 break; 2566 } 2567 2568 btrfs_record_root_in_trans(trans, root); 2569 root = root->reloc_root; 2570 2571 if (next->new_bytenr != root->node->start) { 2572 BUG_ON(next->new_bytenr); 2573 BUG_ON(!list_empty(&next->list)); 2574 next->new_bytenr = root->node->start; 2575 next->root = root; 2576 list_add_tail(&next->list, 2577 &rc->backref_cache.changed); 2578 __mark_block_processed(rc, next); 2579 break; 2580 } 2581 2582 WARN_ON(1); 2583 root = NULL; 2584 next = walk_down_backref(edges, &index); 2585 if (!next || next->level <= node->level) 2586 break; 2587 } 2588 if (!root) 2589 return NULL; 2590 2591 next = node; 2592 /* setup backref node path for btrfs_reloc_cow_block */ 2593 while (1) { 2594 rc->backref_cache.path[next->level] = next; 2595 if (--index < 0) 2596 break; 2597 next = edges[index]->node[UPPER]; 2598 } 2599 return root; 2600 } 2601 2602 /* 2603 * select a tree root for relocation. return NULL if the block 2604 * is reference counted. we should use do_relocation() in this 2605 * case. return a tree root pointer if the block isn't reference 2606 * counted. return -ENOENT if the block is root of reloc tree. 2607 */ 2608 static noinline_for_stack 2609 struct btrfs_root *select_one_root(struct backref_node *node) 2610 { 2611 struct backref_node *next; 2612 struct btrfs_root *root; 2613 struct btrfs_root *fs_root = NULL; 2614 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2615 int index = 0; 2616 2617 next = node; 2618 while (1) { 2619 cond_resched(); 2620 next = walk_up_backref(next, edges, &index); 2621 root = next->root; 2622 BUG_ON(!root); 2623 2624 /* no other choice for non-references counted tree */ 2625 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 2626 return root; 2627 2628 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2629 fs_root = root; 2630 2631 if (next != node) 2632 return NULL; 2633 2634 next = walk_down_backref(edges, &index); 2635 if (!next || next->level <= node->level) 2636 break; 2637 } 2638 2639 if (!fs_root) 2640 return ERR_PTR(-ENOENT); 2641 return fs_root; 2642 } 2643 2644 static noinline_for_stack 2645 u64 calcu_metadata_size(struct reloc_control *rc, 2646 struct backref_node *node, int reserve) 2647 { 2648 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2649 struct backref_node *next = node; 2650 struct backref_edge *edge; 2651 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2652 u64 num_bytes = 0; 2653 int index = 0; 2654 2655 BUG_ON(reserve && node->processed); 2656 2657 while (next) { 2658 cond_resched(); 2659 while (1) { 2660 if (next->processed && (reserve || next != node)) 2661 break; 2662 2663 num_bytes += fs_info->nodesize; 2664 2665 if (list_empty(&next->upper)) 2666 break; 2667 2668 edge = list_entry(next->upper.next, 2669 struct backref_edge, list[LOWER]); 2670 edges[index++] = edge; 2671 next = edge->node[UPPER]; 2672 } 2673 next = walk_down_backref(edges, &index); 2674 } 2675 return num_bytes; 2676 } 2677 2678 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2679 struct reloc_control *rc, 2680 struct backref_node *node) 2681 { 2682 struct btrfs_root *root = rc->extent_root; 2683 struct btrfs_fs_info *fs_info = root->fs_info; 2684 u64 num_bytes; 2685 int ret; 2686 u64 tmp; 2687 2688 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2689 2690 trans->block_rsv = rc->block_rsv; 2691 rc->reserved_bytes += num_bytes; 2692 2693 /* 2694 * We are under a transaction here so we can only do limited flushing. 2695 * If we get an enospc just kick back -EAGAIN so we know to drop the 2696 * transaction and try to refill when we can flush all the things. 2697 */ 2698 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, 2699 BTRFS_RESERVE_FLUSH_LIMIT); 2700 if (ret) { 2701 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2702 while (tmp <= rc->reserved_bytes) 2703 tmp <<= 1; 2704 /* 2705 * only one thread can access block_rsv at this point, 2706 * so we don't need hold lock to protect block_rsv. 2707 * we expand more reservation size here to allow enough 2708 * space for relocation and we will return earlier in 2709 * enospc case. 2710 */ 2711 rc->block_rsv->size = tmp + fs_info->nodesize * 2712 RELOCATION_RESERVED_NODES; 2713 return -EAGAIN; 2714 } 2715 2716 return 0; 2717 } 2718 2719 /* 2720 * relocate a block tree, and then update pointers in upper level 2721 * blocks that reference the block to point to the new location. 2722 * 2723 * if called by link_to_upper, the block has already been relocated. 2724 * in that case this function just updates pointers. 2725 */ 2726 static int do_relocation(struct btrfs_trans_handle *trans, 2727 struct reloc_control *rc, 2728 struct backref_node *node, 2729 struct btrfs_key *key, 2730 struct btrfs_path *path, int lowest) 2731 { 2732 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2733 struct backref_node *upper; 2734 struct backref_edge *edge; 2735 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2736 struct btrfs_root *root; 2737 struct extent_buffer *eb; 2738 u32 blocksize; 2739 u64 bytenr; 2740 u64 generation; 2741 int slot; 2742 int ret; 2743 int err = 0; 2744 2745 BUG_ON(lowest && node->eb); 2746 2747 path->lowest_level = node->level + 1; 2748 rc->backref_cache.path[node->level] = node; 2749 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2750 struct btrfs_key first_key; 2751 struct btrfs_ref ref = { 0 }; 2752 2753 cond_resched(); 2754 2755 upper = edge->node[UPPER]; 2756 root = select_reloc_root(trans, rc, upper, edges); 2757 BUG_ON(!root); 2758 2759 if (upper->eb && !upper->locked) { 2760 if (!lowest) { 2761 ret = btrfs_bin_search(upper->eb, key, 2762 upper->level, &slot); 2763 if (ret < 0) { 2764 err = ret; 2765 goto next; 2766 } 2767 BUG_ON(ret); 2768 bytenr = btrfs_node_blockptr(upper->eb, slot); 2769 if (node->eb->start == bytenr) 2770 goto next; 2771 } 2772 drop_node_buffer(upper); 2773 } 2774 2775 if (!upper->eb) { 2776 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2777 if (ret) { 2778 if (ret < 0) 2779 err = ret; 2780 else 2781 err = -ENOENT; 2782 2783 btrfs_release_path(path); 2784 break; 2785 } 2786 2787 if (!upper->eb) { 2788 upper->eb = path->nodes[upper->level]; 2789 path->nodes[upper->level] = NULL; 2790 } else { 2791 BUG_ON(upper->eb != path->nodes[upper->level]); 2792 } 2793 2794 upper->locked = 1; 2795 path->locks[upper->level] = 0; 2796 2797 slot = path->slots[upper->level]; 2798 btrfs_release_path(path); 2799 } else { 2800 ret = btrfs_bin_search(upper->eb, key, upper->level, 2801 &slot); 2802 if (ret < 0) { 2803 err = ret; 2804 goto next; 2805 } 2806 BUG_ON(ret); 2807 } 2808 2809 bytenr = btrfs_node_blockptr(upper->eb, slot); 2810 if (lowest) { 2811 if (bytenr != node->bytenr) { 2812 btrfs_err(root->fs_info, 2813 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2814 bytenr, node->bytenr, slot, 2815 upper->eb->start); 2816 err = -EIO; 2817 goto next; 2818 } 2819 } else { 2820 if (node->eb->start == bytenr) 2821 goto next; 2822 } 2823 2824 blocksize = root->fs_info->nodesize; 2825 generation = btrfs_node_ptr_generation(upper->eb, slot); 2826 btrfs_node_key_to_cpu(upper->eb, &first_key, slot); 2827 eb = read_tree_block(fs_info, bytenr, generation, 2828 upper->level - 1, &first_key); 2829 if (IS_ERR(eb)) { 2830 err = PTR_ERR(eb); 2831 goto next; 2832 } else if (!extent_buffer_uptodate(eb)) { 2833 free_extent_buffer(eb); 2834 err = -EIO; 2835 goto next; 2836 } 2837 btrfs_tree_lock(eb); 2838 btrfs_set_lock_blocking_write(eb); 2839 2840 if (!node->eb) { 2841 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2842 slot, &eb); 2843 btrfs_tree_unlock(eb); 2844 free_extent_buffer(eb); 2845 if (ret < 0) { 2846 err = ret; 2847 goto next; 2848 } 2849 BUG_ON(node->eb != eb); 2850 } else { 2851 btrfs_set_node_blockptr(upper->eb, slot, 2852 node->eb->start); 2853 btrfs_set_node_ptr_generation(upper->eb, slot, 2854 trans->transid); 2855 btrfs_mark_buffer_dirty(upper->eb); 2856 2857 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2858 node->eb->start, blocksize, 2859 upper->eb->start); 2860 ref.real_root = root->root_key.objectid; 2861 btrfs_init_tree_ref(&ref, node->level, 2862 btrfs_header_owner(upper->eb)); 2863 ret = btrfs_inc_extent_ref(trans, &ref); 2864 BUG_ON(ret); 2865 2866 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2867 BUG_ON(ret); 2868 } 2869 next: 2870 if (!upper->pending) 2871 drop_node_buffer(upper); 2872 else 2873 unlock_node_buffer(upper); 2874 if (err) 2875 break; 2876 } 2877 2878 if (!err && node->pending) { 2879 drop_node_buffer(node); 2880 list_move_tail(&node->list, &rc->backref_cache.changed); 2881 node->pending = 0; 2882 } 2883 2884 path->lowest_level = 0; 2885 BUG_ON(err == -ENOSPC); 2886 return err; 2887 } 2888 2889 static int link_to_upper(struct btrfs_trans_handle *trans, 2890 struct reloc_control *rc, 2891 struct backref_node *node, 2892 struct btrfs_path *path) 2893 { 2894 struct btrfs_key key; 2895 2896 btrfs_node_key_to_cpu(node->eb, &key, 0); 2897 return do_relocation(trans, rc, node, &key, path, 0); 2898 } 2899 2900 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2901 struct reloc_control *rc, 2902 struct btrfs_path *path, int err) 2903 { 2904 LIST_HEAD(list); 2905 struct backref_cache *cache = &rc->backref_cache; 2906 struct backref_node *node; 2907 int level; 2908 int ret; 2909 2910 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2911 while (!list_empty(&cache->pending[level])) { 2912 node = list_entry(cache->pending[level].next, 2913 struct backref_node, list); 2914 list_move_tail(&node->list, &list); 2915 BUG_ON(!node->pending); 2916 2917 if (!err) { 2918 ret = link_to_upper(trans, rc, node, path); 2919 if (ret < 0) 2920 err = ret; 2921 } 2922 } 2923 list_splice_init(&list, &cache->pending[level]); 2924 } 2925 return err; 2926 } 2927 2928 static void mark_block_processed(struct reloc_control *rc, 2929 u64 bytenr, u32 blocksize) 2930 { 2931 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, 2932 EXTENT_DIRTY); 2933 } 2934 2935 static void __mark_block_processed(struct reloc_control *rc, 2936 struct backref_node *node) 2937 { 2938 u32 blocksize; 2939 if (node->level == 0 || 2940 in_block_group(node->bytenr, rc->block_group)) { 2941 blocksize = rc->extent_root->fs_info->nodesize; 2942 mark_block_processed(rc, node->bytenr, blocksize); 2943 } 2944 node->processed = 1; 2945 } 2946 2947 /* 2948 * mark a block and all blocks directly/indirectly reference the block 2949 * as processed. 2950 */ 2951 static void update_processed_blocks(struct reloc_control *rc, 2952 struct backref_node *node) 2953 { 2954 struct backref_node *next = node; 2955 struct backref_edge *edge; 2956 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2957 int index = 0; 2958 2959 while (next) { 2960 cond_resched(); 2961 while (1) { 2962 if (next->processed) 2963 break; 2964 2965 __mark_block_processed(rc, next); 2966 2967 if (list_empty(&next->upper)) 2968 break; 2969 2970 edge = list_entry(next->upper.next, 2971 struct backref_edge, list[LOWER]); 2972 edges[index++] = edge; 2973 next = edge->node[UPPER]; 2974 } 2975 next = walk_down_backref(edges, &index); 2976 } 2977 } 2978 2979 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2980 { 2981 u32 blocksize = rc->extent_root->fs_info->nodesize; 2982 2983 if (test_range_bit(&rc->processed_blocks, bytenr, 2984 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2985 return 1; 2986 return 0; 2987 } 2988 2989 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2990 struct tree_block *block) 2991 { 2992 struct extent_buffer *eb; 2993 2994 BUG_ON(block->key_ready); 2995 eb = read_tree_block(fs_info, block->bytenr, block->key.offset, 2996 block->level, NULL); 2997 if (IS_ERR(eb)) { 2998 return PTR_ERR(eb); 2999 } else if (!extent_buffer_uptodate(eb)) { 3000 free_extent_buffer(eb); 3001 return -EIO; 3002 } 3003 if (block->level == 0) 3004 btrfs_item_key_to_cpu(eb, &block->key, 0); 3005 else 3006 btrfs_node_key_to_cpu(eb, &block->key, 0); 3007 free_extent_buffer(eb); 3008 block->key_ready = 1; 3009 return 0; 3010 } 3011 3012 /* 3013 * helper function to relocate a tree block 3014 */ 3015 static int relocate_tree_block(struct btrfs_trans_handle *trans, 3016 struct reloc_control *rc, 3017 struct backref_node *node, 3018 struct btrfs_key *key, 3019 struct btrfs_path *path) 3020 { 3021 struct btrfs_root *root; 3022 int ret = 0; 3023 3024 if (!node) 3025 return 0; 3026 3027 BUG_ON(node->processed); 3028 root = select_one_root(node); 3029 if (root == ERR_PTR(-ENOENT)) { 3030 update_processed_blocks(rc, node); 3031 goto out; 3032 } 3033 3034 if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3035 ret = reserve_metadata_space(trans, rc, node); 3036 if (ret) 3037 goto out; 3038 } 3039 3040 if (root) { 3041 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3042 BUG_ON(node->new_bytenr); 3043 BUG_ON(!list_empty(&node->list)); 3044 btrfs_record_root_in_trans(trans, root); 3045 root = root->reloc_root; 3046 node->new_bytenr = root->node->start; 3047 node->root = root; 3048 list_add_tail(&node->list, &rc->backref_cache.changed); 3049 } else { 3050 path->lowest_level = node->level; 3051 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 3052 btrfs_release_path(path); 3053 if (ret > 0) 3054 ret = 0; 3055 } 3056 if (!ret) 3057 update_processed_blocks(rc, node); 3058 } else { 3059 ret = do_relocation(trans, rc, node, key, path, 1); 3060 } 3061 out: 3062 if (ret || node->level == 0 || node->cowonly) 3063 remove_backref_node(&rc->backref_cache, node); 3064 return ret; 3065 } 3066 3067 /* 3068 * relocate a list of blocks 3069 */ 3070 static noinline_for_stack 3071 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 3072 struct reloc_control *rc, struct rb_root *blocks) 3073 { 3074 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3075 struct backref_node *node; 3076 struct btrfs_path *path; 3077 struct tree_block *block; 3078 struct tree_block *next; 3079 int ret; 3080 int err = 0; 3081 3082 path = btrfs_alloc_path(); 3083 if (!path) { 3084 err = -ENOMEM; 3085 goto out_free_blocks; 3086 } 3087 3088 /* Kick in readahead for tree blocks with missing keys */ 3089 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3090 if (!block->key_ready) 3091 readahead_tree_block(fs_info, block->bytenr); 3092 } 3093 3094 /* Get first keys */ 3095 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3096 if (!block->key_ready) { 3097 err = get_tree_block_key(fs_info, block); 3098 if (err) 3099 goto out_free_path; 3100 } 3101 } 3102 3103 /* Do tree relocation */ 3104 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3105 node = build_backref_tree(rc, &block->key, 3106 block->level, block->bytenr); 3107 if (IS_ERR(node)) { 3108 err = PTR_ERR(node); 3109 goto out; 3110 } 3111 3112 ret = relocate_tree_block(trans, rc, node, &block->key, 3113 path); 3114 if (ret < 0) { 3115 if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) 3116 err = ret; 3117 goto out; 3118 } 3119 } 3120 out: 3121 err = finish_pending_nodes(trans, rc, path, err); 3122 3123 out_free_path: 3124 btrfs_free_path(path); 3125 out_free_blocks: 3126 free_block_list(blocks); 3127 return err; 3128 } 3129 3130 static noinline_for_stack 3131 int prealloc_file_extent_cluster(struct inode *inode, 3132 struct file_extent_cluster *cluster) 3133 { 3134 u64 alloc_hint = 0; 3135 u64 start; 3136 u64 end; 3137 u64 offset = BTRFS_I(inode)->index_cnt; 3138 u64 num_bytes; 3139 int nr = 0; 3140 int ret = 0; 3141 u64 prealloc_start = cluster->start - offset; 3142 u64 prealloc_end = cluster->end - offset; 3143 u64 cur_offset; 3144 struct extent_changeset *data_reserved = NULL; 3145 3146 BUG_ON(cluster->start != cluster->boundary[0]); 3147 inode_lock(inode); 3148 3149 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, 3150 prealloc_end + 1 - prealloc_start); 3151 if (ret) 3152 goto out; 3153 3154 cur_offset = prealloc_start; 3155 while (nr < cluster->nr) { 3156 start = cluster->boundary[nr] - offset; 3157 if (nr + 1 < cluster->nr) 3158 end = cluster->boundary[nr + 1] - 1 - offset; 3159 else 3160 end = cluster->end - offset; 3161 3162 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3163 num_bytes = end + 1 - start; 3164 if (cur_offset < start) 3165 btrfs_free_reserved_data_space(inode, data_reserved, 3166 cur_offset, start - cur_offset); 3167 ret = btrfs_prealloc_file_range(inode, 0, start, 3168 num_bytes, num_bytes, 3169 end + 1, &alloc_hint); 3170 cur_offset = end + 1; 3171 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3172 if (ret) 3173 break; 3174 nr++; 3175 } 3176 if (cur_offset < prealloc_end) 3177 btrfs_free_reserved_data_space(inode, data_reserved, 3178 cur_offset, prealloc_end + 1 - cur_offset); 3179 out: 3180 inode_unlock(inode); 3181 extent_changeset_free(data_reserved); 3182 return ret; 3183 } 3184 3185 static noinline_for_stack 3186 int setup_extent_mapping(struct inode *inode, u64 start, u64 end, 3187 u64 block_start) 3188 { 3189 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3190 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3191 struct extent_map *em; 3192 int ret = 0; 3193 3194 em = alloc_extent_map(); 3195 if (!em) 3196 return -ENOMEM; 3197 3198 em->start = start; 3199 em->len = end + 1 - start; 3200 em->block_len = em->len; 3201 em->block_start = block_start; 3202 em->bdev = fs_info->fs_devices->latest_bdev; 3203 set_bit(EXTENT_FLAG_PINNED, &em->flags); 3204 3205 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3206 while (1) { 3207 write_lock(&em_tree->lock); 3208 ret = add_extent_mapping(em_tree, em, 0); 3209 write_unlock(&em_tree->lock); 3210 if (ret != -EEXIST) { 3211 free_extent_map(em); 3212 break; 3213 } 3214 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 3215 } 3216 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3217 return ret; 3218 } 3219 3220 static int relocate_file_extent_cluster(struct inode *inode, 3221 struct file_extent_cluster *cluster) 3222 { 3223 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3224 u64 page_start; 3225 u64 page_end; 3226 u64 offset = BTRFS_I(inode)->index_cnt; 3227 unsigned long index; 3228 unsigned long last_index; 3229 struct page *page; 3230 struct file_ra_state *ra; 3231 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 3232 int nr = 0; 3233 int ret = 0; 3234 3235 if (!cluster->nr) 3236 return 0; 3237 3238 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3239 if (!ra) 3240 return -ENOMEM; 3241 3242 ret = prealloc_file_extent_cluster(inode, cluster); 3243 if (ret) 3244 goto out; 3245 3246 file_ra_state_init(ra, inode->i_mapping); 3247 3248 ret = setup_extent_mapping(inode, cluster->start - offset, 3249 cluster->end - offset, cluster->start); 3250 if (ret) 3251 goto out; 3252 3253 index = (cluster->start - offset) >> PAGE_SHIFT; 3254 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3255 while (index <= last_index) { 3256 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3257 PAGE_SIZE); 3258 if (ret) 3259 goto out; 3260 3261 page = find_lock_page(inode->i_mapping, index); 3262 if (!page) { 3263 page_cache_sync_readahead(inode->i_mapping, 3264 ra, NULL, index, 3265 last_index + 1 - index); 3266 page = find_or_create_page(inode->i_mapping, index, 3267 mask); 3268 if (!page) { 3269 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3270 PAGE_SIZE, true); 3271 ret = -ENOMEM; 3272 goto out; 3273 } 3274 } 3275 3276 if (PageReadahead(page)) { 3277 page_cache_async_readahead(inode->i_mapping, 3278 ra, NULL, page, index, 3279 last_index + 1 - index); 3280 } 3281 3282 if (!PageUptodate(page)) { 3283 btrfs_readpage(NULL, page); 3284 lock_page(page); 3285 if (!PageUptodate(page)) { 3286 unlock_page(page); 3287 put_page(page); 3288 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3289 PAGE_SIZE, true); 3290 btrfs_delalloc_release_extents(BTRFS_I(inode), 3291 PAGE_SIZE, true); 3292 ret = -EIO; 3293 goto out; 3294 } 3295 } 3296 3297 page_start = page_offset(page); 3298 page_end = page_start + PAGE_SIZE - 1; 3299 3300 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3301 3302 set_page_extent_mapped(page); 3303 3304 if (nr < cluster->nr && 3305 page_start + offset == cluster->boundary[nr]) { 3306 set_extent_bits(&BTRFS_I(inode)->io_tree, 3307 page_start, page_end, 3308 EXTENT_BOUNDARY); 3309 nr++; 3310 } 3311 3312 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 3313 NULL, 0); 3314 if (ret) { 3315 unlock_page(page); 3316 put_page(page); 3317 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3318 PAGE_SIZE, true); 3319 btrfs_delalloc_release_extents(BTRFS_I(inode), 3320 PAGE_SIZE, true); 3321 3322 clear_extent_bits(&BTRFS_I(inode)->io_tree, 3323 page_start, page_end, 3324 EXTENT_LOCKED | EXTENT_BOUNDARY); 3325 goto out; 3326 3327 } 3328 set_page_dirty(page); 3329 3330 unlock_extent(&BTRFS_I(inode)->io_tree, 3331 page_start, page_end); 3332 unlock_page(page); 3333 put_page(page); 3334 3335 index++; 3336 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, 3337 false); 3338 balance_dirty_pages_ratelimited(inode->i_mapping); 3339 btrfs_throttle(fs_info); 3340 } 3341 WARN_ON(nr != cluster->nr); 3342 out: 3343 kfree(ra); 3344 return ret; 3345 } 3346 3347 static noinline_for_stack 3348 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3349 struct file_extent_cluster *cluster) 3350 { 3351 int ret; 3352 3353 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3354 ret = relocate_file_extent_cluster(inode, cluster); 3355 if (ret) 3356 return ret; 3357 cluster->nr = 0; 3358 } 3359 3360 if (!cluster->nr) 3361 cluster->start = extent_key->objectid; 3362 else 3363 BUG_ON(cluster->nr >= MAX_EXTENTS); 3364 cluster->end = extent_key->objectid + extent_key->offset - 1; 3365 cluster->boundary[cluster->nr] = extent_key->objectid; 3366 cluster->nr++; 3367 3368 if (cluster->nr >= MAX_EXTENTS) { 3369 ret = relocate_file_extent_cluster(inode, cluster); 3370 if (ret) 3371 return ret; 3372 cluster->nr = 0; 3373 } 3374 return 0; 3375 } 3376 3377 /* 3378 * helper to add a tree block to the list. 3379 * the major work is getting the generation and level of the block 3380 */ 3381 static int add_tree_block(struct reloc_control *rc, 3382 struct btrfs_key *extent_key, 3383 struct btrfs_path *path, 3384 struct rb_root *blocks) 3385 { 3386 struct extent_buffer *eb; 3387 struct btrfs_extent_item *ei; 3388 struct btrfs_tree_block_info *bi; 3389 struct tree_block *block; 3390 struct rb_node *rb_node; 3391 u32 item_size; 3392 int level = -1; 3393 u64 generation; 3394 3395 eb = path->nodes[0]; 3396 item_size = btrfs_item_size_nr(eb, path->slots[0]); 3397 3398 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3399 item_size >= sizeof(*ei) + sizeof(*bi)) { 3400 ei = btrfs_item_ptr(eb, path->slots[0], 3401 struct btrfs_extent_item); 3402 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3403 bi = (struct btrfs_tree_block_info *)(ei + 1); 3404 level = btrfs_tree_block_level(eb, bi); 3405 } else { 3406 level = (int)extent_key->offset; 3407 } 3408 generation = btrfs_extent_generation(eb, ei); 3409 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3410 btrfs_print_v0_err(eb->fs_info); 3411 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3412 return -EINVAL; 3413 } else { 3414 BUG(); 3415 } 3416 3417 btrfs_release_path(path); 3418 3419 BUG_ON(level == -1); 3420 3421 block = kmalloc(sizeof(*block), GFP_NOFS); 3422 if (!block) 3423 return -ENOMEM; 3424 3425 block->bytenr = extent_key->objectid; 3426 block->key.objectid = rc->extent_root->fs_info->nodesize; 3427 block->key.offset = generation; 3428 block->level = level; 3429 block->key_ready = 0; 3430 3431 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); 3432 if (rb_node) 3433 backref_tree_panic(rb_node, -EEXIST, block->bytenr); 3434 3435 return 0; 3436 } 3437 3438 /* 3439 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3440 */ 3441 static int __add_tree_block(struct reloc_control *rc, 3442 u64 bytenr, u32 blocksize, 3443 struct rb_root *blocks) 3444 { 3445 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3446 struct btrfs_path *path; 3447 struct btrfs_key key; 3448 int ret; 3449 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3450 3451 if (tree_block_processed(bytenr, rc)) 3452 return 0; 3453 3454 if (tree_search(blocks, bytenr)) 3455 return 0; 3456 3457 path = btrfs_alloc_path(); 3458 if (!path) 3459 return -ENOMEM; 3460 again: 3461 key.objectid = bytenr; 3462 if (skinny) { 3463 key.type = BTRFS_METADATA_ITEM_KEY; 3464 key.offset = (u64)-1; 3465 } else { 3466 key.type = BTRFS_EXTENT_ITEM_KEY; 3467 key.offset = blocksize; 3468 } 3469 3470 path->search_commit_root = 1; 3471 path->skip_locking = 1; 3472 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3473 if (ret < 0) 3474 goto out; 3475 3476 if (ret > 0 && skinny) { 3477 if (path->slots[0]) { 3478 path->slots[0]--; 3479 btrfs_item_key_to_cpu(path->nodes[0], &key, 3480 path->slots[0]); 3481 if (key.objectid == bytenr && 3482 (key.type == BTRFS_METADATA_ITEM_KEY || 3483 (key.type == BTRFS_EXTENT_ITEM_KEY && 3484 key.offset == blocksize))) 3485 ret = 0; 3486 } 3487 3488 if (ret) { 3489 skinny = false; 3490 btrfs_release_path(path); 3491 goto again; 3492 } 3493 } 3494 if (ret) { 3495 ASSERT(ret == 1); 3496 btrfs_print_leaf(path->nodes[0]); 3497 btrfs_err(fs_info, 3498 "tree block extent item (%llu) is not found in extent tree", 3499 bytenr); 3500 WARN_ON(1); 3501 ret = -EINVAL; 3502 goto out; 3503 } 3504 3505 ret = add_tree_block(rc, &key, path, blocks); 3506 out: 3507 btrfs_free_path(path); 3508 return ret; 3509 } 3510 3511 /* 3512 * helper to check if the block use full backrefs for pointers in it 3513 */ 3514 static int block_use_full_backref(struct reloc_control *rc, 3515 struct extent_buffer *eb) 3516 { 3517 u64 flags; 3518 int ret; 3519 3520 if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || 3521 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3522 return 1; 3523 3524 ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, 3525 eb->start, btrfs_header_level(eb), 1, 3526 NULL, &flags); 3527 BUG_ON(ret); 3528 3529 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3530 ret = 1; 3531 else 3532 ret = 0; 3533 return ret; 3534 } 3535 3536 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3537 struct btrfs_block_group_cache *block_group, 3538 struct inode *inode, 3539 u64 ino) 3540 { 3541 struct btrfs_key key; 3542 struct btrfs_root *root = fs_info->tree_root; 3543 struct btrfs_trans_handle *trans; 3544 int ret = 0; 3545 3546 if (inode) 3547 goto truncate; 3548 3549 key.objectid = ino; 3550 key.type = BTRFS_INODE_ITEM_KEY; 3551 key.offset = 0; 3552 3553 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3554 if (IS_ERR(inode)) 3555 return -ENOENT; 3556 3557 truncate: 3558 ret = btrfs_check_trunc_cache_free_space(fs_info, 3559 &fs_info->global_block_rsv); 3560 if (ret) 3561 goto out; 3562 3563 trans = btrfs_join_transaction(root); 3564 if (IS_ERR(trans)) { 3565 ret = PTR_ERR(trans); 3566 goto out; 3567 } 3568 3569 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3570 3571 btrfs_end_transaction(trans); 3572 btrfs_btree_balance_dirty(fs_info); 3573 out: 3574 iput(inode); 3575 return ret; 3576 } 3577 3578 /* 3579 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY 3580 * this function scans fs tree to find blocks reference the data extent 3581 */ 3582 static int find_data_references(struct reloc_control *rc, 3583 struct btrfs_key *extent_key, 3584 struct extent_buffer *leaf, 3585 struct btrfs_extent_data_ref *ref, 3586 struct rb_root *blocks) 3587 { 3588 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3589 struct btrfs_path *path; 3590 struct tree_block *block; 3591 struct btrfs_root *root; 3592 struct btrfs_file_extent_item *fi; 3593 struct rb_node *rb_node; 3594 struct btrfs_key key; 3595 u64 ref_root; 3596 u64 ref_objectid; 3597 u64 ref_offset; 3598 u32 ref_count; 3599 u32 nritems; 3600 int err = 0; 3601 int added = 0; 3602 int counted; 3603 int ret; 3604 3605 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3606 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3607 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3608 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3609 3610 /* 3611 * This is an extent belonging to the free space cache, lets just delete 3612 * it and redo the search. 3613 */ 3614 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3615 ret = delete_block_group_cache(fs_info, rc->block_group, 3616 NULL, ref_objectid); 3617 if (ret != -ENOENT) 3618 return ret; 3619 ret = 0; 3620 } 3621 3622 path = btrfs_alloc_path(); 3623 if (!path) 3624 return -ENOMEM; 3625 path->reada = READA_FORWARD; 3626 3627 root = read_fs_root(fs_info, ref_root); 3628 if (IS_ERR(root)) { 3629 err = PTR_ERR(root); 3630 goto out; 3631 } 3632 3633 key.objectid = ref_objectid; 3634 key.type = BTRFS_EXTENT_DATA_KEY; 3635 if (ref_offset > ((u64)-1 << 32)) 3636 key.offset = 0; 3637 else 3638 key.offset = ref_offset; 3639 3640 path->search_commit_root = 1; 3641 path->skip_locking = 1; 3642 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3643 if (ret < 0) { 3644 err = ret; 3645 goto out; 3646 } 3647 3648 leaf = path->nodes[0]; 3649 nritems = btrfs_header_nritems(leaf); 3650 /* 3651 * the references in tree blocks that use full backrefs 3652 * are not counted in 3653 */ 3654 if (block_use_full_backref(rc, leaf)) 3655 counted = 0; 3656 else 3657 counted = 1; 3658 rb_node = tree_search(blocks, leaf->start); 3659 if (rb_node) { 3660 if (counted) 3661 added = 1; 3662 else 3663 path->slots[0] = nritems; 3664 } 3665 3666 while (ref_count > 0) { 3667 while (path->slots[0] >= nritems) { 3668 ret = btrfs_next_leaf(root, path); 3669 if (ret < 0) { 3670 err = ret; 3671 goto out; 3672 } 3673 if (WARN_ON(ret > 0)) 3674 goto out; 3675 3676 leaf = path->nodes[0]; 3677 nritems = btrfs_header_nritems(leaf); 3678 added = 0; 3679 3680 if (block_use_full_backref(rc, leaf)) 3681 counted = 0; 3682 else 3683 counted = 1; 3684 rb_node = tree_search(blocks, leaf->start); 3685 if (rb_node) { 3686 if (counted) 3687 added = 1; 3688 else 3689 path->slots[0] = nritems; 3690 } 3691 } 3692 3693 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3694 if (WARN_ON(key.objectid != ref_objectid || 3695 key.type != BTRFS_EXTENT_DATA_KEY)) 3696 break; 3697 3698 fi = btrfs_item_ptr(leaf, path->slots[0], 3699 struct btrfs_file_extent_item); 3700 3701 if (btrfs_file_extent_type(leaf, fi) == 3702 BTRFS_FILE_EXTENT_INLINE) 3703 goto next; 3704 3705 if (btrfs_file_extent_disk_bytenr(leaf, fi) != 3706 extent_key->objectid) 3707 goto next; 3708 3709 key.offset -= btrfs_file_extent_offset(leaf, fi); 3710 if (key.offset != ref_offset) 3711 goto next; 3712 3713 if (counted) 3714 ref_count--; 3715 if (added) 3716 goto next; 3717 3718 if (!tree_block_processed(leaf->start, rc)) { 3719 block = kmalloc(sizeof(*block), GFP_NOFS); 3720 if (!block) { 3721 err = -ENOMEM; 3722 break; 3723 } 3724 block->bytenr = leaf->start; 3725 btrfs_item_key_to_cpu(leaf, &block->key, 0); 3726 block->level = 0; 3727 block->key_ready = 1; 3728 rb_node = tree_insert(blocks, block->bytenr, 3729 &block->rb_node); 3730 if (rb_node) 3731 backref_tree_panic(rb_node, -EEXIST, 3732 block->bytenr); 3733 } 3734 if (counted) 3735 added = 1; 3736 else 3737 path->slots[0] = nritems; 3738 next: 3739 path->slots[0]++; 3740 3741 } 3742 out: 3743 btrfs_free_path(path); 3744 return err; 3745 } 3746 3747 /* 3748 * helper to find all tree blocks that reference a given data extent 3749 */ 3750 static noinline_for_stack 3751 int add_data_references(struct reloc_control *rc, 3752 struct btrfs_key *extent_key, 3753 struct btrfs_path *path, 3754 struct rb_root *blocks) 3755 { 3756 struct btrfs_key key; 3757 struct extent_buffer *eb; 3758 struct btrfs_extent_data_ref *dref; 3759 struct btrfs_extent_inline_ref *iref; 3760 unsigned long ptr; 3761 unsigned long end; 3762 u32 blocksize = rc->extent_root->fs_info->nodesize; 3763 int ret = 0; 3764 int err = 0; 3765 3766 eb = path->nodes[0]; 3767 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3768 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3769 ptr += sizeof(struct btrfs_extent_item); 3770 3771 while (ptr < end) { 3772 iref = (struct btrfs_extent_inline_ref *)ptr; 3773 key.type = btrfs_get_extent_inline_ref_type(eb, iref, 3774 BTRFS_REF_TYPE_DATA); 3775 if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 3776 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 3777 ret = __add_tree_block(rc, key.offset, blocksize, 3778 blocks); 3779 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 3780 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 3781 ret = find_data_references(rc, extent_key, 3782 eb, dref, blocks); 3783 } else { 3784 ret = -EUCLEAN; 3785 btrfs_err(rc->extent_root->fs_info, 3786 "extent %llu slot %d has an invalid inline ref type", 3787 eb->start, path->slots[0]); 3788 } 3789 if (ret) { 3790 err = ret; 3791 goto out; 3792 } 3793 ptr += btrfs_extent_inline_ref_size(key.type); 3794 } 3795 WARN_ON(ptr > end); 3796 3797 while (1) { 3798 cond_resched(); 3799 eb = path->nodes[0]; 3800 if (path->slots[0] >= btrfs_header_nritems(eb)) { 3801 ret = btrfs_next_leaf(rc->extent_root, path); 3802 if (ret < 0) { 3803 err = ret; 3804 break; 3805 } 3806 if (ret > 0) 3807 break; 3808 eb = path->nodes[0]; 3809 } 3810 3811 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 3812 if (key.objectid != extent_key->objectid) 3813 break; 3814 3815 if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 3816 ret = __add_tree_block(rc, key.offset, blocksize, 3817 blocks); 3818 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 3819 dref = btrfs_item_ptr(eb, path->slots[0], 3820 struct btrfs_extent_data_ref); 3821 ret = find_data_references(rc, extent_key, 3822 eb, dref, blocks); 3823 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 3824 btrfs_print_v0_err(eb->fs_info); 3825 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3826 ret = -EINVAL; 3827 } else { 3828 ret = 0; 3829 } 3830 if (ret) { 3831 err = ret; 3832 break; 3833 } 3834 path->slots[0]++; 3835 } 3836 out: 3837 btrfs_release_path(path); 3838 if (err) 3839 free_block_list(blocks); 3840 return err; 3841 } 3842 3843 /* 3844 * helper to find next unprocessed extent 3845 */ 3846 static noinline_for_stack 3847 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3848 struct btrfs_key *extent_key) 3849 { 3850 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3851 struct btrfs_key key; 3852 struct extent_buffer *leaf; 3853 u64 start, end, last; 3854 int ret; 3855 3856 last = rc->block_group->key.objectid + rc->block_group->key.offset; 3857 while (1) { 3858 cond_resched(); 3859 if (rc->search_start >= last) { 3860 ret = 1; 3861 break; 3862 } 3863 3864 key.objectid = rc->search_start; 3865 key.type = BTRFS_EXTENT_ITEM_KEY; 3866 key.offset = 0; 3867 3868 path->search_commit_root = 1; 3869 path->skip_locking = 1; 3870 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3871 0, 0); 3872 if (ret < 0) 3873 break; 3874 next: 3875 leaf = path->nodes[0]; 3876 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3877 ret = btrfs_next_leaf(rc->extent_root, path); 3878 if (ret != 0) 3879 break; 3880 leaf = path->nodes[0]; 3881 } 3882 3883 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3884 if (key.objectid >= last) { 3885 ret = 1; 3886 break; 3887 } 3888 3889 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3890 key.type != BTRFS_METADATA_ITEM_KEY) { 3891 path->slots[0]++; 3892 goto next; 3893 } 3894 3895 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3896 key.objectid + key.offset <= rc->search_start) { 3897 path->slots[0]++; 3898 goto next; 3899 } 3900 3901 if (key.type == BTRFS_METADATA_ITEM_KEY && 3902 key.objectid + fs_info->nodesize <= 3903 rc->search_start) { 3904 path->slots[0]++; 3905 goto next; 3906 } 3907 3908 ret = find_first_extent_bit(&rc->processed_blocks, 3909 key.objectid, &start, &end, 3910 EXTENT_DIRTY, NULL); 3911 3912 if (ret == 0 && start <= key.objectid) { 3913 btrfs_release_path(path); 3914 rc->search_start = end + 1; 3915 } else { 3916 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3917 rc->search_start = key.objectid + key.offset; 3918 else 3919 rc->search_start = key.objectid + 3920 fs_info->nodesize; 3921 memcpy(extent_key, &key, sizeof(key)); 3922 return 0; 3923 } 3924 } 3925 btrfs_release_path(path); 3926 return ret; 3927 } 3928 3929 static void set_reloc_control(struct reloc_control *rc) 3930 { 3931 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3932 3933 mutex_lock(&fs_info->reloc_mutex); 3934 fs_info->reloc_ctl = rc; 3935 mutex_unlock(&fs_info->reloc_mutex); 3936 } 3937 3938 static void unset_reloc_control(struct reloc_control *rc) 3939 { 3940 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3941 3942 mutex_lock(&fs_info->reloc_mutex); 3943 fs_info->reloc_ctl = NULL; 3944 mutex_unlock(&fs_info->reloc_mutex); 3945 } 3946 3947 static int check_extent_flags(u64 flags) 3948 { 3949 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3950 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3951 return 1; 3952 if (!(flags & BTRFS_EXTENT_FLAG_DATA) && 3953 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3954 return 1; 3955 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3956 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 3957 return 1; 3958 return 0; 3959 } 3960 3961 static noinline_for_stack 3962 int prepare_to_relocate(struct reloc_control *rc) 3963 { 3964 struct btrfs_trans_handle *trans; 3965 int ret; 3966 3967 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3968 BTRFS_BLOCK_RSV_TEMP); 3969 if (!rc->block_rsv) 3970 return -ENOMEM; 3971 3972 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3973 rc->search_start = rc->block_group->key.objectid; 3974 rc->extents_found = 0; 3975 rc->nodes_relocated = 0; 3976 rc->merging_rsv_size = 0; 3977 rc->reserved_bytes = 0; 3978 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3979 RELOCATION_RESERVED_NODES; 3980 ret = btrfs_block_rsv_refill(rc->extent_root, 3981 rc->block_rsv, rc->block_rsv->size, 3982 BTRFS_RESERVE_FLUSH_ALL); 3983 if (ret) 3984 return ret; 3985 3986 rc->create_reloc_tree = 1; 3987 set_reloc_control(rc); 3988 3989 trans = btrfs_join_transaction(rc->extent_root); 3990 if (IS_ERR(trans)) { 3991 unset_reloc_control(rc); 3992 /* 3993 * extent tree is not a ref_cow tree and has no reloc_root to 3994 * cleanup. And callers are responsible to free the above 3995 * block rsv. 3996 */ 3997 return PTR_ERR(trans); 3998 } 3999 btrfs_commit_transaction(trans); 4000 return 0; 4001 } 4002 4003 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 4004 { 4005 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 4006 struct rb_root blocks = RB_ROOT; 4007 struct btrfs_key key; 4008 struct btrfs_trans_handle *trans = NULL; 4009 struct btrfs_path *path; 4010 struct btrfs_extent_item *ei; 4011 u64 flags; 4012 u32 item_size; 4013 int ret; 4014 int err = 0; 4015 int progress = 0; 4016 4017 path = btrfs_alloc_path(); 4018 if (!path) 4019 return -ENOMEM; 4020 path->reada = READA_FORWARD; 4021 4022 ret = prepare_to_relocate(rc); 4023 if (ret) { 4024 err = ret; 4025 goto out_free; 4026 } 4027 4028 while (1) { 4029 rc->reserved_bytes = 0; 4030 ret = btrfs_block_rsv_refill(rc->extent_root, 4031 rc->block_rsv, rc->block_rsv->size, 4032 BTRFS_RESERVE_FLUSH_ALL); 4033 if (ret) { 4034 err = ret; 4035 break; 4036 } 4037 progress++; 4038 trans = btrfs_start_transaction(rc->extent_root, 0); 4039 if (IS_ERR(trans)) { 4040 err = PTR_ERR(trans); 4041 trans = NULL; 4042 break; 4043 } 4044 restart: 4045 if (update_backref_cache(trans, &rc->backref_cache)) { 4046 btrfs_end_transaction(trans); 4047 trans = NULL; 4048 continue; 4049 } 4050 4051 ret = find_next_extent(rc, path, &key); 4052 if (ret < 0) 4053 err = ret; 4054 if (ret != 0) 4055 break; 4056 4057 rc->extents_found++; 4058 4059 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 4060 struct btrfs_extent_item); 4061 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 4062 if (item_size >= sizeof(*ei)) { 4063 flags = btrfs_extent_flags(path->nodes[0], ei); 4064 ret = check_extent_flags(flags); 4065 BUG_ON(ret); 4066 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 4067 err = -EINVAL; 4068 btrfs_print_v0_err(trans->fs_info); 4069 btrfs_abort_transaction(trans, err); 4070 break; 4071 } else { 4072 BUG(); 4073 } 4074 4075 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 4076 ret = add_tree_block(rc, &key, path, &blocks); 4077 } else if (rc->stage == UPDATE_DATA_PTRS && 4078 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4079 ret = add_data_references(rc, &key, path, &blocks); 4080 } else { 4081 btrfs_release_path(path); 4082 ret = 0; 4083 } 4084 if (ret < 0) { 4085 err = ret; 4086 break; 4087 } 4088 4089 if (!RB_EMPTY_ROOT(&blocks)) { 4090 ret = relocate_tree_blocks(trans, rc, &blocks); 4091 if (ret < 0) { 4092 /* 4093 * if we fail to relocate tree blocks, force to update 4094 * backref cache when committing transaction. 4095 */ 4096 rc->backref_cache.last_trans = trans->transid - 1; 4097 4098 if (ret != -EAGAIN) { 4099 err = ret; 4100 break; 4101 } 4102 rc->extents_found--; 4103 rc->search_start = key.objectid; 4104 } 4105 } 4106 4107 btrfs_end_transaction_throttle(trans); 4108 btrfs_btree_balance_dirty(fs_info); 4109 trans = NULL; 4110 4111 if (rc->stage == MOVE_DATA_EXTENTS && 4112 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4113 rc->found_file_extent = 1; 4114 ret = relocate_data_extent(rc->data_inode, 4115 &key, &rc->cluster); 4116 if (ret < 0) { 4117 err = ret; 4118 break; 4119 } 4120 } 4121 } 4122 if (trans && progress && err == -ENOSPC) { 4123 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 4124 if (ret == 1) { 4125 err = 0; 4126 progress = 0; 4127 goto restart; 4128 } 4129 } 4130 4131 btrfs_release_path(path); 4132 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 4133 4134 if (trans) { 4135 btrfs_end_transaction_throttle(trans); 4136 btrfs_btree_balance_dirty(fs_info); 4137 } 4138 4139 if (!err) { 4140 ret = relocate_file_extent_cluster(rc->data_inode, 4141 &rc->cluster); 4142 if (ret < 0) 4143 err = ret; 4144 } 4145 4146 rc->create_reloc_tree = 0; 4147 set_reloc_control(rc); 4148 4149 backref_cache_cleanup(&rc->backref_cache); 4150 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); 4151 4152 err = prepare_to_merge(rc, err); 4153 4154 merge_reloc_roots(rc); 4155 4156 rc->merge_reloc_tree = 0; 4157 unset_reloc_control(rc); 4158 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); 4159 4160 /* get rid of pinned extents */ 4161 trans = btrfs_join_transaction(rc->extent_root); 4162 if (IS_ERR(trans)) { 4163 err = PTR_ERR(trans); 4164 goto out_free; 4165 } 4166 btrfs_commit_transaction(trans); 4167 ret = clean_dirty_subvols(rc); 4168 if (ret < 0 && !err) 4169 err = ret; 4170 out_free: 4171 btrfs_free_block_rsv(fs_info, rc->block_rsv); 4172 btrfs_free_path(path); 4173 return err; 4174 } 4175 4176 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 4177 struct btrfs_root *root, u64 objectid) 4178 { 4179 struct btrfs_path *path; 4180 struct btrfs_inode_item *item; 4181 struct extent_buffer *leaf; 4182 int ret; 4183 4184 path = btrfs_alloc_path(); 4185 if (!path) 4186 return -ENOMEM; 4187 4188 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 4189 if (ret) 4190 goto out; 4191 4192 leaf = path->nodes[0]; 4193 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 4194 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 4195 btrfs_set_inode_generation(leaf, item, 1); 4196 btrfs_set_inode_size(leaf, item, 0); 4197 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 4198 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4199 BTRFS_INODE_PREALLOC); 4200 btrfs_mark_buffer_dirty(leaf); 4201 out: 4202 btrfs_free_path(path); 4203 return ret; 4204 } 4205 4206 /* 4207 * helper to create inode for data relocation. 4208 * the inode is in data relocation tree and its link count is 0 4209 */ 4210 static noinline_for_stack 4211 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 4212 struct btrfs_block_group_cache *group) 4213 { 4214 struct inode *inode = NULL; 4215 struct btrfs_trans_handle *trans; 4216 struct btrfs_root *root; 4217 struct btrfs_key key; 4218 u64 objectid; 4219 int err = 0; 4220 4221 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4222 if (IS_ERR(root)) 4223 return ERR_CAST(root); 4224 4225 trans = btrfs_start_transaction(root, 6); 4226 if (IS_ERR(trans)) 4227 return ERR_CAST(trans); 4228 4229 err = btrfs_find_free_objectid(root, &objectid); 4230 if (err) 4231 goto out; 4232 4233 err = __insert_orphan_inode(trans, root, objectid); 4234 BUG_ON(err); 4235 4236 key.objectid = objectid; 4237 key.type = BTRFS_INODE_ITEM_KEY; 4238 key.offset = 0; 4239 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 4240 BUG_ON(IS_ERR(inode)); 4241 BTRFS_I(inode)->index_cnt = group->key.objectid; 4242 4243 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4244 out: 4245 btrfs_end_transaction(trans); 4246 btrfs_btree_balance_dirty(fs_info); 4247 if (err) { 4248 if (inode) 4249 iput(inode); 4250 inode = ERR_PTR(err); 4251 } 4252 return inode; 4253 } 4254 4255 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 4256 { 4257 struct reloc_control *rc; 4258 4259 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4260 if (!rc) 4261 return NULL; 4262 4263 INIT_LIST_HEAD(&rc->reloc_roots); 4264 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 4265 backref_cache_init(&rc->backref_cache); 4266 mapping_tree_init(&rc->reloc_root_tree); 4267 extent_io_tree_init(fs_info, &rc->processed_blocks, 4268 IO_TREE_RELOC_BLOCKS, NULL); 4269 return rc; 4270 } 4271 4272 /* 4273 * Print the block group being relocated 4274 */ 4275 static void describe_relocation(struct btrfs_fs_info *fs_info, 4276 struct btrfs_block_group_cache *block_group) 4277 { 4278 char buf[128] = {'\0'}; 4279 4280 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 4281 4282 btrfs_info(fs_info, 4283 "relocating block group %llu flags %s", 4284 block_group->key.objectid, buf); 4285 } 4286 4287 /* 4288 * function to relocate all extents in a block group. 4289 */ 4290 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 4291 { 4292 struct btrfs_block_group_cache *bg; 4293 struct btrfs_root *extent_root = fs_info->extent_root; 4294 struct reloc_control *rc; 4295 struct inode *inode; 4296 struct btrfs_path *path; 4297 int ret; 4298 int rw = 0; 4299 int err = 0; 4300 4301 bg = btrfs_lookup_block_group(fs_info, group_start); 4302 if (!bg) 4303 return -ENOENT; 4304 4305 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4306 btrfs_put_block_group(bg); 4307 return -ETXTBSY; 4308 } 4309 4310 rc = alloc_reloc_control(fs_info); 4311 if (!rc) { 4312 btrfs_put_block_group(bg); 4313 return -ENOMEM; 4314 } 4315 4316 rc->extent_root = extent_root; 4317 rc->block_group = bg; 4318 4319 ret = btrfs_inc_block_group_ro(rc->block_group); 4320 if (ret) { 4321 err = ret; 4322 goto out; 4323 } 4324 rw = 1; 4325 4326 path = btrfs_alloc_path(); 4327 if (!path) { 4328 err = -ENOMEM; 4329 goto out; 4330 } 4331 4332 inode = lookup_free_space_inode(rc->block_group, path); 4333 btrfs_free_path(path); 4334 4335 if (!IS_ERR(inode)) 4336 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4337 else 4338 ret = PTR_ERR(inode); 4339 4340 if (ret && ret != -ENOENT) { 4341 err = ret; 4342 goto out; 4343 } 4344 4345 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4346 if (IS_ERR(rc->data_inode)) { 4347 err = PTR_ERR(rc->data_inode); 4348 rc->data_inode = NULL; 4349 goto out; 4350 } 4351 4352 describe_relocation(fs_info, rc->block_group); 4353 4354 btrfs_wait_block_group_reservations(rc->block_group); 4355 btrfs_wait_nocow_writers(rc->block_group); 4356 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4357 rc->block_group->key.objectid, 4358 rc->block_group->key.offset); 4359 4360 while (1) { 4361 mutex_lock(&fs_info->cleaner_mutex); 4362 ret = relocate_block_group(rc); 4363 mutex_unlock(&fs_info->cleaner_mutex); 4364 if (ret < 0) 4365 err = ret; 4366 4367 /* 4368 * We may have gotten ENOSPC after we already dirtied some 4369 * extents. If writeout happens while we're relocating a 4370 * different block group we could end up hitting the 4371 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4372 * btrfs_reloc_cow_block. Make sure we write everything out 4373 * properly so we don't trip over this problem, and then break 4374 * out of the loop if we hit an error. 4375 */ 4376 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4377 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4378 (u64)-1); 4379 if (ret) 4380 err = ret; 4381 invalidate_mapping_pages(rc->data_inode->i_mapping, 4382 0, -1); 4383 rc->stage = UPDATE_DATA_PTRS; 4384 } 4385 4386 if (err < 0) 4387 goto out; 4388 4389 if (rc->extents_found == 0) 4390 break; 4391 4392 btrfs_info(fs_info, "found %llu extents", rc->extents_found); 4393 4394 } 4395 4396 WARN_ON(rc->block_group->pinned > 0); 4397 WARN_ON(rc->block_group->reserved > 0); 4398 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 4399 out: 4400 if (err && rw) 4401 btrfs_dec_block_group_ro(rc->block_group); 4402 iput(rc->data_inode); 4403 btrfs_put_block_group(rc->block_group); 4404 kfree(rc); 4405 return err; 4406 } 4407 4408 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4409 { 4410 struct btrfs_fs_info *fs_info = root->fs_info; 4411 struct btrfs_trans_handle *trans; 4412 int ret, err; 4413 4414 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4415 if (IS_ERR(trans)) 4416 return PTR_ERR(trans); 4417 4418 memset(&root->root_item.drop_progress, 0, 4419 sizeof(root->root_item.drop_progress)); 4420 root->root_item.drop_level = 0; 4421 btrfs_set_root_refs(&root->root_item, 0); 4422 ret = btrfs_update_root(trans, fs_info->tree_root, 4423 &root->root_key, &root->root_item); 4424 4425 err = btrfs_end_transaction(trans); 4426 if (err) 4427 return err; 4428 return ret; 4429 } 4430 4431 /* 4432 * recover relocation interrupted by system crash. 4433 * 4434 * this function resumes merging reloc trees with corresponding fs trees. 4435 * this is important for keeping the sharing of tree blocks 4436 */ 4437 int btrfs_recover_relocation(struct btrfs_root *root) 4438 { 4439 struct btrfs_fs_info *fs_info = root->fs_info; 4440 LIST_HEAD(reloc_roots); 4441 struct btrfs_key key; 4442 struct btrfs_root *fs_root; 4443 struct btrfs_root *reloc_root; 4444 struct btrfs_path *path; 4445 struct extent_buffer *leaf; 4446 struct reloc_control *rc = NULL; 4447 struct btrfs_trans_handle *trans; 4448 int ret; 4449 int err = 0; 4450 4451 path = btrfs_alloc_path(); 4452 if (!path) 4453 return -ENOMEM; 4454 path->reada = READA_BACK; 4455 4456 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4457 key.type = BTRFS_ROOT_ITEM_KEY; 4458 key.offset = (u64)-1; 4459 4460 while (1) { 4461 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4462 path, 0, 0); 4463 if (ret < 0) { 4464 err = ret; 4465 goto out; 4466 } 4467 if (ret > 0) { 4468 if (path->slots[0] == 0) 4469 break; 4470 path->slots[0]--; 4471 } 4472 leaf = path->nodes[0]; 4473 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4474 btrfs_release_path(path); 4475 4476 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4477 key.type != BTRFS_ROOT_ITEM_KEY) 4478 break; 4479 4480 reloc_root = btrfs_read_fs_root(root, &key); 4481 if (IS_ERR(reloc_root)) { 4482 err = PTR_ERR(reloc_root); 4483 goto out; 4484 } 4485 4486 list_add(&reloc_root->root_list, &reloc_roots); 4487 4488 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4489 fs_root = read_fs_root(fs_info, 4490 reloc_root->root_key.offset); 4491 if (IS_ERR(fs_root)) { 4492 ret = PTR_ERR(fs_root); 4493 if (ret != -ENOENT) { 4494 err = ret; 4495 goto out; 4496 } 4497 ret = mark_garbage_root(reloc_root); 4498 if (ret < 0) { 4499 err = ret; 4500 goto out; 4501 } 4502 } 4503 } 4504 4505 if (key.offset == 0) 4506 break; 4507 4508 key.offset--; 4509 } 4510 btrfs_release_path(path); 4511 4512 if (list_empty(&reloc_roots)) 4513 goto out; 4514 4515 rc = alloc_reloc_control(fs_info); 4516 if (!rc) { 4517 err = -ENOMEM; 4518 goto out; 4519 } 4520 4521 rc->extent_root = fs_info->extent_root; 4522 4523 set_reloc_control(rc); 4524 4525 trans = btrfs_join_transaction(rc->extent_root); 4526 if (IS_ERR(trans)) { 4527 unset_reloc_control(rc); 4528 err = PTR_ERR(trans); 4529 goto out_free; 4530 } 4531 4532 rc->merge_reloc_tree = 1; 4533 4534 while (!list_empty(&reloc_roots)) { 4535 reloc_root = list_entry(reloc_roots.next, 4536 struct btrfs_root, root_list); 4537 list_del(&reloc_root->root_list); 4538 4539 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4540 list_add_tail(&reloc_root->root_list, 4541 &rc->reloc_roots); 4542 continue; 4543 } 4544 4545 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); 4546 if (IS_ERR(fs_root)) { 4547 err = PTR_ERR(fs_root); 4548 goto out_free; 4549 } 4550 4551 err = __add_reloc_root(reloc_root); 4552 BUG_ON(err < 0); /* -ENOMEM or logic error */ 4553 fs_root->reloc_root = reloc_root; 4554 } 4555 4556 err = btrfs_commit_transaction(trans); 4557 if (err) 4558 goto out_free; 4559 4560 merge_reloc_roots(rc); 4561 4562 unset_reloc_control(rc); 4563 4564 trans = btrfs_join_transaction(rc->extent_root); 4565 if (IS_ERR(trans)) { 4566 err = PTR_ERR(trans); 4567 goto out_free; 4568 } 4569 err = btrfs_commit_transaction(trans); 4570 4571 ret = clean_dirty_subvols(rc); 4572 if (ret < 0 && !err) 4573 err = ret; 4574 out_free: 4575 kfree(rc); 4576 out: 4577 if (!list_empty(&reloc_roots)) 4578 free_reloc_roots(&reloc_roots); 4579 4580 btrfs_free_path(path); 4581 4582 if (err == 0) { 4583 /* cleanup orphan inode in data relocation tree */ 4584 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4585 if (IS_ERR(fs_root)) 4586 err = PTR_ERR(fs_root); 4587 else 4588 err = btrfs_orphan_cleanup(fs_root); 4589 } 4590 return err; 4591 } 4592 4593 /* 4594 * helper to add ordered checksum for data relocation. 4595 * 4596 * cloning checksum properly handles the nodatasum extents. 4597 * it also saves CPU time to re-calculate the checksum. 4598 */ 4599 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4600 { 4601 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4602 struct btrfs_ordered_sum *sums; 4603 struct btrfs_ordered_extent *ordered; 4604 int ret; 4605 u64 disk_bytenr; 4606 u64 new_bytenr; 4607 LIST_HEAD(list); 4608 4609 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4610 BUG_ON(ordered->file_offset != file_pos || ordered->len != len); 4611 4612 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 4613 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 4614 disk_bytenr + len - 1, &list, 0); 4615 if (ret) 4616 goto out; 4617 4618 while (!list_empty(&list)) { 4619 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4620 list_del_init(&sums->list); 4621 4622 /* 4623 * We need to offset the new_bytenr based on where the csum is. 4624 * We need to do this because we will read in entire prealloc 4625 * extents but we may have written to say the middle of the 4626 * prealloc extent, so we need to make sure the csum goes with 4627 * the right disk offset. 4628 * 4629 * We can do this because the data reloc inode refers strictly 4630 * to the on disk bytes, so we don't have to worry about 4631 * disk_len vs real len like with real inodes since it's all 4632 * disk length. 4633 */ 4634 new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); 4635 sums->bytenr = new_bytenr; 4636 4637 btrfs_add_ordered_sum(ordered, sums); 4638 } 4639 out: 4640 btrfs_put_ordered_extent(ordered); 4641 return ret; 4642 } 4643 4644 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4645 struct btrfs_root *root, struct extent_buffer *buf, 4646 struct extent_buffer *cow) 4647 { 4648 struct btrfs_fs_info *fs_info = root->fs_info; 4649 struct reloc_control *rc; 4650 struct backref_node *node; 4651 int first_cow = 0; 4652 int level; 4653 int ret = 0; 4654 4655 rc = fs_info->reloc_ctl; 4656 if (!rc) 4657 return 0; 4658 4659 BUG_ON(rc->stage == UPDATE_DATA_PTRS && 4660 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); 4661 4662 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 4663 if (buf == root->node) 4664 __update_reloc_root(root, cow->start); 4665 } 4666 4667 level = btrfs_header_level(buf); 4668 if (btrfs_header_generation(buf) <= 4669 btrfs_root_last_snapshot(&root->root_item)) 4670 first_cow = 1; 4671 4672 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4673 rc->create_reloc_tree) { 4674 WARN_ON(!first_cow && level == 0); 4675 4676 node = rc->backref_cache.path[level]; 4677 BUG_ON(node->bytenr != buf->start && 4678 node->new_bytenr != buf->start); 4679 4680 drop_node_buffer(node); 4681 extent_buffer_get(cow); 4682 node->eb = cow; 4683 node->new_bytenr = cow->start; 4684 4685 if (!node->pending) { 4686 list_move_tail(&node->list, 4687 &rc->backref_cache.pending[level]); 4688 node->pending = 1; 4689 } 4690 4691 if (first_cow) 4692 __mark_block_processed(rc, node); 4693 4694 if (first_cow && level > 0) 4695 rc->nodes_relocated += buf->len; 4696 } 4697 4698 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4699 ret = replace_file_extents(trans, rc, root, cow); 4700 return ret; 4701 } 4702 4703 /* 4704 * called before creating snapshot. it calculates metadata reservation 4705 * required for relocating tree blocks in the snapshot 4706 */ 4707 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4708 u64 *bytes_to_reserve) 4709 { 4710 struct btrfs_root *root = pending->root; 4711 struct reloc_control *rc = root->fs_info->reloc_ctl; 4712 4713 if (!root->reloc_root || !rc) 4714 return; 4715 4716 if (!rc->merge_reloc_tree) 4717 return; 4718 4719 root = root->reloc_root; 4720 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4721 /* 4722 * relocation is in the stage of merging trees. the space 4723 * used by merging a reloc tree is twice the size of 4724 * relocated tree nodes in the worst case. half for cowing 4725 * the reloc tree, half for cowing the fs tree. the space 4726 * used by cowing the reloc tree will be freed after the 4727 * tree is dropped. if we create snapshot, cowing the fs 4728 * tree may use more space than it frees. so we need 4729 * reserve extra space. 4730 */ 4731 *bytes_to_reserve += rc->nodes_relocated; 4732 } 4733 4734 /* 4735 * called after snapshot is created. migrate block reservation 4736 * and create reloc root for the newly created snapshot 4737 */ 4738 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4739 struct btrfs_pending_snapshot *pending) 4740 { 4741 struct btrfs_root *root = pending->root; 4742 struct btrfs_root *reloc_root; 4743 struct btrfs_root *new_root; 4744 struct reloc_control *rc = root->fs_info->reloc_ctl; 4745 int ret; 4746 4747 if (!root->reloc_root || !rc) 4748 return 0; 4749 4750 rc = root->fs_info->reloc_ctl; 4751 rc->merging_rsv_size += rc->nodes_relocated; 4752 4753 if (rc->merge_reloc_tree) { 4754 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4755 rc->block_rsv, 4756 rc->nodes_relocated, true); 4757 if (ret) 4758 return ret; 4759 } 4760 4761 new_root = pending->snap; 4762 reloc_root = create_reloc_root(trans, root->reloc_root, 4763 new_root->root_key.objectid); 4764 if (IS_ERR(reloc_root)) 4765 return PTR_ERR(reloc_root); 4766 4767 ret = __add_reloc_root(reloc_root); 4768 BUG_ON(ret < 0); 4769 new_root->reloc_root = reloc_root; 4770 4771 if (rc->create_reloc_tree) 4772 ret = clone_backref_node(trans, rc, root, reloc_root); 4773 return ret; 4774 } 4775