1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include "ctree.h" 13 #include "disk-io.h" 14 #include "transaction.h" 15 #include "volumes.h" 16 #include "locking.h" 17 #include "btrfs_inode.h" 18 #include "async-thread.h" 19 #include "free-space-cache.h" 20 #include "inode-map.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 24 /* 25 * backref_node, mapping_node and tree_block start with this 26 */ 27 struct tree_entry { 28 struct rb_node rb_node; 29 u64 bytenr; 30 }; 31 32 /* 33 * present a tree block in the backref cache 34 */ 35 struct backref_node { 36 struct rb_node rb_node; 37 u64 bytenr; 38 39 u64 new_bytenr; 40 /* objectid of tree block owner, can be not uptodate */ 41 u64 owner; 42 /* link to pending, changed or detached list */ 43 struct list_head list; 44 /* list of upper level blocks reference this block */ 45 struct list_head upper; 46 /* list of child blocks in the cache */ 47 struct list_head lower; 48 /* NULL if this node is not tree root */ 49 struct btrfs_root *root; 50 /* extent buffer got by COW the block */ 51 struct extent_buffer *eb; 52 /* level of tree block */ 53 unsigned int level:8; 54 /* is the block in non-reference counted tree */ 55 unsigned int cowonly:1; 56 /* 1 if no child node in the cache */ 57 unsigned int lowest:1; 58 /* is the extent buffer locked */ 59 unsigned int locked:1; 60 /* has the block been processed */ 61 unsigned int processed:1; 62 /* have backrefs of this block been checked */ 63 unsigned int checked:1; 64 /* 65 * 1 if corresponding block has been cowed but some upper 66 * level block pointers may not point to the new location 67 */ 68 unsigned int pending:1; 69 /* 70 * 1 if the backref node isn't connected to any other 71 * backref node. 72 */ 73 unsigned int detached:1; 74 }; 75 76 /* 77 * present a block pointer in the backref cache 78 */ 79 struct backref_edge { 80 struct list_head list[2]; 81 struct backref_node *node[2]; 82 }; 83 84 #define LOWER 0 85 #define UPPER 1 86 #define RELOCATION_RESERVED_NODES 256 87 88 struct backref_cache { 89 /* red black tree of all backref nodes in the cache */ 90 struct rb_root rb_root; 91 /* for passing backref nodes to btrfs_reloc_cow_block */ 92 struct backref_node *path[BTRFS_MAX_LEVEL]; 93 /* 94 * list of blocks that have been cowed but some block 95 * pointers in upper level blocks may not reflect the 96 * new location 97 */ 98 struct list_head pending[BTRFS_MAX_LEVEL]; 99 /* list of backref nodes with no child node */ 100 struct list_head leaves; 101 /* list of blocks that have been cowed in current transaction */ 102 struct list_head changed; 103 /* list of detached backref node. */ 104 struct list_head detached; 105 106 u64 last_trans; 107 108 int nr_nodes; 109 int nr_edges; 110 }; 111 112 /* 113 * map address of tree root to tree 114 */ 115 struct mapping_node { 116 struct rb_node rb_node; 117 u64 bytenr; 118 void *data; 119 }; 120 121 struct mapping_tree { 122 struct rb_root rb_root; 123 spinlock_t lock; 124 }; 125 126 /* 127 * present a tree block to process 128 */ 129 struct tree_block { 130 struct rb_node rb_node; 131 u64 bytenr; 132 struct btrfs_key key; 133 unsigned int level:8; 134 unsigned int key_ready:1; 135 }; 136 137 #define MAX_EXTENTS 128 138 139 struct file_extent_cluster { 140 u64 start; 141 u64 end; 142 u64 boundary[MAX_EXTENTS]; 143 unsigned int nr; 144 }; 145 146 struct reloc_control { 147 /* block group to relocate */ 148 struct btrfs_block_group_cache *block_group; 149 /* extent tree */ 150 struct btrfs_root *extent_root; 151 /* inode for moving data */ 152 struct inode *data_inode; 153 154 struct btrfs_block_rsv *block_rsv; 155 156 struct backref_cache backref_cache; 157 158 struct file_extent_cluster cluster; 159 /* tree blocks have been processed */ 160 struct extent_io_tree processed_blocks; 161 /* map start of tree root to corresponding reloc tree */ 162 struct mapping_tree reloc_root_tree; 163 /* list of reloc trees */ 164 struct list_head reloc_roots; 165 /* list of subvolume trees that get relocated */ 166 struct list_head dirty_subvol_roots; 167 /* size of metadata reservation for merging reloc trees */ 168 u64 merging_rsv_size; 169 /* size of relocated tree nodes */ 170 u64 nodes_relocated; 171 /* reserved size for block group relocation*/ 172 u64 reserved_bytes; 173 174 u64 search_start; 175 u64 extents_found; 176 177 unsigned int stage:8; 178 unsigned int create_reloc_tree:1; 179 unsigned int merge_reloc_tree:1; 180 unsigned int found_file_extent:1; 181 }; 182 183 /* stages of data relocation */ 184 #define MOVE_DATA_EXTENTS 0 185 #define UPDATE_DATA_PTRS 1 186 187 static void remove_backref_node(struct backref_cache *cache, 188 struct backref_node *node); 189 static void __mark_block_processed(struct reloc_control *rc, 190 struct backref_node *node); 191 192 static void mapping_tree_init(struct mapping_tree *tree) 193 { 194 tree->rb_root = RB_ROOT; 195 spin_lock_init(&tree->lock); 196 } 197 198 static void backref_cache_init(struct backref_cache *cache) 199 { 200 int i; 201 cache->rb_root = RB_ROOT; 202 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 203 INIT_LIST_HEAD(&cache->pending[i]); 204 INIT_LIST_HEAD(&cache->changed); 205 INIT_LIST_HEAD(&cache->detached); 206 INIT_LIST_HEAD(&cache->leaves); 207 } 208 209 static void backref_cache_cleanup(struct backref_cache *cache) 210 { 211 struct backref_node *node; 212 int i; 213 214 while (!list_empty(&cache->detached)) { 215 node = list_entry(cache->detached.next, 216 struct backref_node, list); 217 remove_backref_node(cache, node); 218 } 219 220 while (!list_empty(&cache->leaves)) { 221 node = list_entry(cache->leaves.next, 222 struct backref_node, lower); 223 remove_backref_node(cache, node); 224 } 225 226 cache->last_trans = 0; 227 228 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 229 ASSERT(list_empty(&cache->pending[i])); 230 ASSERT(list_empty(&cache->changed)); 231 ASSERT(list_empty(&cache->detached)); 232 ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); 233 ASSERT(!cache->nr_nodes); 234 ASSERT(!cache->nr_edges); 235 } 236 237 static struct backref_node *alloc_backref_node(struct backref_cache *cache) 238 { 239 struct backref_node *node; 240 241 node = kzalloc(sizeof(*node), GFP_NOFS); 242 if (node) { 243 INIT_LIST_HEAD(&node->list); 244 INIT_LIST_HEAD(&node->upper); 245 INIT_LIST_HEAD(&node->lower); 246 RB_CLEAR_NODE(&node->rb_node); 247 cache->nr_nodes++; 248 } 249 return node; 250 } 251 252 static void free_backref_node(struct backref_cache *cache, 253 struct backref_node *node) 254 { 255 if (node) { 256 cache->nr_nodes--; 257 kfree(node); 258 } 259 } 260 261 static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) 262 { 263 struct backref_edge *edge; 264 265 edge = kzalloc(sizeof(*edge), GFP_NOFS); 266 if (edge) 267 cache->nr_edges++; 268 return edge; 269 } 270 271 static void free_backref_edge(struct backref_cache *cache, 272 struct backref_edge *edge) 273 { 274 if (edge) { 275 cache->nr_edges--; 276 kfree(edge); 277 } 278 } 279 280 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 281 struct rb_node *node) 282 { 283 struct rb_node **p = &root->rb_node; 284 struct rb_node *parent = NULL; 285 struct tree_entry *entry; 286 287 while (*p) { 288 parent = *p; 289 entry = rb_entry(parent, struct tree_entry, rb_node); 290 291 if (bytenr < entry->bytenr) 292 p = &(*p)->rb_left; 293 else if (bytenr > entry->bytenr) 294 p = &(*p)->rb_right; 295 else 296 return parent; 297 } 298 299 rb_link_node(node, parent, p); 300 rb_insert_color(node, root); 301 return NULL; 302 } 303 304 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) 305 { 306 struct rb_node *n = root->rb_node; 307 struct tree_entry *entry; 308 309 while (n) { 310 entry = rb_entry(n, struct tree_entry, rb_node); 311 312 if (bytenr < entry->bytenr) 313 n = n->rb_left; 314 else if (bytenr > entry->bytenr) 315 n = n->rb_right; 316 else 317 return n; 318 } 319 return NULL; 320 } 321 322 static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) 323 { 324 325 struct btrfs_fs_info *fs_info = NULL; 326 struct backref_node *bnode = rb_entry(rb_node, struct backref_node, 327 rb_node); 328 if (bnode->root) 329 fs_info = bnode->root->fs_info; 330 btrfs_panic(fs_info, errno, 331 "Inconsistency in backref cache found at offset %llu", 332 bytenr); 333 } 334 335 /* 336 * walk up backref nodes until reach node presents tree root 337 */ 338 static struct backref_node *walk_up_backref(struct backref_node *node, 339 struct backref_edge *edges[], 340 int *index) 341 { 342 struct backref_edge *edge; 343 int idx = *index; 344 345 while (!list_empty(&node->upper)) { 346 edge = list_entry(node->upper.next, 347 struct backref_edge, list[LOWER]); 348 edges[idx++] = edge; 349 node = edge->node[UPPER]; 350 } 351 BUG_ON(node->detached); 352 *index = idx; 353 return node; 354 } 355 356 /* 357 * walk down backref nodes to find start of next reference path 358 */ 359 static struct backref_node *walk_down_backref(struct backref_edge *edges[], 360 int *index) 361 { 362 struct backref_edge *edge; 363 struct backref_node *lower; 364 int idx = *index; 365 366 while (idx > 0) { 367 edge = edges[idx - 1]; 368 lower = edge->node[LOWER]; 369 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 370 idx--; 371 continue; 372 } 373 edge = list_entry(edge->list[LOWER].next, 374 struct backref_edge, list[LOWER]); 375 edges[idx - 1] = edge; 376 *index = idx; 377 return edge->node[UPPER]; 378 } 379 *index = 0; 380 return NULL; 381 } 382 383 static void unlock_node_buffer(struct backref_node *node) 384 { 385 if (node->locked) { 386 btrfs_tree_unlock(node->eb); 387 node->locked = 0; 388 } 389 } 390 391 static void drop_node_buffer(struct backref_node *node) 392 { 393 if (node->eb) { 394 unlock_node_buffer(node); 395 free_extent_buffer(node->eb); 396 node->eb = NULL; 397 } 398 } 399 400 static void drop_backref_node(struct backref_cache *tree, 401 struct backref_node *node) 402 { 403 BUG_ON(!list_empty(&node->upper)); 404 405 drop_node_buffer(node); 406 list_del(&node->list); 407 list_del(&node->lower); 408 if (!RB_EMPTY_NODE(&node->rb_node)) 409 rb_erase(&node->rb_node, &tree->rb_root); 410 free_backref_node(tree, node); 411 } 412 413 /* 414 * remove a backref node from the backref cache 415 */ 416 static void remove_backref_node(struct backref_cache *cache, 417 struct backref_node *node) 418 { 419 struct backref_node *upper; 420 struct backref_edge *edge; 421 422 if (!node) 423 return; 424 425 BUG_ON(!node->lowest && !node->detached); 426 while (!list_empty(&node->upper)) { 427 edge = list_entry(node->upper.next, struct backref_edge, 428 list[LOWER]); 429 upper = edge->node[UPPER]; 430 list_del(&edge->list[LOWER]); 431 list_del(&edge->list[UPPER]); 432 free_backref_edge(cache, edge); 433 434 if (RB_EMPTY_NODE(&upper->rb_node)) { 435 BUG_ON(!list_empty(&node->upper)); 436 drop_backref_node(cache, node); 437 node = upper; 438 node->lowest = 1; 439 continue; 440 } 441 /* 442 * add the node to leaf node list if no other 443 * child block cached. 444 */ 445 if (list_empty(&upper->lower)) { 446 list_add_tail(&upper->lower, &cache->leaves); 447 upper->lowest = 1; 448 } 449 } 450 451 drop_backref_node(cache, node); 452 } 453 454 static void update_backref_node(struct backref_cache *cache, 455 struct backref_node *node, u64 bytenr) 456 { 457 struct rb_node *rb_node; 458 rb_erase(&node->rb_node, &cache->rb_root); 459 node->bytenr = bytenr; 460 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 461 if (rb_node) 462 backref_tree_panic(rb_node, -EEXIST, bytenr); 463 } 464 465 /* 466 * update backref cache after a transaction commit 467 */ 468 static int update_backref_cache(struct btrfs_trans_handle *trans, 469 struct backref_cache *cache) 470 { 471 struct backref_node *node; 472 int level = 0; 473 474 if (cache->last_trans == 0) { 475 cache->last_trans = trans->transid; 476 return 0; 477 } 478 479 if (cache->last_trans == trans->transid) 480 return 0; 481 482 /* 483 * detached nodes are used to avoid unnecessary backref 484 * lookup. transaction commit changes the extent tree. 485 * so the detached nodes are no longer useful. 486 */ 487 while (!list_empty(&cache->detached)) { 488 node = list_entry(cache->detached.next, 489 struct backref_node, list); 490 remove_backref_node(cache, node); 491 } 492 493 while (!list_empty(&cache->changed)) { 494 node = list_entry(cache->changed.next, 495 struct backref_node, list); 496 list_del_init(&node->list); 497 BUG_ON(node->pending); 498 update_backref_node(cache, node, node->new_bytenr); 499 } 500 501 /* 502 * some nodes can be left in the pending list if there were 503 * errors during processing the pending nodes. 504 */ 505 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 506 list_for_each_entry(node, &cache->pending[level], list) { 507 BUG_ON(!node->pending); 508 if (node->bytenr == node->new_bytenr) 509 continue; 510 update_backref_node(cache, node, node->new_bytenr); 511 } 512 } 513 514 cache->last_trans = 0; 515 return 1; 516 } 517 518 519 static int should_ignore_root(struct btrfs_root *root) 520 { 521 struct btrfs_root *reloc_root; 522 523 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 524 return 0; 525 526 reloc_root = root->reloc_root; 527 if (!reloc_root) 528 return 0; 529 530 if (btrfs_root_last_snapshot(&reloc_root->root_item) == 531 root->fs_info->running_transaction->transid - 1) 532 return 0; 533 /* 534 * if there is reloc tree and it was created in previous 535 * transaction backref lookup can find the reloc tree, 536 * so backref node for the fs tree root is useless for 537 * relocation. 538 */ 539 return 1; 540 } 541 /* 542 * find reloc tree by address of tree root 543 */ 544 static struct btrfs_root *find_reloc_root(struct reloc_control *rc, 545 u64 bytenr) 546 { 547 struct rb_node *rb_node; 548 struct mapping_node *node; 549 struct btrfs_root *root = NULL; 550 551 spin_lock(&rc->reloc_root_tree.lock); 552 rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); 553 if (rb_node) { 554 node = rb_entry(rb_node, struct mapping_node, rb_node); 555 root = (struct btrfs_root *)node->data; 556 } 557 spin_unlock(&rc->reloc_root_tree.lock); 558 return root; 559 } 560 561 static int is_cowonly_root(u64 root_objectid) 562 { 563 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || 564 root_objectid == BTRFS_EXTENT_TREE_OBJECTID || 565 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 566 root_objectid == BTRFS_DEV_TREE_OBJECTID || 567 root_objectid == BTRFS_TREE_LOG_OBJECTID || 568 root_objectid == BTRFS_CSUM_TREE_OBJECTID || 569 root_objectid == BTRFS_UUID_TREE_OBJECTID || 570 root_objectid == BTRFS_QUOTA_TREE_OBJECTID || 571 root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) 572 return 1; 573 return 0; 574 } 575 576 static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, 577 u64 root_objectid) 578 { 579 struct btrfs_key key; 580 581 key.objectid = root_objectid; 582 key.type = BTRFS_ROOT_ITEM_KEY; 583 if (is_cowonly_root(root_objectid)) 584 key.offset = 0; 585 else 586 key.offset = (u64)-1; 587 588 return btrfs_get_fs_root(fs_info, &key, false); 589 } 590 591 static noinline_for_stack 592 int find_inline_backref(struct extent_buffer *leaf, int slot, 593 unsigned long *ptr, unsigned long *end) 594 { 595 struct btrfs_key key; 596 struct btrfs_extent_item *ei; 597 struct btrfs_tree_block_info *bi; 598 u32 item_size; 599 600 btrfs_item_key_to_cpu(leaf, &key, slot); 601 602 item_size = btrfs_item_size_nr(leaf, slot); 603 if (item_size < sizeof(*ei)) { 604 btrfs_print_v0_err(leaf->fs_info); 605 btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL); 606 return 1; 607 } 608 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 609 WARN_ON(!(btrfs_extent_flags(leaf, ei) & 610 BTRFS_EXTENT_FLAG_TREE_BLOCK)); 611 612 if (key.type == BTRFS_EXTENT_ITEM_KEY && 613 item_size <= sizeof(*ei) + sizeof(*bi)) { 614 WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); 615 return 1; 616 } 617 if (key.type == BTRFS_METADATA_ITEM_KEY && 618 item_size <= sizeof(*ei)) { 619 WARN_ON(item_size < sizeof(*ei)); 620 return 1; 621 } 622 623 if (key.type == BTRFS_EXTENT_ITEM_KEY) { 624 bi = (struct btrfs_tree_block_info *)(ei + 1); 625 *ptr = (unsigned long)(bi + 1); 626 } else { 627 *ptr = (unsigned long)(ei + 1); 628 } 629 *end = (unsigned long)ei + item_size; 630 return 0; 631 } 632 633 /* 634 * build backref tree for a given tree block. root of the backref tree 635 * corresponds the tree block, leaves of the backref tree correspond 636 * roots of b-trees that reference the tree block. 637 * 638 * the basic idea of this function is check backrefs of a given block 639 * to find upper level blocks that reference the block, and then check 640 * backrefs of these upper level blocks recursively. the recursion stop 641 * when tree root is reached or backrefs for the block is cached. 642 * 643 * NOTE: if we find backrefs for a block are cached, we know backrefs 644 * for all upper level blocks that directly/indirectly reference the 645 * block are also cached. 646 */ 647 static noinline_for_stack 648 struct backref_node *build_backref_tree(struct reloc_control *rc, 649 struct btrfs_key *node_key, 650 int level, u64 bytenr) 651 { 652 struct backref_cache *cache = &rc->backref_cache; 653 struct btrfs_path *path1; /* For searching extent root */ 654 struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ 655 struct extent_buffer *eb; 656 struct btrfs_root *root; 657 struct backref_node *cur; 658 struct backref_node *upper; 659 struct backref_node *lower; 660 struct backref_node *node = NULL; 661 struct backref_node *exist = NULL; 662 struct backref_edge *edge; 663 struct rb_node *rb_node; 664 struct btrfs_key key; 665 unsigned long end; 666 unsigned long ptr; 667 LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ 668 LIST_HEAD(useless); 669 int cowonly; 670 int ret; 671 int err = 0; 672 bool need_check = true; 673 674 path1 = btrfs_alloc_path(); 675 path2 = btrfs_alloc_path(); 676 if (!path1 || !path2) { 677 err = -ENOMEM; 678 goto out; 679 } 680 path1->reada = READA_FORWARD; 681 path2->reada = READA_FORWARD; 682 683 node = alloc_backref_node(cache); 684 if (!node) { 685 err = -ENOMEM; 686 goto out; 687 } 688 689 node->bytenr = bytenr; 690 node->level = level; 691 node->lowest = 1; 692 cur = node; 693 again: 694 end = 0; 695 ptr = 0; 696 key.objectid = cur->bytenr; 697 key.type = BTRFS_METADATA_ITEM_KEY; 698 key.offset = (u64)-1; 699 700 path1->search_commit_root = 1; 701 path1->skip_locking = 1; 702 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, 703 0, 0); 704 if (ret < 0) { 705 err = ret; 706 goto out; 707 } 708 ASSERT(ret); 709 ASSERT(path1->slots[0]); 710 711 path1->slots[0]--; 712 713 WARN_ON(cur->checked); 714 if (!list_empty(&cur->upper)) { 715 /* 716 * the backref was added previously when processing 717 * backref of type BTRFS_TREE_BLOCK_REF_KEY 718 */ 719 ASSERT(list_is_singular(&cur->upper)); 720 edge = list_entry(cur->upper.next, struct backref_edge, 721 list[LOWER]); 722 ASSERT(list_empty(&edge->list[UPPER])); 723 exist = edge->node[UPPER]; 724 /* 725 * add the upper level block to pending list if we need 726 * check its backrefs 727 */ 728 if (!exist->checked) 729 list_add_tail(&edge->list[UPPER], &list); 730 } else { 731 exist = NULL; 732 } 733 734 while (1) { 735 cond_resched(); 736 eb = path1->nodes[0]; 737 738 if (ptr >= end) { 739 if (path1->slots[0] >= btrfs_header_nritems(eb)) { 740 ret = btrfs_next_leaf(rc->extent_root, path1); 741 if (ret < 0) { 742 err = ret; 743 goto out; 744 } 745 if (ret > 0) 746 break; 747 eb = path1->nodes[0]; 748 } 749 750 btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); 751 if (key.objectid != cur->bytenr) { 752 WARN_ON(exist); 753 break; 754 } 755 756 if (key.type == BTRFS_EXTENT_ITEM_KEY || 757 key.type == BTRFS_METADATA_ITEM_KEY) { 758 ret = find_inline_backref(eb, path1->slots[0], 759 &ptr, &end); 760 if (ret) 761 goto next; 762 } 763 } 764 765 if (ptr < end) { 766 /* update key for inline back ref */ 767 struct btrfs_extent_inline_ref *iref; 768 int type; 769 iref = (struct btrfs_extent_inline_ref *)ptr; 770 type = btrfs_get_extent_inline_ref_type(eb, iref, 771 BTRFS_REF_TYPE_BLOCK); 772 if (type == BTRFS_REF_TYPE_INVALID) { 773 err = -EUCLEAN; 774 goto out; 775 } 776 key.type = type; 777 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 778 779 WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && 780 key.type != BTRFS_SHARED_BLOCK_REF_KEY); 781 } 782 783 /* 784 * Parent node found and matches current inline ref, no need to 785 * rebuild this node for this inline ref. 786 */ 787 if (exist && 788 ((key.type == BTRFS_TREE_BLOCK_REF_KEY && 789 exist->owner == key.offset) || 790 (key.type == BTRFS_SHARED_BLOCK_REF_KEY && 791 exist->bytenr == key.offset))) { 792 exist = NULL; 793 goto next; 794 } 795 796 /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ 797 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 798 if (key.objectid == key.offset) { 799 /* 800 * Only root blocks of reloc trees use backref 801 * pointing to itself. 802 */ 803 root = find_reloc_root(rc, cur->bytenr); 804 ASSERT(root); 805 cur->root = root; 806 break; 807 } 808 809 edge = alloc_backref_edge(cache); 810 if (!edge) { 811 err = -ENOMEM; 812 goto out; 813 } 814 rb_node = tree_search(&cache->rb_root, key.offset); 815 if (!rb_node) { 816 upper = alloc_backref_node(cache); 817 if (!upper) { 818 free_backref_edge(cache, edge); 819 err = -ENOMEM; 820 goto out; 821 } 822 upper->bytenr = key.offset; 823 upper->level = cur->level + 1; 824 /* 825 * backrefs for the upper level block isn't 826 * cached, add the block to pending list 827 */ 828 list_add_tail(&edge->list[UPPER], &list); 829 } else { 830 upper = rb_entry(rb_node, struct backref_node, 831 rb_node); 832 ASSERT(upper->checked); 833 INIT_LIST_HEAD(&edge->list[UPPER]); 834 } 835 list_add_tail(&edge->list[LOWER], &cur->upper); 836 edge->node[LOWER] = cur; 837 edge->node[UPPER] = upper; 838 839 goto next; 840 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 841 err = -EINVAL; 842 btrfs_print_v0_err(rc->extent_root->fs_info); 843 btrfs_handle_fs_error(rc->extent_root->fs_info, err, 844 NULL); 845 goto out; 846 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 847 goto next; 848 } 849 850 /* 851 * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset 852 * means the root objectid. We need to search the tree to get 853 * its parent bytenr. 854 */ 855 root = read_fs_root(rc->extent_root->fs_info, key.offset); 856 if (IS_ERR(root)) { 857 err = PTR_ERR(root); 858 goto out; 859 } 860 861 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 862 cur->cowonly = 1; 863 864 if (btrfs_root_level(&root->root_item) == cur->level) { 865 /* tree root */ 866 ASSERT(btrfs_root_bytenr(&root->root_item) == 867 cur->bytenr); 868 if (should_ignore_root(root)) 869 list_add(&cur->list, &useless); 870 else 871 cur->root = root; 872 break; 873 } 874 875 level = cur->level + 1; 876 877 /* Search the tree to find parent blocks referring the block. */ 878 path2->search_commit_root = 1; 879 path2->skip_locking = 1; 880 path2->lowest_level = level; 881 ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); 882 path2->lowest_level = 0; 883 if (ret < 0) { 884 err = ret; 885 goto out; 886 } 887 if (ret > 0 && path2->slots[level] > 0) 888 path2->slots[level]--; 889 890 eb = path2->nodes[level]; 891 if (btrfs_node_blockptr(eb, path2->slots[level]) != 892 cur->bytenr) { 893 btrfs_err(root->fs_info, 894 "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", 895 cur->bytenr, level - 1, 896 root->root_key.objectid, 897 node_key->objectid, node_key->type, 898 node_key->offset); 899 err = -ENOENT; 900 goto out; 901 } 902 lower = cur; 903 need_check = true; 904 905 /* Add all nodes and edges in the path */ 906 for (; level < BTRFS_MAX_LEVEL; level++) { 907 if (!path2->nodes[level]) { 908 ASSERT(btrfs_root_bytenr(&root->root_item) == 909 lower->bytenr); 910 if (should_ignore_root(root)) 911 list_add(&lower->list, &useless); 912 else 913 lower->root = root; 914 break; 915 } 916 917 edge = alloc_backref_edge(cache); 918 if (!edge) { 919 err = -ENOMEM; 920 goto out; 921 } 922 923 eb = path2->nodes[level]; 924 rb_node = tree_search(&cache->rb_root, eb->start); 925 if (!rb_node) { 926 upper = alloc_backref_node(cache); 927 if (!upper) { 928 free_backref_edge(cache, edge); 929 err = -ENOMEM; 930 goto out; 931 } 932 upper->bytenr = eb->start; 933 upper->owner = btrfs_header_owner(eb); 934 upper->level = lower->level + 1; 935 if (!test_bit(BTRFS_ROOT_REF_COWS, 936 &root->state)) 937 upper->cowonly = 1; 938 939 /* 940 * if we know the block isn't shared 941 * we can void checking its backrefs. 942 */ 943 if (btrfs_block_can_be_shared(root, eb)) 944 upper->checked = 0; 945 else 946 upper->checked = 1; 947 948 /* 949 * add the block to pending list if we 950 * need check its backrefs, we only do this once 951 * while walking up a tree as we will catch 952 * anything else later on. 953 */ 954 if (!upper->checked && need_check) { 955 need_check = false; 956 list_add_tail(&edge->list[UPPER], 957 &list); 958 } else { 959 if (upper->checked) 960 need_check = true; 961 INIT_LIST_HEAD(&edge->list[UPPER]); 962 } 963 } else { 964 upper = rb_entry(rb_node, struct backref_node, 965 rb_node); 966 ASSERT(upper->checked); 967 INIT_LIST_HEAD(&edge->list[UPPER]); 968 if (!upper->owner) 969 upper->owner = btrfs_header_owner(eb); 970 } 971 list_add_tail(&edge->list[LOWER], &lower->upper); 972 edge->node[LOWER] = lower; 973 edge->node[UPPER] = upper; 974 975 if (rb_node) 976 break; 977 lower = upper; 978 upper = NULL; 979 } 980 btrfs_release_path(path2); 981 next: 982 if (ptr < end) { 983 ptr += btrfs_extent_inline_ref_size(key.type); 984 if (ptr >= end) { 985 WARN_ON(ptr > end); 986 ptr = 0; 987 end = 0; 988 } 989 } 990 if (ptr >= end) 991 path1->slots[0]++; 992 } 993 btrfs_release_path(path1); 994 995 cur->checked = 1; 996 WARN_ON(exist); 997 998 /* the pending list isn't empty, take the first block to process */ 999 if (!list_empty(&list)) { 1000 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1001 list_del_init(&edge->list[UPPER]); 1002 cur = edge->node[UPPER]; 1003 goto again; 1004 } 1005 1006 /* 1007 * everything goes well, connect backref nodes and insert backref nodes 1008 * into the cache. 1009 */ 1010 ASSERT(node->checked); 1011 cowonly = node->cowonly; 1012 if (!cowonly) { 1013 rb_node = tree_insert(&cache->rb_root, node->bytenr, 1014 &node->rb_node); 1015 if (rb_node) 1016 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1017 list_add_tail(&node->lower, &cache->leaves); 1018 } 1019 1020 list_for_each_entry(edge, &node->upper, list[LOWER]) 1021 list_add_tail(&edge->list[UPPER], &list); 1022 1023 while (!list_empty(&list)) { 1024 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1025 list_del_init(&edge->list[UPPER]); 1026 upper = edge->node[UPPER]; 1027 if (upper->detached) { 1028 list_del(&edge->list[LOWER]); 1029 lower = edge->node[LOWER]; 1030 free_backref_edge(cache, edge); 1031 if (list_empty(&lower->upper)) 1032 list_add(&lower->list, &useless); 1033 continue; 1034 } 1035 1036 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1037 if (upper->lowest) { 1038 list_del_init(&upper->lower); 1039 upper->lowest = 0; 1040 } 1041 1042 list_add_tail(&edge->list[UPPER], &upper->lower); 1043 continue; 1044 } 1045 1046 if (!upper->checked) { 1047 /* 1048 * Still want to blow up for developers since this is a 1049 * logic bug. 1050 */ 1051 ASSERT(0); 1052 err = -EINVAL; 1053 goto out; 1054 } 1055 if (cowonly != upper->cowonly) { 1056 ASSERT(0); 1057 err = -EINVAL; 1058 goto out; 1059 } 1060 1061 if (!cowonly) { 1062 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1063 &upper->rb_node); 1064 if (rb_node) 1065 backref_tree_panic(rb_node, -EEXIST, 1066 upper->bytenr); 1067 } 1068 1069 list_add_tail(&edge->list[UPPER], &upper->lower); 1070 1071 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1072 list_add_tail(&edge->list[UPPER], &list); 1073 } 1074 /* 1075 * process useless backref nodes. backref nodes for tree leaves 1076 * are deleted from the cache. backref nodes for upper level 1077 * tree blocks are left in the cache to avoid unnecessary backref 1078 * lookup. 1079 */ 1080 while (!list_empty(&useless)) { 1081 upper = list_entry(useless.next, struct backref_node, list); 1082 list_del_init(&upper->list); 1083 ASSERT(list_empty(&upper->upper)); 1084 if (upper == node) 1085 node = NULL; 1086 if (upper->lowest) { 1087 list_del_init(&upper->lower); 1088 upper->lowest = 0; 1089 } 1090 while (!list_empty(&upper->lower)) { 1091 edge = list_entry(upper->lower.next, 1092 struct backref_edge, list[UPPER]); 1093 list_del(&edge->list[UPPER]); 1094 list_del(&edge->list[LOWER]); 1095 lower = edge->node[LOWER]; 1096 free_backref_edge(cache, edge); 1097 1098 if (list_empty(&lower->upper)) 1099 list_add(&lower->list, &useless); 1100 } 1101 __mark_block_processed(rc, upper); 1102 if (upper->level > 0) { 1103 list_add(&upper->list, &cache->detached); 1104 upper->detached = 1; 1105 } else { 1106 rb_erase(&upper->rb_node, &cache->rb_root); 1107 free_backref_node(cache, upper); 1108 } 1109 } 1110 out: 1111 btrfs_free_path(path1); 1112 btrfs_free_path(path2); 1113 if (err) { 1114 while (!list_empty(&useless)) { 1115 lower = list_entry(useless.next, 1116 struct backref_node, list); 1117 list_del_init(&lower->list); 1118 } 1119 while (!list_empty(&list)) { 1120 edge = list_first_entry(&list, struct backref_edge, 1121 list[UPPER]); 1122 list_del(&edge->list[UPPER]); 1123 list_del(&edge->list[LOWER]); 1124 lower = edge->node[LOWER]; 1125 upper = edge->node[UPPER]; 1126 free_backref_edge(cache, edge); 1127 1128 /* 1129 * Lower is no longer linked to any upper backref nodes 1130 * and isn't in the cache, we can free it ourselves. 1131 */ 1132 if (list_empty(&lower->upper) && 1133 RB_EMPTY_NODE(&lower->rb_node)) 1134 list_add(&lower->list, &useless); 1135 1136 if (!RB_EMPTY_NODE(&upper->rb_node)) 1137 continue; 1138 1139 /* Add this guy's upper edges to the list to process */ 1140 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1141 list_add_tail(&edge->list[UPPER], &list); 1142 if (list_empty(&upper->upper)) 1143 list_add(&upper->list, &useless); 1144 } 1145 1146 while (!list_empty(&useless)) { 1147 lower = list_entry(useless.next, 1148 struct backref_node, list); 1149 list_del_init(&lower->list); 1150 if (lower == node) 1151 node = NULL; 1152 free_backref_node(cache, lower); 1153 } 1154 1155 free_backref_node(cache, node); 1156 return ERR_PTR(err); 1157 } 1158 ASSERT(!node || !node->detached); 1159 return node; 1160 } 1161 1162 /* 1163 * helper to add backref node for the newly created snapshot. 1164 * the backref node is created by cloning backref node that 1165 * corresponds to root of source tree 1166 */ 1167 static int clone_backref_node(struct btrfs_trans_handle *trans, 1168 struct reloc_control *rc, 1169 struct btrfs_root *src, 1170 struct btrfs_root *dest) 1171 { 1172 struct btrfs_root *reloc_root = src->reloc_root; 1173 struct backref_cache *cache = &rc->backref_cache; 1174 struct backref_node *node = NULL; 1175 struct backref_node *new_node; 1176 struct backref_edge *edge; 1177 struct backref_edge *new_edge; 1178 struct rb_node *rb_node; 1179 1180 if (cache->last_trans > 0) 1181 update_backref_cache(trans, cache); 1182 1183 rb_node = tree_search(&cache->rb_root, src->commit_root->start); 1184 if (rb_node) { 1185 node = rb_entry(rb_node, struct backref_node, rb_node); 1186 if (node->detached) 1187 node = NULL; 1188 else 1189 BUG_ON(node->new_bytenr != reloc_root->node->start); 1190 } 1191 1192 if (!node) { 1193 rb_node = tree_search(&cache->rb_root, 1194 reloc_root->commit_root->start); 1195 if (rb_node) { 1196 node = rb_entry(rb_node, struct backref_node, 1197 rb_node); 1198 BUG_ON(node->detached); 1199 } 1200 } 1201 1202 if (!node) 1203 return 0; 1204 1205 new_node = alloc_backref_node(cache); 1206 if (!new_node) 1207 return -ENOMEM; 1208 1209 new_node->bytenr = dest->node->start; 1210 new_node->level = node->level; 1211 new_node->lowest = node->lowest; 1212 new_node->checked = 1; 1213 new_node->root = dest; 1214 1215 if (!node->lowest) { 1216 list_for_each_entry(edge, &node->lower, list[UPPER]) { 1217 new_edge = alloc_backref_edge(cache); 1218 if (!new_edge) 1219 goto fail; 1220 1221 new_edge->node[UPPER] = new_node; 1222 new_edge->node[LOWER] = edge->node[LOWER]; 1223 list_add_tail(&new_edge->list[UPPER], 1224 &new_node->lower); 1225 } 1226 } else { 1227 list_add_tail(&new_node->lower, &cache->leaves); 1228 } 1229 1230 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1231 &new_node->rb_node); 1232 if (rb_node) 1233 backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); 1234 1235 if (!new_node->lowest) { 1236 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 1237 list_add_tail(&new_edge->list[LOWER], 1238 &new_edge->node[LOWER]->upper); 1239 } 1240 } 1241 return 0; 1242 fail: 1243 while (!list_empty(&new_node->lower)) { 1244 new_edge = list_entry(new_node->lower.next, 1245 struct backref_edge, list[UPPER]); 1246 list_del(&new_edge->list[UPPER]); 1247 free_backref_edge(cache, new_edge); 1248 } 1249 free_backref_node(cache, new_node); 1250 return -ENOMEM; 1251 } 1252 1253 /* 1254 * helper to add 'address of tree root -> reloc tree' mapping 1255 */ 1256 static int __must_check __add_reloc_root(struct btrfs_root *root) 1257 { 1258 struct btrfs_fs_info *fs_info = root->fs_info; 1259 struct rb_node *rb_node; 1260 struct mapping_node *node; 1261 struct reloc_control *rc = fs_info->reloc_ctl; 1262 1263 node = kmalloc(sizeof(*node), GFP_NOFS); 1264 if (!node) 1265 return -ENOMEM; 1266 1267 node->bytenr = root->node->start; 1268 node->data = root; 1269 1270 spin_lock(&rc->reloc_root_tree.lock); 1271 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1272 node->bytenr, &node->rb_node); 1273 spin_unlock(&rc->reloc_root_tree.lock); 1274 if (rb_node) { 1275 btrfs_panic(fs_info, -EEXIST, 1276 "Duplicate root found for start=%llu while inserting into relocation tree", 1277 node->bytenr); 1278 } 1279 1280 list_add_tail(&root->root_list, &rc->reloc_roots); 1281 return 0; 1282 } 1283 1284 /* 1285 * helper to delete the 'address of tree root -> reloc tree' 1286 * mapping 1287 */ 1288 static void __del_reloc_root(struct btrfs_root *root) 1289 { 1290 struct btrfs_fs_info *fs_info = root->fs_info; 1291 struct rb_node *rb_node; 1292 struct mapping_node *node = NULL; 1293 struct reloc_control *rc = fs_info->reloc_ctl; 1294 1295 if (rc && root->node) { 1296 spin_lock(&rc->reloc_root_tree.lock); 1297 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1298 root->node->start); 1299 if (rb_node) { 1300 node = rb_entry(rb_node, struct mapping_node, rb_node); 1301 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1302 } 1303 spin_unlock(&rc->reloc_root_tree.lock); 1304 if (!node) 1305 return; 1306 BUG_ON((struct btrfs_root *)node->data != root); 1307 } 1308 1309 spin_lock(&fs_info->trans_lock); 1310 list_del_init(&root->root_list); 1311 spin_unlock(&fs_info->trans_lock); 1312 kfree(node); 1313 } 1314 1315 /* 1316 * helper to update the 'address of tree root -> reloc tree' 1317 * mapping 1318 */ 1319 static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) 1320 { 1321 struct btrfs_fs_info *fs_info = root->fs_info; 1322 struct rb_node *rb_node; 1323 struct mapping_node *node = NULL; 1324 struct reloc_control *rc = fs_info->reloc_ctl; 1325 1326 spin_lock(&rc->reloc_root_tree.lock); 1327 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1328 root->node->start); 1329 if (rb_node) { 1330 node = rb_entry(rb_node, struct mapping_node, rb_node); 1331 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1332 } 1333 spin_unlock(&rc->reloc_root_tree.lock); 1334 1335 if (!node) 1336 return 0; 1337 BUG_ON((struct btrfs_root *)node->data != root); 1338 1339 spin_lock(&rc->reloc_root_tree.lock); 1340 node->bytenr = new_bytenr; 1341 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1342 node->bytenr, &node->rb_node); 1343 spin_unlock(&rc->reloc_root_tree.lock); 1344 if (rb_node) 1345 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1346 return 0; 1347 } 1348 1349 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 1350 struct btrfs_root *root, u64 objectid) 1351 { 1352 struct btrfs_fs_info *fs_info = root->fs_info; 1353 struct btrfs_root *reloc_root; 1354 struct extent_buffer *eb; 1355 struct btrfs_root_item *root_item; 1356 struct btrfs_key root_key; 1357 int ret; 1358 1359 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1360 BUG_ON(!root_item); 1361 1362 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1363 root_key.type = BTRFS_ROOT_ITEM_KEY; 1364 root_key.offset = objectid; 1365 1366 if (root->root_key.objectid == objectid) { 1367 u64 commit_root_gen; 1368 1369 /* called by btrfs_init_reloc_root */ 1370 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1371 BTRFS_TREE_RELOC_OBJECTID); 1372 BUG_ON(ret); 1373 /* 1374 * Set the last_snapshot field to the generation of the commit 1375 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 1376 * correctly (returns true) when the relocation root is created 1377 * either inside the critical section of a transaction commit 1378 * (through transaction.c:qgroup_account_snapshot()) and when 1379 * it's created before the transaction commit is started. 1380 */ 1381 commit_root_gen = btrfs_header_generation(root->commit_root); 1382 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 1383 } else { 1384 /* 1385 * called by btrfs_reloc_post_snapshot_hook. 1386 * the source tree is a reloc tree, all tree blocks 1387 * modified after it was created have RELOC flag 1388 * set in their headers. so it's OK to not update 1389 * the 'last_snapshot'. 1390 */ 1391 ret = btrfs_copy_root(trans, root, root->node, &eb, 1392 BTRFS_TREE_RELOC_OBJECTID); 1393 BUG_ON(ret); 1394 } 1395 1396 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1397 btrfs_set_root_bytenr(root_item, eb->start); 1398 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1399 btrfs_set_root_generation(root_item, trans->transid); 1400 1401 if (root->root_key.objectid == objectid) { 1402 btrfs_set_root_refs(root_item, 0); 1403 memset(&root_item->drop_progress, 0, 1404 sizeof(struct btrfs_disk_key)); 1405 root_item->drop_level = 0; 1406 } 1407 1408 btrfs_tree_unlock(eb); 1409 free_extent_buffer(eb); 1410 1411 ret = btrfs_insert_root(trans, fs_info->tree_root, 1412 &root_key, root_item); 1413 BUG_ON(ret); 1414 kfree(root_item); 1415 1416 reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); 1417 BUG_ON(IS_ERR(reloc_root)); 1418 reloc_root->last_trans = trans->transid; 1419 return reloc_root; 1420 } 1421 1422 /* 1423 * create reloc tree for a given fs tree. reloc tree is just a 1424 * snapshot of the fs tree with special root objectid. 1425 */ 1426 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 1427 struct btrfs_root *root) 1428 { 1429 struct btrfs_fs_info *fs_info = root->fs_info; 1430 struct btrfs_root *reloc_root; 1431 struct reloc_control *rc = fs_info->reloc_ctl; 1432 struct btrfs_block_rsv *rsv; 1433 int clear_rsv = 0; 1434 int ret; 1435 1436 if (root->reloc_root) { 1437 reloc_root = root->reloc_root; 1438 reloc_root->last_trans = trans->transid; 1439 return 0; 1440 } 1441 1442 if (!rc || !rc->create_reloc_tree || 1443 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1444 return 0; 1445 1446 if (!trans->reloc_reserved) { 1447 rsv = trans->block_rsv; 1448 trans->block_rsv = rc->block_rsv; 1449 clear_rsv = 1; 1450 } 1451 reloc_root = create_reloc_root(trans, root, root->root_key.objectid); 1452 if (clear_rsv) 1453 trans->block_rsv = rsv; 1454 1455 ret = __add_reloc_root(reloc_root); 1456 BUG_ON(ret < 0); 1457 root->reloc_root = reloc_root; 1458 return 0; 1459 } 1460 1461 /* 1462 * update root item of reloc tree 1463 */ 1464 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 1465 struct btrfs_root *root) 1466 { 1467 struct btrfs_fs_info *fs_info = root->fs_info; 1468 struct btrfs_root *reloc_root; 1469 struct btrfs_root_item *root_item; 1470 int ret; 1471 1472 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || 1473 !root->reloc_root) 1474 goto out; 1475 1476 reloc_root = root->reloc_root; 1477 root_item = &reloc_root->root_item; 1478 1479 /* root->reloc_root will stay until current relocation finished */ 1480 if (fs_info->reloc_ctl->merge_reloc_tree && 1481 btrfs_root_refs(root_item) == 0) { 1482 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1483 __del_reloc_root(reloc_root); 1484 } 1485 1486 if (reloc_root->commit_root != reloc_root->node) { 1487 btrfs_set_root_node(root_item, reloc_root->node); 1488 free_extent_buffer(reloc_root->commit_root); 1489 reloc_root->commit_root = btrfs_root_node(reloc_root); 1490 } 1491 1492 ret = btrfs_update_root(trans, fs_info->tree_root, 1493 &reloc_root->root_key, root_item); 1494 BUG_ON(ret); 1495 1496 out: 1497 return 0; 1498 } 1499 1500 /* 1501 * helper to find first cached inode with inode number >= objectid 1502 * in a subvolume 1503 */ 1504 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) 1505 { 1506 struct rb_node *node; 1507 struct rb_node *prev; 1508 struct btrfs_inode *entry; 1509 struct inode *inode; 1510 1511 spin_lock(&root->inode_lock); 1512 again: 1513 node = root->inode_tree.rb_node; 1514 prev = NULL; 1515 while (node) { 1516 prev = node; 1517 entry = rb_entry(node, struct btrfs_inode, rb_node); 1518 1519 if (objectid < btrfs_ino(entry)) 1520 node = node->rb_left; 1521 else if (objectid > btrfs_ino(entry)) 1522 node = node->rb_right; 1523 else 1524 break; 1525 } 1526 if (!node) { 1527 while (prev) { 1528 entry = rb_entry(prev, struct btrfs_inode, rb_node); 1529 if (objectid <= btrfs_ino(entry)) { 1530 node = prev; 1531 break; 1532 } 1533 prev = rb_next(prev); 1534 } 1535 } 1536 while (node) { 1537 entry = rb_entry(node, struct btrfs_inode, rb_node); 1538 inode = igrab(&entry->vfs_inode); 1539 if (inode) { 1540 spin_unlock(&root->inode_lock); 1541 return inode; 1542 } 1543 1544 objectid = btrfs_ino(entry) + 1; 1545 if (cond_resched_lock(&root->inode_lock)) 1546 goto again; 1547 1548 node = rb_next(node); 1549 } 1550 spin_unlock(&root->inode_lock); 1551 return NULL; 1552 } 1553 1554 static int in_block_group(u64 bytenr, 1555 struct btrfs_block_group_cache *block_group) 1556 { 1557 if (bytenr >= block_group->key.objectid && 1558 bytenr < block_group->key.objectid + block_group->key.offset) 1559 return 1; 1560 return 0; 1561 } 1562 1563 /* 1564 * get new location of data 1565 */ 1566 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 1567 u64 bytenr, u64 num_bytes) 1568 { 1569 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 1570 struct btrfs_path *path; 1571 struct btrfs_file_extent_item *fi; 1572 struct extent_buffer *leaf; 1573 int ret; 1574 1575 path = btrfs_alloc_path(); 1576 if (!path) 1577 return -ENOMEM; 1578 1579 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1580 ret = btrfs_lookup_file_extent(NULL, root, path, 1581 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 1582 if (ret < 0) 1583 goto out; 1584 if (ret > 0) { 1585 ret = -ENOENT; 1586 goto out; 1587 } 1588 1589 leaf = path->nodes[0]; 1590 fi = btrfs_item_ptr(leaf, path->slots[0], 1591 struct btrfs_file_extent_item); 1592 1593 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 1594 btrfs_file_extent_compression(leaf, fi) || 1595 btrfs_file_extent_encryption(leaf, fi) || 1596 btrfs_file_extent_other_encoding(leaf, fi)); 1597 1598 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1599 ret = -EINVAL; 1600 goto out; 1601 } 1602 1603 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1604 ret = 0; 1605 out: 1606 btrfs_free_path(path); 1607 return ret; 1608 } 1609 1610 /* 1611 * update file extent items in the tree leaf to point to 1612 * the new locations. 1613 */ 1614 static noinline_for_stack 1615 int replace_file_extents(struct btrfs_trans_handle *trans, 1616 struct reloc_control *rc, 1617 struct btrfs_root *root, 1618 struct extent_buffer *leaf) 1619 { 1620 struct btrfs_fs_info *fs_info = root->fs_info; 1621 struct btrfs_key key; 1622 struct btrfs_file_extent_item *fi; 1623 struct inode *inode = NULL; 1624 u64 parent; 1625 u64 bytenr; 1626 u64 new_bytenr = 0; 1627 u64 num_bytes; 1628 u64 end; 1629 u32 nritems; 1630 u32 i; 1631 int ret = 0; 1632 int first = 1; 1633 int dirty = 0; 1634 1635 if (rc->stage != UPDATE_DATA_PTRS) 1636 return 0; 1637 1638 /* reloc trees always use full backref */ 1639 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) 1640 parent = leaf->start; 1641 else 1642 parent = 0; 1643 1644 nritems = btrfs_header_nritems(leaf); 1645 for (i = 0; i < nritems; i++) { 1646 struct btrfs_ref ref = { 0 }; 1647 1648 cond_resched(); 1649 btrfs_item_key_to_cpu(leaf, &key, i); 1650 if (key.type != BTRFS_EXTENT_DATA_KEY) 1651 continue; 1652 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1653 if (btrfs_file_extent_type(leaf, fi) == 1654 BTRFS_FILE_EXTENT_INLINE) 1655 continue; 1656 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1657 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1658 if (bytenr == 0) 1659 continue; 1660 if (!in_block_group(bytenr, rc->block_group)) 1661 continue; 1662 1663 /* 1664 * if we are modifying block in fs tree, wait for readpage 1665 * to complete and drop the extent cache 1666 */ 1667 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1668 if (first) { 1669 inode = find_next_inode(root, key.objectid); 1670 first = 0; 1671 } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { 1672 btrfs_add_delayed_iput(inode); 1673 inode = find_next_inode(root, key.objectid); 1674 } 1675 if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { 1676 end = key.offset + 1677 btrfs_file_extent_num_bytes(leaf, fi); 1678 WARN_ON(!IS_ALIGNED(key.offset, 1679 fs_info->sectorsize)); 1680 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1681 end--; 1682 ret = try_lock_extent(&BTRFS_I(inode)->io_tree, 1683 key.offset, end); 1684 if (!ret) 1685 continue; 1686 1687 btrfs_drop_extent_cache(BTRFS_I(inode), 1688 key.offset, end, 1); 1689 unlock_extent(&BTRFS_I(inode)->io_tree, 1690 key.offset, end); 1691 } 1692 } 1693 1694 ret = get_new_location(rc->data_inode, &new_bytenr, 1695 bytenr, num_bytes); 1696 if (ret) { 1697 /* 1698 * Don't have to abort since we've not changed anything 1699 * in the file extent yet. 1700 */ 1701 break; 1702 } 1703 1704 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1705 dirty = 1; 1706 1707 key.offset -= btrfs_file_extent_offset(leaf, fi); 1708 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1709 num_bytes, parent); 1710 ref.real_root = root->root_key.objectid; 1711 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1712 key.objectid, key.offset); 1713 ret = btrfs_inc_extent_ref(trans, &ref); 1714 if (ret) { 1715 btrfs_abort_transaction(trans, ret); 1716 break; 1717 } 1718 1719 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 1720 num_bytes, parent); 1721 ref.real_root = root->root_key.objectid; 1722 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), 1723 key.objectid, key.offset); 1724 ret = btrfs_free_extent(trans, &ref); 1725 if (ret) { 1726 btrfs_abort_transaction(trans, ret); 1727 break; 1728 } 1729 } 1730 if (dirty) 1731 btrfs_mark_buffer_dirty(leaf); 1732 if (inode) 1733 btrfs_add_delayed_iput(inode); 1734 return ret; 1735 } 1736 1737 static noinline_for_stack 1738 int memcmp_node_keys(struct extent_buffer *eb, int slot, 1739 struct btrfs_path *path, int level) 1740 { 1741 struct btrfs_disk_key key1; 1742 struct btrfs_disk_key key2; 1743 btrfs_node_key(eb, &key1, slot); 1744 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1745 return memcmp(&key1, &key2, sizeof(key1)); 1746 } 1747 1748 /* 1749 * try to replace tree blocks in fs tree with the new blocks 1750 * in reloc tree. tree blocks haven't been modified since the 1751 * reloc tree was create can be replaced. 1752 * 1753 * if a block was replaced, level of the block + 1 is returned. 1754 * if no block got replaced, 0 is returned. if there are other 1755 * errors, a negative error number is returned. 1756 */ 1757 static noinline_for_stack 1758 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1759 struct btrfs_root *dest, struct btrfs_root *src, 1760 struct btrfs_path *path, struct btrfs_key *next_key, 1761 int lowest_level, int max_level) 1762 { 1763 struct btrfs_fs_info *fs_info = dest->fs_info; 1764 struct extent_buffer *eb; 1765 struct extent_buffer *parent; 1766 struct btrfs_ref ref = { 0 }; 1767 struct btrfs_key key; 1768 u64 old_bytenr; 1769 u64 new_bytenr; 1770 u64 old_ptr_gen; 1771 u64 new_ptr_gen; 1772 u64 last_snapshot; 1773 u32 blocksize; 1774 int cow = 0; 1775 int level; 1776 int ret; 1777 int slot; 1778 1779 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1780 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1781 1782 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1783 again: 1784 slot = path->slots[lowest_level]; 1785 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1786 1787 eb = btrfs_lock_root_node(dest); 1788 btrfs_set_lock_blocking_write(eb); 1789 level = btrfs_header_level(eb); 1790 1791 if (level < lowest_level) { 1792 btrfs_tree_unlock(eb); 1793 free_extent_buffer(eb); 1794 return 0; 1795 } 1796 1797 if (cow) { 1798 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1799 BUG_ON(ret); 1800 } 1801 btrfs_set_lock_blocking_write(eb); 1802 1803 if (next_key) { 1804 next_key->objectid = (u64)-1; 1805 next_key->type = (u8)-1; 1806 next_key->offset = (u64)-1; 1807 } 1808 1809 parent = eb; 1810 while (1) { 1811 struct btrfs_key first_key; 1812 1813 level = btrfs_header_level(parent); 1814 BUG_ON(level < lowest_level); 1815 1816 ret = btrfs_bin_search(parent, &key, level, &slot); 1817 if (ret < 0) 1818 break; 1819 if (ret && slot > 0) 1820 slot--; 1821 1822 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1823 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1824 1825 old_bytenr = btrfs_node_blockptr(parent, slot); 1826 blocksize = fs_info->nodesize; 1827 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1828 btrfs_node_key_to_cpu(parent, &first_key, slot); 1829 1830 if (level <= max_level) { 1831 eb = path->nodes[level]; 1832 new_bytenr = btrfs_node_blockptr(eb, 1833 path->slots[level]); 1834 new_ptr_gen = btrfs_node_ptr_generation(eb, 1835 path->slots[level]); 1836 } else { 1837 new_bytenr = 0; 1838 new_ptr_gen = 0; 1839 } 1840 1841 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1842 ret = level; 1843 break; 1844 } 1845 1846 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1847 memcmp_node_keys(parent, slot, path, level)) { 1848 if (level <= lowest_level) { 1849 ret = 0; 1850 break; 1851 } 1852 1853 eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen, 1854 level - 1, &first_key); 1855 if (IS_ERR(eb)) { 1856 ret = PTR_ERR(eb); 1857 break; 1858 } else if (!extent_buffer_uptodate(eb)) { 1859 ret = -EIO; 1860 free_extent_buffer(eb); 1861 break; 1862 } 1863 btrfs_tree_lock(eb); 1864 if (cow) { 1865 ret = btrfs_cow_block(trans, dest, eb, parent, 1866 slot, &eb); 1867 BUG_ON(ret); 1868 } 1869 btrfs_set_lock_blocking_write(eb); 1870 1871 btrfs_tree_unlock(parent); 1872 free_extent_buffer(parent); 1873 1874 parent = eb; 1875 continue; 1876 } 1877 1878 if (!cow) { 1879 btrfs_tree_unlock(parent); 1880 free_extent_buffer(parent); 1881 cow = 1; 1882 goto again; 1883 } 1884 1885 btrfs_node_key_to_cpu(path->nodes[level], &key, 1886 path->slots[level]); 1887 btrfs_release_path(path); 1888 1889 path->lowest_level = level; 1890 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1891 path->lowest_level = 0; 1892 BUG_ON(ret); 1893 1894 /* 1895 * Info qgroup to trace both subtrees. 1896 * 1897 * We must trace both trees. 1898 * 1) Tree reloc subtree 1899 * If not traced, we will leak data numbers 1900 * 2) Fs subtree 1901 * If not traced, we will double count old data 1902 * 1903 * We don't scan the subtree right now, but only record 1904 * the swapped tree blocks. 1905 * The real subtree rescan is delayed until we have new 1906 * CoW on the subtree root node before transaction commit. 1907 */ 1908 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1909 rc->block_group, parent, slot, 1910 path->nodes[level], path->slots[level], 1911 last_snapshot); 1912 if (ret < 0) 1913 break; 1914 /* 1915 * swap blocks in fs tree and reloc tree. 1916 */ 1917 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1918 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1919 btrfs_mark_buffer_dirty(parent); 1920 1921 btrfs_set_node_blockptr(path->nodes[level], 1922 path->slots[level], old_bytenr); 1923 btrfs_set_node_ptr_generation(path->nodes[level], 1924 path->slots[level], old_ptr_gen); 1925 btrfs_mark_buffer_dirty(path->nodes[level]); 1926 1927 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, 1928 blocksize, path->nodes[level]->start); 1929 ref.skip_qgroup = true; 1930 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 1931 ret = btrfs_inc_extent_ref(trans, &ref); 1932 BUG_ON(ret); 1933 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, 1934 blocksize, 0); 1935 ref.skip_qgroup = true; 1936 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 1937 ret = btrfs_inc_extent_ref(trans, &ref); 1938 BUG_ON(ret); 1939 1940 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, 1941 blocksize, path->nodes[level]->start); 1942 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); 1943 ref.skip_qgroup = true; 1944 ret = btrfs_free_extent(trans, &ref); 1945 BUG_ON(ret); 1946 1947 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, 1948 blocksize, 0); 1949 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); 1950 ref.skip_qgroup = true; 1951 ret = btrfs_free_extent(trans, &ref); 1952 BUG_ON(ret); 1953 1954 btrfs_unlock_up_safe(path, 0); 1955 1956 ret = level; 1957 break; 1958 } 1959 btrfs_tree_unlock(parent); 1960 free_extent_buffer(parent); 1961 return ret; 1962 } 1963 1964 /* 1965 * helper to find next relocated block in reloc tree 1966 */ 1967 static noinline_for_stack 1968 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1969 int *level) 1970 { 1971 struct extent_buffer *eb; 1972 int i; 1973 u64 last_snapshot; 1974 u32 nritems; 1975 1976 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1977 1978 for (i = 0; i < *level; i++) { 1979 free_extent_buffer(path->nodes[i]); 1980 path->nodes[i] = NULL; 1981 } 1982 1983 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1984 eb = path->nodes[i]; 1985 nritems = btrfs_header_nritems(eb); 1986 while (path->slots[i] + 1 < nritems) { 1987 path->slots[i]++; 1988 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1989 last_snapshot) 1990 continue; 1991 1992 *level = i; 1993 return 0; 1994 } 1995 free_extent_buffer(path->nodes[i]); 1996 path->nodes[i] = NULL; 1997 } 1998 return 1; 1999 } 2000 2001 /* 2002 * walk down reloc tree to find relocated block of lowest level 2003 */ 2004 static noinline_for_stack 2005 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 2006 int *level) 2007 { 2008 struct btrfs_fs_info *fs_info = root->fs_info; 2009 struct extent_buffer *eb = NULL; 2010 int i; 2011 u64 bytenr; 2012 u64 ptr_gen = 0; 2013 u64 last_snapshot; 2014 u32 nritems; 2015 2016 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2017 2018 for (i = *level; i > 0; i--) { 2019 struct btrfs_key first_key; 2020 2021 eb = path->nodes[i]; 2022 nritems = btrfs_header_nritems(eb); 2023 while (path->slots[i] < nritems) { 2024 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 2025 if (ptr_gen > last_snapshot) 2026 break; 2027 path->slots[i]++; 2028 } 2029 if (path->slots[i] >= nritems) { 2030 if (i == *level) 2031 break; 2032 *level = i + 1; 2033 return 0; 2034 } 2035 if (i == 1) { 2036 *level = i; 2037 return 0; 2038 } 2039 2040 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2041 btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]); 2042 eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1, 2043 &first_key); 2044 if (IS_ERR(eb)) { 2045 return PTR_ERR(eb); 2046 } else if (!extent_buffer_uptodate(eb)) { 2047 free_extent_buffer(eb); 2048 return -EIO; 2049 } 2050 BUG_ON(btrfs_header_level(eb) != i - 1); 2051 path->nodes[i - 1] = eb; 2052 path->slots[i - 1] = 0; 2053 } 2054 return 1; 2055 } 2056 2057 /* 2058 * invalidate extent cache for file extents whose key in range of 2059 * [min_key, max_key) 2060 */ 2061 static int invalidate_extent_cache(struct btrfs_root *root, 2062 struct btrfs_key *min_key, 2063 struct btrfs_key *max_key) 2064 { 2065 struct btrfs_fs_info *fs_info = root->fs_info; 2066 struct inode *inode = NULL; 2067 u64 objectid; 2068 u64 start, end; 2069 u64 ino; 2070 2071 objectid = min_key->objectid; 2072 while (1) { 2073 cond_resched(); 2074 iput(inode); 2075 2076 if (objectid > max_key->objectid) 2077 break; 2078 2079 inode = find_next_inode(root, objectid); 2080 if (!inode) 2081 break; 2082 ino = btrfs_ino(BTRFS_I(inode)); 2083 2084 if (ino > max_key->objectid) { 2085 iput(inode); 2086 break; 2087 } 2088 2089 objectid = ino + 1; 2090 if (!S_ISREG(inode->i_mode)) 2091 continue; 2092 2093 if (unlikely(min_key->objectid == ino)) { 2094 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 2095 continue; 2096 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 2097 start = 0; 2098 else { 2099 start = min_key->offset; 2100 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 2101 } 2102 } else { 2103 start = 0; 2104 } 2105 2106 if (unlikely(max_key->objectid == ino)) { 2107 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 2108 continue; 2109 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 2110 end = (u64)-1; 2111 } else { 2112 if (max_key->offset == 0) 2113 continue; 2114 end = max_key->offset; 2115 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 2116 end--; 2117 } 2118 } else { 2119 end = (u64)-1; 2120 } 2121 2122 /* the lock_extent waits for readpage to complete */ 2123 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 2124 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); 2125 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 2126 } 2127 return 0; 2128 } 2129 2130 static int find_next_key(struct btrfs_path *path, int level, 2131 struct btrfs_key *key) 2132 2133 { 2134 while (level < BTRFS_MAX_LEVEL) { 2135 if (!path->nodes[level]) 2136 break; 2137 if (path->slots[level] + 1 < 2138 btrfs_header_nritems(path->nodes[level])) { 2139 btrfs_node_key_to_cpu(path->nodes[level], key, 2140 path->slots[level] + 1); 2141 return 0; 2142 } 2143 level++; 2144 } 2145 return 1; 2146 } 2147 2148 /* 2149 * Insert current subvolume into reloc_control::dirty_subvol_roots 2150 */ 2151 static void insert_dirty_subvol(struct btrfs_trans_handle *trans, 2152 struct reloc_control *rc, 2153 struct btrfs_root *root) 2154 { 2155 struct btrfs_root *reloc_root = root->reloc_root; 2156 struct btrfs_root_item *reloc_root_item; 2157 2158 /* @root must be a subvolume tree root with a valid reloc tree */ 2159 ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 2160 ASSERT(reloc_root); 2161 2162 reloc_root_item = &reloc_root->root_item; 2163 memset(&reloc_root_item->drop_progress, 0, 2164 sizeof(reloc_root_item->drop_progress)); 2165 reloc_root_item->drop_level = 0; 2166 btrfs_set_root_refs(reloc_root_item, 0); 2167 btrfs_update_reloc_root(trans, root); 2168 2169 if (list_empty(&root->reloc_dirty_list)) { 2170 btrfs_grab_fs_root(root); 2171 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 2172 } 2173 } 2174 2175 static int clean_dirty_subvols(struct reloc_control *rc) 2176 { 2177 struct btrfs_root *root; 2178 struct btrfs_root *next; 2179 int ret = 0; 2180 2181 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 2182 reloc_dirty_list) { 2183 struct btrfs_root *reloc_root = root->reloc_root; 2184 2185 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 2186 list_del_init(&root->reloc_dirty_list); 2187 root->reloc_root = NULL; 2188 if (reloc_root) { 2189 int ret2; 2190 2191 ret2 = btrfs_drop_snapshot(reloc_root, NULL, 0, 1); 2192 if (ret2 < 0 && !ret) 2193 ret = ret2; 2194 } 2195 btrfs_put_fs_root(root); 2196 } 2197 return ret; 2198 } 2199 2200 /* 2201 * merge the relocated tree blocks in reloc tree with corresponding 2202 * fs tree. 2203 */ 2204 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 2205 struct btrfs_root *root) 2206 { 2207 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2208 struct btrfs_key key; 2209 struct btrfs_key next_key; 2210 struct btrfs_trans_handle *trans = NULL; 2211 struct btrfs_root *reloc_root; 2212 struct btrfs_root_item *root_item; 2213 struct btrfs_path *path; 2214 struct extent_buffer *leaf; 2215 int level; 2216 int max_level; 2217 int replaced = 0; 2218 int ret; 2219 int err = 0; 2220 u32 min_reserved; 2221 2222 path = btrfs_alloc_path(); 2223 if (!path) 2224 return -ENOMEM; 2225 path->reada = READA_FORWARD; 2226 2227 reloc_root = root->reloc_root; 2228 root_item = &reloc_root->root_item; 2229 2230 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 2231 level = btrfs_root_level(root_item); 2232 extent_buffer_get(reloc_root->node); 2233 path->nodes[level] = reloc_root->node; 2234 path->slots[level] = 0; 2235 } else { 2236 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 2237 2238 level = root_item->drop_level; 2239 BUG_ON(level == 0); 2240 path->lowest_level = level; 2241 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 2242 path->lowest_level = 0; 2243 if (ret < 0) { 2244 btrfs_free_path(path); 2245 return ret; 2246 } 2247 2248 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 2249 path->slots[level]); 2250 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 2251 2252 btrfs_unlock_up_safe(path, 0); 2253 } 2254 2255 min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2256 memset(&next_key, 0, sizeof(next_key)); 2257 2258 while (1) { 2259 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, 2260 BTRFS_RESERVE_FLUSH_ALL); 2261 if (ret) { 2262 err = ret; 2263 goto out; 2264 } 2265 trans = btrfs_start_transaction(root, 0); 2266 if (IS_ERR(trans)) { 2267 err = PTR_ERR(trans); 2268 trans = NULL; 2269 goto out; 2270 } 2271 trans->block_rsv = rc->block_rsv; 2272 2273 replaced = 0; 2274 max_level = level; 2275 2276 ret = walk_down_reloc_tree(reloc_root, path, &level); 2277 if (ret < 0) { 2278 err = ret; 2279 goto out; 2280 } 2281 if (ret > 0) 2282 break; 2283 2284 if (!find_next_key(path, level, &key) && 2285 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2286 ret = 0; 2287 } else { 2288 ret = replace_path(trans, rc, root, reloc_root, path, 2289 &next_key, level, max_level); 2290 } 2291 if (ret < 0) { 2292 err = ret; 2293 goto out; 2294 } 2295 2296 if (ret > 0) { 2297 level = ret; 2298 btrfs_node_key_to_cpu(path->nodes[level], &key, 2299 path->slots[level]); 2300 replaced = 1; 2301 } 2302 2303 ret = walk_up_reloc_tree(reloc_root, path, &level); 2304 if (ret > 0) 2305 break; 2306 2307 BUG_ON(level == 0); 2308 /* 2309 * save the merging progress in the drop_progress. 2310 * this is OK since root refs == 1 in this case. 2311 */ 2312 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 2313 path->slots[level]); 2314 root_item->drop_level = level; 2315 2316 btrfs_end_transaction_throttle(trans); 2317 trans = NULL; 2318 2319 btrfs_btree_balance_dirty(fs_info); 2320 2321 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2322 invalidate_extent_cache(root, &key, &next_key); 2323 } 2324 2325 /* 2326 * handle the case only one block in the fs tree need to be 2327 * relocated and the block is tree root. 2328 */ 2329 leaf = btrfs_lock_root_node(root); 2330 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); 2331 btrfs_tree_unlock(leaf); 2332 free_extent_buffer(leaf); 2333 if (ret < 0) 2334 err = ret; 2335 out: 2336 btrfs_free_path(path); 2337 2338 if (err == 0) 2339 insert_dirty_subvol(trans, rc, root); 2340 2341 if (trans) 2342 btrfs_end_transaction_throttle(trans); 2343 2344 btrfs_btree_balance_dirty(fs_info); 2345 2346 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2347 invalidate_extent_cache(root, &key, &next_key); 2348 2349 return err; 2350 } 2351 2352 static noinline_for_stack 2353 int prepare_to_merge(struct reloc_control *rc, int err) 2354 { 2355 struct btrfs_root *root = rc->extent_root; 2356 struct btrfs_fs_info *fs_info = root->fs_info; 2357 struct btrfs_root *reloc_root; 2358 struct btrfs_trans_handle *trans; 2359 LIST_HEAD(reloc_roots); 2360 u64 num_bytes = 0; 2361 int ret; 2362 2363 mutex_lock(&fs_info->reloc_mutex); 2364 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2365 rc->merging_rsv_size += rc->nodes_relocated * 2; 2366 mutex_unlock(&fs_info->reloc_mutex); 2367 2368 again: 2369 if (!err) { 2370 num_bytes = rc->merging_rsv_size; 2371 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2372 BTRFS_RESERVE_FLUSH_ALL); 2373 if (ret) 2374 err = ret; 2375 } 2376 2377 trans = btrfs_join_transaction(rc->extent_root); 2378 if (IS_ERR(trans)) { 2379 if (!err) 2380 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2381 num_bytes); 2382 return PTR_ERR(trans); 2383 } 2384 2385 if (!err) { 2386 if (num_bytes != rc->merging_rsv_size) { 2387 btrfs_end_transaction(trans); 2388 btrfs_block_rsv_release(fs_info, rc->block_rsv, 2389 num_bytes); 2390 goto again; 2391 } 2392 } 2393 2394 rc->merge_reloc_tree = 1; 2395 2396 while (!list_empty(&rc->reloc_roots)) { 2397 reloc_root = list_entry(rc->reloc_roots.next, 2398 struct btrfs_root, root_list); 2399 list_del_init(&reloc_root->root_list); 2400 2401 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2402 BUG_ON(IS_ERR(root)); 2403 BUG_ON(root->reloc_root != reloc_root); 2404 2405 /* 2406 * set reference count to 1, so btrfs_recover_relocation 2407 * knows it should resumes merging 2408 */ 2409 if (!err) 2410 btrfs_set_root_refs(&reloc_root->root_item, 1); 2411 btrfs_update_reloc_root(trans, root); 2412 2413 list_add(&reloc_root->root_list, &reloc_roots); 2414 } 2415 2416 list_splice(&reloc_roots, &rc->reloc_roots); 2417 2418 if (!err) 2419 btrfs_commit_transaction(trans); 2420 else 2421 btrfs_end_transaction(trans); 2422 return err; 2423 } 2424 2425 static noinline_for_stack 2426 void free_reloc_roots(struct list_head *list) 2427 { 2428 struct btrfs_root *reloc_root; 2429 2430 while (!list_empty(list)) { 2431 reloc_root = list_entry(list->next, struct btrfs_root, 2432 root_list); 2433 __del_reloc_root(reloc_root); 2434 free_extent_buffer(reloc_root->node); 2435 free_extent_buffer(reloc_root->commit_root); 2436 reloc_root->node = NULL; 2437 reloc_root->commit_root = NULL; 2438 } 2439 } 2440 2441 static noinline_for_stack 2442 void merge_reloc_roots(struct reloc_control *rc) 2443 { 2444 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2445 struct btrfs_root *root; 2446 struct btrfs_root *reloc_root; 2447 LIST_HEAD(reloc_roots); 2448 int found = 0; 2449 int ret = 0; 2450 again: 2451 root = rc->extent_root; 2452 2453 /* 2454 * this serializes us with btrfs_record_root_in_transaction, 2455 * we have to make sure nobody is in the middle of 2456 * adding their roots to the list while we are 2457 * doing this splice 2458 */ 2459 mutex_lock(&fs_info->reloc_mutex); 2460 list_splice_init(&rc->reloc_roots, &reloc_roots); 2461 mutex_unlock(&fs_info->reloc_mutex); 2462 2463 while (!list_empty(&reloc_roots)) { 2464 found = 1; 2465 reloc_root = list_entry(reloc_roots.next, 2466 struct btrfs_root, root_list); 2467 2468 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 2469 root = read_fs_root(fs_info, 2470 reloc_root->root_key.offset); 2471 BUG_ON(IS_ERR(root)); 2472 BUG_ON(root->reloc_root != reloc_root); 2473 2474 ret = merge_reloc_root(rc, root); 2475 if (ret) { 2476 if (list_empty(&reloc_root->root_list)) 2477 list_add_tail(&reloc_root->root_list, 2478 &reloc_roots); 2479 goto out; 2480 } 2481 } else { 2482 list_del_init(&reloc_root->root_list); 2483 } 2484 } 2485 2486 if (found) { 2487 found = 0; 2488 goto again; 2489 } 2490 out: 2491 if (ret) { 2492 btrfs_handle_fs_error(fs_info, ret, NULL); 2493 if (!list_empty(&reloc_roots)) 2494 free_reloc_roots(&reloc_roots); 2495 2496 /* new reloc root may be added */ 2497 mutex_lock(&fs_info->reloc_mutex); 2498 list_splice_init(&rc->reloc_roots, &reloc_roots); 2499 mutex_unlock(&fs_info->reloc_mutex); 2500 if (!list_empty(&reloc_roots)) 2501 free_reloc_roots(&reloc_roots); 2502 } 2503 2504 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2505 } 2506 2507 static void free_block_list(struct rb_root *blocks) 2508 { 2509 struct tree_block *block; 2510 struct rb_node *rb_node; 2511 while ((rb_node = rb_first(blocks))) { 2512 block = rb_entry(rb_node, struct tree_block, rb_node); 2513 rb_erase(rb_node, blocks); 2514 kfree(block); 2515 } 2516 } 2517 2518 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2519 struct btrfs_root *reloc_root) 2520 { 2521 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2522 struct btrfs_root *root; 2523 2524 if (reloc_root->last_trans == trans->transid) 2525 return 0; 2526 2527 root = read_fs_root(fs_info, reloc_root->root_key.offset); 2528 BUG_ON(IS_ERR(root)); 2529 BUG_ON(root->reloc_root != reloc_root); 2530 2531 return btrfs_record_root_in_trans(trans, root); 2532 } 2533 2534 static noinline_for_stack 2535 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2536 struct reloc_control *rc, 2537 struct backref_node *node, 2538 struct backref_edge *edges[]) 2539 { 2540 struct backref_node *next; 2541 struct btrfs_root *root; 2542 int index = 0; 2543 2544 next = node; 2545 while (1) { 2546 cond_resched(); 2547 next = walk_up_backref(next, edges, &index); 2548 root = next->root; 2549 BUG_ON(!root); 2550 BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); 2551 2552 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2553 record_reloc_root_in_trans(trans, root); 2554 break; 2555 } 2556 2557 btrfs_record_root_in_trans(trans, root); 2558 root = root->reloc_root; 2559 2560 if (next->new_bytenr != root->node->start) { 2561 BUG_ON(next->new_bytenr); 2562 BUG_ON(!list_empty(&next->list)); 2563 next->new_bytenr = root->node->start; 2564 next->root = root; 2565 list_add_tail(&next->list, 2566 &rc->backref_cache.changed); 2567 __mark_block_processed(rc, next); 2568 break; 2569 } 2570 2571 WARN_ON(1); 2572 root = NULL; 2573 next = walk_down_backref(edges, &index); 2574 if (!next || next->level <= node->level) 2575 break; 2576 } 2577 if (!root) 2578 return NULL; 2579 2580 next = node; 2581 /* setup backref node path for btrfs_reloc_cow_block */ 2582 while (1) { 2583 rc->backref_cache.path[next->level] = next; 2584 if (--index < 0) 2585 break; 2586 next = edges[index]->node[UPPER]; 2587 } 2588 return root; 2589 } 2590 2591 /* 2592 * select a tree root for relocation. return NULL if the block 2593 * is reference counted. we should use do_relocation() in this 2594 * case. return a tree root pointer if the block isn't reference 2595 * counted. return -ENOENT if the block is root of reloc tree. 2596 */ 2597 static noinline_for_stack 2598 struct btrfs_root *select_one_root(struct backref_node *node) 2599 { 2600 struct backref_node *next; 2601 struct btrfs_root *root; 2602 struct btrfs_root *fs_root = NULL; 2603 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2604 int index = 0; 2605 2606 next = node; 2607 while (1) { 2608 cond_resched(); 2609 next = walk_up_backref(next, edges, &index); 2610 root = next->root; 2611 BUG_ON(!root); 2612 2613 /* no other choice for non-references counted tree */ 2614 if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 2615 return root; 2616 2617 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) 2618 fs_root = root; 2619 2620 if (next != node) 2621 return NULL; 2622 2623 next = walk_down_backref(edges, &index); 2624 if (!next || next->level <= node->level) 2625 break; 2626 } 2627 2628 if (!fs_root) 2629 return ERR_PTR(-ENOENT); 2630 return fs_root; 2631 } 2632 2633 static noinline_for_stack 2634 u64 calcu_metadata_size(struct reloc_control *rc, 2635 struct backref_node *node, int reserve) 2636 { 2637 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2638 struct backref_node *next = node; 2639 struct backref_edge *edge; 2640 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2641 u64 num_bytes = 0; 2642 int index = 0; 2643 2644 BUG_ON(reserve && node->processed); 2645 2646 while (next) { 2647 cond_resched(); 2648 while (1) { 2649 if (next->processed && (reserve || next != node)) 2650 break; 2651 2652 num_bytes += fs_info->nodesize; 2653 2654 if (list_empty(&next->upper)) 2655 break; 2656 2657 edge = list_entry(next->upper.next, 2658 struct backref_edge, list[LOWER]); 2659 edges[index++] = edge; 2660 next = edge->node[UPPER]; 2661 } 2662 next = walk_down_backref(edges, &index); 2663 } 2664 return num_bytes; 2665 } 2666 2667 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2668 struct reloc_control *rc, 2669 struct backref_node *node) 2670 { 2671 struct btrfs_root *root = rc->extent_root; 2672 struct btrfs_fs_info *fs_info = root->fs_info; 2673 u64 num_bytes; 2674 int ret; 2675 u64 tmp; 2676 2677 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2678 2679 trans->block_rsv = rc->block_rsv; 2680 rc->reserved_bytes += num_bytes; 2681 2682 /* 2683 * We are under a transaction here so we can only do limited flushing. 2684 * If we get an enospc just kick back -EAGAIN so we know to drop the 2685 * transaction and try to refill when we can flush all the things. 2686 */ 2687 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, 2688 BTRFS_RESERVE_FLUSH_LIMIT); 2689 if (ret) { 2690 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2691 while (tmp <= rc->reserved_bytes) 2692 tmp <<= 1; 2693 /* 2694 * only one thread can access block_rsv at this point, 2695 * so we don't need hold lock to protect block_rsv. 2696 * we expand more reservation size here to allow enough 2697 * space for relocation and we will return earlier in 2698 * enospc case. 2699 */ 2700 rc->block_rsv->size = tmp + fs_info->nodesize * 2701 RELOCATION_RESERVED_NODES; 2702 return -EAGAIN; 2703 } 2704 2705 return 0; 2706 } 2707 2708 /* 2709 * relocate a block tree, and then update pointers in upper level 2710 * blocks that reference the block to point to the new location. 2711 * 2712 * if called by link_to_upper, the block has already been relocated. 2713 * in that case this function just updates pointers. 2714 */ 2715 static int do_relocation(struct btrfs_trans_handle *trans, 2716 struct reloc_control *rc, 2717 struct backref_node *node, 2718 struct btrfs_key *key, 2719 struct btrfs_path *path, int lowest) 2720 { 2721 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2722 struct backref_node *upper; 2723 struct backref_edge *edge; 2724 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2725 struct btrfs_root *root; 2726 struct extent_buffer *eb; 2727 u32 blocksize; 2728 u64 bytenr; 2729 u64 generation; 2730 int slot; 2731 int ret; 2732 int err = 0; 2733 2734 BUG_ON(lowest && node->eb); 2735 2736 path->lowest_level = node->level + 1; 2737 rc->backref_cache.path[node->level] = node; 2738 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2739 struct btrfs_key first_key; 2740 struct btrfs_ref ref = { 0 }; 2741 2742 cond_resched(); 2743 2744 upper = edge->node[UPPER]; 2745 root = select_reloc_root(trans, rc, upper, edges); 2746 BUG_ON(!root); 2747 2748 if (upper->eb && !upper->locked) { 2749 if (!lowest) { 2750 ret = btrfs_bin_search(upper->eb, key, 2751 upper->level, &slot); 2752 if (ret < 0) { 2753 err = ret; 2754 goto next; 2755 } 2756 BUG_ON(ret); 2757 bytenr = btrfs_node_blockptr(upper->eb, slot); 2758 if (node->eb->start == bytenr) 2759 goto next; 2760 } 2761 drop_node_buffer(upper); 2762 } 2763 2764 if (!upper->eb) { 2765 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2766 if (ret) { 2767 if (ret < 0) 2768 err = ret; 2769 else 2770 err = -ENOENT; 2771 2772 btrfs_release_path(path); 2773 break; 2774 } 2775 2776 if (!upper->eb) { 2777 upper->eb = path->nodes[upper->level]; 2778 path->nodes[upper->level] = NULL; 2779 } else { 2780 BUG_ON(upper->eb != path->nodes[upper->level]); 2781 } 2782 2783 upper->locked = 1; 2784 path->locks[upper->level] = 0; 2785 2786 slot = path->slots[upper->level]; 2787 btrfs_release_path(path); 2788 } else { 2789 ret = btrfs_bin_search(upper->eb, key, upper->level, 2790 &slot); 2791 if (ret < 0) { 2792 err = ret; 2793 goto next; 2794 } 2795 BUG_ON(ret); 2796 } 2797 2798 bytenr = btrfs_node_blockptr(upper->eb, slot); 2799 if (lowest) { 2800 if (bytenr != node->bytenr) { 2801 btrfs_err(root->fs_info, 2802 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2803 bytenr, node->bytenr, slot, 2804 upper->eb->start); 2805 err = -EIO; 2806 goto next; 2807 } 2808 } else { 2809 if (node->eb->start == bytenr) 2810 goto next; 2811 } 2812 2813 blocksize = root->fs_info->nodesize; 2814 generation = btrfs_node_ptr_generation(upper->eb, slot); 2815 btrfs_node_key_to_cpu(upper->eb, &first_key, slot); 2816 eb = read_tree_block(fs_info, bytenr, generation, 2817 upper->level - 1, &first_key); 2818 if (IS_ERR(eb)) { 2819 err = PTR_ERR(eb); 2820 goto next; 2821 } else if (!extent_buffer_uptodate(eb)) { 2822 free_extent_buffer(eb); 2823 err = -EIO; 2824 goto next; 2825 } 2826 btrfs_tree_lock(eb); 2827 btrfs_set_lock_blocking_write(eb); 2828 2829 if (!node->eb) { 2830 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2831 slot, &eb); 2832 btrfs_tree_unlock(eb); 2833 free_extent_buffer(eb); 2834 if (ret < 0) { 2835 err = ret; 2836 goto next; 2837 } 2838 BUG_ON(node->eb != eb); 2839 } else { 2840 btrfs_set_node_blockptr(upper->eb, slot, 2841 node->eb->start); 2842 btrfs_set_node_ptr_generation(upper->eb, slot, 2843 trans->transid); 2844 btrfs_mark_buffer_dirty(upper->eb); 2845 2846 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 2847 node->eb->start, blocksize, 2848 upper->eb->start); 2849 ref.real_root = root->root_key.objectid; 2850 btrfs_init_tree_ref(&ref, node->level, 2851 btrfs_header_owner(upper->eb)); 2852 ret = btrfs_inc_extent_ref(trans, &ref); 2853 BUG_ON(ret); 2854 2855 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2856 BUG_ON(ret); 2857 } 2858 next: 2859 if (!upper->pending) 2860 drop_node_buffer(upper); 2861 else 2862 unlock_node_buffer(upper); 2863 if (err) 2864 break; 2865 } 2866 2867 if (!err && node->pending) { 2868 drop_node_buffer(node); 2869 list_move_tail(&node->list, &rc->backref_cache.changed); 2870 node->pending = 0; 2871 } 2872 2873 path->lowest_level = 0; 2874 BUG_ON(err == -ENOSPC); 2875 return err; 2876 } 2877 2878 static int link_to_upper(struct btrfs_trans_handle *trans, 2879 struct reloc_control *rc, 2880 struct backref_node *node, 2881 struct btrfs_path *path) 2882 { 2883 struct btrfs_key key; 2884 2885 btrfs_node_key_to_cpu(node->eb, &key, 0); 2886 return do_relocation(trans, rc, node, &key, path, 0); 2887 } 2888 2889 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2890 struct reloc_control *rc, 2891 struct btrfs_path *path, int err) 2892 { 2893 LIST_HEAD(list); 2894 struct backref_cache *cache = &rc->backref_cache; 2895 struct backref_node *node; 2896 int level; 2897 int ret; 2898 2899 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2900 while (!list_empty(&cache->pending[level])) { 2901 node = list_entry(cache->pending[level].next, 2902 struct backref_node, list); 2903 list_move_tail(&node->list, &list); 2904 BUG_ON(!node->pending); 2905 2906 if (!err) { 2907 ret = link_to_upper(trans, rc, node, path); 2908 if (ret < 0) 2909 err = ret; 2910 } 2911 } 2912 list_splice_init(&list, &cache->pending[level]); 2913 } 2914 return err; 2915 } 2916 2917 static void mark_block_processed(struct reloc_control *rc, 2918 u64 bytenr, u32 blocksize) 2919 { 2920 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, 2921 EXTENT_DIRTY); 2922 } 2923 2924 static void __mark_block_processed(struct reloc_control *rc, 2925 struct backref_node *node) 2926 { 2927 u32 blocksize; 2928 if (node->level == 0 || 2929 in_block_group(node->bytenr, rc->block_group)) { 2930 blocksize = rc->extent_root->fs_info->nodesize; 2931 mark_block_processed(rc, node->bytenr, blocksize); 2932 } 2933 node->processed = 1; 2934 } 2935 2936 /* 2937 * mark a block and all blocks directly/indirectly reference the block 2938 * as processed. 2939 */ 2940 static void update_processed_blocks(struct reloc_control *rc, 2941 struct backref_node *node) 2942 { 2943 struct backref_node *next = node; 2944 struct backref_edge *edge; 2945 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2946 int index = 0; 2947 2948 while (next) { 2949 cond_resched(); 2950 while (1) { 2951 if (next->processed) 2952 break; 2953 2954 __mark_block_processed(rc, next); 2955 2956 if (list_empty(&next->upper)) 2957 break; 2958 2959 edge = list_entry(next->upper.next, 2960 struct backref_edge, list[LOWER]); 2961 edges[index++] = edge; 2962 next = edge->node[UPPER]; 2963 } 2964 next = walk_down_backref(edges, &index); 2965 } 2966 } 2967 2968 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2969 { 2970 u32 blocksize = rc->extent_root->fs_info->nodesize; 2971 2972 if (test_range_bit(&rc->processed_blocks, bytenr, 2973 bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) 2974 return 1; 2975 return 0; 2976 } 2977 2978 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2979 struct tree_block *block) 2980 { 2981 struct extent_buffer *eb; 2982 2983 BUG_ON(block->key_ready); 2984 eb = read_tree_block(fs_info, block->bytenr, block->key.offset, 2985 block->level, NULL); 2986 if (IS_ERR(eb)) { 2987 return PTR_ERR(eb); 2988 } else if (!extent_buffer_uptodate(eb)) { 2989 free_extent_buffer(eb); 2990 return -EIO; 2991 } 2992 if (block->level == 0) 2993 btrfs_item_key_to_cpu(eb, &block->key, 0); 2994 else 2995 btrfs_node_key_to_cpu(eb, &block->key, 0); 2996 free_extent_buffer(eb); 2997 block->key_ready = 1; 2998 return 0; 2999 } 3000 3001 /* 3002 * helper function to relocate a tree block 3003 */ 3004 static int relocate_tree_block(struct btrfs_trans_handle *trans, 3005 struct reloc_control *rc, 3006 struct backref_node *node, 3007 struct btrfs_key *key, 3008 struct btrfs_path *path) 3009 { 3010 struct btrfs_root *root; 3011 int ret = 0; 3012 3013 if (!node) 3014 return 0; 3015 3016 BUG_ON(node->processed); 3017 root = select_one_root(node); 3018 if (root == ERR_PTR(-ENOENT)) { 3019 update_processed_blocks(rc, node); 3020 goto out; 3021 } 3022 3023 if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3024 ret = reserve_metadata_space(trans, rc, node); 3025 if (ret) 3026 goto out; 3027 } 3028 3029 if (root) { 3030 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { 3031 BUG_ON(node->new_bytenr); 3032 BUG_ON(!list_empty(&node->list)); 3033 btrfs_record_root_in_trans(trans, root); 3034 root = root->reloc_root; 3035 node->new_bytenr = root->node->start; 3036 node->root = root; 3037 list_add_tail(&node->list, &rc->backref_cache.changed); 3038 } else { 3039 path->lowest_level = node->level; 3040 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 3041 btrfs_release_path(path); 3042 if (ret > 0) 3043 ret = 0; 3044 } 3045 if (!ret) 3046 update_processed_blocks(rc, node); 3047 } else { 3048 ret = do_relocation(trans, rc, node, key, path, 1); 3049 } 3050 out: 3051 if (ret || node->level == 0 || node->cowonly) 3052 remove_backref_node(&rc->backref_cache, node); 3053 return ret; 3054 } 3055 3056 /* 3057 * relocate a list of blocks 3058 */ 3059 static noinline_for_stack 3060 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 3061 struct reloc_control *rc, struct rb_root *blocks) 3062 { 3063 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3064 struct backref_node *node; 3065 struct btrfs_path *path; 3066 struct tree_block *block; 3067 struct tree_block *next; 3068 int ret; 3069 int err = 0; 3070 3071 path = btrfs_alloc_path(); 3072 if (!path) { 3073 err = -ENOMEM; 3074 goto out_free_blocks; 3075 } 3076 3077 /* Kick in readahead for tree blocks with missing keys */ 3078 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3079 if (!block->key_ready) 3080 readahead_tree_block(fs_info, block->bytenr); 3081 } 3082 3083 /* Get first keys */ 3084 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3085 if (!block->key_ready) { 3086 err = get_tree_block_key(fs_info, block); 3087 if (err) 3088 goto out_free_path; 3089 } 3090 } 3091 3092 /* Do tree relocation */ 3093 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 3094 node = build_backref_tree(rc, &block->key, 3095 block->level, block->bytenr); 3096 if (IS_ERR(node)) { 3097 err = PTR_ERR(node); 3098 goto out; 3099 } 3100 3101 ret = relocate_tree_block(trans, rc, node, &block->key, 3102 path); 3103 if (ret < 0) { 3104 if (ret != -EAGAIN || &block->rb_node == rb_first(blocks)) 3105 err = ret; 3106 goto out; 3107 } 3108 } 3109 out: 3110 err = finish_pending_nodes(trans, rc, path, err); 3111 3112 out_free_path: 3113 btrfs_free_path(path); 3114 out_free_blocks: 3115 free_block_list(blocks); 3116 return err; 3117 } 3118 3119 static noinline_for_stack 3120 int prealloc_file_extent_cluster(struct inode *inode, 3121 struct file_extent_cluster *cluster) 3122 { 3123 u64 alloc_hint = 0; 3124 u64 start; 3125 u64 end; 3126 u64 offset = BTRFS_I(inode)->index_cnt; 3127 u64 num_bytes; 3128 int nr = 0; 3129 int ret = 0; 3130 u64 prealloc_start = cluster->start - offset; 3131 u64 prealloc_end = cluster->end - offset; 3132 u64 cur_offset; 3133 struct extent_changeset *data_reserved = NULL; 3134 3135 BUG_ON(cluster->start != cluster->boundary[0]); 3136 inode_lock(inode); 3137 3138 ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, 3139 prealloc_end + 1 - prealloc_start); 3140 if (ret) 3141 goto out; 3142 3143 cur_offset = prealloc_start; 3144 while (nr < cluster->nr) { 3145 start = cluster->boundary[nr] - offset; 3146 if (nr + 1 < cluster->nr) 3147 end = cluster->boundary[nr + 1] - 1 - offset; 3148 else 3149 end = cluster->end - offset; 3150 3151 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3152 num_bytes = end + 1 - start; 3153 if (cur_offset < start) 3154 btrfs_free_reserved_data_space(inode, data_reserved, 3155 cur_offset, start - cur_offset); 3156 ret = btrfs_prealloc_file_range(inode, 0, start, 3157 num_bytes, num_bytes, 3158 end + 1, &alloc_hint); 3159 cur_offset = end + 1; 3160 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3161 if (ret) 3162 break; 3163 nr++; 3164 } 3165 if (cur_offset < prealloc_end) 3166 btrfs_free_reserved_data_space(inode, data_reserved, 3167 cur_offset, prealloc_end + 1 - cur_offset); 3168 out: 3169 inode_unlock(inode); 3170 extent_changeset_free(data_reserved); 3171 return ret; 3172 } 3173 3174 static noinline_for_stack 3175 int setup_extent_mapping(struct inode *inode, u64 start, u64 end, 3176 u64 block_start) 3177 { 3178 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3179 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 3180 struct extent_map *em; 3181 int ret = 0; 3182 3183 em = alloc_extent_map(); 3184 if (!em) 3185 return -ENOMEM; 3186 3187 em->start = start; 3188 em->len = end + 1 - start; 3189 em->block_len = em->len; 3190 em->block_start = block_start; 3191 em->bdev = fs_info->fs_devices->latest_bdev; 3192 set_bit(EXTENT_FLAG_PINNED, &em->flags); 3193 3194 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3195 while (1) { 3196 write_lock(&em_tree->lock); 3197 ret = add_extent_mapping(em_tree, em, 0); 3198 write_unlock(&em_tree->lock); 3199 if (ret != -EEXIST) { 3200 free_extent_map(em); 3201 break; 3202 } 3203 btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); 3204 } 3205 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3206 return ret; 3207 } 3208 3209 static int relocate_file_extent_cluster(struct inode *inode, 3210 struct file_extent_cluster *cluster) 3211 { 3212 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3213 u64 page_start; 3214 u64 page_end; 3215 u64 offset = BTRFS_I(inode)->index_cnt; 3216 unsigned long index; 3217 unsigned long last_index; 3218 struct page *page; 3219 struct file_ra_state *ra; 3220 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 3221 int nr = 0; 3222 int ret = 0; 3223 3224 if (!cluster->nr) 3225 return 0; 3226 3227 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3228 if (!ra) 3229 return -ENOMEM; 3230 3231 ret = prealloc_file_extent_cluster(inode, cluster); 3232 if (ret) 3233 goto out; 3234 3235 file_ra_state_init(ra, inode->i_mapping); 3236 3237 ret = setup_extent_mapping(inode, cluster->start - offset, 3238 cluster->end - offset, cluster->start); 3239 if (ret) 3240 goto out; 3241 3242 index = (cluster->start - offset) >> PAGE_SHIFT; 3243 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3244 while (index <= last_index) { 3245 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3246 PAGE_SIZE); 3247 if (ret) 3248 goto out; 3249 3250 page = find_lock_page(inode->i_mapping, index); 3251 if (!page) { 3252 page_cache_sync_readahead(inode->i_mapping, 3253 ra, NULL, index, 3254 last_index + 1 - index); 3255 page = find_or_create_page(inode->i_mapping, index, 3256 mask); 3257 if (!page) { 3258 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3259 PAGE_SIZE, true); 3260 ret = -ENOMEM; 3261 goto out; 3262 } 3263 } 3264 3265 if (PageReadahead(page)) { 3266 page_cache_async_readahead(inode->i_mapping, 3267 ra, NULL, page, index, 3268 last_index + 1 - index); 3269 } 3270 3271 if (!PageUptodate(page)) { 3272 btrfs_readpage(NULL, page); 3273 lock_page(page); 3274 if (!PageUptodate(page)) { 3275 unlock_page(page); 3276 put_page(page); 3277 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3278 PAGE_SIZE, true); 3279 btrfs_delalloc_release_extents(BTRFS_I(inode), 3280 PAGE_SIZE, true); 3281 ret = -EIO; 3282 goto out; 3283 } 3284 } 3285 3286 page_start = page_offset(page); 3287 page_end = page_start + PAGE_SIZE - 1; 3288 3289 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); 3290 3291 set_page_extent_mapped(page); 3292 3293 if (nr < cluster->nr && 3294 page_start + offset == cluster->boundary[nr]) { 3295 set_extent_bits(&BTRFS_I(inode)->io_tree, 3296 page_start, page_end, 3297 EXTENT_BOUNDARY); 3298 nr++; 3299 } 3300 3301 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, 3302 NULL, 0); 3303 if (ret) { 3304 unlock_page(page); 3305 put_page(page); 3306 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3307 PAGE_SIZE, true); 3308 btrfs_delalloc_release_extents(BTRFS_I(inode), 3309 PAGE_SIZE, true); 3310 3311 clear_extent_bits(&BTRFS_I(inode)->io_tree, 3312 page_start, page_end, 3313 EXTENT_LOCKED | EXTENT_BOUNDARY); 3314 goto out; 3315 3316 } 3317 set_page_dirty(page); 3318 3319 unlock_extent(&BTRFS_I(inode)->io_tree, 3320 page_start, page_end); 3321 unlock_page(page); 3322 put_page(page); 3323 3324 index++; 3325 btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, 3326 false); 3327 balance_dirty_pages_ratelimited(inode->i_mapping); 3328 btrfs_throttle(fs_info); 3329 } 3330 WARN_ON(nr != cluster->nr); 3331 out: 3332 kfree(ra); 3333 return ret; 3334 } 3335 3336 static noinline_for_stack 3337 int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, 3338 struct file_extent_cluster *cluster) 3339 { 3340 int ret; 3341 3342 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3343 ret = relocate_file_extent_cluster(inode, cluster); 3344 if (ret) 3345 return ret; 3346 cluster->nr = 0; 3347 } 3348 3349 if (!cluster->nr) 3350 cluster->start = extent_key->objectid; 3351 else 3352 BUG_ON(cluster->nr >= MAX_EXTENTS); 3353 cluster->end = extent_key->objectid + extent_key->offset - 1; 3354 cluster->boundary[cluster->nr] = extent_key->objectid; 3355 cluster->nr++; 3356 3357 if (cluster->nr >= MAX_EXTENTS) { 3358 ret = relocate_file_extent_cluster(inode, cluster); 3359 if (ret) 3360 return ret; 3361 cluster->nr = 0; 3362 } 3363 return 0; 3364 } 3365 3366 /* 3367 * helper to add a tree block to the list. 3368 * the major work is getting the generation and level of the block 3369 */ 3370 static int add_tree_block(struct reloc_control *rc, 3371 struct btrfs_key *extent_key, 3372 struct btrfs_path *path, 3373 struct rb_root *blocks) 3374 { 3375 struct extent_buffer *eb; 3376 struct btrfs_extent_item *ei; 3377 struct btrfs_tree_block_info *bi; 3378 struct tree_block *block; 3379 struct rb_node *rb_node; 3380 u32 item_size; 3381 int level = -1; 3382 u64 generation; 3383 3384 eb = path->nodes[0]; 3385 item_size = btrfs_item_size_nr(eb, path->slots[0]); 3386 3387 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3388 item_size >= sizeof(*ei) + sizeof(*bi)) { 3389 ei = btrfs_item_ptr(eb, path->slots[0], 3390 struct btrfs_extent_item); 3391 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3392 bi = (struct btrfs_tree_block_info *)(ei + 1); 3393 level = btrfs_tree_block_level(eb, bi); 3394 } else { 3395 level = (int)extent_key->offset; 3396 } 3397 generation = btrfs_extent_generation(eb, ei); 3398 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 3399 btrfs_print_v0_err(eb->fs_info); 3400 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3401 return -EINVAL; 3402 } else { 3403 BUG(); 3404 } 3405 3406 btrfs_release_path(path); 3407 3408 BUG_ON(level == -1); 3409 3410 block = kmalloc(sizeof(*block), GFP_NOFS); 3411 if (!block) 3412 return -ENOMEM; 3413 3414 block->bytenr = extent_key->objectid; 3415 block->key.objectid = rc->extent_root->fs_info->nodesize; 3416 block->key.offset = generation; 3417 block->level = level; 3418 block->key_ready = 0; 3419 3420 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); 3421 if (rb_node) 3422 backref_tree_panic(rb_node, -EEXIST, block->bytenr); 3423 3424 return 0; 3425 } 3426 3427 /* 3428 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3429 */ 3430 static int __add_tree_block(struct reloc_control *rc, 3431 u64 bytenr, u32 blocksize, 3432 struct rb_root *blocks) 3433 { 3434 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3435 struct btrfs_path *path; 3436 struct btrfs_key key; 3437 int ret; 3438 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3439 3440 if (tree_block_processed(bytenr, rc)) 3441 return 0; 3442 3443 if (tree_search(blocks, bytenr)) 3444 return 0; 3445 3446 path = btrfs_alloc_path(); 3447 if (!path) 3448 return -ENOMEM; 3449 again: 3450 key.objectid = bytenr; 3451 if (skinny) { 3452 key.type = BTRFS_METADATA_ITEM_KEY; 3453 key.offset = (u64)-1; 3454 } else { 3455 key.type = BTRFS_EXTENT_ITEM_KEY; 3456 key.offset = blocksize; 3457 } 3458 3459 path->search_commit_root = 1; 3460 path->skip_locking = 1; 3461 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3462 if (ret < 0) 3463 goto out; 3464 3465 if (ret > 0 && skinny) { 3466 if (path->slots[0]) { 3467 path->slots[0]--; 3468 btrfs_item_key_to_cpu(path->nodes[0], &key, 3469 path->slots[0]); 3470 if (key.objectid == bytenr && 3471 (key.type == BTRFS_METADATA_ITEM_KEY || 3472 (key.type == BTRFS_EXTENT_ITEM_KEY && 3473 key.offset == blocksize))) 3474 ret = 0; 3475 } 3476 3477 if (ret) { 3478 skinny = false; 3479 btrfs_release_path(path); 3480 goto again; 3481 } 3482 } 3483 if (ret) { 3484 ASSERT(ret == 1); 3485 btrfs_print_leaf(path->nodes[0]); 3486 btrfs_err(fs_info, 3487 "tree block extent item (%llu) is not found in extent tree", 3488 bytenr); 3489 WARN_ON(1); 3490 ret = -EINVAL; 3491 goto out; 3492 } 3493 3494 ret = add_tree_block(rc, &key, path, blocks); 3495 out: 3496 btrfs_free_path(path); 3497 return ret; 3498 } 3499 3500 /* 3501 * helper to check if the block use full backrefs for pointers in it 3502 */ 3503 static int block_use_full_backref(struct reloc_control *rc, 3504 struct extent_buffer *eb) 3505 { 3506 u64 flags; 3507 int ret; 3508 3509 if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || 3510 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3511 return 1; 3512 3513 ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, 3514 eb->start, btrfs_header_level(eb), 1, 3515 NULL, &flags); 3516 BUG_ON(ret); 3517 3518 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3519 ret = 1; 3520 else 3521 ret = 0; 3522 return ret; 3523 } 3524 3525 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3526 struct btrfs_block_group_cache *block_group, 3527 struct inode *inode, 3528 u64 ino) 3529 { 3530 struct btrfs_key key; 3531 struct btrfs_root *root = fs_info->tree_root; 3532 struct btrfs_trans_handle *trans; 3533 int ret = 0; 3534 3535 if (inode) 3536 goto truncate; 3537 3538 key.objectid = ino; 3539 key.type = BTRFS_INODE_ITEM_KEY; 3540 key.offset = 0; 3541 3542 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 3543 if (IS_ERR(inode)) 3544 return -ENOENT; 3545 3546 truncate: 3547 ret = btrfs_check_trunc_cache_free_space(fs_info, 3548 &fs_info->global_block_rsv); 3549 if (ret) 3550 goto out; 3551 3552 trans = btrfs_join_transaction(root); 3553 if (IS_ERR(trans)) { 3554 ret = PTR_ERR(trans); 3555 goto out; 3556 } 3557 3558 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3559 3560 btrfs_end_transaction(trans); 3561 btrfs_btree_balance_dirty(fs_info); 3562 out: 3563 iput(inode); 3564 return ret; 3565 } 3566 3567 /* 3568 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY 3569 * this function scans fs tree to find blocks reference the data extent 3570 */ 3571 static int find_data_references(struct reloc_control *rc, 3572 struct btrfs_key *extent_key, 3573 struct extent_buffer *leaf, 3574 struct btrfs_extent_data_ref *ref, 3575 struct rb_root *blocks) 3576 { 3577 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3578 struct btrfs_path *path; 3579 struct tree_block *block; 3580 struct btrfs_root *root; 3581 struct btrfs_file_extent_item *fi; 3582 struct rb_node *rb_node; 3583 struct btrfs_key key; 3584 u64 ref_root; 3585 u64 ref_objectid; 3586 u64 ref_offset; 3587 u32 ref_count; 3588 u32 nritems; 3589 int err = 0; 3590 int added = 0; 3591 int counted; 3592 int ret; 3593 3594 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3595 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3596 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3597 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3598 3599 /* 3600 * This is an extent belonging to the free space cache, lets just delete 3601 * it and redo the search. 3602 */ 3603 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3604 ret = delete_block_group_cache(fs_info, rc->block_group, 3605 NULL, ref_objectid); 3606 if (ret != -ENOENT) 3607 return ret; 3608 ret = 0; 3609 } 3610 3611 path = btrfs_alloc_path(); 3612 if (!path) 3613 return -ENOMEM; 3614 path->reada = READA_FORWARD; 3615 3616 root = read_fs_root(fs_info, ref_root); 3617 if (IS_ERR(root)) { 3618 err = PTR_ERR(root); 3619 goto out; 3620 } 3621 3622 key.objectid = ref_objectid; 3623 key.type = BTRFS_EXTENT_DATA_KEY; 3624 if (ref_offset > ((u64)-1 << 32)) 3625 key.offset = 0; 3626 else 3627 key.offset = ref_offset; 3628 3629 path->search_commit_root = 1; 3630 path->skip_locking = 1; 3631 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3632 if (ret < 0) { 3633 err = ret; 3634 goto out; 3635 } 3636 3637 leaf = path->nodes[0]; 3638 nritems = btrfs_header_nritems(leaf); 3639 /* 3640 * the references in tree blocks that use full backrefs 3641 * are not counted in 3642 */ 3643 if (block_use_full_backref(rc, leaf)) 3644 counted = 0; 3645 else 3646 counted = 1; 3647 rb_node = tree_search(blocks, leaf->start); 3648 if (rb_node) { 3649 if (counted) 3650 added = 1; 3651 else 3652 path->slots[0] = nritems; 3653 } 3654 3655 while (ref_count > 0) { 3656 while (path->slots[0] >= nritems) { 3657 ret = btrfs_next_leaf(root, path); 3658 if (ret < 0) { 3659 err = ret; 3660 goto out; 3661 } 3662 if (WARN_ON(ret > 0)) 3663 goto out; 3664 3665 leaf = path->nodes[0]; 3666 nritems = btrfs_header_nritems(leaf); 3667 added = 0; 3668 3669 if (block_use_full_backref(rc, leaf)) 3670 counted = 0; 3671 else 3672 counted = 1; 3673 rb_node = tree_search(blocks, leaf->start); 3674 if (rb_node) { 3675 if (counted) 3676 added = 1; 3677 else 3678 path->slots[0] = nritems; 3679 } 3680 } 3681 3682 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3683 if (WARN_ON(key.objectid != ref_objectid || 3684 key.type != BTRFS_EXTENT_DATA_KEY)) 3685 break; 3686 3687 fi = btrfs_item_ptr(leaf, path->slots[0], 3688 struct btrfs_file_extent_item); 3689 3690 if (btrfs_file_extent_type(leaf, fi) == 3691 BTRFS_FILE_EXTENT_INLINE) 3692 goto next; 3693 3694 if (btrfs_file_extent_disk_bytenr(leaf, fi) != 3695 extent_key->objectid) 3696 goto next; 3697 3698 key.offset -= btrfs_file_extent_offset(leaf, fi); 3699 if (key.offset != ref_offset) 3700 goto next; 3701 3702 if (counted) 3703 ref_count--; 3704 if (added) 3705 goto next; 3706 3707 if (!tree_block_processed(leaf->start, rc)) { 3708 block = kmalloc(sizeof(*block), GFP_NOFS); 3709 if (!block) { 3710 err = -ENOMEM; 3711 break; 3712 } 3713 block->bytenr = leaf->start; 3714 btrfs_item_key_to_cpu(leaf, &block->key, 0); 3715 block->level = 0; 3716 block->key_ready = 1; 3717 rb_node = tree_insert(blocks, block->bytenr, 3718 &block->rb_node); 3719 if (rb_node) 3720 backref_tree_panic(rb_node, -EEXIST, 3721 block->bytenr); 3722 } 3723 if (counted) 3724 added = 1; 3725 else 3726 path->slots[0] = nritems; 3727 next: 3728 path->slots[0]++; 3729 3730 } 3731 out: 3732 btrfs_free_path(path); 3733 return err; 3734 } 3735 3736 /* 3737 * helper to find all tree blocks that reference a given data extent 3738 */ 3739 static noinline_for_stack 3740 int add_data_references(struct reloc_control *rc, 3741 struct btrfs_key *extent_key, 3742 struct btrfs_path *path, 3743 struct rb_root *blocks) 3744 { 3745 struct btrfs_key key; 3746 struct extent_buffer *eb; 3747 struct btrfs_extent_data_ref *dref; 3748 struct btrfs_extent_inline_ref *iref; 3749 unsigned long ptr; 3750 unsigned long end; 3751 u32 blocksize = rc->extent_root->fs_info->nodesize; 3752 int ret = 0; 3753 int err = 0; 3754 3755 eb = path->nodes[0]; 3756 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3757 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3758 ptr += sizeof(struct btrfs_extent_item); 3759 3760 while (ptr < end) { 3761 iref = (struct btrfs_extent_inline_ref *)ptr; 3762 key.type = btrfs_get_extent_inline_ref_type(eb, iref, 3763 BTRFS_REF_TYPE_DATA); 3764 if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 3765 key.offset = btrfs_extent_inline_ref_offset(eb, iref); 3766 ret = __add_tree_block(rc, key.offset, blocksize, 3767 blocks); 3768 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 3769 dref = (struct btrfs_extent_data_ref *)(&iref->offset); 3770 ret = find_data_references(rc, extent_key, 3771 eb, dref, blocks); 3772 } else { 3773 ret = -EUCLEAN; 3774 btrfs_err(rc->extent_root->fs_info, 3775 "extent %llu slot %d has an invalid inline ref type", 3776 eb->start, path->slots[0]); 3777 } 3778 if (ret) { 3779 err = ret; 3780 goto out; 3781 } 3782 ptr += btrfs_extent_inline_ref_size(key.type); 3783 } 3784 WARN_ON(ptr > end); 3785 3786 while (1) { 3787 cond_resched(); 3788 eb = path->nodes[0]; 3789 if (path->slots[0] >= btrfs_header_nritems(eb)) { 3790 ret = btrfs_next_leaf(rc->extent_root, path); 3791 if (ret < 0) { 3792 err = ret; 3793 break; 3794 } 3795 if (ret > 0) 3796 break; 3797 eb = path->nodes[0]; 3798 } 3799 3800 btrfs_item_key_to_cpu(eb, &key, path->slots[0]); 3801 if (key.objectid != extent_key->objectid) 3802 break; 3803 3804 if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 3805 ret = __add_tree_block(rc, key.offset, blocksize, 3806 blocks); 3807 } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 3808 dref = btrfs_item_ptr(eb, path->slots[0], 3809 struct btrfs_extent_data_ref); 3810 ret = find_data_references(rc, extent_key, 3811 eb, dref, blocks); 3812 } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { 3813 btrfs_print_v0_err(eb->fs_info); 3814 btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); 3815 ret = -EINVAL; 3816 } else { 3817 ret = 0; 3818 } 3819 if (ret) { 3820 err = ret; 3821 break; 3822 } 3823 path->slots[0]++; 3824 } 3825 out: 3826 btrfs_release_path(path); 3827 if (err) 3828 free_block_list(blocks); 3829 return err; 3830 } 3831 3832 /* 3833 * helper to find next unprocessed extent 3834 */ 3835 static noinline_for_stack 3836 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3837 struct btrfs_key *extent_key) 3838 { 3839 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3840 struct btrfs_key key; 3841 struct extent_buffer *leaf; 3842 u64 start, end, last; 3843 int ret; 3844 3845 last = rc->block_group->key.objectid + rc->block_group->key.offset; 3846 while (1) { 3847 cond_resched(); 3848 if (rc->search_start >= last) { 3849 ret = 1; 3850 break; 3851 } 3852 3853 key.objectid = rc->search_start; 3854 key.type = BTRFS_EXTENT_ITEM_KEY; 3855 key.offset = 0; 3856 3857 path->search_commit_root = 1; 3858 path->skip_locking = 1; 3859 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3860 0, 0); 3861 if (ret < 0) 3862 break; 3863 next: 3864 leaf = path->nodes[0]; 3865 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3866 ret = btrfs_next_leaf(rc->extent_root, path); 3867 if (ret != 0) 3868 break; 3869 leaf = path->nodes[0]; 3870 } 3871 3872 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3873 if (key.objectid >= last) { 3874 ret = 1; 3875 break; 3876 } 3877 3878 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3879 key.type != BTRFS_METADATA_ITEM_KEY) { 3880 path->slots[0]++; 3881 goto next; 3882 } 3883 3884 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3885 key.objectid + key.offset <= rc->search_start) { 3886 path->slots[0]++; 3887 goto next; 3888 } 3889 3890 if (key.type == BTRFS_METADATA_ITEM_KEY && 3891 key.objectid + fs_info->nodesize <= 3892 rc->search_start) { 3893 path->slots[0]++; 3894 goto next; 3895 } 3896 3897 ret = find_first_extent_bit(&rc->processed_blocks, 3898 key.objectid, &start, &end, 3899 EXTENT_DIRTY, NULL); 3900 3901 if (ret == 0 && start <= key.objectid) { 3902 btrfs_release_path(path); 3903 rc->search_start = end + 1; 3904 } else { 3905 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3906 rc->search_start = key.objectid + key.offset; 3907 else 3908 rc->search_start = key.objectid + 3909 fs_info->nodesize; 3910 memcpy(extent_key, &key, sizeof(key)); 3911 return 0; 3912 } 3913 } 3914 btrfs_release_path(path); 3915 return ret; 3916 } 3917 3918 static void set_reloc_control(struct reloc_control *rc) 3919 { 3920 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3921 3922 mutex_lock(&fs_info->reloc_mutex); 3923 fs_info->reloc_ctl = rc; 3924 mutex_unlock(&fs_info->reloc_mutex); 3925 } 3926 3927 static void unset_reloc_control(struct reloc_control *rc) 3928 { 3929 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3930 3931 mutex_lock(&fs_info->reloc_mutex); 3932 fs_info->reloc_ctl = NULL; 3933 mutex_unlock(&fs_info->reloc_mutex); 3934 } 3935 3936 static int check_extent_flags(u64 flags) 3937 { 3938 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3939 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3940 return 1; 3941 if (!(flags & BTRFS_EXTENT_FLAG_DATA) && 3942 !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) 3943 return 1; 3944 if ((flags & BTRFS_EXTENT_FLAG_DATA) && 3945 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 3946 return 1; 3947 return 0; 3948 } 3949 3950 static noinline_for_stack 3951 int prepare_to_relocate(struct reloc_control *rc) 3952 { 3953 struct btrfs_trans_handle *trans; 3954 int ret; 3955 3956 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3957 BTRFS_BLOCK_RSV_TEMP); 3958 if (!rc->block_rsv) 3959 return -ENOMEM; 3960 3961 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3962 rc->search_start = rc->block_group->key.objectid; 3963 rc->extents_found = 0; 3964 rc->nodes_relocated = 0; 3965 rc->merging_rsv_size = 0; 3966 rc->reserved_bytes = 0; 3967 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3968 RELOCATION_RESERVED_NODES; 3969 ret = btrfs_block_rsv_refill(rc->extent_root, 3970 rc->block_rsv, rc->block_rsv->size, 3971 BTRFS_RESERVE_FLUSH_ALL); 3972 if (ret) 3973 return ret; 3974 3975 rc->create_reloc_tree = 1; 3976 set_reloc_control(rc); 3977 3978 trans = btrfs_join_transaction(rc->extent_root); 3979 if (IS_ERR(trans)) { 3980 unset_reloc_control(rc); 3981 /* 3982 * extent tree is not a ref_cow tree and has no reloc_root to 3983 * cleanup. And callers are responsible to free the above 3984 * block rsv. 3985 */ 3986 return PTR_ERR(trans); 3987 } 3988 btrfs_commit_transaction(trans); 3989 return 0; 3990 } 3991 3992 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3993 { 3994 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3995 struct rb_root blocks = RB_ROOT; 3996 struct btrfs_key key; 3997 struct btrfs_trans_handle *trans = NULL; 3998 struct btrfs_path *path; 3999 struct btrfs_extent_item *ei; 4000 u64 flags; 4001 u32 item_size; 4002 int ret; 4003 int err = 0; 4004 int progress = 0; 4005 4006 path = btrfs_alloc_path(); 4007 if (!path) 4008 return -ENOMEM; 4009 path->reada = READA_FORWARD; 4010 4011 ret = prepare_to_relocate(rc); 4012 if (ret) { 4013 err = ret; 4014 goto out_free; 4015 } 4016 4017 while (1) { 4018 rc->reserved_bytes = 0; 4019 ret = btrfs_block_rsv_refill(rc->extent_root, 4020 rc->block_rsv, rc->block_rsv->size, 4021 BTRFS_RESERVE_FLUSH_ALL); 4022 if (ret) { 4023 err = ret; 4024 break; 4025 } 4026 progress++; 4027 trans = btrfs_start_transaction(rc->extent_root, 0); 4028 if (IS_ERR(trans)) { 4029 err = PTR_ERR(trans); 4030 trans = NULL; 4031 break; 4032 } 4033 restart: 4034 if (update_backref_cache(trans, &rc->backref_cache)) { 4035 btrfs_end_transaction(trans); 4036 trans = NULL; 4037 continue; 4038 } 4039 4040 ret = find_next_extent(rc, path, &key); 4041 if (ret < 0) 4042 err = ret; 4043 if (ret != 0) 4044 break; 4045 4046 rc->extents_found++; 4047 4048 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 4049 struct btrfs_extent_item); 4050 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 4051 if (item_size >= sizeof(*ei)) { 4052 flags = btrfs_extent_flags(path->nodes[0], ei); 4053 ret = check_extent_flags(flags); 4054 BUG_ON(ret); 4055 } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { 4056 err = -EINVAL; 4057 btrfs_print_v0_err(trans->fs_info); 4058 btrfs_abort_transaction(trans, err); 4059 break; 4060 } else { 4061 BUG(); 4062 } 4063 4064 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 4065 ret = add_tree_block(rc, &key, path, &blocks); 4066 } else if (rc->stage == UPDATE_DATA_PTRS && 4067 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4068 ret = add_data_references(rc, &key, path, &blocks); 4069 } else { 4070 btrfs_release_path(path); 4071 ret = 0; 4072 } 4073 if (ret < 0) { 4074 err = ret; 4075 break; 4076 } 4077 4078 if (!RB_EMPTY_ROOT(&blocks)) { 4079 ret = relocate_tree_blocks(trans, rc, &blocks); 4080 if (ret < 0) { 4081 /* 4082 * if we fail to relocate tree blocks, force to update 4083 * backref cache when committing transaction. 4084 */ 4085 rc->backref_cache.last_trans = trans->transid - 1; 4086 4087 if (ret != -EAGAIN) { 4088 err = ret; 4089 break; 4090 } 4091 rc->extents_found--; 4092 rc->search_start = key.objectid; 4093 } 4094 } 4095 4096 btrfs_end_transaction_throttle(trans); 4097 btrfs_btree_balance_dirty(fs_info); 4098 trans = NULL; 4099 4100 if (rc->stage == MOVE_DATA_EXTENTS && 4101 (flags & BTRFS_EXTENT_FLAG_DATA)) { 4102 rc->found_file_extent = 1; 4103 ret = relocate_data_extent(rc->data_inode, 4104 &key, &rc->cluster); 4105 if (ret < 0) { 4106 err = ret; 4107 break; 4108 } 4109 } 4110 } 4111 if (trans && progress && err == -ENOSPC) { 4112 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 4113 if (ret == 1) { 4114 err = 0; 4115 progress = 0; 4116 goto restart; 4117 } 4118 } 4119 4120 btrfs_release_path(path); 4121 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 4122 4123 if (trans) { 4124 btrfs_end_transaction_throttle(trans); 4125 btrfs_btree_balance_dirty(fs_info); 4126 } 4127 4128 if (!err) { 4129 ret = relocate_file_extent_cluster(rc->data_inode, 4130 &rc->cluster); 4131 if (ret < 0) 4132 err = ret; 4133 } 4134 4135 rc->create_reloc_tree = 0; 4136 set_reloc_control(rc); 4137 4138 backref_cache_cleanup(&rc->backref_cache); 4139 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); 4140 4141 err = prepare_to_merge(rc, err); 4142 4143 merge_reloc_roots(rc); 4144 4145 rc->merge_reloc_tree = 0; 4146 unset_reloc_control(rc); 4147 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); 4148 4149 /* get rid of pinned extents */ 4150 trans = btrfs_join_transaction(rc->extent_root); 4151 if (IS_ERR(trans)) { 4152 err = PTR_ERR(trans); 4153 goto out_free; 4154 } 4155 btrfs_commit_transaction(trans); 4156 ret = clean_dirty_subvols(rc); 4157 if (ret < 0 && !err) 4158 err = ret; 4159 out_free: 4160 btrfs_free_block_rsv(fs_info, rc->block_rsv); 4161 btrfs_free_path(path); 4162 return err; 4163 } 4164 4165 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 4166 struct btrfs_root *root, u64 objectid) 4167 { 4168 struct btrfs_path *path; 4169 struct btrfs_inode_item *item; 4170 struct extent_buffer *leaf; 4171 int ret; 4172 4173 path = btrfs_alloc_path(); 4174 if (!path) 4175 return -ENOMEM; 4176 4177 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 4178 if (ret) 4179 goto out; 4180 4181 leaf = path->nodes[0]; 4182 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 4183 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 4184 btrfs_set_inode_generation(leaf, item, 1); 4185 btrfs_set_inode_size(leaf, item, 0); 4186 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 4187 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4188 BTRFS_INODE_PREALLOC); 4189 btrfs_mark_buffer_dirty(leaf); 4190 out: 4191 btrfs_free_path(path); 4192 return ret; 4193 } 4194 4195 /* 4196 * helper to create inode for data relocation. 4197 * the inode is in data relocation tree and its link count is 0 4198 */ 4199 static noinline_for_stack 4200 struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 4201 struct btrfs_block_group_cache *group) 4202 { 4203 struct inode *inode = NULL; 4204 struct btrfs_trans_handle *trans; 4205 struct btrfs_root *root; 4206 struct btrfs_key key; 4207 u64 objectid; 4208 int err = 0; 4209 4210 root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4211 if (IS_ERR(root)) 4212 return ERR_CAST(root); 4213 4214 trans = btrfs_start_transaction(root, 6); 4215 if (IS_ERR(trans)) 4216 return ERR_CAST(trans); 4217 4218 err = btrfs_find_free_objectid(root, &objectid); 4219 if (err) 4220 goto out; 4221 4222 err = __insert_orphan_inode(trans, root, objectid); 4223 BUG_ON(err); 4224 4225 key.objectid = objectid; 4226 key.type = BTRFS_INODE_ITEM_KEY; 4227 key.offset = 0; 4228 inode = btrfs_iget(fs_info->sb, &key, root, NULL); 4229 BUG_ON(IS_ERR(inode)); 4230 BTRFS_I(inode)->index_cnt = group->key.objectid; 4231 4232 err = btrfs_orphan_add(trans, BTRFS_I(inode)); 4233 out: 4234 btrfs_end_transaction(trans); 4235 btrfs_btree_balance_dirty(fs_info); 4236 if (err) { 4237 if (inode) 4238 iput(inode); 4239 inode = ERR_PTR(err); 4240 } 4241 return inode; 4242 } 4243 4244 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 4245 { 4246 struct reloc_control *rc; 4247 4248 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4249 if (!rc) 4250 return NULL; 4251 4252 INIT_LIST_HEAD(&rc->reloc_roots); 4253 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 4254 backref_cache_init(&rc->backref_cache); 4255 mapping_tree_init(&rc->reloc_root_tree); 4256 extent_io_tree_init(fs_info, &rc->processed_blocks, 4257 IO_TREE_RELOC_BLOCKS, NULL); 4258 return rc; 4259 } 4260 4261 /* 4262 * Print the block group being relocated 4263 */ 4264 static void describe_relocation(struct btrfs_fs_info *fs_info, 4265 struct btrfs_block_group_cache *block_group) 4266 { 4267 char buf[128] = {'\0'}; 4268 4269 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 4270 4271 btrfs_info(fs_info, 4272 "relocating block group %llu flags %s", 4273 block_group->key.objectid, buf); 4274 } 4275 4276 /* 4277 * function to relocate all extents in a block group. 4278 */ 4279 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 4280 { 4281 struct btrfs_block_group_cache *bg; 4282 struct btrfs_root *extent_root = fs_info->extent_root; 4283 struct reloc_control *rc; 4284 struct inode *inode; 4285 struct btrfs_path *path; 4286 int ret; 4287 int rw = 0; 4288 int err = 0; 4289 4290 bg = btrfs_lookup_block_group(fs_info, group_start); 4291 if (!bg) 4292 return -ENOENT; 4293 4294 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4295 btrfs_put_block_group(bg); 4296 return -ETXTBSY; 4297 } 4298 4299 rc = alloc_reloc_control(fs_info); 4300 if (!rc) { 4301 btrfs_put_block_group(bg); 4302 return -ENOMEM; 4303 } 4304 4305 rc->extent_root = extent_root; 4306 rc->block_group = bg; 4307 4308 ret = btrfs_inc_block_group_ro(rc->block_group); 4309 if (ret) { 4310 err = ret; 4311 goto out; 4312 } 4313 rw = 1; 4314 4315 path = btrfs_alloc_path(); 4316 if (!path) { 4317 err = -ENOMEM; 4318 goto out; 4319 } 4320 4321 inode = lookup_free_space_inode(rc->block_group, path); 4322 btrfs_free_path(path); 4323 4324 if (!IS_ERR(inode)) 4325 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4326 else 4327 ret = PTR_ERR(inode); 4328 4329 if (ret && ret != -ENOENT) { 4330 err = ret; 4331 goto out; 4332 } 4333 4334 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4335 if (IS_ERR(rc->data_inode)) { 4336 err = PTR_ERR(rc->data_inode); 4337 rc->data_inode = NULL; 4338 goto out; 4339 } 4340 4341 describe_relocation(fs_info, rc->block_group); 4342 4343 btrfs_wait_block_group_reservations(rc->block_group); 4344 btrfs_wait_nocow_writers(rc->block_group); 4345 btrfs_wait_ordered_roots(fs_info, U64_MAX, 4346 rc->block_group->key.objectid, 4347 rc->block_group->key.offset); 4348 4349 while (1) { 4350 mutex_lock(&fs_info->cleaner_mutex); 4351 ret = relocate_block_group(rc); 4352 mutex_unlock(&fs_info->cleaner_mutex); 4353 if (ret < 0) 4354 err = ret; 4355 4356 /* 4357 * We may have gotten ENOSPC after we already dirtied some 4358 * extents. If writeout happens while we're relocating a 4359 * different block group we could end up hitting the 4360 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4361 * btrfs_reloc_cow_block. Make sure we write everything out 4362 * properly so we don't trip over this problem, and then break 4363 * out of the loop if we hit an error. 4364 */ 4365 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4366 ret = btrfs_wait_ordered_range(rc->data_inode, 0, 4367 (u64)-1); 4368 if (ret) 4369 err = ret; 4370 invalidate_mapping_pages(rc->data_inode->i_mapping, 4371 0, -1); 4372 rc->stage = UPDATE_DATA_PTRS; 4373 } 4374 4375 if (err < 0) 4376 goto out; 4377 4378 if (rc->extents_found == 0) 4379 break; 4380 4381 btrfs_info(fs_info, "found %llu extents", rc->extents_found); 4382 4383 } 4384 4385 WARN_ON(rc->block_group->pinned > 0); 4386 WARN_ON(rc->block_group->reserved > 0); 4387 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 4388 out: 4389 if (err && rw) 4390 btrfs_dec_block_group_ro(rc->block_group); 4391 iput(rc->data_inode); 4392 btrfs_put_block_group(rc->block_group); 4393 kfree(rc); 4394 return err; 4395 } 4396 4397 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4398 { 4399 struct btrfs_fs_info *fs_info = root->fs_info; 4400 struct btrfs_trans_handle *trans; 4401 int ret, err; 4402 4403 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4404 if (IS_ERR(trans)) 4405 return PTR_ERR(trans); 4406 4407 memset(&root->root_item.drop_progress, 0, 4408 sizeof(root->root_item.drop_progress)); 4409 root->root_item.drop_level = 0; 4410 btrfs_set_root_refs(&root->root_item, 0); 4411 ret = btrfs_update_root(trans, fs_info->tree_root, 4412 &root->root_key, &root->root_item); 4413 4414 err = btrfs_end_transaction(trans); 4415 if (err) 4416 return err; 4417 return ret; 4418 } 4419 4420 /* 4421 * recover relocation interrupted by system crash. 4422 * 4423 * this function resumes merging reloc trees with corresponding fs trees. 4424 * this is important for keeping the sharing of tree blocks 4425 */ 4426 int btrfs_recover_relocation(struct btrfs_root *root) 4427 { 4428 struct btrfs_fs_info *fs_info = root->fs_info; 4429 LIST_HEAD(reloc_roots); 4430 struct btrfs_key key; 4431 struct btrfs_root *fs_root; 4432 struct btrfs_root *reloc_root; 4433 struct btrfs_path *path; 4434 struct extent_buffer *leaf; 4435 struct reloc_control *rc = NULL; 4436 struct btrfs_trans_handle *trans; 4437 int ret; 4438 int err = 0; 4439 4440 path = btrfs_alloc_path(); 4441 if (!path) 4442 return -ENOMEM; 4443 path->reada = READA_BACK; 4444 4445 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4446 key.type = BTRFS_ROOT_ITEM_KEY; 4447 key.offset = (u64)-1; 4448 4449 while (1) { 4450 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4451 path, 0, 0); 4452 if (ret < 0) { 4453 err = ret; 4454 goto out; 4455 } 4456 if (ret > 0) { 4457 if (path->slots[0] == 0) 4458 break; 4459 path->slots[0]--; 4460 } 4461 leaf = path->nodes[0]; 4462 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4463 btrfs_release_path(path); 4464 4465 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4466 key.type != BTRFS_ROOT_ITEM_KEY) 4467 break; 4468 4469 reloc_root = btrfs_read_fs_root(root, &key); 4470 if (IS_ERR(reloc_root)) { 4471 err = PTR_ERR(reloc_root); 4472 goto out; 4473 } 4474 4475 list_add(&reloc_root->root_list, &reloc_roots); 4476 4477 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4478 fs_root = read_fs_root(fs_info, 4479 reloc_root->root_key.offset); 4480 if (IS_ERR(fs_root)) { 4481 ret = PTR_ERR(fs_root); 4482 if (ret != -ENOENT) { 4483 err = ret; 4484 goto out; 4485 } 4486 ret = mark_garbage_root(reloc_root); 4487 if (ret < 0) { 4488 err = ret; 4489 goto out; 4490 } 4491 } 4492 } 4493 4494 if (key.offset == 0) 4495 break; 4496 4497 key.offset--; 4498 } 4499 btrfs_release_path(path); 4500 4501 if (list_empty(&reloc_roots)) 4502 goto out; 4503 4504 rc = alloc_reloc_control(fs_info); 4505 if (!rc) { 4506 err = -ENOMEM; 4507 goto out; 4508 } 4509 4510 rc->extent_root = fs_info->extent_root; 4511 4512 set_reloc_control(rc); 4513 4514 trans = btrfs_join_transaction(rc->extent_root); 4515 if (IS_ERR(trans)) { 4516 unset_reloc_control(rc); 4517 err = PTR_ERR(trans); 4518 goto out_free; 4519 } 4520 4521 rc->merge_reloc_tree = 1; 4522 4523 while (!list_empty(&reloc_roots)) { 4524 reloc_root = list_entry(reloc_roots.next, 4525 struct btrfs_root, root_list); 4526 list_del(&reloc_root->root_list); 4527 4528 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4529 list_add_tail(&reloc_root->root_list, 4530 &rc->reloc_roots); 4531 continue; 4532 } 4533 4534 fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); 4535 if (IS_ERR(fs_root)) { 4536 err = PTR_ERR(fs_root); 4537 goto out_free; 4538 } 4539 4540 err = __add_reloc_root(reloc_root); 4541 BUG_ON(err < 0); /* -ENOMEM or logic error */ 4542 fs_root->reloc_root = reloc_root; 4543 } 4544 4545 err = btrfs_commit_transaction(trans); 4546 if (err) 4547 goto out_free; 4548 4549 merge_reloc_roots(rc); 4550 4551 unset_reloc_control(rc); 4552 4553 trans = btrfs_join_transaction(rc->extent_root); 4554 if (IS_ERR(trans)) { 4555 err = PTR_ERR(trans); 4556 goto out_free; 4557 } 4558 err = btrfs_commit_transaction(trans); 4559 4560 ret = clean_dirty_subvols(rc); 4561 if (ret < 0 && !err) 4562 err = ret; 4563 out_free: 4564 kfree(rc); 4565 out: 4566 if (!list_empty(&reloc_roots)) 4567 free_reloc_roots(&reloc_roots); 4568 4569 btrfs_free_path(path); 4570 4571 if (err == 0) { 4572 /* cleanup orphan inode in data relocation tree */ 4573 fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); 4574 if (IS_ERR(fs_root)) 4575 err = PTR_ERR(fs_root); 4576 else 4577 err = btrfs_orphan_cleanup(fs_root); 4578 } 4579 return err; 4580 } 4581 4582 /* 4583 * helper to add ordered checksum for data relocation. 4584 * 4585 * cloning checksum properly handles the nodatasum extents. 4586 * it also saves CPU time to re-calculate the checksum. 4587 */ 4588 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4589 { 4590 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4591 struct btrfs_ordered_sum *sums; 4592 struct btrfs_ordered_extent *ordered; 4593 int ret; 4594 u64 disk_bytenr; 4595 u64 new_bytenr; 4596 LIST_HEAD(list); 4597 4598 ordered = btrfs_lookup_ordered_extent(inode, file_pos); 4599 BUG_ON(ordered->file_offset != file_pos || ordered->len != len); 4600 4601 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 4602 ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, 4603 disk_bytenr + len - 1, &list, 0); 4604 if (ret) 4605 goto out; 4606 4607 while (!list_empty(&list)) { 4608 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4609 list_del_init(&sums->list); 4610 4611 /* 4612 * We need to offset the new_bytenr based on where the csum is. 4613 * We need to do this because we will read in entire prealloc 4614 * extents but we may have written to say the middle of the 4615 * prealloc extent, so we need to make sure the csum goes with 4616 * the right disk offset. 4617 * 4618 * We can do this because the data reloc inode refers strictly 4619 * to the on disk bytes, so we don't have to worry about 4620 * disk_len vs real len like with real inodes since it's all 4621 * disk length. 4622 */ 4623 new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); 4624 sums->bytenr = new_bytenr; 4625 4626 btrfs_add_ordered_sum(ordered, sums); 4627 } 4628 out: 4629 btrfs_put_ordered_extent(ordered); 4630 return ret; 4631 } 4632 4633 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4634 struct btrfs_root *root, struct extent_buffer *buf, 4635 struct extent_buffer *cow) 4636 { 4637 struct btrfs_fs_info *fs_info = root->fs_info; 4638 struct reloc_control *rc; 4639 struct backref_node *node; 4640 int first_cow = 0; 4641 int level; 4642 int ret = 0; 4643 4644 rc = fs_info->reloc_ctl; 4645 if (!rc) 4646 return 0; 4647 4648 BUG_ON(rc->stage == UPDATE_DATA_PTRS && 4649 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); 4650 4651 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 4652 if (buf == root->node) 4653 __update_reloc_root(root, cow->start); 4654 } 4655 4656 level = btrfs_header_level(buf); 4657 if (btrfs_header_generation(buf) <= 4658 btrfs_root_last_snapshot(&root->root_item)) 4659 first_cow = 1; 4660 4661 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && 4662 rc->create_reloc_tree) { 4663 WARN_ON(!first_cow && level == 0); 4664 4665 node = rc->backref_cache.path[level]; 4666 BUG_ON(node->bytenr != buf->start && 4667 node->new_bytenr != buf->start); 4668 4669 drop_node_buffer(node); 4670 extent_buffer_get(cow); 4671 node->eb = cow; 4672 node->new_bytenr = cow->start; 4673 4674 if (!node->pending) { 4675 list_move_tail(&node->list, 4676 &rc->backref_cache.pending[level]); 4677 node->pending = 1; 4678 } 4679 4680 if (first_cow) 4681 __mark_block_processed(rc, node); 4682 4683 if (first_cow && level > 0) 4684 rc->nodes_relocated += buf->len; 4685 } 4686 4687 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4688 ret = replace_file_extents(trans, rc, root, cow); 4689 return ret; 4690 } 4691 4692 /* 4693 * called before creating snapshot. it calculates metadata reservation 4694 * required for relocating tree blocks in the snapshot 4695 */ 4696 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4697 u64 *bytes_to_reserve) 4698 { 4699 struct btrfs_root *root = pending->root; 4700 struct reloc_control *rc = root->fs_info->reloc_ctl; 4701 4702 if (!root->reloc_root || !rc) 4703 return; 4704 4705 if (!rc->merge_reloc_tree) 4706 return; 4707 4708 root = root->reloc_root; 4709 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4710 /* 4711 * relocation is in the stage of merging trees. the space 4712 * used by merging a reloc tree is twice the size of 4713 * relocated tree nodes in the worst case. half for cowing 4714 * the reloc tree, half for cowing the fs tree. the space 4715 * used by cowing the reloc tree will be freed after the 4716 * tree is dropped. if we create snapshot, cowing the fs 4717 * tree may use more space than it frees. so we need 4718 * reserve extra space. 4719 */ 4720 *bytes_to_reserve += rc->nodes_relocated; 4721 } 4722 4723 /* 4724 * called after snapshot is created. migrate block reservation 4725 * and create reloc root for the newly created snapshot 4726 */ 4727 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4728 struct btrfs_pending_snapshot *pending) 4729 { 4730 struct btrfs_root *root = pending->root; 4731 struct btrfs_root *reloc_root; 4732 struct btrfs_root *new_root; 4733 struct reloc_control *rc = root->fs_info->reloc_ctl; 4734 int ret; 4735 4736 if (!root->reloc_root || !rc) 4737 return 0; 4738 4739 rc = root->fs_info->reloc_ctl; 4740 rc->merging_rsv_size += rc->nodes_relocated; 4741 4742 if (rc->merge_reloc_tree) { 4743 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4744 rc->block_rsv, 4745 rc->nodes_relocated, true); 4746 if (ret) 4747 return ret; 4748 } 4749 4750 new_root = pending->snap; 4751 reloc_root = create_reloc_root(trans, root->reloc_root, 4752 new_root->root_key.objectid); 4753 if (IS_ERR(reloc_root)) 4754 return PTR_ERR(reloc_root); 4755 4756 ret = __add_reloc_root(reloc_root); 4757 BUG_ON(ret < 0); 4758 new_root->reloc_root = reloc_root; 4759 4760 if (rc->create_reloc_tree) 4761 ret = clone_backref_node(trans, rc, root, reloc_root); 4762 return ret; 4763 } 4764