1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "messages.h" 4 #include "tree-mod-log.h" 5 #include "disk-io.h" 6 #include "fs.h" 7 #include "accessors.h" 8 #include "tree-checker.h" 9 10 struct tree_mod_root { 11 u64 logical; 12 u8 level; 13 }; 14 15 struct tree_mod_elem { 16 struct rb_node node; 17 u64 logical; 18 u64 seq; 19 enum btrfs_mod_log_op op; 20 21 /* 22 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS 23 * operations. 24 */ 25 int slot; 26 27 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ 28 u64 generation; 29 30 /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ 31 struct btrfs_disk_key key; 32 u64 blockptr; 33 34 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ 35 struct { 36 int dst_slot; 37 int nr_items; 38 } move; 39 40 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ 41 struct tree_mod_root old_root; 42 }; 43 44 /* 45 * Pull a new tree mod seq number for our operation. 46 */ 47 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 48 { 49 return atomic64_inc_return(&fs_info->tree_mod_seq); 50 } 51 52 /* 53 * This adds a new blocker to the tree mod log's blocker list if the @elem 54 * passed does not already have a sequence number set. So when a caller expects 55 * to record tree modifications, it should ensure to set elem->seq to zero 56 * before calling btrfs_get_tree_mod_seq. 57 * Returns a fresh, unused tree log modification sequence number, even if no new 58 * blocker was added. 59 */ 60 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 61 struct btrfs_seq_list *elem) 62 { 63 write_lock(&fs_info->tree_mod_log_lock); 64 if (!elem->seq) { 65 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 66 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 67 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 68 } 69 write_unlock(&fs_info->tree_mod_log_lock); 70 71 return elem->seq; 72 } 73 74 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 75 struct btrfs_seq_list *elem) 76 { 77 struct rb_root *tm_root; 78 struct rb_node *node; 79 struct rb_node *next; 80 struct tree_mod_elem *tm; 81 u64 min_seq = BTRFS_SEQ_LAST; 82 u64 seq_putting = elem->seq; 83 84 if (!seq_putting) 85 return; 86 87 write_lock(&fs_info->tree_mod_log_lock); 88 list_del(&elem->list); 89 elem->seq = 0; 90 91 if (list_empty(&fs_info->tree_mod_seq_list)) { 92 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 93 } else { 94 struct btrfs_seq_list *first; 95 96 first = list_first_entry(&fs_info->tree_mod_seq_list, 97 struct btrfs_seq_list, list); 98 if (seq_putting > first->seq) { 99 /* 100 * Blocker with lower sequence number exists, we cannot 101 * remove anything from the log. 102 */ 103 write_unlock(&fs_info->tree_mod_log_lock); 104 return; 105 } 106 min_seq = first->seq; 107 } 108 109 /* 110 * Anything that's lower than the lowest existing (read: blocked) 111 * sequence number can be removed from the tree. 112 */ 113 tm_root = &fs_info->tree_mod_log; 114 for (node = rb_first(tm_root); node; node = next) { 115 next = rb_next(node); 116 tm = rb_entry(node, struct tree_mod_elem, node); 117 if (tm->seq >= min_seq) 118 continue; 119 rb_erase(node, tm_root); 120 kfree(tm); 121 } 122 write_unlock(&fs_info->tree_mod_log_lock); 123 } 124 125 /* 126 * Key order of the log: 127 * node/leaf start address -> sequence 128 * 129 * The 'start address' is the logical address of the *new* root node for root 130 * replace operations, or the logical address of the affected block for all 131 * other operations. 132 */ 133 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, 134 struct tree_mod_elem *tm) 135 { 136 struct rb_root *tm_root; 137 struct rb_node **new; 138 struct rb_node *parent = NULL; 139 struct tree_mod_elem *cur; 140 141 lockdep_assert_held_write(&fs_info->tree_mod_log_lock); 142 143 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 144 145 tm_root = &fs_info->tree_mod_log; 146 new = &tm_root->rb_node; 147 while (*new) { 148 cur = rb_entry(*new, struct tree_mod_elem, node); 149 parent = *new; 150 if (cur->logical < tm->logical) 151 new = &((*new)->rb_left); 152 else if (cur->logical > tm->logical) 153 new = &((*new)->rb_right); 154 else if (cur->seq < tm->seq) 155 new = &((*new)->rb_left); 156 else if (cur->seq > tm->seq) 157 new = &((*new)->rb_right); 158 else 159 return -EEXIST; 160 } 161 162 rb_link_node(&tm->node, parent, new); 163 rb_insert_color(&tm->node, tm_root); 164 return 0; 165 } 166 167 /* 168 * Determines if logging can be omitted. Returns true if it can. Otherwise, it 169 * returns false with the tree_mod_log_lock acquired. The caller must hold 170 * this until all tree mod log insertions are recorded in the rb tree and then 171 * write unlock fs_info::tree_mod_log_lock. 172 */ 173 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, 174 struct extent_buffer *eb) 175 { 176 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 177 return true; 178 if (eb && btrfs_header_level(eb) == 0) 179 return true; 180 181 write_lock(&fs_info->tree_mod_log_lock); 182 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 183 write_unlock(&fs_info->tree_mod_log_lock); 184 return true; 185 } 186 187 return false; 188 } 189 190 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 191 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, 192 struct extent_buffer *eb) 193 { 194 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 195 return false; 196 if (eb && btrfs_header_level(eb) == 0) 197 return false; 198 199 return true; 200 } 201 202 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, 203 int slot, 204 enum btrfs_mod_log_op op) 205 { 206 struct tree_mod_elem *tm; 207 208 tm = kzalloc(sizeof(*tm), GFP_NOFS); 209 if (!tm) 210 return NULL; 211 212 tm->logical = eb->start; 213 if (op != BTRFS_MOD_LOG_KEY_ADD) { 214 btrfs_node_key(eb, &tm->key, slot); 215 tm->blockptr = btrfs_node_blockptr(eb, slot); 216 } 217 tm->op = op; 218 tm->slot = slot; 219 tm->generation = btrfs_node_ptr_generation(eb, slot); 220 RB_CLEAR_NODE(&tm->node); 221 222 return tm; 223 } 224 225 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, 226 enum btrfs_mod_log_op op) 227 { 228 struct tree_mod_elem *tm; 229 int ret; 230 231 if (!tree_mod_need_log(eb->fs_info, eb)) 232 return 0; 233 234 tm = alloc_tree_mod_elem(eb, slot, op); 235 if (!tm) 236 return -ENOMEM; 237 238 if (tree_mod_dont_log(eb->fs_info, eb)) { 239 kfree(tm); 240 return 0; 241 } 242 243 ret = tree_mod_log_insert(eb->fs_info, tm); 244 write_unlock(&eb->fs_info->tree_mod_log_lock); 245 if (ret) 246 kfree(tm); 247 248 return ret; 249 } 250 251 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, 252 int dst_slot, int src_slot, 253 int nr_items) 254 { 255 struct tree_mod_elem *tm = NULL; 256 struct tree_mod_elem **tm_list = NULL; 257 int ret = 0; 258 int i; 259 bool locked = false; 260 261 if (!tree_mod_need_log(eb->fs_info, eb)) 262 return 0; 263 264 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 265 if (!tm_list) 266 return -ENOMEM; 267 268 tm = kzalloc(sizeof(*tm), GFP_NOFS); 269 if (!tm) { 270 ret = -ENOMEM; 271 goto free_tms; 272 } 273 274 tm->logical = eb->start; 275 tm->slot = src_slot; 276 tm->move.dst_slot = dst_slot; 277 tm->move.nr_items = nr_items; 278 tm->op = BTRFS_MOD_LOG_MOVE_KEYS; 279 280 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 281 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 282 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); 283 if (!tm_list[i]) { 284 ret = -ENOMEM; 285 goto free_tms; 286 } 287 } 288 289 if (tree_mod_dont_log(eb->fs_info, eb)) 290 goto free_tms; 291 locked = true; 292 293 /* 294 * When we override something during the move, we log these removals. 295 * This can only happen when we move towards the beginning of the 296 * buffer, i.e. dst_slot < src_slot. 297 */ 298 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 299 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); 300 if (ret) 301 goto free_tms; 302 } 303 304 ret = tree_mod_log_insert(eb->fs_info, tm); 305 if (ret) 306 goto free_tms; 307 write_unlock(&eb->fs_info->tree_mod_log_lock); 308 kfree(tm_list); 309 310 return 0; 311 312 free_tms: 313 for (i = 0; i < nr_items; i++) { 314 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 315 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 316 kfree(tm_list[i]); 317 } 318 if (locked) 319 write_unlock(&eb->fs_info->tree_mod_log_lock); 320 kfree(tm_list); 321 kfree(tm); 322 323 return ret; 324 } 325 326 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 327 struct tree_mod_elem **tm_list, 328 int nritems) 329 { 330 int i, j; 331 int ret; 332 333 for (i = nritems - 1; i >= 0; i--) { 334 ret = tree_mod_log_insert(fs_info, tm_list[i]); 335 if (ret) { 336 for (j = nritems - 1; j > i; j--) 337 rb_erase(&tm_list[j]->node, 338 &fs_info->tree_mod_log); 339 return ret; 340 } 341 } 342 343 return 0; 344 } 345 346 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, 347 struct extent_buffer *new_root, 348 bool log_removal) 349 { 350 struct btrfs_fs_info *fs_info = old_root->fs_info; 351 struct tree_mod_elem *tm = NULL; 352 struct tree_mod_elem **tm_list = NULL; 353 int nritems = 0; 354 int ret = 0; 355 int i; 356 357 if (!tree_mod_need_log(fs_info, NULL)) 358 return 0; 359 360 if (log_removal && btrfs_header_level(old_root) > 0) { 361 nritems = btrfs_header_nritems(old_root); 362 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 363 GFP_NOFS); 364 if (!tm_list) { 365 ret = -ENOMEM; 366 goto free_tms; 367 } 368 for (i = 0; i < nritems; i++) { 369 tm_list[i] = alloc_tree_mod_elem(old_root, i, 370 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 371 if (!tm_list[i]) { 372 ret = -ENOMEM; 373 goto free_tms; 374 } 375 } 376 } 377 378 tm = kzalloc(sizeof(*tm), GFP_NOFS); 379 if (!tm) { 380 ret = -ENOMEM; 381 goto free_tms; 382 } 383 384 tm->logical = new_root->start; 385 tm->old_root.logical = old_root->start; 386 tm->old_root.level = btrfs_header_level(old_root); 387 tm->generation = btrfs_header_generation(old_root); 388 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; 389 390 if (tree_mod_dont_log(fs_info, NULL)) 391 goto free_tms; 392 393 if (tm_list) 394 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); 395 if (!ret) 396 ret = tree_mod_log_insert(fs_info, tm); 397 398 write_unlock(&fs_info->tree_mod_log_lock); 399 if (ret) 400 goto free_tms; 401 kfree(tm_list); 402 403 return ret; 404 405 free_tms: 406 if (tm_list) { 407 for (i = 0; i < nritems; i++) 408 kfree(tm_list[i]); 409 kfree(tm_list); 410 } 411 kfree(tm); 412 413 return ret; 414 } 415 416 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, 417 u64 start, u64 min_seq, 418 bool smallest) 419 { 420 struct rb_root *tm_root; 421 struct rb_node *node; 422 struct tree_mod_elem *cur = NULL; 423 struct tree_mod_elem *found = NULL; 424 425 read_lock(&fs_info->tree_mod_log_lock); 426 tm_root = &fs_info->tree_mod_log; 427 node = tm_root->rb_node; 428 while (node) { 429 cur = rb_entry(node, struct tree_mod_elem, node); 430 if (cur->logical < start) { 431 node = node->rb_left; 432 } else if (cur->logical > start) { 433 node = node->rb_right; 434 } else if (cur->seq < min_seq) { 435 node = node->rb_left; 436 } else if (!smallest) { 437 /* We want the node with the highest seq */ 438 if (found) 439 BUG_ON(found->seq > cur->seq); 440 found = cur; 441 node = node->rb_left; 442 } else if (cur->seq > min_seq) { 443 /* We want the node with the smallest seq */ 444 if (found) 445 BUG_ON(found->seq < cur->seq); 446 found = cur; 447 node = node->rb_right; 448 } else { 449 found = cur; 450 break; 451 } 452 } 453 read_unlock(&fs_info->tree_mod_log_lock); 454 455 return found; 456 } 457 458 /* 459 * This returns the element from the log with the smallest time sequence 460 * value that's in the log (the oldest log item). Any element with a time 461 * sequence lower than min_seq will be ignored. 462 */ 463 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, 464 u64 start, u64 min_seq) 465 { 466 return __tree_mod_log_search(fs_info, start, min_seq, true); 467 } 468 469 /* 470 * This returns the element from the log with the largest time sequence 471 * value that's in the log (the most recent log item). Any element with 472 * a time sequence lower than min_seq will be ignored. 473 */ 474 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, 475 u64 start, u64 min_seq) 476 { 477 return __tree_mod_log_search(fs_info, start, min_seq, false); 478 } 479 480 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, 481 struct extent_buffer *src, 482 unsigned long dst_offset, 483 unsigned long src_offset, 484 int nr_items) 485 { 486 struct btrfs_fs_info *fs_info = dst->fs_info; 487 int ret = 0; 488 struct tree_mod_elem **tm_list = NULL; 489 struct tree_mod_elem **tm_list_add, **tm_list_rem; 490 int i; 491 bool locked = false; 492 493 if (!tree_mod_need_log(fs_info, NULL)) 494 return 0; 495 496 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 497 return 0; 498 499 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 500 GFP_NOFS); 501 if (!tm_list) 502 return -ENOMEM; 503 504 tm_list_add = tm_list; 505 tm_list_rem = tm_list + nr_items; 506 for (i = 0; i < nr_items; i++) { 507 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 508 BTRFS_MOD_LOG_KEY_REMOVE); 509 if (!tm_list_rem[i]) { 510 ret = -ENOMEM; 511 goto free_tms; 512 } 513 514 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 515 BTRFS_MOD_LOG_KEY_ADD); 516 if (!tm_list_add[i]) { 517 ret = -ENOMEM; 518 goto free_tms; 519 } 520 } 521 522 if (tree_mod_dont_log(fs_info, NULL)) 523 goto free_tms; 524 locked = true; 525 526 for (i = 0; i < nr_items; i++) { 527 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); 528 if (ret) 529 goto free_tms; 530 ret = tree_mod_log_insert(fs_info, tm_list_add[i]); 531 if (ret) 532 goto free_tms; 533 } 534 535 write_unlock(&fs_info->tree_mod_log_lock); 536 kfree(tm_list); 537 538 return 0; 539 540 free_tms: 541 for (i = 0; i < nr_items * 2; i++) { 542 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 543 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 544 kfree(tm_list[i]); 545 } 546 if (locked) 547 write_unlock(&fs_info->tree_mod_log_lock); 548 kfree(tm_list); 549 550 return ret; 551 } 552 553 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) 554 { 555 struct tree_mod_elem **tm_list = NULL; 556 int nritems = 0; 557 int i; 558 int ret = 0; 559 560 if (!tree_mod_need_log(eb->fs_info, eb)) 561 return 0; 562 563 nritems = btrfs_header_nritems(eb); 564 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 565 if (!tm_list) 566 return -ENOMEM; 567 568 for (i = 0; i < nritems; i++) { 569 tm_list[i] = alloc_tree_mod_elem(eb, i, 570 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 571 if (!tm_list[i]) { 572 ret = -ENOMEM; 573 goto free_tms; 574 } 575 } 576 577 if (tree_mod_dont_log(eb->fs_info, eb)) 578 goto free_tms; 579 580 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 581 write_unlock(&eb->fs_info->tree_mod_log_lock); 582 if (ret) 583 goto free_tms; 584 kfree(tm_list); 585 586 return 0; 587 588 free_tms: 589 for (i = 0; i < nritems; i++) 590 kfree(tm_list[i]); 591 kfree(tm_list); 592 593 return ret; 594 } 595 596 /* 597 * Returns the logical address of the oldest predecessor of the given root. 598 * Entries older than time_seq are ignored. 599 */ 600 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, 601 u64 time_seq) 602 { 603 struct tree_mod_elem *tm; 604 struct tree_mod_elem *found = NULL; 605 u64 root_logical = eb_root->start; 606 bool looped = false; 607 608 if (!time_seq) 609 return NULL; 610 611 /* 612 * The very last operation that's logged for a root is the replacement 613 * operation (if it is replaced at all). This has the logical address 614 * of the *new* root, making it the very first operation that's logged 615 * for this root. 616 */ 617 while (1) { 618 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 619 time_seq); 620 if (!looped && !tm) 621 return NULL; 622 /* 623 * If there are no tree operation for the oldest root, we simply 624 * return it. This should only happen if that (old) root is at 625 * level 0. 626 */ 627 if (!tm) 628 break; 629 630 /* 631 * If there's an operation that's not a root replacement, we 632 * found the oldest version of our root. Normally, we'll find a 633 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 634 */ 635 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) 636 break; 637 638 found = tm; 639 root_logical = tm->old_root.logical; 640 looped = true; 641 } 642 643 /* If there's no old root to return, return what we found instead */ 644 if (!found) 645 found = tm; 646 647 return found; 648 } 649 650 651 /* 652 * tm is a pointer to the first operation to rewind within eb. Then, all 653 * previous operations will be rewound (until we reach something older than 654 * time_seq). 655 */ 656 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 657 struct extent_buffer *eb, 658 u64 time_seq, 659 struct tree_mod_elem *first_tm) 660 { 661 u32 n; 662 struct rb_node *next; 663 struct tree_mod_elem *tm = first_tm; 664 unsigned long o_dst; 665 unsigned long o_src; 666 unsigned long p_size = sizeof(struct btrfs_key_ptr); 667 668 n = btrfs_header_nritems(eb); 669 read_lock(&fs_info->tree_mod_log_lock); 670 while (tm && tm->seq >= time_seq) { 671 /* 672 * All the operations are recorded with the operator used for 673 * the modification. As we're going backwards, we do the 674 * opposite of each operation here. 675 */ 676 switch (tm->op) { 677 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: 678 BUG_ON(tm->slot < n); 679 fallthrough; 680 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: 681 case BTRFS_MOD_LOG_KEY_REMOVE: 682 btrfs_set_node_key(eb, &tm->key, tm->slot); 683 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 684 btrfs_set_node_ptr_generation(eb, tm->slot, 685 tm->generation); 686 n++; 687 break; 688 case BTRFS_MOD_LOG_KEY_REPLACE: 689 BUG_ON(tm->slot >= n); 690 btrfs_set_node_key(eb, &tm->key, tm->slot); 691 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 692 btrfs_set_node_ptr_generation(eb, tm->slot, 693 tm->generation); 694 break; 695 case BTRFS_MOD_LOG_KEY_ADD: 696 /* if a move operation is needed it's in the log */ 697 n--; 698 break; 699 case BTRFS_MOD_LOG_MOVE_KEYS: 700 o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); 701 o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); 702 memmove_extent_buffer(eb, o_dst, o_src, 703 tm->move.nr_items * p_size); 704 break; 705 case BTRFS_MOD_LOG_ROOT_REPLACE: 706 /* 707 * This operation is special. For roots, this must be 708 * handled explicitly before rewinding. 709 * For non-roots, this operation may exist if the node 710 * was a root: root A -> child B; then A gets empty and 711 * B is promoted to the new root. In the mod log, we'll 712 * have a root-replace operation for B, a tree block 713 * that is no root. We simply ignore that operation. 714 */ 715 break; 716 } 717 next = rb_next(&tm->node); 718 if (!next) 719 break; 720 tm = rb_entry(next, struct tree_mod_elem, node); 721 if (tm->logical != first_tm->logical) 722 break; 723 } 724 read_unlock(&fs_info->tree_mod_log_lock); 725 btrfs_set_header_nritems(eb, n); 726 } 727 728 /* 729 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 730 * is returned. If rewind operations happen, a fresh buffer is returned. The 731 * returned buffer is always read-locked. If the returned buffer is not the 732 * input buffer, the lock on the input buffer is released and the input buffer 733 * is freed (its refcount is decremented). 734 */ 735 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 736 struct btrfs_path *path, 737 struct extent_buffer *eb, 738 u64 time_seq) 739 { 740 struct extent_buffer *eb_rewin; 741 struct tree_mod_elem *tm; 742 743 if (!time_seq) 744 return eb; 745 746 if (btrfs_header_level(eb) == 0) 747 return eb; 748 749 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 750 if (!tm) 751 return eb; 752 753 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 754 BUG_ON(tm->slot != 0); 755 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 756 if (!eb_rewin) { 757 btrfs_tree_read_unlock(eb); 758 free_extent_buffer(eb); 759 return NULL; 760 } 761 btrfs_set_header_bytenr(eb_rewin, eb->start); 762 btrfs_set_header_backref_rev(eb_rewin, 763 btrfs_header_backref_rev(eb)); 764 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 765 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 766 } else { 767 eb_rewin = btrfs_clone_extent_buffer(eb); 768 if (!eb_rewin) { 769 btrfs_tree_read_unlock(eb); 770 free_extent_buffer(eb); 771 return NULL; 772 } 773 } 774 775 btrfs_tree_read_unlock(eb); 776 free_extent_buffer(eb); 777 778 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), 779 eb_rewin, btrfs_header_level(eb_rewin)); 780 btrfs_tree_read_lock(eb_rewin); 781 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 782 WARN_ON(btrfs_header_nritems(eb_rewin) > 783 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 784 785 return eb_rewin; 786 } 787 788 /* 789 * Rewind the state of @root's root node to the given @time_seq value. 790 * If there are no changes, the current root->root_node is returned. If anything 791 * changed in between, there's a fresh buffer allocated on which the rewind 792 * operations are done. In any case, the returned buffer is read locked. 793 * Returns NULL on error (with no locks held). 794 */ 795 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) 796 { 797 struct btrfs_fs_info *fs_info = root->fs_info; 798 struct tree_mod_elem *tm; 799 struct extent_buffer *eb = NULL; 800 struct extent_buffer *eb_root; 801 u64 eb_root_owner = 0; 802 struct extent_buffer *old; 803 struct tree_mod_root *old_root = NULL; 804 u64 old_generation = 0; 805 u64 logical; 806 int level; 807 808 eb_root = btrfs_read_lock_root_node(root); 809 tm = tree_mod_log_oldest_root(eb_root, time_seq); 810 if (!tm) 811 return eb_root; 812 813 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { 814 old_root = &tm->old_root; 815 old_generation = tm->generation; 816 logical = old_root->logical; 817 level = old_root->level; 818 } else { 819 logical = eb_root->start; 820 level = btrfs_header_level(eb_root); 821 } 822 823 tm = tree_mod_log_search(fs_info, logical, time_seq); 824 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 825 struct btrfs_tree_parent_check check = { 0 }; 826 827 btrfs_tree_read_unlock(eb_root); 828 free_extent_buffer(eb_root); 829 830 check.level = level; 831 check.owner_root = root->root_key.objectid; 832 833 old = read_tree_block(fs_info, logical, &check); 834 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 835 if (!IS_ERR(old)) 836 free_extent_buffer(old); 837 btrfs_warn(fs_info, 838 "failed to read tree block %llu from get_old_root", 839 logical); 840 } else { 841 struct tree_mod_elem *tm2; 842 843 btrfs_tree_read_lock(old); 844 eb = btrfs_clone_extent_buffer(old); 845 /* 846 * After the lookup for the most recent tree mod operation 847 * above and before we locked and cloned the extent buffer 848 * 'old', a new tree mod log operation may have been added. 849 * So lookup for a more recent one to make sure the number 850 * of mod log operations we replay is consistent with the 851 * number of items we have in the cloned extent buffer, 852 * otherwise we can hit a BUG_ON when rewinding the extent 853 * buffer. 854 */ 855 tm2 = tree_mod_log_search(fs_info, logical, time_seq); 856 btrfs_tree_read_unlock(old); 857 free_extent_buffer(old); 858 ASSERT(tm2); 859 ASSERT(tm2 == tm || tm2->seq > tm->seq); 860 if (!tm2 || tm2->seq < tm->seq) { 861 free_extent_buffer(eb); 862 return NULL; 863 } 864 tm = tm2; 865 } 866 } else if (old_root) { 867 eb_root_owner = btrfs_header_owner(eb_root); 868 btrfs_tree_read_unlock(eb_root); 869 free_extent_buffer(eb_root); 870 eb = alloc_dummy_extent_buffer(fs_info, logical); 871 } else { 872 eb = btrfs_clone_extent_buffer(eb_root); 873 btrfs_tree_read_unlock(eb_root); 874 free_extent_buffer(eb_root); 875 } 876 877 if (!eb) 878 return NULL; 879 if (old_root) { 880 btrfs_set_header_bytenr(eb, eb->start); 881 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 882 btrfs_set_header_owner(eb, eb_root_owner); 883 btrfs_set_header_level(eb, old_root->level); 884 btrfs_set_header_generation(eb, old_generation); 885 } 886 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, 887 btrfs_header_level(eb)); 888 btrfs_tree_read_lock(eb); 889 if (tm) 890 tree_mod_log_rewind(fs_info, eb, time_seq, tm); 891 else 892 WARN_ON(btrfs_header_level(eb) != 0); 893 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 894 895 return eb; 896 } 897 898 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 899 { 900 struct tree_mod_elem *tm; 901 int level; 902 struct extent_buffer *eb_root = btrfs_root_node(root); 903 904 tm = tree_mod_log_oldest_root(eb_root, time_seq); 905 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) 906 level = tm->old_root.level; 907 else 908 level = btrfs_header_level(eb_root); 909 910 free_extent_buffer(eb_root); 911 912 return level; 913 } 914 915 /* 916 * Return the lowest sequence number in the tree modification log. 917 * 918 * Return the sequence number of the oldest tree modification log user, which 919 * corresponds to the lowest sequence number of all existing users. If there are 920 * no users it returns 0. 921 */ 922 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) 923 { 924 u64 ret = 0; 925 926 read_lock(&fs_info->tree_mod_log_lock); 927 if (!list_empty(&fs_info->tree_mod_seq_list)) { 928 struct btrfs_seq_list *elem; 929 930 elem = list_first_entry(&fs_info->tree_mod_seq_list, 931 struct btrfs_seq_list, list); 932 ret = elem->seq; 933 } 934 read_unlock(&fs_info->tree_mod_log_lock); 935 936 return ret; 937 } 938