1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "tree-mod-log.h" 4 #include "disk-io.h" 5 6 struct tree_mod_root { 7 u64 logical; 8 u8 level; 9 }; 10 11 struct tree_mod_elem { 12 struct rb_node node; 13 u64 logical; 14 u64 seq; 15 enum btrfs_mod_log_op op; 16 17 /* 18 * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS 19 * operations. 20 */ 21 int slot; 22 23 /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ 24 u64 generation; 25 26 /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ 27 struct btrfs_disk_key key; 28 u64 blockptr; 29 30 /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ 31 struct { 32 int dst_slot; 33 int nr_items; 34 } move; 35 36 /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ 37 struct tree_mod_root old_root; 38 }; 39 40 /* 41 * Pull a new tree mod seq number for our operation. 42 */ 43 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) 44 { 45 return atomic64_inc_return(&fs_info->tree_mod_seq); 46 } 47 48 /* 49 * This adds a new blocker to the tree mod log's blocker list if the @elem 50 * passed does not already have a sequence number set. So when a caller expects 51 * to record tree modifications, it should ensure to set elem->seq to zero 52 * before calling btrfs_get_tree_mod_seq. 53 * Returns a fresh, unused tree log modification sequence number, even if no new 54 * blocker was added. 55 */ 56 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, 57 struct btrfs_seq_list *elem) 58 { 59 write_lock(&fs_info->tree_mod_log_lock); 60 if (!elem->seq) { 61 elem->seq = btrfs_inc_tree_mod_seq(fs_info); 62 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); 63 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 64 } 65 write_unlock(&fs_info->tree_mod_log_lock); 66 67 return elem->seq; 68 } 69 70 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, 71 struct btrfs_seq_list *elem) 72 { 73 struct rb_root *tm_root; 74 struct rb_node *node; 75 struct rb_node *next; 76 struct tree_mod_elem *tm; 77 u64 min_seq = BTRFS_SEQ_LAST; 78 u64 seq_putting = elem->seq; 79 80 if (!seq_putting) 81 return; 82 83 write_lock(&fs_info->tree_mod_log_lock); 84 list_del(&elem->list); 85 elem->seq = 0; 86 87 if (list_empty(&fs_info->tree_mod_seq_list)) { 88 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); 89 } else { 90 struct btrfs_seq_list *first; 91 92 first = list_first_entry(&fs_info->tree_mod_seq_list, 93 struct btrfs_seq_list, list); 94 if (seq_putting > first->seq) { 95 /* 96 * Blocker with lower sequence number exists, we cannot 97 * remove anything from the log. 98 */ 99 write_unlock(&fs_info->tree_mod_log_lock); 100 return; 101 } 102 min_seq = first->seq; 103 } 104 105 /* 106 * Anything that's lower than the lowest existing (read: blocked) 107 * sequence number can be removed from the tree. 108 */ 109 tm_root = &fs_info->tree_mod_log; 110 for (node = rb_first(tm_root); node; node = next) { 111 next = rb_next(node); 112 tm = rb_entry(node, struct tree_mod_elem, node); 113 if (tm->seq >= min_seq) 114 continue; 115 rb_erase(node, tm_root); 116 kfree(tm); 117 } 118 write_unlock(&fs_info->tree_mod_log_lock); 119 } 120 121 /* 122 * Key order of the log: 123 * node/leaf start address -> sequence 124 * 125 * The 'start address' is the logical address of the *new* root node for root 126 * replace operations, or the logical address of the affected block for all 127 * other operations. 128 */ 129 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, 130 struct tree_mod_elem *tm) 131 { 132 struct rb_root *tm_root; 133 struct rb_node **new; 134 struct rb_node *parent = NULL; 135 struct tree_mod_elem *cur; 136 137 lockdep_assert_held_write(&fs_info->tree_mod_log_lock); 138 139 tm->seq = btrfs_inc_tree_mod_seq(fs_info); 140 141 tm_root = &fs_info->tree_mod_log; 142 new = &tm_root->rb_node; 143 while (*new) { 144 cur = rb_entry(*new, struct tree_mod_elem, node); 145 parent = *new; 146 if (cur->logical < tm->logical) 147 new = &((*new)->rb_left); 148 else if (cur->logical > tm->logical) 149 new = &((*new)->rb_right); 150 else if (cur->seq < tm->seq) 151 new = &((*new)->rb_left); 152 else if (cur->seq > tm->seq) 153 new = &((*new)->rb_right); 154 else 155 return -EEXIST; 156 } 157 158 rb_link_node(&tm->node, parent, new); 159 rb_insert_color(&tm->node, tm_root); 160 return 0; 161 } 162 163 /* 164 * Determines if logging can be omitted. Returns true if it can. Otherwise, it 165 * returns false with the tree_mod_log_lock acquired. The caller must hold 166 * this until all tree mod log insertions are recorded in the rb tree and then 167 * write unlock fs_info::tree_mod_log_lock. 168 */ 169 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, 170 struct extent_buffer *eb) 171 { 172 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 173 return true; 174 if (eb && btrfs_header_level(eb) == 0) 175 return true; 176 177 write_lock(&fs_info->tree_mod_log_lock); 178 if (list_empty(&(fs_info)->tree_mod_seq_list)) { 179 write_unlock(&fs_info->tree_mod_log_lock); 180 return true; 181 } 182 183 return false; 184 } 185 186 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ 187 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, 188 struct extent_buffer *eb) 189 { 190 if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) 191 return false; 192 if (eb && btrfs_header_level(eb) == 0) 193 return false; 194 195 return true; 196 } 197 198 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, 199 int slot, 200 enum btrfs_mod_log_op op) 201 { 202 struct tree_mod_elem *tm; 203 204 tm = kzalloc(sizeof(*tm), GFP_NOFS); 205 if (!tm) 206 return NULL; 207 208 tm->logical = eb->start; 209 if (op != BTRFS_MOD_LOG_KEY_ADD) { 210 btrfs_node_key(eb, &tm->key, slot); 211 tm->blockptr = btrfs_node_blockptr(eb, slot); 212 } 213 tm->op = op; 214 tm->slot = slot; 215 tm->generation = btrfs_node_ptr_generation(eb, slot); 216 RB_CLEAR_NODE(&tm->node); 217 218 return tm; 219 } 220 221 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, 222 enum btrfs_mod_log_op op) 223 { 224 struct tree_mod_elem *tm; 225 int ret; 226 227 if (!tree_mod_need_log(eb->fs_info, eb)) 228 return 0; 229 230 tm = alloc_tree_mod_elem(eb, slot, op); 231 if (!tm) 232 return -ENOMEM; 233 234 if (tree_mod_dont_log(eb->fs_info, eb)) { 235 kfree(tm); 236 return 0; 237 } 238 239 ret = tree_mod_log_insert(eb->fs_info, tm); 240 write_unlock(&eb->fs_info->tree_mod_log_lock); 241 if (ret) 242 kfree(tm); 243 244 return ret; 245 } 246 247 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, 248 int dst_slot, int src_slot, 249 int nr_items) 250 { 251 struct tree_mod_elem *tm = NULL; 252 struct tree_mod_elem **tm_list = NULL; 253 int ret = 0; 254 int i; 255 bool locked = false; 256 257 if (!tree_mod_need_log(eb->fs_info, eb)) 258 return 0; 259 260 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); 261 if (!tm_list) 262 return -ENOMEM; 263 264 tm = kzalloc(sizeof(*tm), GFP_NOFS); 265 if (!tm) { 266 ret = -ENOMEM; 267 goto free_tms; 268 } 269 270 tm->logical = eb->start; 271 tm->slot = src_slot; 272 tm->move.dst_slot = dst_slot; 273 tm->move.nr_items = nr_items; 274 tm->op = BTRFS_MOD_LOG_MOVE_KEYS; 275 276 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 277 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, 278 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); 279 if (!tm_list[i]) { 280 ret = -ENOMEM; 281 goto free_tms; 282 } 283 } 284 285 if (tree_mod_dont_log(eb->fs_info, eb)) 286 goto free_tms; 287 locked = true; 288 289 /* 290 * When we override something during the move, we log these removals. 291 * This can only happen when we move towards the beginning of the 292 * buffer, i.e. dst_slot < src_slot. 293 */ 294 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 295 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); 296 if (ret) 297 goto free_tms; 298 } 299 300 ret = tree_mod_log_insert(eb->fs_info, tm); 301 if (ret) 302 goto free_tms; 303 write_unlock(&eb->fs_info->tree_mod_log_lock); 304 kfree(tm_list); 305 306 return 0; 307 308 free_tms: 309 for (i = 0; i < nr_items; i++) { 310 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 311 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); 312 kfree(tm_list[i]); 313 } 314 if (locked) 315 write_unlock(&eb->fs_info->tree_mod_log_lock); 316 kfree(tm_list); 317 kfree(tm); 318 319 return ret; 320 } 321 322 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 323 struct tree_mod_elem **tm_list, 324 int nritems) 325 { 326 int i, j; 327 int ret; 328 329 for (i = nritems - 1; i >= 0; i--) { 330 ret = tree_mod_log_insert(fs_info, tm_list[i]); 331 if (ret) { 332 for (j = nritems - 1; j > i; j--) 333 rb_erase(&tm_list[j]->node, 334 &fs_info->tree_mod_log); 335 return ret; 336 } 337 } 338 339 return 0; 340 } 341 342 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, 343 struct extent_buffer *new_root, 344 bool log_removal) 345 { 346 struct btrfs_fs_info *fs_info = old_root->fs_info; 347 struct tree_mod_elem *tm = NULL; 348 struct tree_mod_elem **tm_list = NULL; 349 int nritems = 0; 350 int ret = 0; 351 int i; 352 353 if (!tree_mod_need_log(fs_info, NULL)) 354 return 0; 355 356 if (log_removal && btrfs_header_level(old_root) > 0) { 357 nritems = btrfs_header_nritems(old_root); 358 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), 359 GFP_NOFS); 360 if (!tm_list) { 361 ret = -ENOMEM; 362 goto free_tms; 363 } 364 for (i = 0; i < nritems; i++) { 365 tm_list[i] = alloc_tree_mod_elem(old_root, i, 366 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 367 if (!tm_list[i]) { 368 ret = -ENOMEM; 369 goto free_tms; 370 } 371 } 372 } 373 374 tm = kzalloc(sizeof(*tm), GFP_NOFS); 375 if (!tm) { 376 ret = -ENOMEM; 377 goto free_tms; 378 } 379 380 tm->logical = new_root->start; 381 tm->old_root.logical = old_root->start; 382 tm->old_root.level = btrfs_header_level(old_root); 383 tm->generation = btrfs_header_generation(old_root); 384 tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; 385 386 if (tree_mod_dont_log(fs_info, NULL)) 387 goto free_tms; 388 389 if (tm_list) 390 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); 391 if (!ret) 392 ret = tree_mod_log_insert(fs_info, tm); 393 394 write_unlock(&fs_info->tree_mod_log_lock); 395 if (ret) 396 goto free_tms; 397 kfree(tm_list); 398 399 return ret; 400 401 free_tms: 402 if (tm_list) { 403 for (i = 0; i < nritems; i++) 404 kfree(tm_list[i]); 405 kfree(tm_list); 406 } 407 kfree(tm); 408 409 return ret; 410 } 411 412 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, 413 u64 start, u64 min_seq, 414 bool smallest) 415 { 416 struct rb_root *tm_root; 417 struct rb_node *node; 418 struct tree_mod_elem *cur = NULL; 419 struct tree_mod_elem *found = NULL; 420 421 read_lock(&fs_info->tree_mod_log_lock); 422 tm_root = &fs_info->tree_mod_log; 423 node = tm_root->rb_node; 424 while (node) { 425 cur = rb_entry(node, struct tree_mod_elem, node); 426 if (cur->logical < start) { 427 node = node->rb_left; 428 } else if (cur->logical > start) { 429 node = node->rb_right; 430 } else if (cur->seq < min_seq) { 431 node = node->rb_left; 432 } else if (!smallest) { 433 /* We want the node with the highest seq */ 434 if (found) 435 BUG_ON(found->seq > cur->seq); 436 found = cur; 437 node = node->rb_left; 438 } else if (cur->seq > min_seq) { 439 /* We want the node with the smallest seq */ 440 if (found) 441 BUG_ON(found->seq < cur->seq); 442 found = cur; 443 node = node->rb_right; 444 } else { 445 found = cur; 446 break; 447 } 448 } 449 read_unlock(&fs_info->tree_mod_log_lock); 450 451 return found; 452 } 453 454 /* 455 * This returns the element from the log with the smallest time sequence 456 * value that's in the log (the oldest log item). Any element with a time 457 * sequence lower than min_seq will be ignored. 458 */ 459 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, 460 u64 start, u64 min_seq) 461 { 462 return __tree_mod_log_search(fs_info, start, min_seq, true); 463 } 464 465 /* 466 * This returns the element from the log with the largest time sequence 467 * value that's in the log (the most recent log item). Any element with 468 * a time sequence lower than min_seq will be ignored. 469 */ 470 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, 471 u64 start, u64 min_seq) 472 { 473 return __tree_mod_log_search(fs_info, start, min_seq, false); 474 } 475 476 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, 477 struct extent_buffer *src, 478 unsigned long dst_offset, 479 unsigned long src_offset, 480 int nr_items) 481 { 482 struct btrfs_fs_info *fs_info = dst->fs_info; 483 int ret = 0; 484 struct tree_mod_elem **tm_list = NULL; 485 struct tree_mod_elem **tm_list_add, **tm_list_rem; 486 int i; 487 bool locked = false; 488 489 if (!tree_mod_need_log(fs_info, NULL)) 490 return 0; 491 492 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 493 return 0; 494 495 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), 496 GFP_NOFS); 497 if (!tm_list) 498 return -ENOMEM; 499 500 tm_list_add = tm_list; 501 tm_list_rem = tm_list + nr_items; 502 for (i = 0; i < nr_items; i++) { 503 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, 504 BTRFS_MOD_LOG_KEY_REMOVE); 505 if (!tm_list_rem[i]) { 506 ret = -ENOMEM; 507 goto free_tms; 508 } 509 510 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, 511 BTRFS_MOD_LOG_KEY_ADD); 512 if (!tm_list_add[i]) { 513 ret = -ENOMEM; 514 goto free_tms; 515 } 516 } 517 518 if (tree_mod_dont_log(fs_info, NULL)) 519 goto free_tms; 520 locked = true; 521 522 for (i = 0; i < nr_items; i++) { 523 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); 524 if (ret) 525 goto free_tms; 526 ret = tree_mod_log_insert(fs_info, tm_list_add[i]); 527 if (ret) 528 goto free_tms; 529 } 530 531 write_unlock(&fs_info->tree_mod_log_lock); 532 kfree(tm_list); 533 534 return 0; 535 536 free_tms: 537 for (i = 0; i < nr_items * 2; i++) { 538 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) 539 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); 540 kfree(tm_list[i]); 541 } 542 if (locked) 543 write_unlock(&fs_info->tree_mod_log_lock); 544 kfree(tm_list); 545 546 return ret; 547 } 548 549 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) 550 { 551 struct tree_mod_elem **tm_list = NULL; 552 int nritems = 0; 553 int i; 554 int ret = 0; 555 556 if (!tree_mod_need_log(eb->fs_info, eb)) 557 return 0; 558 559 nritems = btrfs_header_nritems(eb); 560 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); 561 if (!tm_list) 562 return -ENOMEM; 563 564 for (i = 0; i < nritems; i++) { 565 tm_list[i] = alloc_tree_mod_elem(eb, i, 566 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); 567 if (!tm_list[i]) { 568 ret = -ENOMEM; 569 goto free_tms; 570 } 571 } 572 573 if (tree_mod_dont_log(eb->fs_info, eb)) 574 goto free_tms; 575 576 ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); 577 write_unlock(&eb->fs_info->tree_mod_log_lock); 578 if (ret) 579 goto free_tms; 580 kfree(tm_list); 581 582 return 0; 583 584 free_tms: 585 for (i = 0; i < nritems; i++) 586 kfree(tm_list[i]); 587 kfree(tm_list); 588 589 return ret; 590 } 591 592 /* 593 * Returns the logical address of the oldest predecessor of the given root. 594 * Entries older than time_seq are ignored. 595 */ 596 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, 597 u64 time_seq) 598 { 599 struct tree_mod_elem *tm; 600 struct tree_mod_elem *found = NULL; 601 u64 root_logical = eb_root->start; 602 bool looped = false; 603 604 if (!time_seq) 605 return NULL; 606 607 /* 608 * The very last operation that's logged for a root is the replacement 609 * operation (if it is replaced at all). This has the logical address 610 * of the *new* root, making it the very first operation that's logged 611 * for this root. 612 */ 613 while (1) { 614 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, 615 time_seq); 616 if (!looped && !tm) 617 return NULL; 618 /* 619 * If there are no tree operation for the oldest root, we simply 620 * return it. This should only happen if that (old) root is at 621 * level 0. 622 */ 623 if (!tm) 624 break; 625 626 /* 627 * If there's an operation that's not a root replacement, we 628 * found the oldest version of our root. Normally, we'll find a 629 * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. 630 */ 631 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) 632 break; 633 634 found = tm; 635 root_logical = tm->old_root.logical; 636 looped = true; 637 } 638 639 /* If there's no old root to return, return what we found instead */ 640 if (!found) 641 found = tm; 642 643 return found; 644 } 645 646 647 /* 648 * tm is a pointer to the first operation to rewind within eb. Then, all 649 * previous operations will be rewound (until we reach something older than 650 * time_seq). 651 */ 652 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 653 struct extent_buffer *eb, 654 u64 time_seq, 655 struct tree_mod_elem *first_tm) 656 { 657 u32 n; 658 struct rb_node *next; 659 struct tree_mod_elem *tm = first_tm; 660 unsigned long o_dst; 661 unsigned long o_src; 662 unsigned long p_size = sizeof(struct btrfs_key_ptr); 663 664 n = btrfs_header_nritems(eb); 665 read_lock(&fs_info->tree_mod_log_lock); 666 while (tm && tm->seq >= time_seq) { 667 /* 668 * All the operations are recorded with the operator used for 669 * the modification. As we're going backwards, we do the 670 * opposite of each operation here. 671 */ 672 switch (tm->op) { 673 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: 674 BUG_ON(tm->slot < n); 675 fallthrough; 676 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: 677 case BTRFS_MOD_LOG_KEY_REMOVE: 678 btrfs_set_node_key(eb, &tm->key, tm->slot); 679 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 680 btrfs_set_node_ptr_generation(eb, tm->slot, 681 tm->generation); 682 n++; 683 break; 684 case BTRFS_MOD_LOG_KEY_REPLACE: 685 BUG_ON(tm->slot >= n); 686 btrfs_set_node_key(eb, &tm->key, tm->slot); 687 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 688 btrfs_set_node_ptr_generation(eb, tm->slot, 689 tm->generation); 690 break; 691 case BTRFS_MOD_LOG_KEY_ADD: 692 /* if a move operation is needed it's in the log */ 693 n--; 694 break; 695 case BTRFS_MOD_LOG_MOVE_KEYS: 696 o_dst = btrfs_node_key_ptr_offset(tm->slot); 697 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); 698 memmove_extent_buffer(eb, o_dst, o_src, 699 tm->move.nr_items * p_size); 700 break; 701 case BTRFS_MOD_LOG_ROOT_REPLACE: 702 /* 703 * This operation is special. For roots, this must be 704 * handled explicitly before rewinding. 705 * For non-roots, this operation may exist if the node 706 * was a root: root A -> child B; then A gets empty and 707 * B is promoted to the new root. In the mod log, we'll 708 * have a root-replace operation for B, a tree block 709 * that is no root. We simply ignore that operation. 710 */ 711 break; 712 } 713 next = rb_next(&tm->node); 714 if (!next) 715 break; 716 tm = rb_entry(next, struct tree_mod_elem, node); 717 if (tm->logical != first_tm->logical) 718 break; 719 } 720 read_unlock(&fs_info->tree_mod_log_lock); 721 btrfs_set_header_nritems(eb, n); 722 } 723 724 /* 725 * Called with eb read locked. If the buffer cannot be rewound, the same buffer 726 * is returned. If rewind operations happen, a fresh buffer is returned. The 727 * returned buffer is always read-locked. If the returned buffer is not the 728 * input buffer, the lock on the input buffer is released and the input buffer 729 * is freed (its refcount is decremented). 730 */ 731 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, 732 struct btrfs_path *path, 733 struct extent_buffer *eb, 734 u64 time_seq) 735 { 736 struct extent_buffer *eb_rewin; 737 struct tree_mod_elem *tm; 738 739 if (!time_seq) 740 return eb; 741 742 if (btrfs_header_level(eb) == 0) 743 return eb; 744 745 tm = tree_mod_log_search(fs_info, eb->start, time_seq); 746 if (!tm) 747 return eb; 748 749 if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 750 BUG_ON(tm->slot != 0); 751 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); 752 if (!eb_rewin) { 753 btrfs_tree_read_unlock(eb); 754 free_extent_buffer(eb); 755 return NULL; 756 } 757 btrfs_set_header_bytenr(eb_rewin, eb->start); 758 btrfs_set_header_backref_rev(eb_rewin, 759 btrfs_header_backref_rev(eb)); 760 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); 761 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); 762 } else { 763 eb_rewin = btrfs_clone_extent_buffer(eb); 764 if (!eb_rewin) { 765 btrfs_tree_read_unlock(eb); 766 free_extent_buffer(eb); 767 return NULL; 768 } 769 } 770 771 btrfs_tree_read_unlock(eb); 772 free_extent_buffer(eb); 773 774 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), 775 eb_rewin, btrfs_header_level(eb_rewin)); 776 btrfs_tree_read_lock(eb_rewin); 777 tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); 778 WARN_ON(btrfs_header_nritems(eb_rewin) > 779 BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 780 781 return eb_rewin; 782 } 783 784 /* 785 * Rewind the state of @root's root node to the given @time_seq value. 786 * If there are no changes, the current root->root_node is returned. If anything 787 * changed in between, there's a fresh buffer allocated on which the rewind 788 * operations are done. In any case, the returned buffer is read locked. 789 * Returns NULL on error (with no locks held). 790 */ 791 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) 792 { 793 struct btrfs_fs_info *fs_info = root->fs_info; 794 struct tree_mod_elem *tm; 795 struct extent_buffer *eb = NULL; 796 struct extent_buffer *eb_root; 797 u64 eb_root_owner = 0; 798 struct extent_buffer *old; 799 struct tree_mod_root *old_root = NULL; 800 u64 old_generation = 0; 801 u64 logical; 802 int level; 803 804 eb_root = btrfs_read_lock_root_node(root); 805 tm = tree_mod_log_oldest_root(eb_root, time_seq); 806 if (!tm) 807 return eb_root; 808 809 if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { 810 old_root = &tm->old_root; 811 old_generation = tm->generation; 812 logical = old_root->logical; 813 level = old_root->level; 814 } else { 815 logical = eb_root->start; 816 level = btrfs_header_level(eb_root); 817 } 818 819 tm = tree_mod_log_search(fs_info, logical, time_seq); 820 if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 821 btrfs_tree_read_unlock(eb_root); 822 free_extent_buffer(eb_root); 823 old = read_tree_block(fs_info, logical, root->root_key.objectid, 824 0, level, NULL); 825 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 826 if (!IS_ERR(old)) 827 free_extent_buffer(old); 828 btrfs_warn(fs_info, 829 "failed to read tree block %llu from get_old_root", 830 logical); 831 } else { 832 struct tree_mod_elem *tm2; 833 834 btrfs_tree_read_lock(old); 835 eb = btrfs_clone_extent_buffer(old); 836 /* 837 * After the lookup for the most recent tree mod operation 838 * above and before we locked and cloned the extent buffer 839 * 'old', a new tree mod log operation may have been added. 840 * So lookup for a more recent one to make sure the number 841 * of mod log operations we replay is consistent with the 842 * number of items we have in the cloned extent buffer, 843 * otherwise we can hit a BUG_ON when rewinding the extent 844 * buffer. 845 */ 846 tm2 = tree_mod_log_search(fs_info, logical, time_seq); 847 btrfs_tree_read_unlock(old); 848 free_extent_buffer(old); 849 ASSERT(tm2); 850 ASSERT(tm2 == tm || tm2->seq > tm->seq); 851 if (!tm2 || tm2->seq < tm->seq) { 852 free_extent_buffer(eb); 853 return NULL; 854 } 855 tm = tm2; 856 } 857 } else if (old_root) { 858 eb_root_owner = btrfs_header_owner(eb_root); 859 btrfs_tree_read_unlock(eb_root); 860 free_extent_buffer(eb_root); 861 eb = alloc_dummy_extent_buffer(fs_info, logical); 862 } else { 863 eb = btrfs_clone_extent_buffer(eb_root); 864 btrfs_tree_read_unlock(eb_root); 865 free_extent_buffer(eb_root); 866 } 867 868 if (!eb) 869 return NULL; 870 if (old_root) { 871 btrfs_set_header_bytenr(eb, eb->start); 872 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); 873 btrfs_set_header_owner(eb, eb_root_owner); 874 btrfs_set_header_level(eb, old_root->level); 875 btrfs_set_header_generation(eb, old_generation); 876 } 877 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, 878 btrfs_header_level(eb)); 879 btrfs_tree_read_lock(eb); 880 if (tm) 881 tree_mod_log_rewind(fs_info, eb, time_seq, tm); 882 else 883 WARN_ON(btrfs_header_level(eb) != 0); 884 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); 885 886 return eb; 887 } 888 889 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) 890 { 891 struct tree_mod_elem *tm; 892 int level; 893 struct extent_buffer *eb_root = btrfs_root_node(root); 894 895 tm = tree_mod_log_oldest_root(eb_root, time_seq); 896 if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) 897 level = tm->old_root.level; 898 else 899 level = btrfs_header_level(eb_root); 900 901 free_extent_buffer(eb_root); 902 903 return level; 904 } 905 906 /* 907 * Return the lowest sequence number in the tree modification log. 908 * 909 * Return the sequence number of the oldest tree modification log user, which 910 * corresponds to the lowest sequence number of all existing users. If there are 911 * no users it returns 0. 912 */ 913 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) 914 { 915 u64 ret = 0; 916 917 read_lock(&fs_info->tree_mod_log_lock); 918 if (!list_empty(&fs_info->tree_mod_seq_list)) { 919 struct btrfs_seq_list *elem; 920 921 elem = list_first_entry(&fs_info->tree_mod_seq_list, 922 struct btrfs_seq_list, list); 923 ret = elem->seq; 924 } 925 read_unlock(&fs_info->tree_mod_log_lock); 926 927 return ret; 928 } 929