1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "locking.h" 24 #include "print-tree.h" 25 #include "compat.h" 26 #include "tree-log.h" 27 28 /* magic values for the inode_only field in btrfs_log_inode: 29 * 30 * LOG_INODE_ALL means to log everything 31 * LOG_INODE_EXISTS means to log just enough to recreate the inode 32 * during log replay 33 */ 34 #define LOG_INODE_ALL 0 35 #define LOG_INODE_EXISTS 1 36 37 /* 38 * stages for the tree walking. The first 39 * stage (0) is to only pin down the blocks we find 40 * the second stage (1) is to make sure that all the inodes 41 * we find in the log are created in the subvolume. 42 * 43 * The last stage is to deal with directories and links and extents 44 * and all the other fun semantics 45 */ 46 #define LOG_WALK_PIN_ONLY 0 47 #define LOG_WALK_REPLAY_INODES 1 48 #define LOG_WALK_REPLAY_ALL 2 49 50 static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 51 struct btrfs_root *root, struct inode *inode, 52 int inode_only); 53 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 54 struct btrfs_root *root, 55 struct btrfs_path *path, u64 objectid); 56 57 /* 58 * tree logging is a special write ahead log used to make sure that 59 * fsyncs and O_SYNCs can happen without doing full tree commits. 60 * 61 * Full tree commits are expensive because they require commonly 62 * modified blocks to be recowed, creating many dirty pages in the 63 * extent tree an 4x-6x higher write load than ext3. 64 * 65 * Instead of doing a tree commit on every fsync, we use the 66 * key ranges and transaction ids to find items for a given file or directory 67 * that have changed in this transaction. Those items are copied into 68 * a special tree (one per subvolume root), that tree is written to disk 69 * and then the fsync is considered complete. 70 * 71 * After a crash, items are copied out of the log-tree back into the 72 * subvolume tree. Any file data extents found are recorded in the extent 73 * allocation tree, and the log-tree freed. 74 * 75 * The log tree is read three times, once to pin down all the extents it is 76 * using in ram and once, once to create all the inodes logged in the tree 77 * and once to do all the other items. 78 */ 79 80 /* 81 * btrfs_add_log_tree adds a new per-subvolume log tree into the 82 * tree of log tree roots. This must be called with a tree log transaction 83 * running (see start_log_trans). 84 */ 85 static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86 struct btrfs_root *root) 87 { 88 struct btrfs_key key; 89 struct btrfs_root_item root_item; 90 struct btrfs_inode_item *inode_item; 91 struct extent_buffer *leaf; 92 struct btrfs_root *new_root = root; 93 int ret; 94 u64 objectid = root->root_key.objectid; 95 96 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 97 BTRFS_TREE_LOG_OBJECTID, 98 trans->transid, 0, 0, 0); 99 if (IS_ERR(leaf)) { 100 ret = PTR_ERR(leaf); 101 return ret; 102 } 103 104 btrfs_set_header_nritems(leaf, 0); 105 btrfs_set_header_level(leaf, 0); 106 btrfs_set_header_bytenr(leaf, leaf->start); 107 btrfs_set_header_generation(leaf, trans->transid); 108 btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); 109 110 write_extent_buffer(leaf, root->fs_info->fsid, 111 (unsigned long)btrfs_header_fsid(leaf), 112 BTRFS_FSID_SIZE); 113 btrfs_mark_buffer_dirty(leaf); 114 115 inode_item = &root_item.inode; 116 memset(inode_item, 0, sizeof(*inode_item)); 117 inode_item->generation = cpu_to_le64(1); 118 inode_item->size = cpu_to_le64(3); 119 inode_item->nlink = cpu_to_le32(1); 120 inode_item->nbytes = cpu_to_le64(root->leafsize); 121 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 122 123 btrfs_set_root_bytenr(&root_item, leaf->start); 124 btrfs_set_root_generation(&root_item, trans->transid); 125 btrfs_set_root_level(&root_item, 0); 126 btrfs_set_root_refs(&root_item, 0); 127 btrfs_set_root_used(&root_item, 0); 128 129 memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); 130 root_item.drop_level = 0; 131 132 btrfs_tree_unlock(leaf); 133 free_extent_buffer(leaf); 134 leaf = NULL; 135 136 btrfs_set_root_dirid(&root_item, 0); 137 138 key.objectid = BTRFS_TREE_LOG_OBJECTID; 139 key.offset = objectid; 140 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 141 ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, 142 &root_item); 143 if (ret) 144 goto fail; 145 146 new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, 147 &key); 148 BUG_ON(!new_root); 149 150 WARN_ON(root->log_root); 151 root->log_root = new_root; 152 153 /* 154 * log trees do not get reference counted because they go away 155 * before a real commit is actually done. They do store pointers 156 * to file data extents, and those reference counts still get 157 * updated (along with back refs to the log tree). 158 */ 159 new_root->ref_cows = 0; 160 new_root->last_trans = trans->transid; 161 fail: 162 return ret; 163 } 164 165 /* 166 * start a sub transaction and setup the log tree 167 * this increments the log tree writer count to make the people 168 * syncing the tree wait for us to finish 169 */ 170 static int start_log_trans(struct btrfs_trans_handle *trans, 171 struct btrfs_root *root) 172 { 173 int ret; 174 mutex_lock(&root->fs_info->tree_log_mutex); 175 if (!root->fs_info->log_root_tree) { 176 ret = btrfs_init_log_root_tree(trans, root->fs_info); 177 BUG_ON(ret); 178 } 179 if (!root->log_root) { 180 ret = btrfs_add_log_tree(trans, root); 181 BUG_ON(ret); 182 } 183 atomic_inc(&root->fs_info->tree_log_writers); 184 root->fs_info->tree_log_batch++; 185 mutex_unlock(&root->fs_info->tree_log_mutex); 186 return 0; 187 } 188 189 /* 190 * returns 0 if there was a log transaction running and we were able 191 * to join, or returns -ENOENT if there were not transactions 192 * in progress 193 */ 194 static int join_running_log_trans(struct btrfs_root *root) 195 { 196 int ret = -ENOENT; 197 198 smp_mb(); 199 if (!root->log_root) 200 return -ENOENT; 201 202 mutex_lock(&root->fs_info->tree_log_mutex); 203 if (root->log_root) { 204 ret = 0; 205 atomic_inc(&root->fs_info->tree_log_writers); 206 root->fs_info->tree_log_batch++; 207 } 208 mutex_unlock(&root->fs_info->tree_log_mutex); 209 return ret; 210 } 211 212 /* 213 * indicate we're done making changes to the log tree 214 * and wake up anyone waiting to do a sync 215 */ 216 static int end_log_trans(struct btrfs_root *root) 217 { 218 atomic_dec(&root->fs_info->tree_log_writers); 219 smp_mb(); 220 if (waitqueue_active(&root->fs_info->tree_log_wait)) 221 wake_up(&root->fs_info->tree_log_wait); 222 return 0; 223 } 224 225 226 /* 227 * the walk control struct is used to pass state down the chain when 228 * processing the log tree. The stage field tells us which part 229 * of the log tree processing we are currently doing. The others 230 * are state fields used for that specific part 231 */ 232 struct walk_control { 233 /* should we free the extent on disk when done? This is used 234 * at transaction commit time while freeing a log tree 235 */ 236 int free; 237 238 /* should we write out the extent buffer? This is used 239 * while flushing the log tree to disk during a sync 240 */ 241 int write; 242 243 /* should we wait for the extent buffer io to finish? Also used 244 * while flushing the log tree to disk for a sync 245 */ 246 int wait; 247 248 /* pin only walk, we record which extents on disk belong to the 249 * log trees 250 */ 251 int pin; 252 253 /* what stage of the replay code we're currently in */ 254 int stage; 255 256 /* the root we are currently replaying */ 257 struct btrfs_root *replay_dest; 258 259 /* the trans handle for the current replay */ 260 struct btrfs_trans_handle *trans; 261 262 /* the function that gets used to process blocks we find in the 263 * tree. Note the extent_buffer might not be up to date when it is 264 * passed in, and it must be checked or read if you need the data 265 * inside it 266 */ 267 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 268 struct walk_control *wc, u64 gen); 269 }; 270 271 /* 272 * process_func used to pin down extents, write them or wait on them 273 */ 274 static int process_one_buffer(struct btrfs_root *log, 275 struct extent_buffer *eb, 276 struct walk_control *wc, u64 gen) 277 { 278 if (wc->pin) { 279 mutex_lock(&log->fs_info->pinned_mutex); 280 btrfs_update_pinned_extents(log->fs_info->extent_root, 281 eb->start, eb->len, 1); 282 mutex_unlock(&log->fs_info->pinned_mutex); 283 } 284 285 if (btrfs_buffer_uptodate(eb, gen)) { 286 if (wc->write) 287 btrfs_write_tree_block(eb); 288 if (wc->wait) 289 btrfs_wait_tree_block_writeback(eb); 290 } 291 return 0; 292 } 293 294 /* 295 * Item overwrite used by replay and tree logging. eb, slot and key all refer 296 * to the src data we are copying out. 297 * 298 * root is the tree we are copying into, and path is a scratch 299 * path for use in this function (it should be released on entry and 300 * will be released on exit). 301 * 302 * If the key is already in the destination tree the existing item is 303 * overwritten. If the existing item isn't big enough, it is extended. 304 * If it is too large, it is truncated. 305 * 306 * If the key isn't in the destination yet, a new item is inserted. 307 */ 308 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 309 struct btrfs_root *root, 310 struct btrfs_path *path, 311 struct extent_buffer *eb, int slot, 312 struct btrfs_key *key) 313 { 314 int ret; 315 u32 item_size; 316 u64 saved_i_size = 0; 317 int save_old_i_size = 0; 318 unsigned long src_ptr; 319 unsigned long dst_ptr; 320 int overwrite_root = 0; 321 322 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 323 overwrite_root = 1; 324 325 item_size = btrfs_item_size_nr(eb, slot); 326 src_ptr = btrfs_item_ptr_offset(eb, slot); 327 328 /* look for the key in the destination tree */ 329 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 330 if (ret == 0) { 331 char *src_copy; 332 char *dst_copy; 333 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 334 path->slots[0]); 335 if (dst_size != item_size) 336 goto insert; 337 338 if (item_size == 0) { 339 btrfs_release_path(root, path); 340 return 0; 341 } 342 dst_copy = kmalloc(item_size, GFP_NOFS); 343 src_copy = kmalloc(item_size, GFP_NOFS); 344 345 read_extent_buffer(eb, src_copy, src_ptr, item_size); 346 347 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 348 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 349 item_size); 350 ret = memcmp(dst_copy, src_copy, item_size); 351 352 kfree(dst_copy); 353 kfree(src_copy); 354 /* 355 * they have the same contents, just return, this saves 356 * us from cowing blocks in the destination tree and doing 357 * extra writes that may not have been done by a previous 358 * sync 359 */ 360 if (ret == 0) { 361 btrfs_release_path(root, path); 362 return 0; 363 } 364 365 } 366 insert: 367 btrfs_release_path(root, path); 368 /* try to insert the key into the destination tree */ 369 ret = btrfs_insert_empty_item(trans, root, path, 370 key, item_size); 371 372 /* make sure any existing item is the correct size */ 373 if (ret == -EEXIST) { 374 u32 found_size; 375 found_size = btrfs_item_size_nr(path->nodes[0], 376 path->slots[0]); 377 if (found_size > item_size) { 378 btrfs_truncate_item(trans, root, path, item_size, 1); 379 } else if (found_size < item_size) { 380 ret = btrfs_extend_item(trans, root, path, 381 item_size - found_size); 382 BUG_ON(ret); 383 } 384 } else if (ret) { 385 BUG(); 386 } 387 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 388 path->slots[0]); 389 390 /* don't overwrite an existing inode if the generation number 391 * was logged as zero. This is done when the tree logging code 392 * is just logging an inode to make sure it exists after recovery. 393 * 394 * Also, don't overwrite i_size on directories during replay. 395 * log replay inserts and removes directory items based on the 396 * state of the tree found in the subvolume, and i_size is modified 397 * as it goes 398 */ 399 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 400 struct btrfs_inode_item *src_item; 401 struct btrfs_inode_item *dst_item; 402 403 src_item = (struct btrfs_inode_item *)src_ptr; 404 dst_item = (struct btrfs_inode_item *)dst_ptr; 405 406 if (btrfs_inode_generation(eb, src_item) == 0) 407 goto no_copy; 408 409 if (overwrite_root && 410 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 411 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 412 save_old_i_size = 1; 413 saved_i_size = btrfs_inode_size(path->nodes[0], 414 dst_item); 415 } 416 } 417 418 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 419 src_ptr, item_size); 420 421 if (save_old_i_size) { 422 struct btrfs_inode_item *dst_item; 423 dst_item = (struct btrfs_inode_item *)dst_ptr; 424 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 425 } 426 427 /* make sure the generation is filled in */ 428 if (key->type == BTRFS_INODE_ITEM_KEY) { 429 struct btrfs_inode_item *dst_item; 430 dst_item = (struct btrfs_inode_item *)dst_ptr; 431 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 432 btrfs_set_inode_generation(path->nodes[0], dst_item, 433 trans->transid); 434 } 435 } 436 no_copy: 437 btrfs_mark_buffer_dirty(path->nodes[0]); 438 btrfs_release_path(root, path); 439 return 0; 440 } 441 442 /* 443 * simple helper to read an inode off the disk from a given root 444 * This can only be called for subvolume roots and not for the log 445 */ 446 static noinline struct inode *read_one_inode(struct btrfs_root *root, 447 u64 objectid) 448 { 449 struct inode *inode; 450 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); 451 if (inode->i_state & I_NEW) { 452 BTRFS_I(inode)->root = root; 453 BTRFS_I(inode)->location.objectid = objectid; 454 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 455 BTRFS_I(inode)->location.offset = 0; 456 btrfs_read_locked_inode(inode); 457 unlock_new_inode(inode); 458 459 } 460 if (is_bad_inode(inode)) { 461 iput(inode); 462 inode = NULL; 463 } 464 return inode; 465 } 466 467 /* replays a single extent in 'eb' at 'slot' with 'key' into the 468 * subvolume 'root'. path is released on entry and should be released 469 * on exit. 470 * 471 * extents in the log tree have not been allocated out of the extent 472 * tree yet. So, this completes the allocation, taking a reference 473 * as required if the extent already exists or creating a new extent 474 * if it isn't in the extent allocation tree yet. 475 * 476 * The extent is inserted into the file, dropping any existing extents 477 * from the file that overlap the new one. 478 */ 479 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 480 struct btrfs_root *root, 481 struct btrfs_path *path, 482 struct extent_buffer *eb, int slot, 483 struct btrfs_key *key) 484 { 485 int found_type; 486 u64 mask = root->sectorsize - 1; 487 u64 extent_end; 488 u64 alloc_hint; 489 u64 start = key->offset; 490 u64 saved_nbytes; 491 struct btrfs_file_extent_item *item; 492 struct inode *inode = NULL; 493 unsigned long size; 494 int ret = 0; 495 496 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 497 found_type = btrfs_file_extent_type(eb, item); 498 499 if (found_type == BTRFS_FILE_EXTENT_REG || 500 found_type == BTRFS_FILE_EXTENT_PREALLOC) 501 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 502 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 503 size = btrfs_file_extent_inline_len(eb, item); 504 extent_end = (start + size + mask) & ~mask; 505 } else { 506 ret = 0; 507 goto out; 508 } 509 510 inode = read_one_inode(root, key->objectid); 511 if (!inode) { 512 ret = -EIO; 513 goto out; 514 } 515 516 /* 517 * first check to see if we already have this extent in the 518 * file. This must be done before the btrfs_drop_extents run 519 * so we don't try to drop this extent. 520 */ 521 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 522 start, 0); 523 524 if (ret == 0 && 525 (found_type == BTRFS_FILE_EXTENT_REG || 526 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 527 struct btrfs_file_extent_item cmp1; 528 struct btrfs_file_extent_item cmp2; 529 struct btrfs_file_extent_item *existing; 530 struct extent_buffer *leaf; 531 532 leaf = path->nodes[0]; 533 existing = btrfs_item_ptr(leaf, path->slots[0], 534 struct btrfs_file_extent_item); 535 536 read_extent_buffer(eb, &cmp1, (unsigned long)item, 537 sizeof(cmp1)); 538 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 539 sizeof(cmp2)); 540 541 /* 542 * we already have a pointer to this exact extent, 543 * we don't have to do anything 544 */ 545 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 546 btrfs_release_path(root, path); 547 goto out; 548 } 549 } 550 btrfs_release_path(root, path); 551 552 saved_nbytes = inode_get_bytes(inode); 553 /* drop any overlapping extents */ 554 ret = btrfs_drop_extents(trans, root, inode, 555 start, extent_end, start, &alloc_hint); 556 BUG_ON(ret); 557 558 if (found_type == BTRFS_FILE_EXTENT_REG || 559 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 560 unsigned long dest_offset; 561 struct btrfs_key ins; 562 563 ret = btrfs_insert_empty_item(trans, root, path, key, 564 sizeof(*item)); 565 BUG_ON(ret); 566 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 567 path->slots[0]); 568 copy_extent_buffer(path->nodes[0], eb, dest_offset, 569 (unsigned long)item, sizeof(*item)); 570 571 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 572 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 573 ins.type = BTRFS_EXTENT_ITEM_KEY; 574 575 if (ins.objectid > 0) { 576 u64 csum_start; 577 u64 csum_end; 578 LIST_HEAD(ordered_sums); 579 /* 580 * is this extent already allocated in the extent 581 * allocation tree? If so, just add a reference 582 */ 583 ret = btrfs_lookup_extent(root, ins.objectid, 584 ins.offset); 585 if (ret == 0) { 586 ret = btrfs_inc_extent_ref(trans, root, 587 ins.objectid, ins.offset, 588 path->nodes[0]->start, 589 root->root_key.objectid, 590 trans->transid, key->objectid); 591 } else { 592 /* 593 * insert the extent pointer in the extent 594 * allocation tree 595 */ 596 ret = btrfs_alloc_logged_extent(trans, root, 597 path->nodes[0]->start, 598 root->root_key.objectid, 599 trans->transid, key->objectid, 600 &ins); 601 BUG_ON(ret); 602 } 603 btrfs_release_path(root, path); 604 605 if (btrfs_file_extent_compression(eb, item)) { 606 csum_start = ins.objectid; 607 csum_end = csum_start + ins.offset; 608 } else { 609 csum_start = ins.objectid + 610 btrfs_file_extent_offset(eb, item); 611 csum_end = csum_start + 612 btrfs_file_extent_num_bytes(eb, item); 613 } 614 615 ret = btrfs_lookup_csums_range(root->log_root, 616 csum_start, csum_end - 1, 617 &ordered_sums); 618 BUG_ON(ret); 619 while (!list_empty(&ordered_sums)) { 620 struct btrfs_ordered_sum *sums; 621 sums = list_entry(ordered_sums.next, 622 struct btrfs_ordered_sum, 623 list); 624 ret = btrfs_csum_file_blocks(trans, 625 root->fs_info->csum_root, 626 sums); 627 BUG_ON(ret); 628 list_del(&sums->list); 629 kfree(sums); 630 } 631 } else { 632 btrfs_release_path(root, path); 633 } 634 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 635 /* inline extents are easy, we just overwrite them */ 636 ret = overwrite_item(trans, root, path, eb, slot, key); 637 BUG_ON(ret); 638 } 639 640 inode_set_bytes(inode, saved_nbytes); 641 btrfs_update_inode(trans, root, inode); 642 out: 643 if (inode) 644 iput(inode); 645 return ret; 646 } 647 648 /* 649 * when cleaning up conflicts between the directory names in the 650 * subvolume, directory names in the log and directory names in the 651 * inode back references, we may have to unlink inodes from directories. 652 * 653 * This is a helper function to do the unlink of a specific directory 654 * item 655 */ 656 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 657 struct btrfs_root *root, 658 struct btrfs_path *path, 659 struct inode *dir, 660 struct btrfs_dir_item *di) 661 { 662 struct inode *inode; 663 char *name; 664 int name_len; 665 struct extent_buffer *leaf; 666 struct btrfs_key location; 667 int ret; 668 669 leaf = path->nodes[0]; 670 671 btrfs_dir_item_key_to_cpu(leaf, di, &location); 672 name_len = btrfs_dir_name_len(leaf, di); 673 name = kmalloc(name_len, GFP_NOFS); 674 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 675 btrfs_release_path(root, path); 676 677 inode = read_one_inode(root, location.objectid); 678 BUG_ON(!inode); 679 680 ret = link_to_fixup_dir(trans, root, path, location.objectid); 681 BUG_ON(ret); 682 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 683 BUG_ON(ret); 684 kfree(name); 685 686 iput(inode); 687 return ret; 688 } 689 690 /* 691 * helper function to see if a given name and sequence number found 692 * in an inode back reference are already in a directory and correctly 693 * point to this inode 694 */ 695 static noinline int inode_in_dir(struct btrfs_root *root, 696 struct btrfs_path *path, 697 u64 dirid, u64 objectid, u64 index, 698 const char *name, int name_len) 699 { 700 struct btrfs_dir_item *di; 701 struct btrfs_key location; 702 int match = 0; 703 704 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 705 index, name, name_len, 0); 706 if (di && !IS_ERR(di)) { 707 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 708 if (location.objectid != objectid) 709 goto out; 710 } else 711 goto out; 712 btrfs_release_path(root, path); 713 714 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 715 if (di && !IS_ERR(di)) { 716 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 717 if (location.objectid != objectid) 718 goto out; 719 } else 720 goto out; 721 match = 1; 722 out: 723 btrfs_release_path(root, path); 724 return match; 725 } 726 727 /* 728 * helper function to check a log tree for a named back reference in 729 * an inode. This is used to decide if a back reference that is 730 * found in the subvolume conflicts with what we find in the log. 731 * 732 * inode backreferences may have multiple refs in a single item, 733 * during replay we process one reference at a time, and we don't 734 * want to delete valid links to a file from the subvolume if that 735 * link is also in the log. 736 */ 737 static noinline int backref_in_log(struct btrfs_root *log, 738 struct btrfs_key *key, 739 char *name, int namelen) 740 { 741 struct btrfs_path *path; 742 struct btrfs_inode_ref *ref; 743 unsigned long ptr; 744 unsigned long ptr_end; 745 unsigned long name_ptr; 746 int found_name_len; 747 int item_size; 748 int ret; 749 int match = 0; 750 751 path = btrfs_alloc_path(); 752 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 753 if (ret != 0) 754 goto out; 755 756 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 757 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 758 ptr_end = ptr + item_size; 759 while (ptr < ptr_end) { 760 ref = (struct btrfs_inode_ref *)ptr; 761 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 762 if (found_name_len == namelen) { 763 name_ptr = (unsigned long)(ref + 1); 764 ret = memcmp_extent_buffer(path->nodes[0], name, 765 name_ptr, namelen); 766 if (ret == 0) { 767 match = 1; 768 goto out; 769 } 770 } 771 ptr = (unsigned long)(ref + 1) + found_name_len; 772 } 773 out: 774 btrfs_free_path(path); 775 return match; 776 } 777 778 779 /* 780 * replay one inode back reference item found in the log tree. 781 * eb, slot and key refer to the buffer and key found in the log tree. 782 * root is the destination we are replaying into, and path is for temp 783 * use by this function. (it should be released on return). 784 */ 785 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 786 struct btrfs_root *root, 787 struct btrfs_root *log, 788 struct btrfs_path *path, 789 struct extent_buffer *eb, int slot, 790 struct btrfs_key *key) 791 { 792 struct inode *dir; 793 int ret; 794 struct btrfs_key location; 795 struct btrfs_inode_ref *ref; 796 struct btrfs_dir_item *di; 797 struct inode *inode; 798 char *name; 799 int namelen; 800 unsigned long ref_ptr; 801 unsigned long ref_end; 802 803 location.objectid = key->objectid; 804 location.type = BTRFS_INODE_ITEM_KEY; 805 location.offset = 0; 806 807 /* 808 * it is possible that we didn't log all the parent directories 809 * for a given inode. If we don't find the dir, just don't 810 * copy the back ref in. The link count fixup code will take 811 * care of the rest 812 */ 813 dir = read_one_inode(root, key->offset); 814 if (!dir) 815 return -ENOENT; 816 817 inode = read_one_inode(root, key->objectid); 818 BUG_ON(!dir); 819 820 ref_ptr = btrfs_item_ptr_offset(eb, slot); 821 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 822 823 again: 824 ref = (struct btrfs_inode_ref *)ref_ptr; 825 826 namelen = btrfs_inode_ref_name_len(eb, ref); 827 name = kmalloc(namelen, GFP_NOFS); 828 BUG_ON(!name); 829 830 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 831 832 /* if we already have a perfect match, we're done */ 833 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 834 btrfs_inode_ref_index(eb, ref), 835 name, namelen)) { 836 goto out; 837 } 838 839 /* 840 * look for a conflicting back reference in the metadata. 841 * if we find one we have to unlink that name of the file 842 * before we add our new link. Later on, we overwrite any 843 * existing back reference, and we don't want to create 844 * dangling pointers in the directory. 845 */ 846 conflict_again: 847 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 848 if (ret == 0) { 849 char *victim_name; 850 int victim_name_len; 851 struct btrfs_inode_ref *victim_ref; 852 unsigned long ptr; 853 unsigned long ptr_end; 854 struct extent_buffer *leaf = path->nodes[0]; 855 856 /* are we trying to overwrite a back ref for the root directory 857 * if so, just jump out, we're done 858 */ 859 if (key->objectid == key->offset) 860 goto out_nowrite; 861 862 /* check all the names in this back reference to see 863 * if they are in the log. if so, we allow them to stay 864 * otherwise they must be unlinked as a conflict 865 */ 866 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 867 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 868 while (ptr < ptr_end) { 869 victim_ref = (struct btrfs_inode_ref *)ptr; 870 victim_name_len = btrfs_inode_ref_name_len(leaf, 871 victim_ref); 872 victim_name = kmalloc(victim_name_len, GFP_NOFS); 873 BUG_ON(!victim_name); 874 875 read_extent_buffer(leaf, victim_name, 876 (unsigned long)(victim_ref + 1), 877 victim_name_len); 878 879 if (!backref_in_log(log, key, victim_name, 880 victim_name_len)) { 881 btrfs_inc_nlink(inode); 882 btrfs_release_path(root, path); 883 ret = btrfs_unlink_inode(trans, root, dir, 884 inode, victim_name, 885 victim_name_len); 886 kfree(victim_name); 887 btrfs_release_path(root, path); 888 goto conflict_again; 889 } 890 kfree(victim_name); 891 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 892 } 893 BUG_ON(ret); 894 } 895 btrfs_release_path(root, path); 896 897 /* look for a conflicting sequence number */ 898 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 899 btrfs_inode_ref_index(eb, ref), 900 name, namelen, 0); 901 if (di && !IS_ERR(di)) { 902 ret = drop_one_dir_item(trans, root, path, dir, di); 903 BUG_ON(ret); 904 } 905 btrfs_release_path(root, path); 906 907 908 /* look for a conflicting name */ 909 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 910 name, namelen, 0); 911 if (di && !IS_ERR(di)) { 912 ret = drop_one_dir_item(trans, root, path, dir, di); 913 BUG_ON(ret); 914 } 915 btrfs_release_path(root, path); 916 917 /* insert our name */ 918 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 919 btrfs_inode_ref_index(eb, ref)); 920 BUG_ON(ret); 921 922 btrfs_update_inode(trans, root, inode); 923 924 out: 925 ref_ptr = (unsigned long)(ref + 1) + namelen; 926 kfree(name); 927 if (ref_ptr < ref_end) 928 goto again; 929 930 /* finally write the back reference in the inode */ 931 ret = overwrite_item(trans, root, path, eb, slot, key); 932 BUG_ON(ret); 933 934 out_nowrite: 935 btrfs_release_path(root, path); 936 iput(dir); 937 iput(inode); 938 return 0; 939 } 940 941 /* 942 * There are a few corners where the link count of the file can't 943 * be properly maintained during replay. So, instead of adding 944 * lots of complexity to the log code, we just scan the backrefs 945 * for any file that has been through replay. 946 * 947 * The scan will update the link count on the inode to reflect the 948 * number of back refs found. If it goes down to zero, the iput 949 * will free the inode. 950 */ 951 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 952 struct btrfs_root *root, 953 struct inode *inode) 954 { 955 struct btrfs_path *path; 956 int ret; 957 struct btrfs_key key; 958 u64 nlink = 0; 959 unsigned long ptr; 960 unsigned long ptr_end; 961 int name_len; 962 963 key.objectid = inode->i_ino; 964 key.type = BTRFS_INODE_REF_KEY; 965 key.offset = (u64)-1; 966 967 path = btrfs_alloc_path(); 968 969 while (1) { 970 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 971 if (ret < 0) 972 break; 973 if (ret > 0) { 974 if (path->slots[0] == 0) 975 break; 976 path->slots[0]--; 977 } 978 btrfs_item_key_to_cpu(path->nodes[0], &key, 979 path->slots[0]); 980 if (key.objectid != inode->i_ino || 981 key.type != BTRFS_INODE_REF_KEY) 982 break; 983 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 984 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 985 path->slots[0]); 986 while (ptr < ptr_end) { 987 struct btrfs_inode_ref *ref; 988 989 ref = (struct btrfs_inode_ref *)ptr; 990 name_len = btrfs_inode_ref_name_len(path->nodes[0], 991 ref); 992 ptr = (unsigned long)(ref + 1) + name_len; 993 nlink++; 994 } 995 996 if (key.offset == 0) 997 break; 998 key.offset--; 999 btrfs_release_path(root, path); 1000 } 1001 btrfs_free_path(path); 1002 if (nlink != inode->i_nlink) { 1003 inode->i_nlink = nlink; 1004 btrfs_update_inode(trans, root, inode); 1005 } 1006 BTRFS_I(inode)->index_cnt = (u64)-1; 1007 1008 return 0; 1009 } 1010 1011 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1012 struct btrfs_root *root, 1013 struct btrfs_path *path) 1014 { 1015 int ret; 1016 struct btrfs_key key; 1017 struct inode *inode; 1018 1019 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1020 key.type = BTRFS_ORPHAN_ITEM_KEY; 1021 key.offset = (u64)-1; 1022 while (1) { 1023 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1024 if (ret < 0) 1025 break; 1026 1027 if (ret == 1) { 1028 if (path->slots[0] == 0) 1029 break; 1030 path->slots[0]--; 1031 } 1032 1033 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1034 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1035 key.type != BTRFS_ORPHAN_ITEM_KEY) 1036 break; 1037 1038 ret = btrfs_del_item(trans, root, path); 1039 BUG_ON(ret); 1040 1041 btrfs_release_path(root, path); 1042 inode = read_one_inode(root, key.offset); 1043 BUG_ON(!inode); 1044 1045 ret = fixup_inode_link_count(trans, root, inode); 1046 BUG_ON(ret); 1047 1048 iput(inode); 1049 1050 if (key.offset == 0) 1051 break; 1052 key.offset--; 1053 } 1054 btrfs_release_path(root, path); 1055 return 0; 1056 } 1057 1058 1059 /* 1060 * record a given inode in the fixup dir so we can check its link 1061 * count when replay is done. The link count is incremented here 1062 * so the inode won't go away until we check it 1063 */ 1064 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1065 struct btrfs_root *root, 1066 struct btrfs_path *path, 1067 u64 objectid) 1068 { 1069 struct btrfs_key key; 1070 int ret = 0; 1071 struct inode *inode; 1072 1073 inode = read_one_inode(root, objectid); 1074 BUG_ON(!inode); 1075 1076 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1077 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1078 key.offset = objectid; 1079 1080 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1081 1082 btrfs_release_path(root, path); 1083 if (ret == 0) { 1084 btrfs_inc_nlink(inode); 1085 btrfs_update_inode(trans, root, inode); 1086 } else if (ret == -EEXIST) { 1087 ret = 0; 1088 } else { 1089 BUG(); 1090 } 1091 iput(inode); 1092 1093 return ret; 1094 } 1095 1096 /* 1097 * when replaying the log for a directory, we only insert names 1098 * for inodes that actually exist. This means an fsync on a directory 1099 * does not implicitly fsync all the new files in it 1100 */ 1101 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1102 struct btrfs_root *root, 1103 struct btrfs_path *path, 1104 u64 dirid, u64 index, 1105 char *name, int name_len, u8 type, 1106 struct btrfs_key *location) 1107 { 1108 struct inode *inode; 1109 struct inode *dir; 1110 int ret; 1111 1112 inode = read_one_inode(root, location->objectid); 1113 if (!inode) 1114 return -ENOENT; 1115 1116 dir = read_one_inode(root, dirid); 1117 if (!dir) { 1118 iput(inode); 1119 return -EIO; 1120 } 1121 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1122 1123 /* FIXME, put inode into FIXUP list */ 1124 1125 iput(inode); 1126 iput(dir); 1127 return ret; 1128 } 1129 1130 /* 1131 * take a single entry in a log directory item and replay it into 1132 * the subvolume. 1133 * 1134 * if a conflicting item exists in the subdirectory already, 1135 * the inode it points to is unlinked and put into the link count 1136 * fix up tree. 1137 * 1138 * If a name from the log points to a file or directory that does 1139 * not exist in the FS, it is skipped. fsyncs on directories 1140 * do not force down inodes inside that directory, just changes to the 1141 * names or unlinks in a directory. 1142 */ 1143 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1144 struct btrfs_root *root, 1145 struct btrfs_path *path, 1146 struct extent_buffer *eb, 1147 struct btrfs_dir_item *di, 1148 struct btrfs_key *key) 1149 { 1150 char *name; 1151 int name_len; 1152 struct btrfs_dir_item *dst_di; 1153 struct btrfs_key found_key; 1154 struct btrfs_key log_key; 1155 struct inode *dir; 1156 u8 log_type; 1157 int exists; 1158 int ret; 1159 1160 dir = read_one_inode(root, key->objectid); 1161 BUG_ON(!dir); 1162 1163 name_len = btrfs_dir_name_len(eb, di); 1164 name = kmalloc(name_len, GFP_NOFS); 1165 log_type = btrfs_dir_type(eb, di); 1166 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1167 name_len); 1168 1169 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1170 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1171 if (exists == 0) 1172 exists = 1; 1173 else 1174 exists = 0; 1175 btrfs_release_path(root, path); 1176 1177 if (key->type == BTRFS_DIR_ITEM_KEY) { 1178 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1179 name, name_len, 1); 1180 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1181 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1182 key->objectid, 1183 key->offset, name, 1184 name_len, 1); 1185 } else { 1186 BUG(); 1187 } 1188 if (!dst_di || IS_ERR(dst_di)) { 1189 /* we need a sequence number to insert, so we only 1190 * do inserts for the BTRFS_DIR_INDEX_KEY types 1191 */ 1192 if (key->type != BTRFS_DIR_INDEX_KEY) 1193 goto out; 1194 goto insert; 1195 } 1196 1197 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1198 /* the existing item matches the logged item */ 1199 if (found_key.objectid == log_key.objectid && 1200 found_key.type == log_key.type && 1201 found_key.offset == log_key.offset && 1202 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1203 goto out; 1204 } 1205 1206 /* 1207 * don't drop the conflicting directory entry if the inode 1208 * for the new entry doesn't exist 1209 */ 1210 if (!exists) 1211 goto out; 1212 1213 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1214 BUG_ON(ret); 1215 1216 if (key->type == BTRFS_DIR_INDEX_KEY) 1217 goto insert; 1218 out: 1219 btrfs_release_path(root, path); 1220 kfree(name); 1221 iput(dir); 1222 return 0; 1223 1224 insert: 1225 btrfs_release_path(root, path); 1226 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1227 name, name_len, log_type, &log_key); 1228 1229 if (ret && ret != -ENOENT) 1230 BUG(); 1231 goto out; 1232 } 1233 1234 /* 1235 * find all the names in a directory item and reconcile them into 1236 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1237 * one name in a directory item, but the same code gets used for 1238 * both directory index types 1239 */ 1240 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1241 struct btrfs_root *root, 1242 struct btrfs_path *path, 1243 struct extent_buffer *eb, int slot, 1244 struct btrfs_key *key) 1245 { 1246 int ret; 1247 u32 item_size = btrfs_item_size_nr(eb, slot); 1248 struct btrfs_dir_item *di; 1249 int name_len; 1250 unsigned long ptr; 1251 unsigned long ptr_end; 1252 1253 ptr = btrfs_item_ptr_offset(eb, slot); 1254 ptr_end = ptr + item_size; 1255 while (ptr < ptr_end) { 1256 di = (struct btrfs_dir_item *)ptr; 1257 name_len = btrfs_dir_name_len(eb, di); 1258 ret = replay_one_name(trans, root, path, eb, di, key); 1259 BUG_ON(ret); 1260 ptr = (unsigned long)(di + 1); 1261 ptr += name_len; 1262 } 1263 return 0; 1264 } 1265 1266 /* 1267 * directory replay has two parts. There are the standard directory 1268 * items in the log copied from the subvolume, and range items 1269 * created in the log while the subvolume was logged. 1270 * 1271 * The range items tell us which parts of the key space the log 1272 * is authoritative for. During replay, if a key in the subvolume 1273 * directory is in a logged range item, but not actually in the log 1274 * that means it was deleted from the directory before the fsync 1275 * and should be removed. 1276 */ 1277 static noinline int find_dir_range(struct btrfs_root *root, 1278 struct btrfs_path *path, 1279 u64 dirid, int key_type, 1280 u64 *start_ret, u64 *end_ret) 1281 { 1282 struct btrfs_key key; 1283 u64 found_end; 1284 struct btrfs_dir_log_item *item; 1285 int ret; 1286 int nritems; 1287 1288 if (*start_ret == (u64)-1) 1289 return 1; 1290 1291 key.objectid = dirid; 1292 key.type = key_type; 1293 key.offset = *start_ret; 1294 1295 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1296 if (ret < 0) 1297 goto out; 1298 if (ret > 0) { 1299 if (path->slots[0] == 0) 1300 goto out; 1301 path->slots[0]--; 1302 } 1303 if (ret != 0) 1304 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1305 1306 if (key.type != key_type || key.objectid != dirid) { 1307 ret = 1; 1308 goto next; 1309 } 1310 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1311 struct btrfs_dir_log_item); 1312 found_end = btrfs_dir_log_end(path->nodes[0], item); 1313 1314 if (*start_ret >= key.offset && *start_ret <= found_end) { 1315 ret = 0; 1316 *start_ret = key.offset; 1317 *end_ret = found_end; 1318 goto out; 1319 } 1320 ret = 1; 1321 next: 1322 /* check the next slot in the tree to see if it is a valid item */ 1323 nritems = btrfs_header_nritems(path->nodes[0]); 1324 if (path->slots[0] >= nritems) { 1325 ret = btrfs_next_leaf(root, path); 1326 if (ret) 1327 goto out; 1328 } else { 1329 path->slots[0]++; 1330 } 1331 1332 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1333 1334 if (key.type != key_type || key.objectid != dirid) { 1335 ret = 1; 1336 goto out; 1337 } 1338 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1339 struct btrfs_dir_log_item); 1340 found_end = btrfs_dir_log_end(path->nodes[0], item); 1341 *start_ret = key.offset; 1342 *end_ret = found_end; 1343 ret = 0; 1344 out: 1345 btrfs_release_path(root, path); 1346 return ret; 1347 } 1348 1349 /* 1350 * this looks for a given directory item in the log. If the directory 1351 * item is not in the log, the item is removed and the inode it points 1352 * to is unlinked 1353 */ 1354 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1355 struct btrfs_root *root, 1356 struct btrfs_root *log, 1357 struct btrfs_path *path, 1358 struct btrfs_path *log_path, 1359 struct inode *dir, 1360 struct btrfs_key *dir_key) 1361 { 1362 int ret; 1363 struct extent_buffer *eb; 1364 int slot; 1365 u32 item_size; 1366 struct btrfs_dir_item *di; 1367 struct btrfs_dir_item *log_di; 1368 int name_len; 1369 unsigned long ptr; 1370 unsigned long ptr_end; 1371 char *name; 1372 struct inode *inode; 1373 struct btrfs_key location; 1374 1375 again: 1376 eb = path->nodes[0]; 1377 slot = path->slots[0]; 1378 item_size = btrfs_item_size_nr(eb, slot); 1379 ptr = btrfs_item_ptr_offset(eb, slot); 1380 ptr_end = ptr + item_size; 1381 while (ptr < ptr_end) { 1382 di = (struct btrfs_dir_item *)ptr; 1383 name_len = btrfs_dir_name_len(eb, di); 1384 name = kmalloc(name_len, GFP_NOFS); 1385 if (!name) { 1386 ret = -ENOMEM; 1387 goto out; 1388 } 1389 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1390 name_len); 1391 log_di = NULL; 1392 if (dir_key->type == BTRFS_DIR_ITEM_KEY) { 1393 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1394 dir_key->objectid, 1395 name, name_len, 0); 1396 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { 1397 log_di = btrfs_lookup_dir_index_item(trans, log, 1398 log_path, 1399 dir_key->objectid, 1400 dir_key->offset, 1401 name, name_len, 0); 1402 } 1403 if (!log_di || IS_ERR(log_di)) { 1404 btrfs_dir_item_key_to_cpu(eb, di, &location); 1405 btrfs_release_path(root, path); 1406 btrfs_release_path(log, log_path); 1407 inode = read_one_inode(root, location.objectid); 1408 BUG_ON(!inode); 1409 1410 ret = link_to_fixup_dir(trans, root, 1411 path, location.objectid); 1412 BUG_ON(ret); 1413 btrfs_inc_nlink(inode); 1414 ret = btrfs_unlink_inode(trans, root, dir, inode, 1415 name, name_len); 1416 BUG_ON(ret); 1417 kfree(name); 1418 iput(inode); 1419 1420 /* there might still be more names under this key 1421 * check and repeat if required 1422 */ 1423 ret = btrfs_search_slot(NULL, root, dir_key, path, 1424 0, 0); 1425 if (ret == 0) 1426 goto again; 1427 ret = 0; 1428 goto out; 1429 } 1430 btrfs_release_path(log, log_path); 1431 kfree(name); 1432 1433 ptr = (unsigned long)(di + 1); 1434 ptr += name_len; 1435 } 1436 ret = 0; 1437 out: 1438 btrfs_release_path(root, path); 1439 btrfs_release_path(log, log_path); 1440 return ret; 1441 } 1442 1443 /* 1444 * deletion replay happens before we copy any new directory items 1445 * out of the log or out of backreferences from inodes. It 1446 * scans the log to find ranges of keys that log is authoritative for, 1447 * and then scans the directory to find items in those ranges that are 1448 * not present in the log. 1449 * 1450 * Anything we don't find in the log is unlinked and removed from the 1451 * directory. 1452 */ 1453 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1454 struct btrfs_root *root, 1455 struct btrfs_root *log, 1456 struct btrfs_path *path, 1457 u64 dirid) 1458 { 1459 u64 range_start; 1460 u64 range_end; 1461 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1462 int ret = 0; 1463 struct btrfs_key dir_key; 1464 struct btrfs_key found_key; 1465 struct btrfs_path *log_path; 1466 struct inode *dir; 1467 1468 dir_key.objectid = dirid; 1469 dir_key.type = BTRFS_DIR_ITEM_KEY; 1470 log_path = btrfs_alloc_path(); 1471 if (!log_path) 1472 return -ENOMEM; 1473 1474 dir = read_one_inode(root, dirid); 1475 /* it isn't an error if the inode isn't there, that can happen 1476 * because we replay the deletes before we copy in the inode item 1477 * from the log 1478 */ 1479 if (!dir) { 1480 btrfs_free_path(log_path); 1481 return 0; 1482 } 1483 again: 1484 range_start = 0; 1485 range_end = 0; 1486 while (1) { 1487 ret = find_dir_range(log, path, dirid, key_type, 1488 &range_start, &range_end); 1489 if (ret != 0) 1490 break; 1491 1492 dir_key.offset = range_start; 1493 while (1) { 1494 int nritems; 1495 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1496 0, 0); 1497 if (ret < 0) 1498 goto out; 1499 1500 nritems = btrfs_header_nritems(path->nodes[0]); 1501 if (path->slots[0] >= nritems) { 1502 ret = btrfs_next_leaf(root, path); 1503 if (ret) 1504 break; 1505 } 1506 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1507 path->slots[0]); 1508 if (found_key.objectid != dirid || 1509 found_key.type != dir_key.type) 1510 goto next_type; 1511 1512 if (found_key.offset > range_end) 1513 break; 1514 1515 ret = check_item_in_log(trans, root, log, path, 1516 log_path, dir, &found_key); 1517 BUG_ON(ret); 1518 if (found_key.offset == (u64)-1) 1519 break; 1520 dir_key.offset = found_key.offset + 1; 1521 } 1522 btrfs_release_path(root, path); 1523 if (range_end == (u64)-1) 1524 break; 1525 range_start = range_end + 1; 1526 } 1527 1528 next_type: 1529 ret = 0; 1530 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1531 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1532 dir_key.type = BTRFS_DIR_INDEX_KEY; 1533 btrfs_release_path(root, path); 1534 goto again; 1535 } 1536 out: 1537 btrfs_release_path(root, path); 1538 btrfs_free_path(log_path); 1539 iput(dir); 1540 return ret; 1541 } 1542 1543 /* 1544 * the process_func used to replay items from the log tree. This 1545 * gets called in two different stages. The first stage just looks 1546 * for inodes and makes sure they are all copied into the subvolume. 1547 * 1548 * The second stage copies all the other item types from the log into 1549 * the subvolume. The two stage approach is slower, but gets rid of 1550 * lots of complexity around inodes referencing other inodes that exist 1551 * only in the log (references come from either directory items or inode 1552 * back refs). 1553 */ 1554 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1555 struct walk_control *wc, u64 gen) 1556 { 1557 int nritems; 1558 struct btrfs_path *path; 1559 struct btrfs_root *root = wc->replay_dest; 1560 struct btrfs_key key; 1561 u32 item_size; 1562 int level; 1563 int i; 1564 int ret; 1565 1566 btrfs_read_buffer(eb, gen); 1567 1568 level = btrfs_header_level(eb); 1569 1570 if (level != 0) 1571 return 0; 1572 1573 path = btrfs_alloc_path(); 1574 BUG_ON(!path); 1575 1576 nritems = btrfs_header_nritems(eb); 1577 for (i = 0; i < nritems; i++) { 1578 btrfs_item_key_to_cpu(eb, &key, i); 1579 item_size = btrfs_item_size_nr(eb, i); 1580 1581 /* inode keys are done during the first stage */ 1582 if (key.type == BTRFS_INODE_ITEM_KEY && 1583 wc->stage == LOG_WALK_REPLAY_INODES) { 1584 struct inode *inode; 1585 struct btrfs_inode_item *inode_item; 1586 u32 mode; 1587 1588 inode_item = btrfs_item_ptr(eb, i, 1589 struct btrfs_inode_item); 1590 mode = btrfs_inode_mode(eb, inode_item); 1591 if (S_ISDIR(mode)) { 1592 ret = replay_dir_deletes(wc->trans, 1593 root, log, path, key.objectid); 1594 BUG_ON(ret); 1595 } 1596 ret = overwrite_item(wc->trans, root, path, 1597 eb, i, &key); 1598 BUG_ON(ret); 1599 1600 /* for regular files, truncate away 1601 * extents past the new EOF 1602 */ 1603 if (S_ISREG(mode)) { 1604 inode = read_one_inode(root, 1605 key.objectid); 1606 BUG_ON(!inode); 1607 1608 ret = btrfs_truncate_inode_items(wc->trans, 1609 root, inode, inode->i_size, 1610 BTRFS_EXTENT_DATA_KEY); 1611 BUG_ON(ret); 1612 iput(inode); 1613 } 1614 ret = link_to_fixup_dir(wc->trans, root, 1615 path, key.objectid); 1616 BUG_ON(ret); 1617 } 1618 if (wc->stage < LOG_WALK_REPLAY_ALL) 1619 continue; 1620 1621 /* these keys are simply copied */ 1622 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1623 ret = overwrite_item(wc->trans, root, path, 1624 eb, i, &key); 1625 BUG_ON(ret); 1626 } else if (key.type == BTRFS_INODE_REF_KEY) { 1627 ret = add_inode_ref(wc->trans, root, log, path, 1628 eb, i, &key); 1629 BUG_ON(ret && ret != -ENOENT); 1630 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1631 ret = replay_one_extent(wc->trans, root, path, 1632 eb, i, &key); 1633 BUG_ON(ret); 1634 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1635 key.type == BTRFS_DIR_INDEX_KEY) { 1636 ret = replay_one_dir_item(wc->trans, root, path, 1637 eb, i, &key); 1638 BUG_ON(ret); 1639 } 1640 } 1641 btrfs_free_path(path); 1642 return 0; 1643 } 1644 1645 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1646 struct btrfs_root *root, 1647 struct btrfs_path *path, int *level, 1648 struct walk_control *wc) 1649 { 1650 u64 root_owner; 1651 u64 root_gen; 1652 u64 bytenr; 1653 u64 ptr_gen; 1654 struct extent_buffer *next; 1655 struct extent_buffer *cur; 1656 struct extent_buffer *parent; 1657 u32 blocksize; 1658 int ret = 0; 1659 1660 WARN_ON(*level < 0); 1661 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1662 1663 while (*level > 0) { 1664 WARN_ON(*level < 0); 1665 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1666 cur = path->nodes[*level]; 1667 1668 if (btrfs_header_level(cur) != *level) 1669 WARN_ON(1); 1670 1671 if (path->slots[*level] >= 1672 btrfs_header_nritems(cur)) 1673 break; 1674 1675 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1676 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1677 blocksize = btrfs_level_size(root, *level - 1); 1678 1679 parent = path->nodes[*level]; 1680 root_owner = btrfs_header_owner(parent); 1681 root_gen = btrfs_header_generation(parent); 1682 1683 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1684 1685 wc->process_func(root, next, wc, ptr_gen); 1686 1687 if (*level == 1) { 1688 path->slots[*level]++; 1689 if (wc->free) { 1690 btrfs_read_buffer(next, ptr_gen); 1691 1692 btrfs_tree_lock(next); 1693 clean_tree_block(trans, root, next); 1694 btrfs_wait_tree_block_writeback(next); 1695 btrfs_tree_unlock(next); 1696 1697 ret = btrfs_drop_leaf_ref(trans, root, next); 1698 BUG_ON(ret); 1699 1700 WARN_ON(root_owner != 1701 BTRFS_TREE_LOG_OBJECTID); 1702 ret = btrfs_free_reserved_extent(root, 1703 bytenr, blocksize); 1704 BUG_ON(ret); 1705 } 1706 free_extent_buffer(next); 1707 continue; 1708 } 1709 btrfs_read_buffer(next, ptr_gen); 1710 1711 WARN_ON(*level <= 0); 1712 if (path->nodes[*level-1]) 1713 free_extent_buffer(path->nodes[*level-1]); 1714 path->nodes[*level-1] = next; 1715 *level = btrfs_header_level(next); 1716 path->slots[*level] = 0; 1717 cond_resched(); 1718 } 1719 WARN_ON(*level < 0); 1720 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1721 1722 if (path->nodes[*level] == root->node) 1723 parent = path->nodes[*level]; 1724 else 1725 parent = path->nodes[*level + 1]; 1726 1727 bytenr = path->nodes[*level]->start; 1728 1729 blocksize = btrfs_level_size(root, *level); 1730 root_owner = btrfs_header_owner(parent); 1731 root_gen = btrfs_header_generation(parent); 1732 1733 wc->process_func(root, path->nodes[*level], wc, 1734 btrfs_header_generation(path->nodes[*level])); 1735 1736 if (wc->free) { 1737 next = path->nodes[*level]; 1738 btrfs_tree_lock(next); 1739 clean_tree_block(trans, root, next); 1740 btrfs_wait_tree_block_writeback(next); 1741 btrfs_tree_unlock(next); 1742 1743 if (*level == 0) { 1744 ret = btrfs_drop_leaf_ref(trans, root, next); 1745 BUG_ON(ret); 1746 } 1747 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1748 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1749 BUG_ON(ret); 1750 } 1751 free_extent_buffer(path->nodes[*level]); 1752 path->nodes[*level] = NULL; 1753 *level += 1; 1754 1755 cond_resched(); 1756 return 0; 1757 } 1758 1759 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1760 struct btrfs_root *root, 1761 struct btrfs_path *path, int *level, 1762 struct walk_control *wc) 1763 { 1764 u64 root_owner; 1765 u64 root_gen; 1766 int i; 1767 int slot; 1768 int ret; 1769 1770 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1771 slot = path->slots[i]; 1772 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1773 struct extent_buffer *node; 1774 node = path->nodes[i]; 1775 path->slots[i]++; 1776 *level = i; 1777 WARN_ON(*level == 0); 1778 return 0; 1779 } else { 1780 struct extent_buffer *parent; 1781 if (path->nodes[*level] == root->node) 1782 parent = path->nodes[*level]; 1783 else 1784 parent = path->nodes[*level + 1]; 1785 1786 root_owner = btrfs_header_owner(parent); 1787 root_gen = btrfs_header_generation(parent); 1788 wc->process_func(root, path->nodes[*level], wc, 1789 btrfs_header_generation(path->nodes[*level])); 1790 if (wc->free) { 1791 struct extent_buffer *next; 1792 1793 next = path->nodes[*level]; 1794 1795 btrfs_tree_lock(next); 1796 clean_tree_block(trans, root, next); 1797 btrfs_wait_tree_block_writeback(next); 1798 btrfs_tree_unlock(next); 1799 1800 if (*level == 0) { 1801 ret = btrfs_drop_leaf_ref(trans, root, 1802 next); 1803 BUG_ON(ret); 1804 } 1805 1806 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1807 ret = btrfs_free_reserved_extent(root, 1808 path->nodes[*level]->start, 1809 path->nodes[*level]->len); 1810 BUG_ON(ret); 1811 } 1812 free_extent_buffer(path->nodes[*level]); 1813 path->nodes[*level] = NULL; 1814 *level = i + 1; 1815 } 1816 } 1817 return 1; 1818 } 1819 1820 /* 1821 * drop the reference count on the tree rooted at 'snap'. This traverses 1822 * the tree freeing any blocks that have a ref count of zero after being 1823 * decremented. 1824 */ 1825 static int walk_log_tree(struct btrfs_trans_handle *trans, 1826 struct btrfs_root *log, struct walk_control *wc) 1827 { 1828 int ret = 0; 1829 int wret; 1830 int level; 1831 struct btrfs_path *path; 1832 int i; 1833 int orig_level; 1834 1835 path = btrfs_alloc_path(); 1836 BUG_ON(!path); 1837 1838 level = btrfs_header_level(log->node); 1839 orig_level = level; 1840 path->nodes[level] = log->node; 1841 extent_buffer_get(log->node); 1842 path->slots[level] = 0; 1843 1844 while (1) { 1845 wret = walk_down_log_tree(trans, log, path, &level, wc); 1846 if (wret > 0) 1847 break; 1848 if (wret < 0) 1849 ret = wret; 1850 1851 wret = walk_up_log_tree(trans, log, path, &level, wc); 1852 if (wret > 0) 1853 break; 1854 if (wret < 0) 1855 ret = wret; 1856 } 1857 1858 /* was the root node processed? if not, catch it here */ 1859 if (path->nodes[orig_level]) { 1860 wc->process_func(log, path->nodes[orig_level], wc, 1861 btrfs_header_generation(path->nodes[orig_level])); 1862 if (wc->free) { 1863 struct extent_buffer *next; 1864 1865 next = path->nodes[orig_level]; 1866 1867 btrfs_tree_lock(next); 1868 clean_tree_block(trans, log, next); 1869 btrfs_wait_tree_block_writeback(next); 1870 btrfs_tree_unlock(next); 1871 1872 if (orig_level == 0) { 1873 ret = btrfs_drop_leaf_ref(trans, log, 1874 next); 1875 BUG_ON(ret); 1876 } 1877 WARN_ON(log->root_key.objectid != 1878 BTRFS_TREE_LOG_OBJECTID); 1879 ret = btrfs_free_reserved_extent(log, next->start, 1880 next->len); 1881 BUG_ON(ret); 1882 } 1883 } 1884 1885 for (i = 0; i <= orig_level; i++) { 1886 if (path->nodes[i]) { 1887 free_extent_buffer(path->nodes[i]); 1888 path->nodes[i] = NULL; 1889 } 1890 } 1891 btrfs_free_path(path); 1892 if (wc->free) 1893 free_extent_buffer(log->node); 1894 return ret; 1895 } 1896 1897 static int wait_log_commit(struct btrfs_root *log) 1898 { 1899 DEFINE_WAIT(wait); 1900 u64 transid = log->fs_info->tree_log_transid; 1901 1902 do { 1903 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1904 TASK_UNINTERRUPTIBLE); 1905 mutex_unlock(&log->fs_info->tree_log_mutex); 1906 if (atomic_read(&log->fs_info->tree_log_commit)) 1907 schedule(); 1908 finish_wait(&log->fs_info->tree_log_wait, &wait); 1909 mutex_lock(&log->fs_info->tree_log_mutex); 1910 } while (transid == log->fs_info->tree_log_transid && 1911 atomic_read(&log->fs_info->tree_log_commit)); 1912 return 0; 1913 } 1914 1915 /* 1916 * btrfs_sync_log does sends a given tree log down to the disk and 1917 * updates the super blocks to record it. When this call is done, 1918 * you know that any inodes previously logged are safely on disk 1919 */ 1920 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1921 struct btrfs_root *root) 1922 { 1923 int ret; 1924 unsigned long batch; 1925 struct btrfs_root *log = root->log_root; 1926 1927 mutex_lock(&log->fs_info->tree_log_mutex); 1928 if (atomic_read(&log->fs_info->tree_log_commit)) { 1929 wait_log_commit(log); 1930 goto out; 1931 } 1932 atomic_set(&log->fs_info->tree_log_commit, 1); 1933 1934 while (1) { 1935 batch = log->fs_info->tree_log_batch; 1936 mutex_unlock(&log->fs_info->tree_log_mutex); 1937 schedule_timeout_uninterruptible(1); 1938 mutex_lock(&log->fs_info->tree_log_mutex); 1939 1940 while (atomic_read(&log->fs_info->tree_log_writers)) { 1941 DEFINE_WAIT(wait); 1942 prepare_to_wait(&log->fs_info->tree_log_wait, &wait, 1943 TASK_UNINTERRUPTIBLE); 1944 mutex_unlock(&log->fs_info->tree_log_mutex); 1945 if (atomic_read(&log->fs_info->tree_log_writers)) 1946 schedule(); 1947 mutex_lock(&log->fs_info->tree_log_mutex); 1948 finish_wait(&log->fs_info->tree_log_wait, &wait); 1949 } 1950 if (batch == log->fs_info->tree_log_batch) 1951 break; 1952 } 1953 1954 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 1955 BUG_ON(ret); 1956 ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, 1957 &root->fs_info->log_root_tree->dirty_log_pages); 1958 BUG_ON(ret); 1959 1960 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 1961 log->fs_info->log_root_tree->node->start); 1962 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 1963 btrfs_header_level(log->fs_info->log_root_tree->node)); 1964 1965 write_ctree_super(trans, log->fs_info->tree_root, 2); 1966 log->fs_info->tree_log_transid++; 1967 log->fs_info->tree_log_batch = 0; 1968 atomic_set(&log->fs_info->tree_log_commit, 0); 1969 smp_mb(); 1970 if (waitqueue_active(&log->fs_info->tree_log_wait)) 1971 wake_up(&log->fs_info->tree_log_wait); 1972 out: 1973 mutex_unlock(&log->fs_info->tree_log_mutex); 1974 return 0; 1975 } 1976 1977 /* * free all the extents used by the tree log. This should be called 1978 * at commit time of the full transaction 1979 */ 1980 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 1981 { 1982 int ret; 1983 struct btrfs_root *log; 1984 struct key; 1985 u64 start; 1986 u64 end; 1987 struct walk_control wc = { 1988 .free = 1, 1989 .process_func = process_one_buffer 1990 }; 1991 1992 if (!root->log_root || root->fs_info->log_root_recovering) 1993 return 0; 1994 1995 log = root->log_root; 1996 ret = walk_log_tree(trans, log, &wc); 1997 BUG_ON(ret); 1998 1999 while (1) { 2000 ret = find_first_extent_bit(&log->dirty_log_pages, 2001 0, &start, &end, EXTENT_DIRTY); 2002 if (ret) 2003 break; 2004 2005 clear_extent_dirty(&log->dirty_log_pages, 2006 start, end, GFP_NOFS); 2007 } 2008 2009 log = root->log_root; 2010 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2011 &log->root_key); 2012 BUG_ON(ret); 2013 root->log_root = NULL; 2014 kfree(root->log_root); 2015 return 0; 2016 } 2017 2018 /* 2019 * helper function to update the item for a given subvolumes log root 2020 * in the tree of log roots 2021 */ 2022 static int update_log_root(struct btrfs_trans_handle *trans, 2023 struct btrfs_root *log) 2024 { 2025 u64 bytenr = btrfs_root_bytenr(&log->root_item); 2026 int ret; 2027 2028 if (log->node->start == bytenr) 2029 return 0; 2030 2031 btrfs_set_root_bytenr(&log->root_item, log->node->start); 2032 btrfs_set_root_generation(&log->root_item, trans->transid); 2033 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); 2034 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2035 &log->root_key, &log->root_item); 2036 BUG_ON(ret); 2037 return ret; 2038 } 2039 2040 /* 2041 * If both a file and directory are logged, and unlinks or renames are 2042 * mixed in, we have a few interesting corners: 2043 * 2044 * create file X in dir Y 2045 * link file X to X.link in dir Y 2046 * fsync file X 2047 * unlink file X but leave X.link 2048 * fsync dir Y 2049 * 2050 * After a crash we would expect only X.link to exist. But file X 2051 * didn't get fsync'd again so the log has back refs for X and X.link. 2052 * 2053 * We solve this by removing directory entries and inode backrefs from the 2054 * log when a file that was logged in the current transaction is 2055 * unlinked. Any later fsync will include the updated log entries, and 2056 * we'll be able to reconstruct the proper directory items from backrefs. 2057 * 2058 * This optimizations allows us to avoid relogging the entire inode 2059 * or the entire directory. 2060 */ 2061 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2062 struct btrfs_root *root, 2063 const char *name, int name_len, 2064 struct inode *dir, u64 index) 2065 { 2066 struct btrfs_root *log; 2067 struct btrfs_dir_item *di; 2068 struct btrfs_path *path; 2069 int ret; 2070 int bytes_del = 0; 2071 2072 if (BTRFS_I(dir)->logged_trans < trans->transid) 2073 return 0; 2074 2075 ret = join_running_log_trans(root); 2076 if (ret) 2077 return 0; 2078 2079 mutex_lock(&BTRFS_I(dir)->log_mutex); 2080 2081 log = root->log_root; 2082 path = btrfs_alloc_path(); 2083 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2084 name, name_len, -1); 2085 if (di && !IS_ERR(di)) { 2086 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2087 bytes_del += name_len; 2088 BUG_ON(ret); 2089 } 2090 btrfs_release_path(log, path); 2091 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2092 index, name, name_len, -1); 2093 if (di && !IS_ERR(di)) { 2094 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2095 bytes_del += name_len; 2096 BUG_ON(ret); 2097 } 2098 2099 /* update the directory size in the log to reflect the names 2100 * we have removed 2101 */ 2102 if (bytes_del) { 2103 struct btrfs_key key; 2104 2105 key.objectid = dir->i_ino; 2106 key.offset = 0; 2107 key.type = BTRFS_INODE_ITEM_KEY; 2108 btrfs_release_path(log, path); 2109 2110 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2111 if (ret == 0) { 2112 struct btrfs_inode_item *item; 2113 u64 i_size; 2114 2115 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2116 struct btrfs_inode_item); 2117 i_size = btrfs_inode_size(path->nodes[0], item); 2118 if (i_size > bytes_del) 2119 i_size -= bytes_del; 2120 else 2121 i_size = 0; 2122 btrfs_set_inode_size(path->nodes[0], item, i_size); 2123 btrfs_mark_buffer_dirty(path->nodes[0]); 2124 } else 2125 ret = 0; 2126 btrfs_release_path(log, path); 2127 } 2128 2129 btrfs_free_path(path); 2130 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2131 end_log_trans(root); 2132 2133 return 0; 2134 } 2135 2136 /* see comments for btrfs_del_dir_entries_in_log */ 2137 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2138 struct btrfs_root *root, 2139 const char *name, int name_len, 2140 struct inode *inode, u64 dirid) 2141 { 2142 struct btrfs_root *log; 2143 u64 index; 2144 int ret; 2145 2146 if (BTRFS_I(inode)->logged_trans < trans->transid) 2147 return 0; 2148 2149 ret = join_running_log_trans(root); 2150 if (ret) 2151 return 0; 2152 log = root->log_root; 2153 mutex_lock(&BTRFS_I(inode)->log_mutex); 2154 2155 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2156 dirid, &index); 2157 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2158 end_log_trans(root); 2159 2160 return ret; 2161 } 2162 2163 /* 2164 * creates a range item in the log for 'dirid'. first_offset and 2165 * last_offset tell us which parts of the key space the log should 2166 * be considered authoritative for. 2167 */ 2168 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2169 struct btrfs_root *log, 2170 struct btrfs_path *path, 2171 int key_type, u64 dirid, 2172 u64 first_offset, u64 last_offset) 2173 { 2174 int ret; 2175 struct btrfs_key key; 2176 struct btrfs_dir_log_item *item; 2177 2178 key.objectid = dirid; 2179 key.offset = first_offset; 2180 if (key_type == BTRFS_DIR_ITEM_KEY) 2181 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2182 else 2183 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2184 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2185 BUG_ON(ret); 2186 2187 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2188 struct btrfs_dir_log_item); 2189 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2190 btrfs_mark_buffer_dirty(path->nodes[0]); 2191 btrfs_release_path(log, path); 2192 return 0; 2193 } 2194 2195 /* 2196 * log all the items included in the current transaction for a given 2197 * directory. This also creates the range items in the log tree required 2198 * to replay anything deleted before the fsync 2199 */ 2200 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2201 struct btrfs_root *root, struct inode *inode, 2202 struct btrfs_path *path, 2203 struct btrfs_path *dst_path, int key_type, 2204 u64 min_offset, u64 *last_offset_ret) 2205 { 2206 struct btrfs_key min_key; 2207 struct btrfs_key max_key; 2208 struct btrfs_root *log = root->log_root; 2209 struct extent_buffer *src; 2210 int ret; 2211 int i; 2212 int nritems; 2213 u64 first_offset = min_offset; 2214 u64 last_offset = (u64)-1; 2215 2216 log = root->log_root; 2217 max_key.objectid = inode->i_ino; 2218 max_key.offset = (u64)-1; 2219 max_key.type = key_type; 2220 2221 min_key.objectid = inode->i_ino; 2222 min_key.type = key_type; 2223 min_key.offset = min_offset; 2224 2225 path->keep_locks = 1; 2226 2227 ret = btrfs_search_forward(root, &min_key, &max_key, 2228 path, 0, trans->transid); 2229 2230 /* 2231 * we didn't find anything from this transaction, see if there 2232 * is anything at all 2233 */ 2234 if (ret != 0 || min_key.objectid != inode->i_ino || 2235 min_key.type != key_type) { 2236 min_key.objectid = inode->i_ino; 2237 min_key.type = key_type; 2238 min_key.offset = (u64)-1; 2239 btrfs_release_path(root, path); 2240 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2241 if (ret < 0) { 2242 btrfs_release_path(root, path); 2243 return ret; 2244 } 2245 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2246 2247 /* if ret == 0 there are items for this type, 2248 * create a range to tell us the last key of this type. 2249 * otherwise, there are no items in this directory after 2250 * *min_offset, and we create a range to indicate that. 2251 */ 2252 if (ret == 0) { 2253 struct btrfs_key tmp; 2254 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2255 path->slots[0]); 2256 if (key_type == tmp.type) 2257 first_offset = max(min_offset, tmp.offset) + 1; 2258 } 2259 goto done; 2260 } 2261 2262 /* go backward to find any previous key */ 2263 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2264 if (ret == 0) { 2265 struct btrfs_key tmp; 2266 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2267 if (key_type == tmp.type) { 2268 first_offset = tmp.offset; 2269 ret = overwrite_item(trans, log, dst_path, 2270 path->nodes[0], path->slots[0], 2271 &tmp); 2272 } 2273 } 2274 btrfs_release_path(root, path); 2275 2276 /* find the first key from this transaction again */ 2277 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2278 if (ret != 0) { 2279 WARN_ON(1); 2280 goto done; 2281 } 2282 2283 /* 2284 * we have a block from this transaction, log every item in it 2285 * from our directory 2286 */ 2287 while (1) { 2288 struct btrfs_key tmp; 2289 src = path->nodes[0]; 2290 nritems = btrfs_header_nritems(src); 2291 for (i = path->slots[0]; i < nritems; i++) { 2292 btrfs_item_key_to_cpu(src, &min_key, i); 2293 2294 if (min_key.objectid != inode->i_ino || 2295 min_key.type != key_type) 2296 goto done; 2297 ret = overwrite_item(trans, log, dst_path, src, i, 2298 &min_key); 2299 BUG_ON(ret); 2300 } 2301 path->slots[0] = nritems; 2302 2303 /* 2304 * look ahead to the next item and see if it is also 2305 * from this directory and from this transaction 2306 */ 2307 ret = btrfs_next_leaf(root, path); 2308 if (ret == 1) { 2309 last_offset = (u64)-1; 2310 goto done; 2311 } 2312 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2313 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2314 last_offset = (u64)-1; 2315 goto done; 2316 } 2317 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2318 ret = overwrite_item(trans, log, dst_path, 2319 path->nodes[0], path->slots[0], 2320 &tmp); 2321 2322 BUG_ON(ret); 2323 last_offset = tmp.offset; 2324 goto done; 2325 } 2326 } 2327 done: 2328 *last_offset_ret = last_offset; 2329 btrfs_release_path(root, path); 2330 btrfs_release_path(log, dst_path); 2331 2332 /* insert the log range keys to indicate where the log is valid */ 2333 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2334 first_offset, last_offset); 2335 BUG_ON(ret); 2336 return 0; 2337 } 2338 2339 /* 2340 * logging directories is very similar to logging inodes, We find all the items 2341 * from the current transaction and write them to the log. 2342 * 2343 * The recovery code scans the directory in the subvolume, and if it finds a 2344 * key in the range logged that is not present in the log tree, then it means 2345 * that dir entry was unlinked during the transaction. 2346 * 2347 * In order for that scan to work, we must include one key smaller than 2348 * the smallest logged by this transaction and one key larger than the largest 2349 * key logged by this transaction. 2350 */ 2351 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2352 struct btrfs_root *root, struct inode *inode, 2353 struct btrfs_path *path, 2354 struct btrfs_path *dst_path) 2355 { 2356 u64 min_key; 2357 u64 max_key; 2358 int ret; 2359 int key_type = BTRFS_DIR_ITEM_KEY; 2360 2361 again: 2362 min_key = 0; 2363 max_key = 0; 2364 while (1) { 2365 ret = log_dir_items(trans, root, inode, path, 2366 dst_path, key_type, min_key, 2367 &max_key); 2368 BUG_ON(ret); 2369 if (max_key == (u64)-1) 2370 break; 2371 min_key = max_key + 1; 2372 } 2373 2374 if (key_type == BTRFS_DIR_ITEM_KEY) { 2375 key_type = BTRFS_DIR_INDEX_KEY; 2376 goto again; 2377 } 2378 return 0; 2379 } 2380 2381 /* 2382 * a helper function to drop items from the log before we relog an 2383 * inode. max_key_type indicates the highest item type to remove. 2384 * This cannot be run for file data extents because it does not 2385 * free the extents they point to. 2386 */ 2387 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2388 struct btrfs_root *log, 2389 struct btrfs_path *path, 2390 u64 objectid, int max_key_type) 2391 { 2392 int ret; 2393 struct btrfs_key key; 2394 struct btrfs_key found_key; 2395 2396 key.objectid = objectid; 2397 key.type = max_key_type; 2398 key.offset = (u64)-1; 2399 2400 while (1) { 2401 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2402 2403 if (ret != 1) 2404 break; 2405 2406 if (path->slots[0] == 0) 2407 break; 2408 2409 path->slots[0]--; 2410 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2411 path->slots[0]); 2412 2413 if (found_key.objectid != objectid) 2414 break; 2415 2416 ret = btrfs_del_item(trans, log, path); 2417 BUG_ON(ret); 2418 btrfs_release_path(log, path); 2419 } 2420 btrfs_release_path(log, path); 2421 return 0; 2422 } 2423 2424 static noinline int copy_items(struct btrfs_trans_handle *trans, 2425 struct btrfs_root *log, 2426 struct btrfs_path *dst_path, 2427 struct extent_buffer *src, 2428 int start_slot, int nr, int inode_only) 2429 { 2430 unsigned long src_offset; 2431 unsigned long dst_offset; 2432 struct btrfs_file_extent_item *extent; 2433 struct btrfs_inode_item *inode_item; 2434 int ret; 2435 struct btrfs_key *ins_keys; 2436 u32 *ins_sizes; 2437 char *ins_data; 2438 int i; 2439 struct list_head ordered_sums; 2440 2441 INIT_LIST_HEAD(&ordered_sums); 2442 2443 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2444 nr * sizeof(u32), GFP_NOFS); 2445 ins_sizes = (u32 *)ins_data; 2446 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2447 2448 for (i = 0; i < nr; i++) { 2449 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2450 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2451 } 2452 ret = btrfs_insert_empty_items(trans, log, dst_path, 2453 ins_keys, ins_sizes, nr); 2454 BUG_ON(ret); 2455 2456 for (i = 0; i < nr; i++) { 2457 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2458 dst_path->slots[0]); 2459 2460 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2461 2462 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2463 src_offset, ins_sizes[i]); 2464 2465 if (inode_only == LOG_INODE_EXISTS && 2466 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2467 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2468 dst_path->slots[0], 2469 struct btrfs_inode_item); 2470 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2471 2472 /* set the generation to zero so the recover code 2473 * can tell the difference between an logging 2474 * just to say 'this inode exists' and a logging 2475 * to say 'update this inode with these values' 2476 */ 2477 btrfs_set_inode_generation(dst_path->nodes[0], 2478 inode_item, 0); 2479 } 2480 /* take a reference on file data extents so that truncates 2481 * or deletes of this inode don't have to relog the inode 2482 * again 2483 */ 2484 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2485 int found_type; 2486 extent = btrfs_item_ptr(src, start_slot + i, 2487 struct btrfs_file_extent_item); 2488 2489 found_type = btrfs_file_extent_type(src, extent); 2490 if (found_type == BTRFS_FILE_EXTENT_REG || 2491 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2492 u64 ds = btrfs_file_extent_disk_bytenr(src, 2493 extent); 2494 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2495 extent); 2496 u64 cs = btrfs_file_extent_offset(src, extent); 2497 u64 cl = btrfs_file_extent_num_bytes(src, 2498 extent);; 2499 if (btrfs_file_extent_compression(src, 2500 extent)) { 2501 cs = 0; 2502 cl = dl; 2503 } 2504 /* ds == 0 is a hole */ 2505 if (ds != 0) { 2506 ret = btrfs_inc_extent_ref(trans, log, 2507 ds, dl, 2508 dst_path->nodes[0]->start, 2509 BTRFS_TREE_LOG_OBJECTID, 2510 trans->transid, 2511 ins_keys[i].objectid); 2512 BUG_ON(ret); 2513 ret = btrfs_lookup_csums_range( 2514 log->fs_info->csum_root, 2515 ds + cs, ds + cs + cl - 1, 2516 &ordered_sums); 2517 BUG_ON(ret); 2518 } 2519 } 2520 } 2521 dst_path->slots[0]++; 2522 } 2523 2524 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2525 btrfs_release_path(log, dst_path); 2526 kfree(ins_data); 2527 2528 /* 2529 * we have to do this after the loop above to avoid changing the 2530 * log tree while trying to change the log tree. 2531 */ 2532 while (!list_empty(&ordered_sums)) { 2533 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2534 struct btrfs_ordered_sum, 2535 list); 2536 ret = btrfs_csum_file_blocks(trans, log, sums); 2537 BUG_ON(ret); 2538 list_del(&sums->list); 2539 kfree(sums); 2540 } 2541 return 0; 2542 } 2543 2544 /* log a single inode in the tree log. 2545 * At least one parent directory for this inode must exist in the tree 2546 * or be logged already. 2547 * 2548 * Any items from this inode changed by the current transaction are copied 2549 * to the log tree. An extra reference is taken on any extents in this 2550 * file, allowing us to avoid a whole pile of corner cases around logging 2551 * blocks that have been removed from the tree. 2552 * 2553 * See LOG_INODE_ALL and related defines for a description of what inode_only 2554 * does. 2555 * 2556 * This handles both files and directories. 2557 */ 2558 static int __btrfs_log_inode(struct btrfs_trans_handle *trans, 2559 struct btrfs_root *root, struct inode *inode, 2560 int inode_only) 2561 { 2562 struct btrfs_path *path; 2563 struct btrfs_path *dst_path; 2564 struct btrfs_key min_key; 2565 struct btrfs_key max_key; 2566 struct btrfs_root *log = root->log_root; 2567 struct extent_buffer *src = NULL; 2568 u32 size; 2569 int ret; 2570 int nritems; 2571 int ins_start_slot = 0; 2572 int ins_nr; 2573 2574 log = root->log_root; 2575 2576 path = btrfs_alloc_path(); 2577 dst_path = btrfs_alloc_path(); 2578 2579 min_key.objectid = inode->i_ino; 2580 min_key.type = BTRFS_INODE_ITEM_KEY; 2581 min_key.offset = 0; 2582 2583 max_key.objectid = inode->i_ino; 2584 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2585 max_key.type = BTRFS_XATTR_ITEM_KEY; 2586 else 2587 max_key.type = (u8)-1; 2588 max_key.offset = (u64)-1; 2589 2590 /* 2591 * if this inode has already been logged and we're in inode_only 2592 * mode, we don't want to delete the things that have already 2593 * been written to the log. 2594 * 2595 * But, if the inode has been through an inode_only log, 2596 * the logged_trans field is not set. This allows us to catch 2597 * any new names for this inode in the backrefs by logging it 2598 * again 2599 */ 2600 if (inode_only == LOG_INODE_EXISTS && 2601 BTRFS_I(inode)->logged_trans == trans->transid) { 2602 btrfs_free_path(path); 2603 btrfs_free_path(dst_path); 2604 goto out; 2605 } 2606 mutex_lock(&BTRFS_I(inode)->log_mutex); 2607 2608 /* 2609 * a brute force approach to making sure we get the most uptodate 2610 * copies of everything. 2611 */ 2612 if (S_ISDIR(inode->i_mode)) { 2613 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2614 2615 if (inode_only == LOG_INODE_EXISTS) 2616 max_key_type = BTRFS_XATTR_ITEM_KEY; 2617 ret = drop_objectid_items(trans, log, path, 2618 inode->i_ino, max_key_type); 2619 } else { 2620 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2621 } 2622 BUG_ON(ret); 2623 path->keep_locks = 1; 2624 2625 while (1) { 2626 ins_nr = 0; 2627 ret = btrfs_search_forward(root, &min_key, &max_key, 2628 path, 0, trans->transid); 2629 if (ret != 0) 2630 break; 2631 again: 2632 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2633 if (min_key.objectid != inode->i_ino) 2634 break; 2635 if (min_key.type > max_key.type) 2636 break; 2637 2638 src = path->nodes[0]; 2639 size = btrfs_item_size_nr(src, path->slots[0]); 2640 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2641 ins_nr++; 2642 goto next_slot; 2643 } else if (!ins_nr) { 2644 ins_start_slot = path->slots[0]; 2645 ins_nr = 1; 2646 goto next_slot; 2647 } 2648 2649 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2650 ins_nr, inode_only); 2651 BUG_ON(ret); 2652 ins_nr = 1; 2653 ins_start_slot = path->slots[0]; 2654 next_slot: 2655 2656 nritems = btrfs_header_nritems(path->nodes[0]); 2657 path->slots[0]++; 2658 if (path->slots[0] < nritems) { 2659 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2660 path->slots[0]); 2661 goto again; 2662 } 2663 if (ins_nr) { 2664 ret = copy_items(trans, log, dst_path, src, 2665 ins_start_slot, 2666 ins_nr, inode_only); 2667 BUG_ON(ret); 2668 ins_nr = 0; 2669 } 2670 btrfs_release_path(root, path); 2671 2672 if (min_key.offset < (u64)-1) 2673 min_key.offset++; 2674 else if (min_key.type < (u8)-1) 2675 min_key.type++; 2676 else if (min_key.objectid < (u64)-1) 2677 min_key.objectid++; 2678 else 2679 break; 2680 } 2681 if (ins_nr) { 2682 ret = copy_items(trans, log, dst_path, src, 2683 ins_start_slot, 2684 ins_nr, inode_only); 2685 BUG_ON(ret); 2686 ins_nr = 0; 2687 } 2688 WARN_ON(ins_nr); 2689 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2690 btrfs_release_path(root, path); 2691 btrfs_release_path(log, dst_path); 2692 BTRFS_I(inode)->log_dirty_trans = 0; 2693 ret = log_directory_changes(trans, root, inode, path, dst_path); 2694 BUG_ON(ret); 2695 } 2696 BTRFS_I(inode)->logged_trans = trans->transid; 2697 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2698 2699 btrfs_free_path(path); 2700 btrfs_free_path(dst_path); 2701 2702 mutex_lock(&root->fs_info->tree_log_mutex); 2703 ret = update_log_root(trans, log); 2704 BUG_ON(ret); 2705 mutex_unlock(&root->fs_info->tree_log_mutex); 2706 out: 2707 return 0; 2708 } 2709 2710 int btrfs_log_inode(struct btrfs_trans_handle *trans, 2711 struct btrfs_root *root, struct inode *inode, 2712 int inode_only) 2713 { 2714 int ret; 2715 2716 start_log_trans(trans, root); 2717 ret = __btrfs_log_inode(trans, root, inode, inode_only); 2718 end_log_trans(root); 2719 return ret; 2720 } 2721 2722 /* 2723 * helper function around btrfs_log_inode to make sure newly created 2724 * parent directories also end up in the log. A minimal inode and backref 2725 * only logging is done of any parent directories that are older than 2726 * the last committed transaction 2727 */ 2728 int btrfs_log_dentry(struct btrfs_trans_handle *trans, 2729 struct btrfs_root *root, struct dentry *dentry) 2730 { 2731 int inode_only = LOG_INODE_ALL; 2732 struct super_block *sb; 2733 int ret; 2734 2735 start_log_trans(trans, root); 2736 sb = dentry->d_inode->i_sb; 2737 while (1) { 2738 ret = __btrfs_log_inode(trans, root, dentry->d_inode, 2739 inode_only); 2740 BUG_ON(ret); 2741 inode_only = LOG_INODE_EXISTS; 2742 2743 dentry = dentry->d_parent; 2744 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) 2745 break; 2746 2747 if (BTRFS_I(dentry->d_inode)->generation <= 2748 root->fs_info->last_trans_committed) 2749 break; 2750 } 2751 end_log_trans(root); 2752 return 0; 2753 } 2754 2755 /* 2756 * it is not safe to log dentry if the chunk root has added new 2757 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 2758 * If this returns 1, you must commit the transaction to safely get your 2759 * data on disk. 2760 */ 2761 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2762 struct btrfs_root *root, struct dentry *dentry) 2763 { 2764 u64 gen; 2765 gen = root->fs_info->last_trans_new_blockgroup; 2766 if (gen > root->fs_info->last_trans_committed) 2767 return 1; 2768 else 2769 return btrfs_log_dentry(trans, root, dentry); 2770 } 2771 2772 /* 2773 * should be called during mount to recover any replay any log trees 2774 * from the FS 2775 */ 2776 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 2777 { 2778 int ret; 2779 struct btrfs_path *path; 2780 struct btrfs_trans_handle *trans; 2781 struct btrfs_key key; 2782 struct btrfs_key found_key; 2783 struct btrfs_key tmp_key; 2784 struct btrfs_root *log; 2785 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 2786 u64 highest_inode; 2787 struct walk_control wc = { 2788 .process_func = process_one_buffer, 2789 .stage = 0, 2790 }; 2791 2792 fs_info->log_root_recovering = 1; 2793 path = btrfs_alloc_path(); 2794 BUG_ON(!path); 2795 2796 trans = btrfs_start_transaction(fs_info->tree_root, 1); 2797 2798 wc.trans = trans; 2799 wc.pin = 1; 2800 2801 walk_log_tree(trans, log_root_tree, &wc); 2802 2803 again: 2804 key.objectid = BTRFS_TREE_LOG_OBJECTID; 2805 key.offset = (u64)-1; 2806 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 2807 2808 while (1) { 2809 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 2810 if (ret < 0) 2811 break; 2812 if (ret > 0) { 2813 if (path->slots[0] == 0) 2814 break; 2815 path->slots[0]--; 2816 } 2817 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2818 path->slots[0]); 2819 btrfs_release_path(log_root_tree, path); 2820 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 2821 break; 2822 2823 log = btrfs_read_fs_root_no_radix(log_root_tree, 2824 &found_key); 2825 BUG_ON(!log); 2826 2827 2828 tmp_key.objectid = found_key.offset; 2829 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 2830 tmp_key.offset = (u64)-1; 2831 2832 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 2833 BUG_ON(!wc.replay_dest); 2834 2835 wc.replay_dest->log_root = log; 2836 btrfs_record_root_in_trans(wc.replay_dest); 2837 ret = walk_log_tree(trans, log, &wc); 2838 BUG_ON(ret); 2839 2840 if (wc.stage == LOG_WALK_REPLAY_ALL) { 2841 ret = fixup_inode_link_counts(trans, wc.replay_dest, 2842 path); 2843 BUG_ON(ret); 2844 } 2845 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); 2846 if (ret == 0) { 2847 wc.replay_dest->highest_inode = highest_inode; 2848 wc.replay_dest->last_inode_alloc = highest_inode; 2849 } 2850 2851 key.offset = found_key.offset - 1; 2852 wc.replay_dest->log_root = NULL; 2853 free_extent_buffer(log->node); 2854 kfree(log); 2855 2856 if (found_key.offset == 0) 2857 break; 2858 } 2859 btrfs_release_path(log_root_tree, path); 2860 2861 /* step one is to pin it all, step two is to replay just inodes */ 2862 if (wc.pin) { 2863 wc.pin = 0; 2864 wc.process_func = replay_one_buffer; 2865 wc.stage = LOG_WALK_REPLAY_INODES; 2866 goto again; 2867 } 2868 /* step three is to replay everything */ 2869 if (wc.stage < LOG_WALK_REPLAY_ALL) { 2870 wc.stage++; 2871 goto again; 2872 } 2873 2874 btrfs_free_path(path); 2875 2876 free_extent_buffer(log_root_tree->node); 2877 log_root_tree->log_root = NULL; 2878 fs_info->log_root_recovering = 0; 2879 2880 /* step 4: commit the transaction, which also unpins the blocks */ 2881 btrfs_commit_transaction(trans, fs_info->tree_root); 2882 2883 kfree(log_root_tree); 2884 return 0; 2885 } 2886