1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "locking.h" 24 #include "print-tree.h" 25 #include "compat.h" 26 #include "tree-log.h" 27 28 /* magic values for the inode_only field in btrfs_log_inode: 29 * 30 * LOG_INODE_ALL means to log everything 31 * LOG_INODE_EXISTS means to log just enough to recreate the inode 32 * during log replay 33 */ 34 #define LOG_INODE_ALL 0 35 #define LOG_INODE_EXISTS 1 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 #define LOG_WALK_PIN_ONLY 0 90 #define LOG_WALK_REPLAY_INODES 1 91 #define LOG_WALK_REPLAY_ALL 2 92 93 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 94 struct btrfs_root *root, struct inode *inode, 95 int inode_only); 96 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, 98 struct btrfs_path *path, u64 objectid); 99 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, 101 struct btrfs_root *log, 102 struct btrfs_path *path, 103 u64 dirid, int del_all); 104 105 /* 106 * tree logging is a special write ahead log used to make sure that 107 * fsyncs and O_SYNCs can happen without doing full tree commits. 108 * 109 * Full tree commits are expensive because they require commonly 110 * modified blocks to be recowed, creating many dirty pages in the 111 * extent tree an 4x-6x higher write load than ext3. 112 * 113 * Instead of doing a tree commit on every fsync, we use the 114 * key ranges and transaction ids to find items for a given file or directory 115 * that have changed in this transaction. Those items are copied into 116 * a special tree (one per subvolume root), that tree is written to disk 117 * and then the fsync is considered complete. 118 * 119 * After a crash, items are copied out of the log-tree back into the 120 * subvolume tree. Any file data extents found are recorded in the extent 121 * allocation tree, and the log-tree freed. 122 * 123 * The log tree is read three times, once to pin down all the extents it is 124 * using in ram and once, once to create all the inodes logged in the tree 125 * and once to do all the other items. 126 */ 127 128 /* 129 * start a sub transaction and setup the log tree 130 * this increments the log tree writer count to make the people 131 * syncing the tree wait for us to finish 132 */ 133 static int start_log_trans(struct btrfs_trans_handle *trans, 134 struct btrfs_root *root) 135 { 136 int ret; 137 138 mutex_lock(&root->log_mutex); 139 if (root->log_root) { 140 root->log_batch++; 141 atomic_inc(&root->log_writers); 142 mutex_unlock(&root->log_mutex); 143 return 0; 144 } 145 mutex_lock(&root->fs_info->tree_log_mutex); 146 if (!root->fs_info->log_root_tree) { 147 ret = btrfs_init_log_root_tree(trans, root->fs_info); 148 BUG_ON(ret); 149 } 150 if (!root->log_root) { 151 ret = btrfs_add_log_tree(trans, root); 152 BUG_ON(ret); 153 } 154 mutex_unlock(&root->fs_info->tree_log_mutex); 155 root->log_batch++; 156 atomic_inc(&root->log_writers); 157 mutex_unlock(&root->log_mutex); 158 return 0; 159 } 160 161 /* 162 * returns 0 if there was a log transaction running and we were able 163 * to join, or returns -ENOENT if there were not transactions 164 * in progress 165 */ 166 static int join_running_log_trans(struct btrfs_root *root) 167 { 168 int ret = -ENOENT; 169 170 smp_mb(); 171 if (!root->log_root) 172 return -ENOENT; 173 174 mutex_lock(&root->log_mutex); 175 if (root->log_root) { 176 ret = 0; 177 atomic_inc(&root->log_writers); 178 } 179 mutex_unlock(&root->log_mutex); 180 return ret; 181 } 182 183 /* 184 * This either makes the current running log transaction wait 185 * until you call btrfs_end_log_trans() or it makes any future 186 * log transactions wait until you call btrfs_end_log_trans() 187 */ 188 int btrfs_pin_log_trans(struct btrfs_root *root) 189 { 190 int ret = -ENOENT; 191 192 mutex_lock(&root->log_mutex); 193 atomic_inc(&root->log_writers); 194 mutex_unlock(&root->log_mutex); 195 return ret; 196 } 197 198 /* 199 * indicate we're done making changes to the log tree 200 * and wake up anyone waiting to do a sync 201 */ 202 int btrfs_end_log_trans(struct btrfs_root *root) 203 { 204 if (atomic_dec_and_test(&root->log_writers)) { 205 smp_mb(); 206 if (waitqueue_active(&root->log_writer_wait)) 207 wake_up(&root->log_writer_wait); 208 } 209 return 0; 210 } 211 212 213 /* 214 * the walk control struct is used to pass state down the chain when 215 * processing the log tree. The stage field tells us which part 216 * of the log tree processing we are currently doing. The others 217 * are state fields used for that specific part 218 */ 219 struct walk_control { 220 /* should we free the extent on disk when done? This is used 221 * at transaction commit time while freeing a log tree 222 */ 223 int free; 224 225 /* should we write out the extent buffer? This is used 226 * while flushing the log tree to disk during a sync 227 */ 228 int write; 229 230 /* should we wait for the extent buffer io to finish? Also used 231 * while flushing the log tree to disk for a sync 232 */ 233 int wait; 234 235 /* pin only walk, we record which extents on disk belong to the 236 * log trees 237 */ 238 int pin; 239 240 /* what stage of the replay code we're currently in */ 241 int stage; 242 243 /* the root we are currently replaying */ 244 struct btrfs_root *replay_dest; 245 246 /* the trans handle for the current replay */ 247 struct btrfs_trans_handle *trans; 248 249 /* the function that gets used to process blocks we find in the 250 * tree. Note the extent_buffer might not be up to date when it is 251 * passed in, and it must be checked or read if you need the data 252 * inside it 253 */ 254 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 255 struct walk_control *wc, u64 gen); 256 }; 257 258 /* 259 * process_func used to pin down extents, write them or wait on them 260 */ 261 static int process_one_buffer(struct btrfs_root *log, 262 struct extent_buffer *eb, 263 struct walk_control *wc, u64 gen) 264 { 265 if (wc->pin) 266 btrfs_update_pinned_extents(log->fs_info->extent_root, 267 eb->start, eb->len, 1); 268 269 if (btrfs_buffer_uptodate(eb, gen)) { 270 if (wc->write) 271 btrfs_write_tree_block(eb); 272 if (wc->wait) 273 btrfs_wait_tree_block_writeback(eb); 274 } 275 return 0; 276 } 277 278 /* 279 * Item overwrite used by replay and tree logging. eb, slot and key all refer 280 * to the src data we are copying out. 281 * 282 * root is the tree we are copying into, and path is a scratch 283 * path for use in this function (it should be released on entry and 284 * will be released on exit). 285 * 286 * If the key is already in the destination tree the existing item is 287 * overwritten. If the existing item isn't big enough, it is extended. 288 * If it is too large, it is truncated. 289 * 290 * If the key isn't in the destination yet, a new item is inserted. 291 */ 292 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 293 struct btrfs_root *root, 294 struct btrfs_path *path, 295 struct extent_buffer *eb, int slot, 296 struct btrfs_key *key) 297 { 298 int ret; 299 u32 item_size; 300 u64 saved_i_size = 0; 301 int save_old_i_size = 0; 302 unsigned long src_ptr; 303 unsigned long dst_ptr; 304 int overwrite_root = 0; 305 306 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 307 overwrite_root = 1; 308 309 item_size = btrfs_item_size_nr(eb, slot); 310 src_ptr = btrfs_item_ptr_offset(eb, slot); 311 312 /* look for the key in the destination tree */ 313 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 314 if (ret == 0) { 315 char *src_copy; 316 char *dst_copy; 317 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 318 path->slots[0]); 319 if (dst_size != item_size) 320 goto insert; 321 322 if (item_size == 0) { 323 btrfs_release_path(root, path); 324 return 0; 325 } 326 dst_copy = kmalloc(item_size, GFP_NOFS); 327 src_copy = kmalloc(item_size, GFP_NOFS); 328 329 read_extent_buffer(eb, src_copy, src_ptr, item_size); 330 331 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 332 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 333 item_size); 334 ret = memcmp(dst_copy, src_copy, item_size); 335 336 kfree(dst_copy); 337 kfree(src_copy); 338 /* 339 * they have the same contents, just return, this saves 340 * us from cowing blocks in the destination tree and doing 341 * extra writes that may not have been done by a previous 342 * sync 343 */ 344 if (ret == 0) { 345 btrfs_release_path(root, path); 346 return 0; 347 } 348 349 } 350 insert: 351 btrfs_release_path(root, path); 352 /* try to insert the key into the destination tree */ 353 ret = btrfs_insert_empty_item(trans, root, path, 354 key, item_size); 355 356 /* make sure any existing item is the correct size */ 357 if (ret == -EEXIST) { 358 u32 found_size; 359 found_size = btrfs_item_size_nr(path->nodes[0], 360 path->slots[0]); 361 if (found_size > item_size) { 362 btrfs_truncate_item(trans, root, path, item_size, 1); 363 } else if (found_size < item_size) { 364 ret = btrfs_extend_item(trans, root, path, 365 item_size - found_size); 366 BUG_ON(ret); 367 } 368 } else if (ret) { 369 BUG(); 370 } 371 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 372 path->slots[0]); 373 374 /* don't overwrite an existing inode if the generation number 375 * was logged as zero. This is done when the tree logging code 376 * is just logging an inode to make sure it exists after recovery. 377 * 378 * Also, don't overwrite i_size on directories during replay. 379 * log replay inserts and removes directory items based on the 380 * state of the tree found in the subvolume, and i_size is modified 381 * as it goes 382 */ 383 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 384 struct btrfs_inode_item *src_item; 385 struct btrfs_inode_item *dst_item; 386 387 src_item = (struct btrfs_inode_item *)src_ptr; 388 dst_item = (struct btrfs_inode_item *)dst_ptr; 389 390 if (btrfs_inode_generation(eb, src_item) == 0) 391 goto no_copy; 392 393 if (overwrite_root && 394 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 395 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 396 save_old_i_size = 1; 397 saved_i_size = btrfs_inode_size(path->nodes[0], 398 dst_item); 399 } 400 } 401 402 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 403 src_ptr, item_size); 404 405 if (save_old_i_size) { 406 struct btrfs_inode_item *dst_item; 407 dst_item = (struct btrfs_inode_item *)dst_ptr; 408 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 409 } 410 411 /* make sure the generation is filled in */ 412 if (key->type == BTRFS_INODE_ITEM_KEY) { 413 struct btrfs_inode_item *dst_item; 414 dst_item = (struct btrfs_inode_item *)dst_ptr; 415 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 416 btrfs_set_inode_generation(path->nodes[0], dst_item, 417 trans->transid); 418 } 419 } 420 no_copy: 421 btrfs_mark_buffer_dirty(path->nodes[0]); 422 btrfs_release_path(root, path); 423 return 0; 424 } 425 426 /* 427 * simple helper to read an inode off the disk from a given root 428 * This can only be called for subvolume roots and not for the log 429 */ 430 static noinline struct inode *read_one_inode(struct btrfs_root *root, 431 u64 objectid) 432 { 433 struct inode *inode; 434 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); 435 if (inode->i_state & I_NEW) { 436 BTRFS_I(inode)->root = root; 437 BTRFS_I(inode)->location.objectid = objectid; 438 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 439 BTRFS_I(inode)->location.offset = 0; 440 btrfs_read_locked_inode(inode); 441 unlock_new_inode(inode); 442 443 } 444 if (is_bad_inode(inode)) { 445 iput(inode); 446 inode = NULL; 447 } 448 return inode; 449 } 450 451 /* replays a single extent in 'eb' at 'slot' with 'key' into the 452 * subvolume 'root'. path is released on entry and should be released 453 * on exit. 454 * 455 * extents in the log tree have not been allocated out of the extent 456 * tree yet. So, this completes the allocation, taking a reference 457 * as required if the extent already exists or creating a new extent 458 * if it isn't in the extent allocation tree yet. 459 * 460 * The extent is inserted into the file, dropping any existing extents 461 * from the file that overlap the new one. 462 */ 463 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 464 struct btrfs_root *root, 465 struct btrfs_path *path, 466 struct extent_buffer *eb, int slot, 467 struct btrfs_key *key) 468 { 469 int found_type; 470 u64 mask = root->sectorsize - 1; 471 u64 extent_end; 472 u64 alloc_hint; 473 u64 start = key->offset; 474 u64 saved_nbytes; 475 struct btrfs_file_extent_item *item; 476 struct inode *inode = NULL; 477 unsigned long size; 478 int ret = 0; 479 480 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 481 found_type = btrfs_file_extent_type(eb, item); 482 483 if (found_type == BTRFS_FILE_EXTENT_REG || 484 found_type == BTRFS_FILE_EXTENT_PREALLOC) 485 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 486 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 487 size = btrfs_file_extent_inline_len(eb, item); 488 extent_end = (start + size + mask) & ~mask; 489 } else { 490 ret = 0; 491 goto out; 492 } 493 494 inode = read_one_inode(root, key->objectid); 495 if (!inode) { 496 ret = -EIO; 497 goto out; 498 } 499 500 /* 501 * first check to see if we already have this extent in the 502 * file. This must be done before the btrfs_drop_extents run 503 * so we don't try to drop this extent. 504 */ 505 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 506 start, 0); 507 508 if (ret == 0 && 509 (found_type == BTRFS_FILE_EXTENT_REG || 510 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 511 struct btrfs_file_extent_item cmp1; 512 struct btrfs_file_extent_item cmp2; 513 struct btrfs_file_extent_item *existing; 514 struct extent_buffer *leaf; 515 516 leaf = path->nodes[0]; 517 existing = btrfs_item_ptr(leaf, path->slots[0], 518 struct btrfs_file_extent_item); 519 520 read_extent_buffer(eb, &cmp1, (unsigned long)item, 521 sizeof(cmp1)); 522 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 523 sizeof(cmp2)); 524 525 /* 526 * we already have a pointer to this exact extent, 527 * we don't have to do anything 528 */ 529 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 530 btrfs_release_path(root, path); 531 goto out; 532 } 533 } 534 btrfs_release_path(root, path); 535 536 saved_nbytes = inode_get_bytes(inode); 537 /* drop any overlapping extents */ 538 ret = btrfs_drop_extents(trans, root, inode, 539 start, extent_end, extent_end, start, &alloc_hint); 540 BUG_ON(ret); 541 542 if (found_type == BTRFS_FILE_EXTENT_REG || 543 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 544 unsigned long dest_offset; 545 struct btrfs_key ins; 546 547 ret = btrfs_insert_empty_item(trans, root, path, key, 548 sizeof(*item)); 549 BUG_ON(ret); 550 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 551 path->slots[0]); 552 copy_extent_buffer(path->nodes[0], eb, dest_offset, 553 (unsigned long)item, sizeof(*item)); 554 555 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 556 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 557 ins.type = BTRFS_EXTENT_ITEM_KEY; 558 559 if (ins.objectid > 0) { 560 u64 csum_start; 561 u64 csum_end; 562 LIST_HEAD(ordered_sums); 563 /* 564 * is this extent already allocated in the extent 565 * allocation tree? If so, just add a reference 566 */ 567 ret = btrfs_lookup_extent(root, ins.objectid, 568 ins.offset); 569 if (ret == 0) { 570 ret = btrfs_inc_extent_ref(trans, root, 571 ins.objectid, ins.offset, 572 path->nodes[0]->start, 573 root->root_key.objectid, 574 trans->transid, key->objectid); 575 } else { 576 /* 577 * insert the extent pointer in the extent 578 * allocation tree 579 */ 580 ret = btrfs_alloc_logged_extent(trans, root, 581 path->nodes[0]->start, 582 root->root_key.objectid, 583 trans->transid, key->objectid, 584 &ins); 585 BUG_ON(ret); 586 } 587 btrfs_release_path(root, path); 588 589 if (btrfs_file_extent_compression(eb, item)) { 590 csum_start = ins.objectid; 591 csum_end = csum_start + ins.offset; 592 } else { 593 csum_start = ins.objectid + 594 btrfs_file_extent_offset(eb, item); 595 csum_end = csum_start + 596 btrfs_file_extent_num_bytes(eb, item); 597 } 598 599 ret = btrfs_lookup_csums_range(root->log_root, 600 csum_start, csum_end - 1, 601 &ordered_sums); 602 BUG_ON(ret); 603 while (!list_empty(&ordered_sums)) { 604 struct btrfs_ordered_sum *sums; 605 sums = list_entry(ordered_sums.next, 606 struct btrfs_ordered_sum, 607 list); 608 ret = btrfs_csum_file_blocks(trans, 609 root->fs_info->csum_root, 610 sums); 611 BUG_ON(ret); 612 list_del(&sums->list); 613 kfree(sums); 614 } 615 } else { 616 btrfs_release_path(root, path); 617 } 618 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 619 /* inline extents are easy, we just overwrite them */ 620 ret = overwrite_item(trans, root, path, eb, slot, key); 621 BUG_ON(ret); 622 } 623 624 inode_set_bytes(inode, saved_nbytes); 625 btrfs_update_inode(trans, root, inode); 626 out: 627 if (inode) 628 iput(inode); 629 return ret; 630 } 631 632 /* 633 * when cleaning up conflicts between the directory names in the 634 * subvolume, directory names in the log and directory names in the 635 * inode back references, we may have to unlink inodes from directories. 636 * 637 * This is a helper function to do the unlink of a specific directory 638 * item 639 */ 640 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 641 struct btrfs_root *root, 642 struct btrfs_path *path, 643 struct inode *dir, 644 struct btrfs_dir_item *di) 645 { 646 struct inode *inode; 647 char *name; 648 int name_len; 649 struct extent_buffer *leaf; 650 struct btrfs_key location; 651 int ret; 652 653 leaf = path->nodes[0]; 654 655 btrfs_dir_item_key_to_cpu(leaf, di, &location); 656 name_len = btrfs_dir_name_len(leaf, di); 657 name = kmalloc(name_len, GFP_NOFS); 658 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 659 btrfs_release_path(root, path); 660 661 inode = read_one_inode(root, location.objectid); 662 BUG_ON(!inode); 663 664 ret = link_to_fixup_dir(trans, root, path, location.objectid); 665 BUG_ON(ret); 666 667 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 668 BUG_ON(ret); 669 kfree(name); 670 671 iput(inode); 672 return ret; 673 } 674 675 /* 676 * helper function to see if a given name and sequence number found 677 * in an inode back reference are already in a directory and correctly 678 * point to this inode 679 */ 680 static noinline int inode_in_dir(struct btrfs_root *root, 681 struct btrfs_path *path, 682 u64 dirid, u64 objectid, u64 index, 683 const char *name, int name_len) 684 { 685 struct btrfs_dir_item *di; 686 struct btrfs_key location; 687 int match = 0; 688 689 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 690 index, name, name_len, 0); 691 if (di && !IS_ERR(di)) { 692 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 693 if (location.objectid != objectid) 694 goto out; 695 } else 696 goto out; 697 btrfs_release_path(root, path); 698 699 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 700 if (di && !IS_ERR(di)) { 701 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 702 if (location.objectid != objectid) 703 goto out; 704 } else 705 goto out; 706 match = 1; 707 out: 708 btrfs_release_path(root, path); 709 return match; 710 } 711 712 /* 713 * helper function to check a log tree for a named back reference in 714 * an inode. This is used to decide if a back reference that is 715 * found in the subvolume conflicts with what we find in the log. 716 * 717 * inode backreferences may have multiple refs in a single item, 718 * during replay we process one reference at a time, and we don't 719 * want to delete valid links to a file from the subvolume if that 720 * link is also in the log. 721 */ 722 static noinline int backref_in_log(struct btrfs_root *log, 723 struct btrfs_key *key, 724 char *name, int namelen) 725 { 726 struct btrfs_path *path; 727 struct btrfs_inode_ref *ref; 728 unsigned long ptr; 729 unsigned long ptr_end; 730 unsigned long name_ptr; 731 int found_name_len; 732 int item_size; 733 int ret; 734 int match = 0; 735 736 path = btrfs_alloc_path(); 737 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 738 if (ret != 0) 739 goto out; 740 741 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 742 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 743 ptr_end = ptr + item_size; 744 while (ptr < ptr_end) { 745 ref = (struct btrfs_inode_ref *)ptr; 746 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 747 if (found_name_len == namelen) { 748 name_ptr = (unsigned long)(ref + 1); 749 ret = memcmp_extent_buffer(path->nodes[0], name, 750 name_ptr, namelen); 751 if (ret == 0) { 752 match = 1; 753 goto out; 754 } 755 } 756 ptr = (unsigned long)(ref + 1) + found_name_len; 757 } 758 out: 759 btrfs_free_path(path); 760 return match; 761 } 762 763 764 /* 765 * replay one inode back reference item found in the log tree. 766 * eb, slot and key refer to the buffer and key found in the log tree. 767 * root is the destination we are replaying into, and path is for temp 768 * use by this function. (it should be released on return). 769 */ 770 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 771 struct btrfs_root *root, 772 struct btrfs_root *log, 773 struct btrfs_path *path, 774 struct extent_buffer *eb, int slot, 775 struct btrfs_key *key) 776 { 777 struct inode *dir; 778 int ret; 779 struct btrfs_key location; 780 struct btrfs_inode_ref *ref; 781 struct btrfs_dir_item *di; 782 struct inode *inode; 783 char *name; 784 int namelen; 785 unsigned long ref_ptr; 786 unsigned long ref_end; 787 788 location.objectid = key->objectid; 789 location.type = BTRFS_INODE_ITEM_KEY; 790 location.offset = 0; 791 792 /* 793 * it is possible that we didn't log all the parent directories 794 * for a given inode. If we don't find the dir, just don't 795 * copy the back ref in. The link count fixup code will take 796 * care of the rest 797 */ 798 dir = read_one_inode(root, key->offset); 799 if (!dir) 800 return -ENOENT; 801 802 inode = read_one_inode(root, key->objectid); 803 BUG_ON(!dir); 804 805 ref_ptr = btrfs_item_ptr_offset(eb, slot); 806 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 807 808 again: 809 ref = (struct btrfs_inode_ref *)ref_ptr; 810 811 namelen = btrfs_inode_ref_name_len(eb, ref); 812 name = kmalloc(namelen, GFP_NOFS); 813 BUG_ON(!name); 814 815 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 816 817 /* if we already have a perfect match, we're done */ 818 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 819 btrfs_inode_ref_index(eb, ref), 820 name, namelen)) { 821 goto out; 822 } 823 824 /* 825 * look for a conflicting back reference in the metadata. 826 * if we find one we have to unlink that name of the file 827 * before we add our new link. Later on, we overwrite any 828 * existing back reference, and we don't want to create 829 * dangling pointers in the directory. 830 */ 831 conflict_again: 832 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 833 if (ret == 0) { 834 char *victim_name; 835 int victim_name_len; 836 struct btrfs_inode_ref *victim_ref; 837 unsigned long ptr; 838 unsigned long ptr_end; 839 struct extent_buffer *leaf = path->nodes[0]; 840 841 /* are we trying to overwrite a back ref for the root directory 842 * if so, just jump out, we're done 843 */ 844 if (key->objectid == key->offset) 845 goto out_nowrite; 846 847 /* check all the names in this back reference to see 848 * if they are in the log. if so, we allow them to stay 849 * otherwise they must be unlinked as a conflict 850 */ 851 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 852 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 853 while (ptr < ptr_end) { 854 victim_ref = (struct btrfs_inode_ref *)ptr; 855 victim_name_len = btrfs_inode_ref_name_len(leaf, 856 victim_ref); 857 victim_name = kmalloc(victim_name_len, GFP_NOFS); 858 BUG_ON(!victim_name); 859 860 read_extent_buffer(leaf, victim_name, 861 (unsigned long)(victim_ref + 1), 862 victim_name_len); 863 864 if (!backref_in_log(log, key, victim_name, 865 victim_name_len)) { 866 btrfs_inc_nlink(inode); 867 btrfs_release_path(root, path); 868 869 ret = btrfs_unlink_inode(trans, root, dir, 870 inode, victim_name, 871 victim_name_len); 872 kfree(victim_name); 873 btrfs_release_path(root, path); 874 goto conflict_again; 875 } 876 kfree(victim_name); 877 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 878 } 879 BUG_ON(ret); 880 } 881 btrfs_release_path(root, path); 882 883 /* look for a conflicting sequence number */ 884 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 885 btrfs_inode_ref_index(eb, ref), 886 name, namelen, 0); 887 if (di && !IS_ERR(di)) { 888 ret = drop_one_dir_item(trans, root, path, dir, di); 889 BUG_ON(ret); 890 } 891 btrfs_release_path(root, path); 892 893 894 /* look for a conflicting name */ 895 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 896 name, namelen, 0); 897 if (di && !IS_ERR(di)) { 898 ret = drop_one_dir_item(trans, root, path, dir, di); 899 BUG_ON(ret); 900 } 901 btrfs_release_path(root, path); 902 903 /* insert our name */ 904 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 905 btrfs_inode_ref_index(eb, ref)); 906 BUG_ON(ret); 907 908 btrfs_update_inode(trans, root, inode); 909 910 out: 911 ref_ptr = (unsigned long)(ref + 1) + namelen; 912 kfree(name); 913 if (ref_ptr < ref_end) 914 goto again; 915 916 /* finally write the back reference in the inode */ 917 ret = overwrite_item(trans, root, path, eb, slot, key); 918 BUG_ON(ret); 919 920 out_nowrite: 921 btrfs_release_path(root, path); 922 iput(dir); 923 iput(inode); 924 return 0; 925 } 926 927 /* 928 * There are a few corners where the link count of the file can't 929 * be properly maintained during replay. So, instead of adding 930 * lots of complexity to the log code, we just scan the backrefs 931 * for any file that has been through replay. 932 * 933 * The scan will update the link count on the inode to reflect the 934 * number of back refs found. If it goes down to zero, the iput 935 * will free the inode. 936 */ 937 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 938 struct btrfs_root *root, 939 struct inode *inode) 940 { 941 struct btrfs_path *path; 942 int ret; 943 struct btrfs_key key; 944 u64 nlink = 0; 945 unsigned long ptr; 946 unsigned long ptr_end; 947 int name_len; 948 949 key.objectid = inode->i_ino; 950 key.type = BTRFS_INODE_REF_KEY; 951 key.offset = (u64)-1; 952 953 path = btrfs_alloc_path(); 954 955 while (1) { 956 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 957 if (ret < 0) 958 break; 959 if (ret > 0) { 960 if (path->slots[0] == 0) 961 break; 962 path->slots[0]--; 963 } 964 btrfs_item_key_to_cpu(path->nodes[0], &key, 965 path->slots[0]); 966 if (key.objectid != inode->i_ino || 967 key.type != BTRFS_INODE_REF_KEY) 968 break; 969 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 970 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 971 path->slots[0]); 972 while (ptr < ptr_end) { 973 struct btrfs_inode_ref *ref; 974 975 ref = (struct btrfs_inode_ref *)ptr; 976 name_len = btrfs_inode_ref_name_len(path->nodes[0], 977 ref); 978 ptr = (unsigned long)(ref + 1) + name_len; 979 nlink++; 980 } 981 982 if (key.offset == 0) 983 break; 984 key.offset--; 985 btrfs_release_path(root, path); 986 } 987 btrfs_release_path(root, path); 988 if (nlink != inode->i_nlink) { 989 inode->i_nlink = nlink; 990 btrfs_update_inode(trans, root, inode); 991 } 992 BTRFS_I(inode)->index_cnt = (u64)-1; 993 994 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { 995 ret = replay_dir_deletes(trans, root, NULL, path, 996 inode->i_ino, 1); 997 BUG_ON(ret); 998 } 999 btrfs_free_path(path); 1000 1001 return 0; 1002 } 1003 1004 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1005 struct btrfs_root *root, 1006 struct btrfs_path *path) 1007 { 1008 int ret; 1009 struct btrfs_key key; 1010 struct inode *inode; 1011 1012 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1013 key.type = BTRFS_ORPHAN_ITEM_KEY; 1014 key.offset = (u64)-1; 1015 while (1) { 1016 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1017 if (ret < 0) 1018 break; 1019 1020 if (ret == 1) { 1021 if (path->slots[0] == 0) 1022 break; 1023 path->slots[0]--; 1024 } 1025 1026 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1027 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1028 key.type != BTRFS_ORPHAN_ITEM_KEY) 1029 break; 1030 1031 ret = btrfs_del_item(trans, root, path); 1032 BUG_ON(ret); 1033 1034 btrfs_release_path(root, path); 1035 inode = read_one_inode(root, key.offset); 1036 BUG_ON(!inode); 1037 1038 ret = fixup_inode_link_count(trans, root, inode); 1039 BUG_ON(ret); 1040 1041 iput(inode); 1042 1043 /* 1044 * fixup on a directory may create new entries, 1045 * make sure we always look for the highset possible 1046 * offset 1047 */ 1048 key.offset = (u64)-1; 1049 } 1050 btrfs_release_path(root, path); 1051 return 0; 1052 } 1053 1054 1055 /* 1056 * record a given inode in the fixup dir so we can check its link 1057 * count when replay is done. The link count is incremented here 1058 * so the inode won't go away until we check it 1059 */ 1060 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1061 struct btrfs_root *root, 1062 struct btrfs_path *path, 1063 u64 objectid) 1064 { 1065 struct btrfs_key key; 1066 int ret = 0; 1067 struct inode *inode; 1068 1069 inode = read_one_inode(root, objectid); 1070 BUG_ON(!inode); 1071 1072 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1073 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1074 key.offset = objectid; 1075 1076 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1077 1078 btrfs_release_path(root, path); 1079 if (ret == 0) { 1080 btrfs_inc_nlink(inode); 1081 btrfs_update_inode(trans, root, inode); 1082 } else if (ret == -EEXIST) { 1083 ret = 0; 1084 } else { 1085 BUG(); 1086 } 1087 iput(inode); 1088 1089 return ret; 1090 } 1091 1092 /* 1093 * when replaying the log for a directory, we only insert names 1094 * for inodes that actually exist. This means an fsync on a directory 1095 * does not implicitly fsync all the new files in it 1096 */ 1097 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1098 struct btrfs_root *root, 1099 struct btrfs_path *path, 1100 u64 dirid, u64 index, 1101 char *name, int name_len, u8 type, 1102 struct btrfs_key *location) 1103 { 1104 struct inode *inode; 1105 struct inode *dir; 1106 int ret; 1107 1108 inode = read_one_inode(root, location->objectid); 1109 if (!inode) 1110 return -ENOENT; 1111 1112 dir = read_one_inode(root, dirid); 1113 if (!dir) { 1114 iput(inode); 1115 return -EIO; 1116 } 1117 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1118 1119 /* FIXME, put inode into FIXUP list */ 1120 1121 iput(inode); 1122 iput(dir); 1123 return ret; 1124 } 1125 1126 /* 1127 * take a single entry in a log directory item and replay it into 1128 * the subvolume. 1129 * 1130 * if a conflicting item exists in the subdirectory already, 1131 * the inode it points to is unlinked and put into the link count 1132 * fix up tree. 1133 * 1134 * If a name from the log points to a file or directory that does 1135 * not exist in the FS, it is skipped. fsyncs on directories 1136 * do not force down inodes inside that directory, just changes to the 1137 * names or unlinks in a directory. 1138 */ 1139 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1140 struct btrfs_root *root, 1141 struct btrfs_path *path, 1142 struct extent_buffer *eb, 1143 struct btrfs_dir_item *di, 1144 struct btrfs_key *key) 1145 { 1146 char *name; 1147 int name_len; 1148 struct btrfs_dir_item *dst_di; 1149 struct btrfs_key found_key; 1150 struct btrfs_key log_key; 1151 struct inode *dir; 1152 u8 log_type; 1153 int exists; 1154 int ret; 1155 1156 dir = read_one_inode(root, key->objectid); 1157 BUG_ON(!dir); 1158 1159 name_len = btrfs_dir_name_len(eb, di); 1160 name = kmalloc(name_len, GFP_NOFS); 1161 log_type = btrfs_dir_type(eb, di); 1162 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1163 name_len); 1164 1165 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1166 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1167 if (exists == 0) 1168 exists = 1; 1169 else 1170 exists = 0; 1171 btrfs_release_path(root, path); 1172 1173 if (key->type == BTRFS_DIR_ITEM_KEY) { 1174 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1175 name, name_len, 1); 1176 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1177 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1178 key->objectid, 1179 key->offset, name, 1180 name_len, 1); 1181 } else { 1182 BUG(); 1183 } 1184 if (!dst_di || IS_ERR(dst_di)) { 1185 /* we need a sequence number to insert, so we only 1186 * do inserts for the BTRFS_DIR_INDEX_KEY types 1187 */ 1188 if (key->type != BTRFS_DIR_INDEX_KEY) 1189 goto out; 1190 goto insert; 1191 } 1192 1193 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1194 /* the existing item matches the logged item */ 1195 if (found_key.objectid == log_key.objectid && 1196 found_key.type == log_key.type && 1197 found_key.offset == log_key.offset && 1198 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1199 goto out; 1200 } 1201 1202 /* 1203 * don't drop the conflicting directory entry if the inode 1204 * for the new entry doesn't exist 1205 */ 1206 if (!exists) 1207 goto out; 1208 1209 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1210 BUG_ON(ret); 1211 1212 if (key->type == BTRFS_DIR_INDEX_KEY) 1213 goto insert; 1214 out: 1215 btrfs_release_path(root, path); 1216 kfree(name); 1217 iput(dir); 1218 return 0; 1219 1220 insert: 1221 btrfs_release_path(root, path); 1222 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1223 name, name_len, log_type, &log_key); 1224 1225 BUG_ON(ret && ret != -ENOENT); 1226 goto out; 1227 } 1228 1229 /* 1230 * find all the names in a directory item and reconcile them into 1231 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1232 * one name in a directory item, but the same code gets used for 1233 * both directory index types 1234 */ 1235 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1236 struct btrfs_root *root, 1237 struct btrfs_path *path, 1238 struct extent_buffer *eb, int slot, 1239 struct btrfs_key *key) 1240 { 1241 int ret; 1242 u32 item_size = btrfs_item_size_nr(eb, slot); 1243 struct btrfs_dir_item *di; 1244 int name_len; 1245 unsigned long ptr; 1246 unsigned long ptr_end; 1247 1248 ptr = btrfs_item_ptr_offset(eb, slot); 1249 ptr_end = ptr + item_size; 1250 while (ptr < ptr_end) { 1251 di = (struct btrfs_dir_item *)ptr; 1252 name_len = btrfs_dir_name_len(eb, di); 1253 ret = replay_one_name(trans, root, path, eb, di, key); 1254 BUG_ON(ret); 1255 ptr = (unsigned long)(di + 1); 1256 ptr += name_len; 1257 } 1258 return 0; 1259 } 1260 1261 /* 1262 * directory replay has two parts. There are the standard directory 1263 * items in the log copied from the subvolume, and range items 1264 * created in the log while the subvolume was logged. 1265 * 1266 * The range items tell us which parts of the key space the log 1267 * is authoritative for. During replay, if a key in the subvolume 1268 * directory is in a logged range item, but not actually in the log 1269 * that means it was deleted from the directory before the fsync 1270 * and should be removed. 1271 */ 1272 static noinline int find_dir_range(struct btrfs_root *root, 1273 struct btrfs_path *path, 1274 u64 dirid, int key_type, 1275 u64 *start_ret, u64 *end_ret) 1276 { 1277 struct btrfs_key key; 1278 u64 found_end; 1279 struct btrfs_dir_log_item *item; 1280 int ret; 1281 int nritems; 1282 1283 if (*start_ret == (u64)-1) 1284 return 1; 1285 1286 key.objectid = dirid; 1287 key.type = key_type; 1288 key.offset = *start_ret; 1289 1290 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1291 if (ret < 0) 1292 goto out; 1293 if (ret > 0) { 1294 if (path->slots[0] == 0) 1295 goto out; 1296 path->slots[0]--; 1297 } 1298 if (ret != 0) 1299 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1300 1301 if (key.type != key_type || key.objectid != dirid) { 1302 ret = 1; 1303 goto next; 1304 } 1305 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1306 struct btrfs_dir_log_item); 1307 found_end = btrfs_dir_log_end(path->nodes[0], item); 1308 1309 if (*start_ret >= key.offset && *start_ret <= found_end) { 1310 ret = 0; 1311 *start_ret = key.offset; 1312 *end_ret = found_end; 1313 goto out; 1314 } 1315 ret = 1; 1316 next: 1317 /* check the next slot in the tree to see if it is a valid item */ 1318 nritems = btrfs_header_nritems(path->nodes[0]); 1319 if (path->slots[0] >= nritems) { 1320 ret = btrfs_next_leaf(root, path); 1321 if (ret) 1322 goto out; 1323 } else { 1324 path->slots[0]++; 1325 } 1326 1327 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1328 1329 if (key.type != key_type || key.objectid != dirid) { 1330 ret = 1; 1331 goto out; 1332 } 1333 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1334 struct btrfs_dir_log_item); 1335 found_end = btrfs_dir_log_end(path->nodes[0], item); 1336 *start_ret = key.offset; 1337 *end_ret = found_end; 1338 ret = 0; 1339 out: 1340 btrfs_release_path(root, path); 1341 return ret; 1342 } 1343 1344 /* 1345 * this looks for a given directory item in the log. If the directory 1346 * item is not in the log, the item is removed and the inode it points 1347 * to is unlinked 1348 */ 1349 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1350 struct btrfs_root *root, 1351 struct btrfs_root *log, 1352 struct btrfs_path *path, 1353 struct btrfs_path *log_path, 1354 struct inode *dir, 1355 struct btrfs_key *dir_key) 1356 { 1357 int ret; 1358 struct extent_buffer *eb; 1359 int slot; 1360 u32 item_size; 1361 struct btrfs_dir_item *di; 1362 struct btrfs_dir_item *log_di; 1363 int name_len; 1364 unsigned long ptr; 1365 unsigned long ptr_end; 1366 char *name; 1367 struct inode *inode; 1368 struct btrfs_key location; 1369 1370 again: 1371 eb = path->nodes[0]; 1372 slot = path->slots[0]; 1373 item_size = btrfs_item_size_nr(eb, slot); 1374 ptr = btrfs_item_ptr_offset(eb, slot); 1375 ptr_end = ptr + item_size; 1376 while (ptr < ptr_end) { 1377 di = (struct btrfs_dir_item *)ptr; 1378 name_len = btrfs_dir_name_len(eb, di); 1379 name = kmalloc(name_len, GFP_NOFS); 1380 if (!name) { 1381 ret = -ENOMEM; 1382 goto out; 1383 } 1384 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1385 name_len); 1386 log_di = NULL; 1387 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1388 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1389 dir_key->objectid, 1390 name, name_len, 0); 1391 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1392 log_di = btrfs_lookup_dir_index_item(trans, log, 1393 log_path, 1394 dir_key->objectid, 1395 dir_key->offset, 1396 name, name_len, 0); 1397 } 1398 if (!log_di || IS_ERR(log_di)) { 1399 btrfs_dir_item_key_to_cpu(eb, di, &location); 1400 btrfs_release_path(root, path); 1401 btrfs_release_path(log, log_path); 1402 inode = read_one_inode(root, location.objectid); 1403 BUG_ON(!inode); 1404 1405 ret = link_to_fixup_dir(trans, root, 1406 path, location.objectid); 1407 BUG_ON(ret); 1408 btrfs_inc_nlink(inode); 1409 ret = btrfs_unlink_inode(trans, root, dir, inode, 1410 name, name_len); 1411 BUG_ON(ret); 1412 kfree(name); 1413 iput(inode); 1414 1415 /* there might still be more names under this key 1416 * check and repeat if required 1417 */ 1418 ret = btrfs_search_slot(NULL, root, dir_key, path, 1419 0, 0); 1420 if (ret == 0) 1421 goto again; 1422 ret = 0; 1423 goto out; 1424 } 1425 btrfs_release_path(log, log_path); 1426 kfree(name); 1427 1428 ptr = (unsigned long)(di + 1); 1429 ptr += name_len; 1430 } 1431 ret = 0; 1432 out: 1433 btrfs_release_path(root, path); 1434 btrfs_release_path(log, log_path); 1435 return ret; 1436 } 1437 1438 /* 1439 * deletion replay happens before we copy any new directory items 1440 * out of the log or out of backreferences from inodes. It 1441 * scans the log to find ranges of keys that log is authoritative for, 1442 * and then scans the directory to find items in those ranges that are 1443 * not present in the log. 1444 * 1445 * Anything we don't find in the log is unlinked and removed from the 1446 * directory. 1447 */ 1448 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1449 struct btrfs_root *root, 1450 struct btrfs_root *log, 1451 struct btrfs_path *path, 1452 u64 dirid, int del_all) 1453 { 1454 u64 range_start; 1455 u64 range_end; 1456 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1457 int ret = 0; 1458 struct btrfs_key dir_key; 1459 struct btrfs_key found_key; 1460 struct btrfs_path *log_path; 1461 struct inode *dir; 1462 1463 dir_key.objectid = dirid; 1464 dir_key.type = BTRFS_DIR_ITEM_KEY; 1465 log_path = btrfs_alloc_path(); 1466 if (!log_path) 1467 return -ENOMEM; 1468 1469 dir = read_one_inode(root, dirid); 1470 /* it isn't an error if the inode isn't there, that can happen 1471 * because we replay the deletes before we copy in the inode item 1472 * from the log 1473 */ 1474 if (!dir) { 1475 btrfs_free_path(log_path); 1476 return 0; 1477 } 1478 again: 1479 range_start = 0; 1480 range_end = 0; 1481 while (1) { 1482 if (del_all) 1483 range_end = (u64)-1; 1484 else { 1485 ret = find_dir_range(log, path, dirid, key_type, 1486 &range_start, &range_end); 1487 if (ret != 0) 1488 break; 1489 } 1490 1491 dir_key.offset = range_start; 1492 while (1) { 1493 int nritems; 1494 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1495 0, 0); 1496 if (ret < 0) 1497 goto out; 1498 1499 nritems = btrfs_header_nritems(path->nodes[0]); 1500 if (path->slots[0] >= nritems) { 1501 ret = btrfs_next_leaf(root, path); 1502 if (ret) 1503 break; 1504 } 1505 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1506 path->slots[0]); 1507 if (found_key.objectid != dirid || 1508 found_key.type != dir_key.type) 1509 goto next_type; 1510 1511 if (found_key.offset > range_end) 1512 break; 1513 1514 ret = check_item_in_log(trans, root, log, path, 1515 log_path, dir, 1516 &found_key); 1517 BUG_ON(ret); 1518 if (found_key.offset == (u64)-1) 1519 break; 1520 dir_key.offset = found_key.offset + 1; 1521 } 1522 btrfs_release_path(root, path); 1523 if (range_end == (u64)-1) 1524 break; 1525 range_start = range_end + 1; 1526 } 1527 1528 next_type: 1529 ret = 0; 1530 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1531 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1532 dir_key.type = BTRFS_DIR_INDEX_KEY; 1533 btrfs_release_path(root, path); 1534 goto again; 1535 } 1536 out: 1537 btrfs_release_path(root, path); 1538 btrfs_free_path(log_path); 1539 iput(dir); 1540 return ret; 1541 } 1542 1543 /* 1544 * the process_func used to replay items from the log tree. This 1545 * gets called in two different stages. The first stage just looks 1546 * for inodes and makes sure they are all copied into the subvolume. 1547 * 1548 * The second stage copies all the other item types from the log into 1549 * the subvolume. The two stage approach is slower, but gets rid of 1550 * lots of complexity around inodes referencing other inodes that exist 1551 * only in the log (references come from either directory items or inode 1552 * back refs). 1553 */ 1554 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1555 struct walk_control *wc, u64 gen) 1556 { 1557 int nritems; 1558 struct btrfs_path *path; 1559 struct btrfs_root *root = wc->replay_dest; 1560 struct btrfs_key key; 1561 u32 item_size; 1562 int level; 1563 int i; 1564 int ret; 1565 1566 btrfs_read_buffer(eb, gen); 1567 1568 level = btrfs_header_level(eb); 1569 1570 if (level != 0) 1571 return 0; 1572 1573 path = btrfs_alloc_path(); 1574 BUG_ON(!path); 1575 1576 nritems = btrfs_header_nritems(eb); 1577 for (i = 0; i < nritems; i++) { 1578 btrfs_item_key_to_cpu(eb, &key, i); 1579 item_size = btrfs_item_size_nr(eb, i); 1580 1581 /* inode keys are done during the first stage */ 1582 if (key.type == BTRFS_INODE_ITEM_KEY && 1583 wc->stage == LOG_WALK_REPLAY_INODES) { 1584 struct inode *inode; 1585 struct btrfs_inode_item *inode_item; 1586 u32 mode; 1587 1588 inode_item = btrfs_item_ptr(eb, i, 1589 struct btrfs_inode_item); 1590 mode = btrfs_inode_mode(eb, inode_item); 1591 if (S_ISDIR(mode)) { 1592 ret = replay_dir_deletes(wc->trans, 1593 root, log, path, key.objectid, 0); 1594 BUG_ON(ret); 1595 } 1596 ret = overwrite_item(wc->trans, root, path, 1597 eb, i, &key); 1598 BUG_ON(ret); 1599 1600 /* for regular files, truncate away 1601 * extents past the new EOF 1602 */ 1603 if (S_ISREG(mode)) { 1604 inode = read_one_inode(root, 1605 key.objectid); 1606 BUG_ON(!inode); 1607 1608 ret = btrfs_truncate_inode_items(wc->trans, 1609 root, inode, inode->i_size, 1610 BTRFS_EXTENT_DATA_KEY); 1611 BUG_ON(ret); 1612 1613 /* if the nlink count is zero here, the iput 1614 * will free the inode. We bump it to make 1615 * sure it doesn't get freed until the link 1616 * count fixup is done 1617 */ 1618 if (inode->i_nlink == 0) { 1619 btrfs_inc_nlink(inode); 1620 btrfs_update_inode(wc->trans, 1621 root, inode); 1622 } 1623 iput(inode); 1624 } 1625 ret = link_to_fixup_dir(wc->trans, root, 1626 path, key.objectid); 1627 BUG_ON(ret); 1628 } 1629 if (wc->stage < LOG_WALK_REPLAY_ALL) 1630 continue; 1631 1632 /* these keys are simply copied */ 1633 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1634 ret = overwrite_item(wc->trans, root, path, 1635 eb, i, &key); 1636 BUG_ON(ret); 1637 } else if (key.type == BTRFS_INODE_REF_KEY) { 1638 ret = add_inode_ref(wc->trans, root, log, path, 1639 eb, i, &key); 1640 BUG_ON(ret && ret != -ENOENT); 1641 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1642 ret = replay_one_extent(wc->trans, root, path, 1643 eb, i, &key); 1644 BUG_ON(ret); 1645 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1646 key.type == BTRFS_DIR_INDEX_KEY) { 1647 ret = replay_one_dir_item(wc->trans, root, path, 1648 eb, i, &key); 1649 BUG_ON(ret); 1650 } 1651 } 1652 btrfs_free_path(path); 1653 return 0; 1654 } 1655 1656 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1657 struct btrfs_root *root, 1658 struct btrfs_path *path, int *level, 1659 struct walk_control *wc) 1660 { 1661 u64 root_owner; 1662 u64 root_gen; 1663 u64 bytenr; 1664 u64 ptr_gen; 1665 struct extent_buffer *next; 1666 struct extent_buffer *cur; 1667 struct extent_buffer *parent; 1668 u32 blocksize; 1669 int ret = 0; 1670 1671 WARN_ON(*level < 0); 1672 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1673 1674 while (*level > 0) { 1675 WARN_ON(*level < 0); 1676 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1677 cur = path->nodes[*level]; 1678 1679 if (btrfs_header_level(cur) != *level) 1680 WARN_ON(1); 1681 1682 if (path->slots[*level] >= 1683 btrfs_header_nritems(cur)) 1684 break; 1685 1686 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1687 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1688 blocksize = btrfs_level_size(root, *level - 1); 1689 1690 parent = path->nodes[*level]; 1691 root_owner = btrfs_header_owner(parent); 1692 root_gen = btrfs_header_generation(parent); 1693 1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1695 1696 wc->process_func(root, next, wc, ptr_gen); 1697 1698 if (*level == 1) { 1699 path->slots[*level]++; 1700 if (wc->free) { 1701 btrfs_read_buffer(next, ptr_gen); 1702 1703 btrfs_tree_lock(next); 1704 clean_tree_block(trans, root, next); 1705 btrfs_set_lock_blocking(next); 1706 btrfs_wait_tree_block_writeback(next); 1707 btrfs_tree_unlock(next); 1708 1709 ret = btrfs_drop_leaf_ref(trans, root, next); 1710 BUG_ON(ret); 1711 1712 WARN_ON(root_owner != 1713 BTRFS_TREE_LOG_OBJECTID); 1714 ret = btrfs_free_reserved_extent(root, 1715 bytenr, blocksize); 1716 BUG_ON(ret); 1717 } 1718 free_extent_buffer(next); 1719 continue; 1720 } 1721 btrfs_read_buffer(next, ptr_gen); 1722 1723 WARN_ON(*level <= 0); 1724 if (path->nodes[*level-1]) 1725 free_extent_buffer(path->nodes[*level-1]); 1726 path->nodes[*level-1] = next; 1727 *level = btrfs_header_level(next); 1728 path->slots[*level] = 0; 1729 cond_resched(); 1730 } 1731 WARN_ON(*level < 0); 1732 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1733 1734 if (path->nodes[*level] == root->node) 1735 parent = path->nodes[*level]; 1736 else 1737 parent = path->nodes[*level + 1]; 1738 1739 bytenr = path->nodes[*level]->start; 1740 1741 blocksize = btrfs_level_size(root, *level); 1742 root_owner = btrfs_header_owner(parent); 1743 root_gen = btrfs_header_generation(parent); 1744 1745 wc->process_func(root, path->nodes[*level], wc, 1746 btrfs_header_generation(path->nodes[*level])); 1747 1748 if (wc->free) { 1749 next = path->nodes[*level]; 1750 btrfs_tree_lock(next); 1751 clean_tree_block(trans, root, next); 1752 btrfs_set_lock_blocking(next); 1753 btrfs_wait_tree_block_writeback(next); 1754 btrfs_tree_unlock(next); 1755 1756 if (*level == 0) { 1757 ret = btrfs_drop_leaf_ref(trans, root, next); 1758 BUG_ON(ret); 1759 } 1760 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1761 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1762 BUG_ON(ret); 1763 } 1764 free_extent_buffer(path->nodes[*level]); 1765 path->nodes[*level] = NULL; 1766 *level += 1; 1767 1768 cond_resched(); 1769 return 0; 1770 } 1771 1772 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1773 struct btrfs_root *root, 1774 struct btrfs_path *path, int *level, 1775 struct walk_control *wc) 1776 { 1777 u64 root_owner; 1778 u64 root_gen; 1779 int i; 1780 int slot; 1781 int ret; 1782 1783 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1784 slot = path->slots[i]; 1785 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1786 struct extent_buffer *node; 1787 node = path->nodes[i]; 1788 path->slots[i]++; 1789 *level = i; 1790 WARN_ON(*level == 0); 1791 return 0; 1792 } else { 1793 struct extent_buffer *parent; 1794 if (path->nodes[*level] == root->node) 1795 parent = path->nodes[*level]; 1796 else 1797 parent = path->nodes[*level + 1]; 1798 1799 root_owner = btrfs_header_owner(parent); 1800 root_gen = btrfs_header_generation(parent); 1801 wc->process_func(root, path->nodes[*level], wc, 1802 btrfs_header_generation(path->nodes[*level])); 1803 if (wc->free) { 1804 struct extent_buffer *next; 1805 1806 next = path->nodes[*level]; 1807 1808 btrfs_tree_lock(next); 1809 clean_tree_block(trans, root, next); 1810 btrfs_set_lock_blocking(next); 1811 btrfs_wait_tree_block_writeback(next); 1812 btrfs_tree_unlock(next); 1813 1814 if (*level == 0) { 1815 ret = btrfs_drop_leaf_ref(trans, root, 1816 next); 1817 BUG_ON(ret); 1818 } 1819 1820 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1821 ret = btrfs_free_reserved_extent(root, 1822 path->nodes[*level]->start, 1823 path->nodes[*level]->len); 1824 BUG_ON(ret); 1825 } 1826 free_extent_buffer(path->nodes[*level]); 1827 path->nodes[*level] = NULL; 1828 *level = i + 1; 1829 } 1830 } 1831 return 1; 1832 } 1833 1834 /* 1835 * drop the reference count on the tree rooted at 'snap'. This traverses 1836 * the tree freeing any blocks that have a ref count of zero after being 1837 * decremented. 1838 */ 1839 static int walk_log_tree(struct btrfs_trans_handle *trans, 1840 struct btrfs_root *log, struct walk_control *wc) 1841 { 1842 int ret = 0; 1843 int wret; 1844 int level; 1845 struct btrfs_path *path; 1846 int i; 1847 int orig_level; 1848 1849 path = btrfs_alloc_path(); 1850 BUG_ON(!path); 1851 1852 level = btrfs_header_level(log->node); 1853 orig_level = level; 1854 path->nodes[level] = log->node; 1855 extent_buffer_get(log->node); 1856 path->slots[level] = 0; 1857 1858 while (1) { 1859 wret = walk_down_log_tree(trans, log, path, &level, wc); 1860 if (wret > 0) 1861 break; 1862 if (wret < 0) 1863 ret = wret; 1864 1865 wret = walk_up_log_tree(trans, log, path, &level, wc); 1866 if (wret > 0) 1867 break; 1868 if (wret < 0) 1869 ret = wret; 1870 } 1871 1872 /* was the root node processed? if not, catch it here */ 1873 if (path->nodes[orig_level]) { 1874 wc->process_func(log, path->nodes[orig_level], wc, 1875 btrfs_header_generation(path->nodes[orig_level])); 1876 if (wc->free) { 1877 struct extent_buffer *next; 1878 1879 next = path->nodes[orig_level]; 1880 1881 btrfs_tree_lock(next); 1882 clean_tree_block(trans, log, next); 1883 btrfs_set_lock_blocking(next); 1884 btrfs_wait_tree_block_writeback(next); 1885 btrfs_tree_unlock(next); 1886 1887 if (orig_level == 0) { 1888 ret = btrfs_drop_leaf_ref(trans, log, 1889 next); 1890 BUG_ON(ret); 1891 } 1892 WARN_ON(log->root_key.objectid != 1893 BTRFS_TREE_LOG_OBJECTID); 1894 ret = btrfs_free_reserved_extent(log, next->start, 1895 next->len); 1896 BUG_ON(ret); 1897 } 1898 } 1899 1900 for (i = 0; i <= orig_level; i++) { 1901 if (path->nodes[i]) { 1902 free_extent_buffer(path->nodes[i]); 1903 path->nodes[i] = NULL; 1904 } 1905 } 1906 btrfs_free_path(path); 1907 return ret; 1908 } 1909 1910 /* 1911 * helper function to update the item for a given subvolumes log root 1912 * in the tree of log roots 1913 */ 1914 static int update_log_root(struct btrfs_trans_handle *trans, 1915 struct btrfs_root *log) 1916 { 1917 int ret; 1918 1919 if (log->log_transid == 1) { 1920 /* insert root item on the first sync */ 1921 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1922 &log->root_key, &log->root_item); 1923 } else { 1924 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1925 &log->root_key, &log->root_item); 1926 } 1927 return ret; 1928 } 1929 1930 static int wait_log_commit(struct btrfs_trans_handle *trans, 1931 struct btrfs_root *root, unsigned long transid) 1932 { 1933 DEFINE_WAIT(wait); 1934 int index = transid % 2; 1935 1936 /* 1937 * we only allow two pending log transactions at a time, 1938 * so we know that if ours is more than 2 older than the 1939 * current transaction, we're done 1940 */ 1941 do { 1942 prepare_to_wait(&root->log_commit_wait[index], 1943 &wait, TASK_UNINTERRUPTIBLE); 1944 mutex_unlock(&root->log_mutex); 1945 1946 if (root->fs_info->last_trans_log_full_commit != 1947 trans->transid && root->log_transid < transid + 2 && 1948 atomic_read(&root->log_commit[index])) 1949 schedule(); 1950 1951 finish_wait(&root->log_commit_wait[index], &wait); 1952 mutex_lock(&root->log_mutex); 1953 } while (root->log_transid < transid + 2 && 1954 atomic_read(&root->log_commit[index])); 1955 return 0; 1956 } 1957 1958 static int wait_for_writer(struct btrfs_trans_handle *trans, 1959 struct btrfs_root *root) 1960 { 1961 DEFINE_WAIT(wait); 1962 while (atomic_read(&root->log_writers)) { 1963 prepare_to_wait(&root->log_writer_wait, 1964 &wait, TASK_UNINTERRUPTIBLE); 1965 mutex_unlock(&root->log_mutex); 1966 if (root->fs_info->last_trans_log_full_commit != 1967 trans->transid && atomic_read(&root->log_writers)) 1968 schedule(); 1969 mutex_lock(&root->log_mutex); 1970 finish_wait(&root->log_writer_wait, &wait); 1971 } 1972 return 0; 1973 } 1974 1975 /* 1976 * btrfs_sync_log does sends a given tree log down to the disk and 1977 * updates the super blocks to record it. When this call is done, 1978 * you know that any inodes previously logged are safely on disk only 1979 * if it returns 0. 1980 * 1981 * Any other return value means you need to call btrfs_commit_transaction. 1982 * Some of the edge cases for fsyncing directories that have had unlinks 1983 * or renames done in the past mean that sometimes the only safe 1984 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 1985 * that has happened. 1986 */ 1987 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1988 struct btrfs_root *root) 1989 { 1990 int index1; 1991 int index2; 1992 int ret; 1993 struct btrfs_root *log = root->log_root; 1994 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1995 1996 mutex_lock(&root->log_mutex); 1997 index1 = root->log_transid % 2; 1998 if (atomic_read(&root->log_commit[index1])) { 1999 wait_log_commit(trans, root, root->log_transid); 2000 mutex_unlock(&root->log_mutex); 2001 return 0; 2002 } 2003 atomic_set(&root->log_commit[index1], 1); 2004 2005 /* wait for previous tree log sync to complete */ 2006 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2007 wait_log_commit(trans, root, root->log_transid - 1); 2008 2009 while (1) { 2010 unsigned long batch = root->log_batch; 2011 mutex_unlock(&root->log_mutex); 2012 schedule_timeout_uninterruptible(1); 2013 mutex_lock(&root->log_mutex); 2014 2015 wait_for_writer(trans, root); 2016 if (batch == root->log_batch) 2017 break; 2018 } 2019 2020 /* bail out if we need to do a full commit */ 2021 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2022 ret = -EAGAIN; 2023 mutex_unlock(&root->log_mutex); 2024 goto out; 2025 } 2026 2027 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2028 BUG_ON(ret); 2029 2030 btrfs_set_root_bytenr(&log->root_item, log->node->start); 2031 btrfs_set_root_generation(&log->root_item, trans->transid); 2032 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); 2033 2034 root->log_batch = 0; 2035 root->log_transid++; 2036 log->log_transid = root->log_transid; 2037 smp_mb(); 2038 /* 2039 * log tree has been flushed to disk, new modifications of 2040 * the log will be written to new positions. so it's safe to 2041 * allow log writers to go in. 2042 */ 2043 mutex_unlock(&root->log_mutex); 2044 2045 mutex_lock(&log_root_tree->log_mutex); 2046 log_root_tree->log_batch++; 2047 atomic_inc(&log_root_tree->log_writers); 2048 mutex_unlock(&log_root_tree->log_mutex); 2049 2050 ret = update_log_root(trans, log); 2051 BUG_ON(ret); 2052 2053 mutex_lock(&log_root_tree->log_mutex); 2054 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2055 smp_mb(); 2056 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2057 wake_up(&log_root_tree->log_writer_wait); 2058 } 2059 2060 index2 = log_root_tree->log_transid % 2; 2061 if (atomic_read(&log_root_tree->log_commit[index2])) { 2062 wait_log_commit(trans, log_root_tree, 2063 log_root_tree->log_transid); 2064 mutex_unlock(&log_root_tree->log_mutex); 2065 goto out; 2066 } 2067 atomic_set(&log_root_tree->log_commit[index2], 1); 2068 2069 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2070 wait_log_commit(trans, log_root_tree, 2071 log_root_tree->log_transid - 1); 2072 } 2073 2074 wait_for_writer(trans, log_root_tree); 2075 2076 /* 2077 * now that we've moved on to the tree of log tree roots, 2078 * check the full commit flag again 2079 */ 2080 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2081 mutex_unlock(&log_root_tree->log_mutex); 2082 ret = -EAGAIN; 2083 goto out_wake_log_root; 2084 } 2085 2086 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2087 &log_root_tree->dirty_log_pages); 2088 BUG_ON(ret); 2089 2090 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2091 log_root_tree->node->start); 2092 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2093 btrfs_header_level(log_root_tree->node)); 2094 2095 log_root_tree->log_batch = 0; 2096 log_root_tree->log_transid++; 2097 smp_mb(); 2098 2099 mutex_unlock(&log_root_tree->log_mutex); 2100 2101 /* 2102 * nobody else is going to jump in and write the the ctree 2103 * super here because the log_commit atomic below is protecting 2104 * us. We must be called with a transaction handle pinning 2105 * the running transaction open, so a full commit can't hop 2106 * in and cause problems either. 2107 */ 2108 write_ctree_super(trans, root->fs_info->tree_root, 2); 2109 ret = 0; 2110 2111 out_wake_log_root: 2112 atomic_set(&log_root_tree->log_commit[index2], 0); 2113 smp_mb(); 2114 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2115 wake_up(&log_root_tree->log_commit_wait[index2]); 2116 out: 2117 atomic_set(&root->log_commit[index1], 0); 2118 smp_mb(); 2119 if (waitqueue_active(&root->log_commit_wait[index1])) 2120 wake_up(&root->log_commit_wait[index1]); 2121 return 0; 2122 } 2123 2124 /* 2125 * free all the extents used by the tree log. This should be called 2126 * at commit time of the full transaction 2127 */ 2128 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2129 { 2130 int ret; 2131 struct btrfs_root *log; 2132 struct key; 2133 u64 start; 2134 u64 end; 2135 struct walk_control wc = { 2136 .free = 1, 2137 .process_func = process_one_buffer 2138 }; 2139 2140 if (!root->log_root || root->fs_info->log_root_recovering) 2141 return 0; 2142 2143 log = root->log_root; 2144 ret = walk_log_tree(trans, log, &wc); 2145 BUG_ON(ret); 2146 2147 while (1) { 2148 ret = find_first_extent_bit(&log->dirty_log_pages, 2149 0, &start, &end, EXTENT_DIRTY); 2150 if (ret) 2151 break; 2152 2153 clear_extent_dirty(&log->dirty_log_pages, 2154 start, end, GFP_NOFS); 2155 } 2156 2157 if (log->log_transid > 0) { 2158 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2159 &log->root_key); 2160 BUG_ON(ret); 2161 } 2162 root->log_root = NULL; 2163 free_extent_buffer(log->node); 2164 kfree(log); 2165 return 0; 2166 } 2167 2168 /* 2169 * If both a file and directory are logged, and unlinks or renames are 2170 * mixed in, we have a few interesting corners: 2171 * 2172 * create file X in dir Y 2173 * link file X to X.link in dir Y 2174 * fsync file X 2175 * unlink file X but leave X.link 2176 * fsync dir Y 2177 * 2178 * After a crash we would expect only X.link to exist. But file X 2179 * didn't get fsync'd again so the log has back refs for X and X.link. 2180 * 2181 * We solve this by removing directory entries and inode backrefs from the 2182 * log when a file that was logged in the current transaction is 2183 * unlinked. Any later fsync will include the updated log entries, and 2184 * we'll be able to reconstruct the proper directory items from backrefs. 2185 * 2186 * This optimizations allows us to avoid relogging the entire inode 2187 * or the entire directory. 2188 */ 2189 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2190 struct btrfs_root *root, 2191 const char *name, int name_len, 2192 struct inode *dir, u64 index) 2193 { 2194 struct btrfs_root *log; 2195 struct btrfs_dir_item *di; 2196 struct btrfs_path *path; 2197 int ret; 2198 int bytes_del = 0; 2199 2200 if (BTRFS_I(dir)->logged_trans < trans->transid) 2201 return 0; 2202 2203 ret = join_running_log_trans(root); 2204 if (ret) 2205 return 0; 2206 2207 mutex_lock(&BTRFS_I(dir)->log_mutex); 2208 2209 log = root->log_root; 2210 path = btrfs_alloc_path(); 2211 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2212 name, name_len, -1); 2213 if (di && !IS_ERR(di)) { 2214 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2215 bytes_del += name_len; 2216 BUG_ON(ret); 2217 } 2218 btrfs_release_path(log, path); 2219 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2220 index, name, name_len, -1); 2221 if (di && !IS_ERR(di)) { 2222 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2223 bytes_del += name_len; 2224 BUG_ON(ret); 2225 } 2226 2227 /* update the directory size in the log to reflect the names 2228 * we have removed 2229 */ 2230 if (bytes_del) { 2231 struct btrfs_key key; 2232 2233 key.objectid = dir->i_ino; 2234 key.offset = 0; 2235 key.type = BTRFS_INODE_ITEM_KEY; 2236 btrfs_release_path(log, path); 2237 2238 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2239 if (ret == 0) { 2240 struct btrfs_inode_item *item; 2241 u64 i_size; 2242 2243 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2244 struct btrfs_inode_item); 2245 i_size = btrfs_inode_size(path->nodes[0], item); 2246 if (i_size > bytes_del) 2247 i_size -= bytes_del; 2248 else 2249 i_size = 0; 2250 btrfs_set_inode_size(path->nodes[0], item, i_size); 2251 btrfs_mark_buffer_dirty(path->nodes[0]); 2252 } else 2253 ret = 0; 2254 btrfs_release_path(log, path); 2255 } 2256 2257 btrfs_free_path(path); 2258 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2259 btrfs_end_log_trans(root); 2260 2261 return 0; 2262 } 2263 2264 /* see comments for btrfs_del_dir_entries_in_log */ 2265 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2266 struct btrfs_root *root, 2267 const char *name, int name_len, 2268 struct inode *inode, u64 dirid) 2269 { 2270 struct btrfs_root *log; 2271 u64 index; 2272 int ret; 2273 2274 if (BTRFS_I(inode)->logged_trans < trans->transid) 2275 return 0; 2276 2277 ret = join_running_log_trans(root); 2278 if (ret) 2279 return 0; 2280 log = root->log_root; 2281 mutex_lock(&BTRFS_I(inode)->log_mutex); 2282 2283 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2284 dirid, &index); 2285 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2286 btrfs_end_log_trans(root); 2287 2288 return ret; 2289 } 2290 2291 /* 2292 * creates a range item in the log for 'dirid'. first_offset and 2293 * last_offset tell us which parts of the key space the log should 2294 * be considered authoritative for. 2295 */ 2296 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2297 struct btrfs_root *log, 2298 struct btrfs_path *path, 2299 int key_type, u64 dirid, 2300 u64 first_offset, u64 last_offset) 2301 { 2302 int ret; 2303 struct btrfs_key key; 2304 struct btrfs_dir_log_item *item; 2305 2306 key.objectid = dirid; 2307 key.offset = first_offset; 2308 if (key_type == BTRFS_DIR_ITEM_KEY) 2309 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2310 else 2311 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2312 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2313 BUG_ON(ret); 2314 2315 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2316 struct btrfs_dir_log_item); 2317 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2318 btrfs_mark_buffer_dirty(path->nodes[0]); 2319 btrfs_release_path(log, path); 2320 return 0; 2321 } 2322 2323 /* 2324 * log all the items included in the current transaction for a given 2325 * directory. This also creates the range items in the log tree required 2326 * to replay anything deleted before the fsync 2327 */ 2328 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2329 struct btrfs_root *root, struct inode *inode, 2330 struct btrfs_path *path, 2331 struct btrfs_path *dst_path, int key_type, 2332 u64 min_offset, u64 *last_offset_ret) 2333 { 2334 struct btrfs_key min_key; 2335 struct btrfs_key max_key; 2336 struct btrfs_root *log = root->log_root; 2337 struct extent_buffer *src; 2338 int ret; 2339 int i; 2340 int nritems; 2341 u64 first_offset = min_offset; 2342 u64 last_offset = (u64)-1; 2343 2344 log = root->log_root; 2345 max_key.objectid = inode->i_ino; 2346 max_key.offset = (u64)-1; 2347 max_key.type = key_type; 2348 2349 min_key.objectid = inode->i_ino; 2350 min_key.type = key_type; 2351 min_key.offset = min_offset; 2352 2353 path->keep_locks = 1; 2354 2355 ret = btrfs_search_forward(root, &min_key, &max_key, 2356 path, 0, trans->transid); 2357 2358 /* 2359 * we didn't find anything from this transaction, see if there 2360 * is anything at all 2361 */ 2362 if (ret != 0 || min_key.objectid != inode->i_ino || 2363 min_key.type != key_type) { 2364 min_key.objectid = inode->i_ino; 2365 min_key.type = key_type; 2366 min_key.offset = (u64)-1; 2367 btrfs_release_path(root, path); 2368 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2369 if (ret < 0) { 2370 btrfs_release_path(root, path); 2371 return ret; 2372 } 2373 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2374 2375 /* if ret == 0 there are items for this type, 2376 * create a range to tell us the last key of this type. 2377 * otherwise, there are no items in this directory after 2378 * *min_offset, and we create a range to indicate that. 2379 */ 2380 if (ret == 0) { 2381 struct btrfs_key tmp; 2382 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2383 path->slots[0]); 2384 if (key_type == tmp.type) 2385 first_offset = max(min_offset, tmp.offset) + 1; 2386 } 2387 goto done; 2388 } 2389 2390 /* go backward to find any previous key */ 2391 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2392 if (ret == 0) { 2393 struct btrfs_key tmp; 2394 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2395 if (key_type == tmp.type) { 2396 first_offset = tmp.offset; 2397 ret = overwrite_item(trans, log, dst_path, 2398 path->nodes[0], path->slots[0], 2399 &tmp); 2400 } 2401 } 2402 btrfs_release_path(root, path); 2403 2404 /* find the first key from this transaction again */ 2405 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2406 if (ret != 0) { 2407 WARN_ON(1); 2408 goto done; 2409 } 2410 2411 /* 2412 * we have a block from this transaction, log every item in it 2413 * from our directory 2414 */ 2415 while (1) { 2416 struct btrfs_key tmp; 2417 src = path->nodes[0]; 2418 nritems = btrfs_header_nritems(src); 2419 for (i = path->slots[0]; i < nritems; i++) { 2420 btrfs_item_key_to_cpu(src, &min_key, i); 2421 2422 if (min_key.objectid != inode->i_ino || 2423 min_key.type != key_type) 2424 goto done; 2425 ret = overwrite_item(trans, log, dst_path, src, i, 2426 &min_key); 2427 BUG_ON(ret); 2428 } 2429 path->slots[0] = nritems; 2430 2431 /* 2432 * look ahead to the next item and see if it is also 2433 * from this directory and from this transaction 2434 */ 2435 ret = btrfs_next_leaf(root, path); 2436 if (ret == 1) { 2437 last_offset = (u64)-1; 2438 goto done; 2439 } 2440 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2441 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2442 last_offset = (u64)-1; 2443 goto done; 2444 } 2445 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2446 ret = overwrite_item(trans, log, dst_path, 2447 path->nodes[0], path->slots[0], 2448 &tmp); 2449 2450 BUG_ON(ret); 2451 last_offset = tmp.offset; 2452 goto done; 2453 } 2454 } 2455 done: 2456 *last_offset_ret = last_offset; 2457 btrfs_release_path(root, path); 2458 btrfs_release_path(log, dst_path); 2459 2460 /* insert the log range keys to indicate where the log is valid */ 2461 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2462 first_offset, last_offset); 2463 BUG_ON(ret); 2464 return 0; 2465 } 2466 2467 /* 2468 * logging directories is very similar to logging inodes, We find all the items 2469 * from the current transaction and write them to the log. 2470 * 2471 * The recovery code scans the directory in the subvolume, and if it finds a 2472 * key in the range logged that is not present in the log tree, then it means 2473 * that dir entry was unlinked during the transaction. 2474 * 2475 * In order for that scan to work, we must include one key smaller than 2476 * the smallest logged by this transaction and one key larger than the largest 2477 * key logged by this transaction. 2478 */ 2479 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2480 struct btrfs_root *root, struct inode *inode, 2481 struct btrfs_path *path, 2482 struct btrfs_path *dst_path) 2483 { 2484 u64 min_key; 2485 u64 max_key; 2486 int ret; 2487 int key_type = BTRFS_DIR_ITEM_KEY; 2488 2489 again: 2490 min_key = 0; 2491 max_key = 0; 2492 while (1) { 2493 ret = log_dir_items(trans, root, inode, path, 2494 dst_path, key_type, min_key, 2495 &max_key); 2496 BUG_ON(ret); 2497 if (max_key == (u64)-1) 2498 break; 2499 min_key = max_key + 1; 2500 } 2501 2502 if (key_type == BTRFS_DIR_ITEM_KEY) { 2503 key_type = BTRFS_DIR_INDEX_KEY; 2504 goto again; 2505 } 2506 return 0; 2507 } 2508 2509 /* 2510 * a helper function to drop items from the log before we relog an 2511 * inode. max_key_type indicates the highest item type to remove. 2512 * This cannot be run for file data extents because it does not 2513 * free the extents they point to. 2514 */ 2515 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2516 struct btrfs_root *log, 2517 struct btrfs_path *path, 2518 u64 objectid, int max_key_type) 2519 { 2520 int ret; 2521 struct btrfs_key key; 2522 struct btrfs_key found_key; 2523 2524 key.objectid = objectid; 2525 key.type = max_key_type; 2526 key.offset = (u64)-1; 2527 2528 while (1) { 2529 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2530 2531 if (ret != 1) 2532 break; 2533 2534 if (path->slots[0] == 0) 2535 break; 2536 2537 path->slots[0]--; 2538 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2539 path->slots[0]); 2540 2541 if (found_key.objectid != objectid) 2542 break; 2543 2544 ret = btrfs_del_item(trans, log, path); 2545 BUG_ON(ret); 2546 btrfs_release_path(log, path); 2547 } 2548 btrfs_release_path(log, path); 2549 return 0; 2550 } 2551 2552 static noinline int copy_items(struct btrfs_trans_handle *trans, 2553 struct btrfs_root *log, 2554 struct btrfs_path *dst_path, 2555 struct extent_buffer *src, 2556 int start_slot, int nr, int inode_only) 2557 { 2558 unsigned long src_offset; 2559 unsigned long dst_offset; 2560 struct btrfs_file_extent_item *extent; 2561 struct btrfs_inode_item *inode_item; 2562 int ret; 2563 struct btrfs_key *ins_keys; 2564 u32 *ins_sizes; 2565 char *ins_data; 2566 int i; 2567 struct list_head ordered_sums; 2568 2569 INIT_LIST_HEAD(&ordered_sums); 2570 2571 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2572 nr * sizeof(u32), GFP_NOFS); 2573 ins_sizes = (u32 *)ins_data; 2574 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2575 2576 for (i = 0; i < nr; i++) { 2577 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2578 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2579 } 2580 ret = btrfs_insert_empty_items(trans, log, dst_path, 2581 ins_keys, ins_sizes, nr); 2582 BUG_ON(ret); 2583 2584 for (i = 0; i < nr; i++) { 2585 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2586 dst_path->slots[0]); 2587 2588 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2589 2590 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2591 src_offset, ins_sizes[i]); 2592 2593 if (inode_only == LOG_INODE_EXISTS && 2594 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2595 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2596 dst_path->slots[0], 2597 struct btrfs_inode_item); 2598 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2599 2600 /* set the generation to zero so the recover code 2601 * can tell the difference between an logging 2602 * just to say 'this inode exists' and a logging 2603 * to say 'update this inode with these values' 2604 */ 2605 btrfs_set_inode_generation(dst_path->nodes[0], 2606 inode_item, 0); 2607 } 2608 /* take a reference on file data extents so that truncates 2609 * or deletes of this inode don't have to relog the inode 2610 * again 2611 */ 2612 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2613 int found_type; 2614 extent = btrfs_item_ptr(src, start_slot + i, 2615 struct btrfs_file_extent_item); 2616 2617 found_type = btrfs_file_extent_type(src, extent); 2618 if (found_type == BTRFS_FILE_EXTENT_REG || 2619 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2620 u64 ds = btrfs_file_extent_disk_bytenr(src, 2621 extent); 2622 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2623 extent); 2624 u64 cs = btrfs_file_extent_offset(src, extent); 2625 u64 cl = btrfs_file_extent_num_bytes(src, 2626 extent);; 2627 if (btrfs_file_extent_compression(src, 2628 extent)) { 2629 cs = 0; 2630 cl = dl; 2631 } 2632 /* ds == 0 is a hole */ 2633 if (ds != 0) { 2634 ret = btrfs_inc_extent_ref(trans, log, 2635 ds, dl, 2636 dst_path->nodes[0]->start, 2637 BTRFS_TREE_LOG_OBJECTID, 2638 trans->transid, 2639 ins_keys[i].objectid); 2640 BUG_ON(ret); 2641 ret = btrfs_lookup_csums_range( 2642 log->fs_info->csum_root, 2643 ds + cs, ds + cs + cl - 1, 2644 &ordered_sums); 2645 BUG_ON(ret); 2646 } 2647 } 2648 } 2649 dst_path->slots[0]++; 2650 } 2651 2652 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2653 btrfs_release_path(log, dst_path); 2654 kfree(ins_data); 2655 2656 /* 2657 * we have to do this after the loop above to avoid changing the 2658 * log tree while trying to change the log tree. 2659 */ 2660 while (!list_empty(&ordered_sums)) { 2661 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2662 struct btrfs_ordered_sum, 2663 list); 2664 ret = btrfs_csum_file_blocks(trans, log, sums); 2665 BUG_ON(ret); 2666 list_del(&sums->list); 2667 kfree(sums); 2668 } 2669 return 0; 2670 } 2671 2672 /* log a single inode in the tree log. 2673 * At least one parent directory for this inode must exist in the tree 2674 * or be logged already. 2675 * 2676 * Any items from this inode changed by the current transaction are copied 2677 * to the log tree. An extra reference is taken on any extents in this 2678 * file, allowing us to avoid a whole pile of corner cases around logging 2679 * blocks that have been removed from the tree. 2680 * 2681 * See LOG_INODE_ALL and related defines for a description of what inode_only 2682 * does. 2683 * 2684 * This handles both files and directories. 2685 */ 2686 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2687 struct btrfs_root *root, struct inode *inode, 2688 int inode_only) 2689 { 2690 struct btrfs_path *path; 2691 struct btrfs_path *dst_path; 2692 struct btrfs_key min_key; 2693 struct btrfs_key max_key; 2694 struct btrfs_root *log = root->log_root; 2695 struct extent_buffer *src = NULL; 2696 u32 size; 2697 int ret; 2698 int nritems; 2699 int ins_start_slot = 0; 2700 int ins_nr; 2701 2702 log = root->log_root; 2703 2704 path = btrfs_alloc_path(); 2705 dst_path = btrfs_alloc_path(); 2706 2707 min_key.objectid = inode->i_ino; 2708 min_key.type = BTRFS_INODE_ITEM_KEY; 2709 min_key.offset = 0; 2710 2711 max_key.objectid = inode->i_ino; 2712 2713 /* today the code can only do partial logging of directories */ 2714 if (!S_ISDIR(inode->i_mode)) 2715 inode_only = LOG_INODE_ALL; 2716 2717 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2718 max_key.type = BTRFS_XATTR_ITEM_KEY; 2719 else 2720 max_key.type = (u8)-1; 2721 max_key.offset = (u64)-1; 2722 2723 mutex_lock(&BTRFS_I(inode)->log_mutex); 2724 2725 /* 2726 * a brute force approach to making sure we get the most uptodate 2727 * copies of everything. 2728 */ 2729 if (S_ISDIR(inode->i_mode)) { 2730 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2731 2732 if (inode_only == LOG_INODE_EXISTS) 2733 max_key_type = BTRFS_XATTR_ITEM_KEY; 2734 ret = drop_objectid_items(trans, log, path, 2735 inode->i_ino, max_key_type); 2736 } else { 2737 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2738 } 2739 BUG_ON(ret); 2740 path->keep_locks = 1; 2741 2742 while (1) { 2743 ins_nr = 0; 2744 ret = btrfs_search_forward(root, &min_key, &max_key, 2745 path, 0, trans->transid); 2746 if (ret != 0) 2747 break; 2748 again: 2749 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2750 if (min_key.objectid != inode->i_ino) 2751 break; 2752 if (min_key.type > max_key.type) 2753 break; 2754 2755 src = path->nodes[0]; 2756 size = btrfs_item_size_nr(src, path->slots[0]); 2757 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2758 ins_nr++; 2759 goto next_slot; 2760 } else if (!ins_nr) { 2761 ins_start_slot = path->slots[0]; 2762 ins_nr = 1; 2763 goto next_slot; 2764 } 2765 2766 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2767 ins_nr, inode_only); 2768 BUG_ON(ret); 2769 ins_nr = 1; 2770 ins_start_slot = path->slots[0]; 2771 next_slot: 2772 2773 nritems = btrfs_header_nritems(path->nodes[0]); 2774 path->slots[0]++; 2775 if (path->slots[0] < nritems) { 2776 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2777 path->slots[0]); 2778 goto again; 2779 } 2780 if (ins_nr) { 2781 ret = copy_items(trans, log, dst_path, src, 2782 ins_start_slot, 2783 ins_nr, inode_only); 2784 BUG_ON(ret); 2785 ins_nr = 0; 2786 } 2787 btrfs_release_path(root, path); 2788 2789 if (min_key.offset < (u64)-1) 2790 min_key.offset++; 2791 else if (min_key.type < (u8)-1) 2792 min_key.type++; 2793 else if (min_key.objectid < (u64)-1) 2794 min_key.objectid++; 2795 else 2796 break; 2797 } 2798 if (ins_nr) { 2799 ret = copy_items(trans, log, dst_path, src, 2800 ins_start_slot, 2801 ins_nr, inode_only); 2802 BUG_ON(ret); 2803 ins_nr = 0; 2804 } 2805 WARN_ON(ins_nr); 2806 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2807 btrfs_release_path(root, path); 2808 btrfs_release_path(log, dst_path); 2809 ret = log_directory_changes(trans, root, inode, path, dst_path); 2810 BUG_ON(ret); 2811 } 2812 BTRFS_I(inode)->logged_trans = trans->transid; 2813 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2814 2815 btrfs_free_path(path); 2816 btrfs_free_path(dst_path); 2817 return 0; 2818 } 2819 2820 /* 2821 * follow the dentry parent pointers up the chain and see if any 2822 * of the directories in it require a full commit before they can 2823 * be logged. Returns zero if nothing special needs to be done or 1 if 2824 * a full commit is required. 2825 */ 2826 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 2827 struct inode *inode, 2828 struct dentry *parent, 2829 struct super_block *sb, 2830 u64 last_committed) 2831 { 2832 int ret = 0; 2833 struct btrfs_root *root; 2834 2835 /* 2836 * for regular files, if its inode is already on disk, we don't 2837 * have to worry about the parents at all. This is because 2838 * we can use the last_unlink_trans field to record renames 2839 * and other fun in this file. 2840 */ 2841 if (S_ISREG(inode->i_mode) && 2842 BTRFS_I(inode)->generation <= last_committed && 2843 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2844 goto out; 2845 2846 if (!S_ISDIR(inode->i_mode)) { 2847 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2848 goto out; 2849 inode = parent->d_inode; 2850 } 2851 2852 while (1) { 2853 BTRFS_I(inode)->logged_trans = trans->transid; 2854 smp_mb(); 2855 2856 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 2857 root = BTRFS_I(inode)->root; 2858 2859 /* 2860 * make sure any commits to the log are forced 2861 * to be full commits 2862 */ 2863 root->fs_info->last_trans_log_full_commit = 2864 trans->transid; 2865 ret = 1; 2866 break; 2867 } 2868 2869 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2870 break; 2871 2872 if (parent == sb->s_root) 2873 break; 2874 2875 parent = parent->d_parent; 2876 inode = parent->d_inode; 2877 2878 } 2879 out: 2880 return ret; 2881 } 2882 2883 /* 2884 * helper function around btrfs_log_inode to make sure newly created 2885 * parent directories also end up in the log. A minimal inode and backref 2886 * only logging is done of any parent directories that are older than 2887 * the last committed transaction 2888 */ 2889 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 2890 struct btrfs_root *root, struct inode *inode, 2891 struct dentry *parent, int exists_only) 2892 { 2893 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2894 struct super_block *sb; 2895 int ret = 0; 2896 u64 last_committed = root->fs_info->last_trans_committed; 2897 2898 sb = inode->i_sb; 2899 2900 if (btrfs_test_opt(root, NOTREELOG)) { 2901 ret = 1; 2902 goto end_no_trans; 2903 } 2904 2905 if (root->fs_info->last_trans_log_full_commit > 2906 root->fs_info->last_trans_committed) { 2907 ret = 1; 2908 goto end_no_trans; 2909 } 2910 2911 ret = check_parent_dirs_for_sync(trans, inode, parent, 2912 sb, last_committed); 2913 if (ret) 2914 goto end_no_trans; 2915 2916 start_log_trans(trans, root); 2917 2918 ret = btrfs_log_inode(trans, root, inode, inode_only); 2919 BUG_ON(ret); 2920 2921 /* 2922 * for regular files, if its inode is already on disk, we don't 2923 * have to worry about the parents at all. This is because 2924 * we can use the last_unlink_trans field to record renames 2925 * and other fun in this file. 2926 */ 2927 if (S_ISREG(inode->i_mode) && 2928 BTRFS_I(inode)->generation <= last_committed && 2929 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2930 goto no_parent; 2931 2932 inode_only = LOG_INODE_EXISTS; 2933 while (1) { 2934 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2935 break; 2936 2937 inode = parent->d_inode; 2938 if (BTRFS_I(inode)->generation > 2939 root->fs_info->last_trans_committed) { 2940 ret = btrfs_log_inode(trans, root, inode, inode_only); 2941 BUG_ON(ret); 2942 } 2943 if (parent == sb->s_root) 2944 break; 2945 2946 parent = parent->d_parent; 2947 } 2948 no_parent: 2949 ret = 0; 2950 btrfs_end_log_trans(root); 2951 end_no_trans: 2952 return ret; 2953 } 2954 2955 /* 2956 * it is not safe to log dentry if the chunk root has added new 2957 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 2958 * If this returns 1, you must commit the transaction to safely get your 2959 * data on disk. 2960 */ 2961 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2962 struct btrfs_root *root, struct dentry *dentry) 2963 { 2964 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 2965 dentry->d_parent, 0); 2966 } 2967 2968 /* 2969 * should be called during mount to recover any replay any log trees 2970 * from the FS 2971 */ 2972 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 2973 { 2974 int ret; 2975 struct btrfs_path *path; 2976 struct btrfs_trans_handle *trans; 2977 struct btrfs_key key; 2978 struct btrfs_key found_key; 2979 struct btrfs_key tmp_key; 2980 struct btrfs_root *log; 2981 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 2982 u64 highest_inode; 2983 struct walk_control wc = { 2984 .process_func = process_one_buffer, 2985 .stage = 0, 2986 }; 2987 2988 fs_info->log_root_recovering = 1; 2989 path = btrfs_alloc_path(); 2990 BUG_ON(!path); 2991 2992 trans = btrfs_start_transaction(fs_info->tree_root, 1); 2993 2994 wc.trans = trans; 2995 wc.pin = 1; 2996 2997 walk_log_tree(trans, log_root_tree, &wc); 2998 2999 again: 3000 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3001 key.offset = (u64)-1; 3002 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3003 3004 while (1) { 3005 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 3006 if (ret < 0) 3007 break; 3008 if (ret > 0) { 3009 if (path->slots[0] == 0) 3010 break; 3011 path->slots[0]--; 3012 } 3013 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3014 path->slots[0]); 3015 btrfs_release_path(log_root_tree, path); 3016 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3017 break; 3018 3019 log = btrfs_read_fs_root_no_radix(log_root_tree, 3020 &found_key); 3021 BUG_ON(!log); 3022 3023 3024 tmp_key.objectid = found_key.offset; 3025 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3026 tmp_key.offset = (u64)-1; 3027 3028 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3029 BUG_ON(!wc.replay_dest); 3030 3031 wc.replay_dest->log_root = log; 3032 mutex_lock(&fs_info->trans_mutex); 3033 btrfs_record_root_in_trans(wc.replay_dest); 3034 mutex_unlock(&fs_info->trans_mutex); 3035 ret = walk_log_tree(trans, log, &wc); 3036 BUG_ON(ret); 3037 3038 if (wc.stage == LOG_WALK_REPLAY_ALL) { 3039 ret = fixup_inode_link_counts(trans, wc.replay_dest, 3040 path); 3041 BUG_ON(ret); 3042 } 3043 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); 3044 if (ret == 0) { 3045 wc.replay_dest->highest_inode = highest_inode; 3046 wc.replay_dest->last_inode_alloc = highest_inode; 3047 } 3048 3049 key.offset = found_key.offset - 1; 3050 wc.replay_dest->log_root = NULL; 3051 free_extent_buffer(log->node); 3052 kfree(log); 3053 3054 if (found_key.offset == 0) 3055 break; 3056 } 3057 btrfs_release_path(log_root_tree, path); 3058 3059 /* step one is to pin it all, step two is to replay just inodes */ 3060 if (wc.pin) { 3061 wc.pin = 0; 3062 wc.process_func = replay_one_buffer; 3063 wc.stage = LOG_WALK_REPLAY_INODES; 3064 goto again; 3065 } 3066 /* step three is to replay everything */ 3067 if (wc.stage < LOG_WALK_REPLAY_ALL) { 3068 wc.stage++; 3069 goto again; 3070 } 3071 3072 btrfs_free_path(path); 3073 3074 free_extent_buffer(log_root_tree->node); 3075 log_root_tree->log_root = NULL; 3076 fs_info->log_root_recovering = 0; 3077 3078 /* step 4: commit the transaction, which also unpins the blocks */ 3079 btrfs_commit_transaction(trans, fs_info->tree_root); 3080 3081 kfree(log_root_tree); 3082 return 0; 3083 } 3084 3085 /* 3086 * there are some corner cases where we want to force a full 3087 * commit instead of allowing a directory to be logged. 3088 * 3089 * They revolve around files there were unlinked from the directory, and 3090 * this function updates the parent directory so that a full commit is 3091 * properly done if it is fsync'd later after the unlinks are done. 3092 */ 3093 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 3094 struct inode *dir, struct inode *inode, 3095 int for_rename) 3096 { 3097 /* 3098 * when we're logging a file, if it hasn't been renamed 3099 * or unlinked, and its inode is fully committed on disk, 3100 * we don't have to worry about walking up the directory chain 3101 * to log its parents. 3102 * 3103 * So, we use the last_unlink_trans field to put this transid 3104 * into the file. When the file is logged we check it and 3105 * don't log the parents if the file is fully on disk. 3106 */ 3107 if (S_ISREG(inode->i_mode)) 3108 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3109 3110 /* 3111 * if this directory was already logged any new 3112 * names for this file/dir will get recorded 3113 */ 3114 smp_mb(); 3115 if (BTRFS_I(dir)->logged_trans == trans->transid) 3116 return; 3117 3118 /* 3119 * if the inode we're about to unlink was logged, 3120 * the log will be properly updated for any new names 3121 */ 3122 if (BTRFS_I(inode)->logged_trans == trans->transid) 3123 return; 3124 3125 /* 3126 * when renaming files across directories, if the directory 3127 * there we're unlinking from gets fsync'd later on, there's 3128 * no way to find the destination directory later and fsync it 3129 * properly. So, we have to be conservative and force commits 3130 * so the new name gets discovered. 3131 */ 3132 if (for_rename) 3133 goto record; 3134 3135 /* we can safely do the unlink without any special recording */ 3136 return; 3137 3138 record: 3139 BTRFS_I(dir)->last_unlink_trans = trans->transid; 3140 } 3141 3142 /* 3143 * Call this after adding a new name for a file and it will properly 3144 * update the log to reflect the new name. 3145 * 3146 * It will return zero if all goes well, and it will return 1 if a 3147 * full transaction commit is required. 3148 */ 3149 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 3150 struct inode *inode, struct inode *old_dir, 3151 struct dentry *parent) 3152 { 3153 struct btrfs_root * root = BTRFS_I(inode)->root; 3154 3155 /* 3156 * this will force the logging code to walk the dentry chain 3157 * up for the file 3158 */ 3159 if (S_ISREG(inode->i_mode)) 3160 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3161 3162 /* 3163 * if this inode hasn't been logged and directory we're renaming it 3164 * from hasn't been logged, we don't need to log it 3165 */ 3166 if (BTRFS_I(inode)->logged_trans <= 3167 root->fs_info->last_trans_committed && 3168 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 3169 root->fs_info->last_trans_committed)) 3170 return 0; 3171 3172 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 3173 } 3174 3175