1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include "ctree.h" 21 #include "transaction.h" 22 #include "disk-io.h" 23 #include "locking.h" 24 #include "print-tree.h" 25 #include "compat.h" 26 #include "tree-log.h" 27 28 /* magic values for the inode_only field in btrfs_log_inode: 29 * 30 * LOG_INODE_ALL means to log everything 31 * LOG_INODE_EXISTS means to log just enough to recreate the inode 32 * during log replay 33 */ 34 #define LOG_INODE_ALL 0 35 #define LOG_INODE_EXISTS 1 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 #define LOG_WALK_PIN_ONLY 0 90 #define LOG_WALK_REPLAY_INODES 1 91 #define LOG_WALK_REPLAY_ALL 2 92 93 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 94 struct btrfs_root *root, struct inode *inode, 95 int inode_only); 96 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, 98 struct btrfs_path *path, u64 objectid); 99 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, 101 struct btrfs_root *log, 102 struct btrfs_path *path, 103 u64 dirid, int del_all); 104 105 /* 106 * tree logging is a special write ahead log used to make sure that 107 * fsyncs and O_SYNCs can happen without doing full tree commits. 108 * 109 * Full tree commits are expensive because they require commonly 110 * modified blocks to be recowed, creating many dirty pages in the 111 * extent tree an 4x-6x higher write load than ext3. 112 * 113 * Instead of doing a tree commit on every fsync, we use the 114 * key ranges and transaction ids to find items for a given file or directory 115 * that have changed in this transaction. Those items are copied into 116 * a special tree (one per subvolume root), that tree is written to disk 117 * and then the fsync is considered complete. 118 * 119 * After a crash, items are copied out of the log-tree back into the 120 * subvolume tree. Any file data extents found are recorded in the extent 121 * allocation tree, and the log-tree freed. 122 * 123 * The log tree is read three times, once to pin down all the extents it is 124 * using in ram and once, once to create all the inodes logged in the tree 125 * and once to do all the other items. 126 */ 127 128 /* 129 * start a sub transaction and setup the log tree 130 * this increments the log tree writer count to make the people 131 * syncing the tree wait for us to finish 132 */ 133 static int start_log_trans(struct btrfs_trans_handle *trans, 134 struct btrfs_root *root) 135 { 136 int ret; 137 138 mutex_lock(&root->log_mutex); 139 if (root->log_root) { 140 root->log_batch++; 141 atomic_inc(&root->log_writers); 142 mutex_unlock(&root->log_mutex); 143 return 0; 144 } 145 mutex_lock(&root->fs_info->tree_log_mutex); 146 if (!root->fs_info->log_root_tree) { 147 ret = btrfs_init_log_root_tree(trans, root->fs_info); 148 BUG_ON(ret); 149 } 150 if (!root->log_root) { 151 ret = btrfs_add_log_tree(trans, root); 152 BUG_ON(ret); 153 } 154 mutex_unlock(&root->fs_info->tree_log_mutex); 155 root->log_batch++; 156 atomic_inc(&root->log_writers); 157 mutex_unlock(&root->log_mutex); 158 return 0; 159 } 160 161 /* 162 * returns 0 if there was a log transaction running and we were able 163 * to join, or returns -ENOENT if there were not transactions 164 * in progress 165 */ 166 static int join_running_log_trans(struct btrfs_root *root) 167 { 168 int ret = -ENOENT; 169 170 smp_mb(); 171 if (!root->log_root) 172 return -ENOENT; 173 174 mutex_lock(&root->log_mutex); 175 if (root->log_root) { 176 ret = 0; 177 atomic_inc(&root->log_writers); 178 } 179 mutex_unlock(&root->log_mutex); 180 return ret; 181 } 182 183 /* 184 * This either makes the current running log transaction wait 185 * until you call btrfs_end_log_trans() or it makes any future 186 * log transactions wait until you call btrfs_end_log_trans() 187 */ 188 int btrfs_pin_log_trans(struct btrfs_root *root) 189 { 190 int ret = -ENOENT; 191 192 mutex_lock(&root->log_mutex); 193 atomic_inc(&root->log_writers); 194 mutex_unlock(&root->log_mutex); 195 return ret; 196 } 197 198 /* 199 * indicate we're done making changes to the log tree 200 * and wake up anyone waiting to do a sync 201 */ 202 int btrfs_end_log_trans(struct btrfs_root *root) 203 { 204 if (atomic_dec_and_test(&root->log_writers)) { 205 smp_mb(); 206 if (waitqueue_active(&root->log_writer_wait)) 207 wake_up(&root->log_writer_wait); 208 } 209 return 0; 210 } 211 212 213 /* 214 * the walk control struct is used to pass state down the chain when 215 * processing the log tree. The stage field tells us which part 216 * of the log tree processing we are currently doing. The others 217 * are state fields used for that specific part 218 */ 219 struct walk_control { 220 /* should we free the extent on disk when done? This is used 221 * at transaction commit time while freeing a log tree 222 */ 223 int free; 224 225 /* should we write out the extent buffer? This is used 226 * while flushing the log tree to disk during a sync 227 */ 228 int write; 229 230 /* should we wait for the extent buffer io to finish? Also used 231 * while flushing the log tree to disk for a sync 232 */ 233 int wait; 234 235 /* pin only walk, we record which extents on disk belong to the 236 * log trees 237 */ 238 int pin; 239 240 /* what stage of the replay code we're currently in */ 241 int stage; 242 243 /* the root we are currently replaying */ 244 struct btrfs_root *replay_dest; 245 246 /* the trans handle for the current replay */ 247 struct btrfs_trans_handle *trans; 248 249 /* the function that gets used to process blocks we find in the 250 * tree. Note the extent_buffer might not be up to date when it is 251 * passed in, and it must be checked or read if you need the data 252 * inside it 253 */ 254 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 255 struct walk_control *wc, u64 gen); 256 }; 257 258 /* 259 * process_func used to pin down extents, write them or wait on them 260 */ 261 static int process_one_buffer(struct btrfs_root *log, 262 struct extent_buffer *eb, 263 struct walk_control *wc, u64 gen) 264 { 265 if (wc->pin) { 266 mutex_lock(&log->fs_info->pinned_mutex); 267 btrfs_update_pinned_extents(log->fs_info->extent_root, 268 eb->start, eb->len, 1); 269 } 270 271 if (btrfs_buffer_uptodate(eb, gen)) { 272 if (wc->write) 273 btrfs_write_tree_block(eb); 274 if (wc->wait) 275 btrfs_wait_tree_block_writeback(eb); 276 } 277 return 0; 278 } 279 280 /* 281 * Item overwrite used by replay and tree logging. eb, slot and key all refer 282 * to the src data we are copying out. 283 * 284 * root is the tree we are copying into, and path is a scratch 285 * path for use in this function (it should be released on entry and 286 * will be released on exit). 287 * 288 * If the key is already in the destination tree the existing item is 289 * overwritten. If the existing item isn't big enough, it is extended. 290 * If it is too large, it is truncated. 291 * 292 * If the key isn't in the destination yet, a new item is inserted. 293 */ 294 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 295 struct btrfs_root *root, 296 struct btrfs_path *path, 297 struct extent_buffer *eb, int slot, 298 struct btrfs_key *key) 299 { 300 int ret; 301 u32 item_size; 302 u64 saved_i_size = 0; 303 int save_old_i_size = 0; 304 unsigned long src_ptr; 305 unsigned long dst_ptr; 306 int overwrite_root = 0; 307 308 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 309 overwrite_root = 1; 310 311 item_size = btrfs_item_size_nr(eb, slot); 312 src_ptr = btrfs_item_ptr_offset(eb, slot); 313 314 /* look for the key in the destination tree */ 315 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 316 if (ret == 0) { 317 char *src_copy; 318 char *dst_copy; 319 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 320 path->slots[0]); 321 if (dst_size != item_size) 322 goto insert; 323 324 if (item_size == 0) { 325 btrfs_release_path(root, path); 326 return 0; 327 } 328 dst_copy = kmalloc(item_size, GFP_NOFS); 329 src_copy = kmalloc(item_size, GFP_NOFS); 330 331 read_extent_buffer(eb, src_copy, src_ptr, item_size); 332 333 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 334 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 335 item_size); 336 ret = memcmp(dst_copy, src_copy, item_size); 337 338 kfree(dst_copy); 339 kfree(src_copy); 340 /* 341 * they have the same contents, just return, this saves 342 * us from cowing blocks in the destination tree and doing 343 * extra writes that may not have been done by a previous 344 * sync 345 */ 346 if (ret == 0) { 347 btrfs_release_path(root, path); 348 return 0; 349 } 350 351 } 352 insert: 353 btrfs_release_path(root, path); 354 /* try to insert the key into the destination tree */ 355 ret = btrfs_insert_empty_item(trans, root, path, 356 key, item_size); 357 358 /* make sure any existing item is the correct size */ 359 if (ret == -EEXIST) { 360 u32 found_size; 361 found_size = btrfs_item_size_nr(path->nodes[0], 362 path->slots[0]); 363 if (found_size > item_size) { 364 btrfs_truncate_item(trans, root, path, item_size, 1); 365 } else if (found_size < item_size) { 366 ret = btrfs_extend_item(trans, root, path, 367 item_size - found_size); 368 BUG_ON(ret); 369 } 370 } else if (ret) { 371 BUG(); 372 } 373 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 374 path->slots[0]); 375 376 /* don't overwrite an existing inode if the generation number 377 * was logged as zero. This is done when the tree logging code 378 * is just logging an inode to make sure it exists after recovery. 379 * 380 * Also, don't overwrite i_size on directories during replay. 381 * log replay inserts and removes directory items based on the 382 * state of the tree found in the subvolume, and i_size is modified 383 * as it goes 384 */ 385 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 386 struct btrfs_inode_item *src_item; 387 struct btrfs_inode_item *dst_item; 388 389 src_item = (struct btrfs_inode_item *)src_ptr; 390 dst_item = (struct btrfs_inode_item *)dst_ptr; 391 392 if (btrfs_inode_generation(eb, src_item) == 0) 393 goto no_copy; 394 395 if (overwrite_root && 396 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 397 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 398 save_old_i_size = 1; 399 saved_i_size = btrfs_inode_size(path->nodes[0], 400 dst_item); 401 } 402 } 403 404 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 405 src_ptr, item_size); 406 407 if (save_old_i_size) { 408 struct btrfs_inode_item *dst_item; 409 dst_item = (struct btrfs_inode_item *)dst_ptr; 410 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 411 } 412 413 /* make sure the generation is filled in */ 414 if (key->type == BTRFS_INODE_ITEM_KEY) { 415 struct btrfs_inode_item *dst_item; 416 dst_item = (struct btrfs_inode_item *)dst_ptr; 417 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 418 btrfs_set_inode_generation(path->nodes[0], dst_item, 419 trans->transid); 420 } 421 } 422 no_copy: 423 btrfs_mark_buffer_dirty(path->nodes[0]); 424 btrfs_release_path(root, path); 425 return 0; 426 } 427 428 /* 429 * simple helper to read an inode off the disk from a given root 430 * This can only be called for subvolume roots and not for the log 431 */ 432 static noinline struct inode *read_one_inode(struct btrfs_root *root, 433 u64 objectid) 434 { 435 struct inode *inode; 436 inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); 437 if (inode->i_state & I_NEW) { 438 BTRFS_I(inode)->root = root; 439 BTRFS_I(inode)->location.objectid = objectid; 440 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; 441 BTRFS_I(inode)->location.offset = 0; 442 btrfs_read_locked_inode(inode); 443 unlock_new_inode(inode); 444 445 } 446 if (is_bad_inode(inode)) { 447 iput(inode); 448 inode = NULL; 449 } 450 return inode; 451 } 452 453 /* replays a single extent in 'eb' at 'slot' with 'key' into the 454 * subvolume 'root'. path is released on entry and should be released 455 * on exit. 456 * 457 * extents in the log tree have not been allocated out of the extent 458 * tree yet. So, this completes the allocation, taking a reference 459 * as required if the extent already exists or creating a new extent 460 * if it isn't in the extent allocation tree yet. 461 * 462 * The extent is inserted into the file, dropping any existing extents 463 * from the file that overlap the new one. 464 */ 465 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 466 struct btrfs_root *root, 467 struct btrfs_path *path, 468 struct extent_buffer *eb, int slot, 469 struct btrfs_key *key) 470 { 471 int found_type; 472 u64 mask = root->sectorsize - 1; 473 u64 extent_end; 474 u64 alloc_hint; 475 u64 start = key->offset; 476 u64 saved_nbytes; 477 struct btrfs_file_extent_item *item; 478 struct inode *inode = NULL; 479 unsigned long size; 480 int ret = 0; 481 482 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 483 found_type = btrfs_file_extent_type(eb, item); 484 485 if (found_type == BTRFS_FILE_EXTENT_REG || 486 found_type == BTRFS_FILE_EXTENT_PREALLOC) 487 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 488 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 489 size = btrfs_file_extent_inline_len(eb, item); 490 extent_end = (start + size + mask) & ~mask; 491 } else { 492 ret = 0; 493 goto out; 494 } 495 496 inode = read_one_inode(root, key->objectid); 497 if (!inode) { 498 ret = -EIO; 499 goto out; 500 } 501 502 /* 503 * first check to see if we already have this extent in the 504 * file. This must be done before the btrfs_drop_extents run 505 * so we don't try to drop this extent. 506 */ 507 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 508 start, 0); 509 510 if (ret == 0 && 511 (found_type == BTRFS_FILE_EXTENT_REG || 512 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 513 struct btrfs_file_extent_item cmp1; 514 struct btrfs_file_extent_item cmp2; 515 struct btrfs_file_extent_item *existing; 516 struct extent_buffer *leaf; 517 518 leaf = path->nodes[0]; 519 existing = btrfs_item_ptr(leaf, path->slots[0], 520 struct btrfs_file_extent_item); 521 522 read_extent_buffer(eb, &cmp1, (unsigned long)item, 523 sizeof(cmp1)); 524 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 525 sizeof(cmp2)); 526 527 /* 528 * we already have a pointer to this exact extent, 529 * we don't have to do anything 530 */ 531 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 532 btrfs_release_path(root, path); 533 goto out; 534 } 535 } 536 btrfs_release_path(root, path); 537 538 saved_nbytes = inode_get_bytes(inode); 539 /* drop any overlapping extents */ 540 ret = btrfs_drop_extents(trans, root, inode, 541 start, extent_end, start, &alloc_hint); 542 BUG_ON(ret); 543 544 if (found_type == BTRFS_FILE_EXTENT_REG || 545 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 546 unsigned long dest_offset; 547 struct btrfs_key ins; 548 549 ret = btrfs_insert_empty_item(trans, root, path, key, 550 sizeof(*item)); 551 BUG_ON(ret); 552 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 553 path->slots[0]); 554 copy_extent_buffer(path->nodes[0], eb, dest_offset, 555 (unsigned long)item, sizeof(*item)); 556 557 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 558 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 559 ins.type = BTRFS_EXTENT_ITEM_KEY; 560 561 if (ins.objectid > 0) { 562 u64 csum_start; 563 u64 csum_end; 564 LIST_HEAD(ordered_sums); 565 /* 566 * is this extent already allocated in the extent 567 * allocation tree? If so, just add a reference 568 */ 569 ret = btrfs_lookup_extent(root, ins.objectid, 570 ins.offset); 571 if (ret == 0) { 572 ret = btrfs_inc_extent_ref(trans, root, 573 ins.objectid, ins.offset, 574 path->nodes[0]->start, 575 root->root_key.objectid, 576 trans->transid, key->objectid); 577 } else { 578 /* 579 * insert the extent pointer in the extent 580 * allocation tree 581 */ 582 ret = btrfs_alloc_logged_extent(trans, root, 583 path->nodes[0]->start, 584 root->root_key.objectid, 585 trans->transid, key->objectid, 586 &ins); 587 BUG_ON(ret); 588 } 589 btrfs_release_path(root, path); 590 591 if (btrfs_file_extent_compression(eb, item)) { 592 csum_start = ins.objectid; 593 csum_end = csum_start + ins.offset; 594 } else { 595 csum_start = ins.objectid + 596 btrfs_file_extent_offset(eb, item); 597 csum_end = csum_start + 598 btrfs_file_extent_num_bytes(eb, item); 599 } 600 601 ret = btrfs_lookup_csums_range(root->log_root, 602 csum_start, csum_end - 1, 603 &ordered_sums); 604 BUG_ON(ret); 605 while (!list_empty(&ordered_sums)) { 606 struct btrfs_ordered_sum *sums; 607 sums = list_entry(ordered_sums.next, 608 struct btrfs_ordered_sum, 609 list); 610 ret = btrfs_csum_file_blocks(trans, 611 root->fs_info->csum_root, 612 sums); 613 BUG_ON(ret); 614 list_del(&sums->list); 615 kfree(sums); 616 } 617 } else { 618 btrfs_release_path(root, path); 619 } 620 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 621 /* inline extents are easy, we just overwrite them */ 622 ret = overwrite_item(trans, root, path, eb, slot, key); 623 BUG_ON(ret); 624 } 625 626 inode_set_bytes(inode, saved_nbytes); 627 btrfs_update_inode(trans, root, inode); 628 out: 629 if (inode) 630 iput(inode); 631 return ret; 632 } 633 634 /* 635 * when cleaning up conflicts between the directory names in the 636 * subvolume, directory names in the log and directory names in the 637 * inode back references, we may have to unlink inodes from directories. 638 * 639 * This is a helper function to do the unlink of a specific directory 640 * item 641 */ 642 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 643 struct btrfs_root *root, 644 struct btrfs_path *path, 645 struct inode *dir, 646 struct btrfs_dir_item *di) 647 { 648 struct inode *inode; 649 char *name; 650 int name_len; 651 struct extent_buffer *leaf; 652 struct btrfs_key location; 653 int ret; 654 655 leaf = path->nodes[0]; 656 657 btrfs_dir_item_key_to_cpu(leaf, di, &location); 658 name_len = btrfs_dir_name_len(leaf, di); 659 name = kmalloc(name_len, GFP_NOFS); 660 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 661 btrfs_release_path(root, path); 662 663 inode = read_one_inode(root, location.objectid); 664 BUG_ON(!inode); 665 666 ret = link_to_fixup_dir(trans, root, path, location.objectid); 667 BUG_ON(ret); 668 669 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 670 BUG_ON(ret); 671 kfree(name); 672 673 iput(inode); 674 return ret; 675 } 676 677 /* 678 * helper function to see if a given name and sequence number found 679 * in an inode back reference are already in a directory and correctly 680 * point to this inode 681 */ 682 static noinline int inode_in_dir(struct btrfs_root *root, 683 struct btrfs_path *path, 684 u64 dirid, u64 objectid, u64 index, 685 const char *name, int name_len) 686 { 687 struct btrfs_dir_item *di; 688 struct btrfs_key location; 689 int match = 0; 690 691 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 692 index, name, name_len, 0); 693 if (di && !IS_ERR(di)) { 694 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 695 if (location.objectid != objectid) 696 goto out; 697 } else 698 goto out; 699 btrfs_release_path(root, path); 700 701 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 702 if (di && !IS_ERR(di)) { 703 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 704 if (location.objectid != objectid) 705 goto out; 706 } else 707 goto out; 708 match = 1; 709 out: 710 btrfs_release_path(root, path); 711 return match; 712 } 713 714 /* 715 * helper function to check a log tree for a named back reference in 716 * an inode. This is used to decide if a back reference that is 717 * found in the subvolume conflicts with what we find in the log. 718 * 719 * inode backreferences may have multiple refs in a single item, 720 * during replay we process one reference at a time, and we don't 721 * want to delete valid links to a file from the subvolume if that 722 * link is also in the log. 723 */ 724 static noinline int backref_in_log(struct btrfs_root *log, 725 struct btrfs_key *key, 726 char *name, int namelen) 727 { 728 struct btrfs_path *path; 729 struct btrfs_inode_ref *ref; 730 unsigned long ptr; 731 unsigned long ptr_end; 732 unsigned long name_ptr; 733 int found_name_len; 734 int item_size; 735 int ret; 736 int match = 0; 737 738 path = btrfs_alloc_path(); 739 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 740 if (ret != 0) 741 goto out; 742 743 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 744 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 745 ptr_end = ptr + item_size; 746 while (ptr < ptr_end) { 747 ref = (struct btrfs_inode_ref *)ptr; 748 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 749 if (found_name_len == namelen) { 750 name_ptr = (unsigned long)(ref + 1); 751 ret = memcmp_extent_buffer(path->nodes[0], name, 752 name_ptr, namelen); 753 if (ret == 0) { 754 match = 1; 755 goto out; 756 } 757 } 758 ptr = (unsigned long)(ref + 1) + found_name_len; 759 } 760 out: 761 btrfs_free_path(path); 762 return match; 763 } 764 765 766 /* 767 * replay one inode back reference item found in the log tree. 768 * eb, slot and key refer to the buffer and key found in the log tree. 769 * root is the destination we are replaying into, and path is for temp 770 * use by this function. (it should be released on return). 771 */ 772 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 773 struct btrfs_root *root, 774 struct btrfs_root *log, 775 struct btrfs_path *path, 776 struct extent_buffer *eb, int slot, 777 struct btrfs_key *key) 778 { 779 struct inode *dir; 780 int ret; 781 struct btrfs_key location; 782 struct btrfs_inode_ref *ref; 783 struct btrfs_dir_item *di; 784 struct inode *inode; 785 char *name; 786 int namelen; 787 unsigned long ref_ptr; 788 unsigned long ref_end; 789 790 location.objectid = key->objectid; 791 location.type = BTRFS_INODE_ITEM_KEY; 792 location.offset = 0; 793 794 /* 795 * it is possible that we didn't log all the parent directories 796 * for a given inode. If we don't find the dir, just don't 797 * copy the back ref in. The link count fixup code will take 798 * care of the rest 799 */ 800 dir = read_one_inode(root, key->offset); 801 if (!dir) 802 return -ENOENT; 803 804 inode = read_one_inode(root, key->objectid); 805 BUG_ON(!dir); 806 807 ref_ptr = btrfs_item_ptr_offset(eb, slot); 808 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 809 810 again: 811 ref = (struct btrfs_inode_ref *)ref_ptr; 812 813 namelen = btrfs_inode_ref_name_len(eb, ref); 814 name = kmalloc(namelen, GFP_NOFS); 815 BUG_ON(!name); 816 817 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 818 819 /* if we already have a perfect match, we're done */ 820 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 821 btrfs_inode_ref_index(eb, ref), 822 name, namelen)) { 823 goto out; 824 } 825 826 /* 827 * look for a conflicting back reference in the metadata. 828 * if we find one we have to unlink that name of the file 829 * before we add our new link. Later on, we overwrite any 830 * existing back reference, and we don't want to create 831 * dangling pointers in the directory. 832 */ 833 conflict_again: 834 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 835 if (ret == 0) { 836 char *victim_name; 837 int victim_name_len; 838 struct btrfs_inode_ref *victim_ref; 839 unsigned long ptr; 840 unsigned long ptr_end; 841 struct extent_buffer *leaf = path->nodes[0]; 842 843 /* are we trying to overwrite a back ref for the root directory 844 * if so, just jump out, we're done 845 */ 846 if (key->objectid == key->offset) 847 goto out_nowrite; 848 849 /* check all the names in this back reference to see 850 * if they are in the log. if so, we allow them to stay 851 * otherwise they must be unlinked as a conflict 852 */ 853 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 854 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 855 while (ptr < ptr_end) { 856 victim_ref = (struct btrfs_inode_ref *)ptr; 857 victim_name_len = btrfs_inode_ref_name_len(leaf, 858 victim_ref); 859 victim_name = kmalloc(victim_name_len, GFP_NOFS); 860 BUG_ON(!victim_name); 861 862 read_extent_buffer(leaf, victim_name, 863 (unsigned long)(victim_ref + 1), 864 victim_name_len); 865 866 if (!backref_in_log(log, key, victim_name, 867 victim_name_len)) { 868 btrfs_inc_nlink(inode); 869 btrfs_release_path(root, path); 870 871 ret = btrfs_unlink_inode(trans, root, dir, 872 inode, victim_name, 873 victim_name_len); 874 kfree(victim_name); 875 btrfs_release_path(root, path); 876 goto conflict_again; 877 } 878 kfree(victim_name); 879 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 880 } 881 BUG_ON(ret); 882 } 883 btrfs_release_path(root, path); 884 885 /* look for a conflicting sequence number */ 886 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 887 btrfs_inode_ref_index(eb, ref), 888 name, namelen, 0); 889 if (di && !IS_ERR(di)) { 890 ret = drop_one_dir_item(trans, root, path, dir, di); 891 BUG_ON(ret); 892 } 893 btrfs_release_path(root, path); 894 895 896 /* look for a conflicting name */ 897 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 898 name, namelen, 0); 899 if (di && !IS_ERR(di)) { 900 ret = drop_one_dir_item(trans, root, path, dir, di); 901 BUG_ON(ret); 902 } 903 btrfs_release_path(root, path); 904 905 /* insert our name */ 906 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 907 btrfs_inode_ref_index(eb, ref)); 908 BUG_ON(ret); 909 910 btrfs_update_inode(trans, root, inode); 911 912 out: 913 ref_ptr = (unsigned long)(ref + 1) + namelen; 914 kfree(name); 915 if (ref_ptr < ref_end) 916 goto again; 917 918 /* finally write the back reference in the inode */ 919 ret = overwrite_item(trans, root, path, eb, slot, key); 920 BUG_ON(ret); 921 922 out_nowrite: 923 btrfs_release_path(root, path); 924 iput(dir); 925 iput(inode); 926 return 0; 927 } 928 929 /* 930 * There are a few corners where the link count of the file can't 931 * be properly maintained during replay. So, instead of adding 932 * lots of complexity to the log code, we just scan the backrefs 933 * for any file that has been through replay. 934 * 935 * The scan will update the link count on the inode to reflect the 936 * number of back refs found. If it goes down to zero, the iput 937 * will free the inode. 938 */ 939 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 940 struct btrfs_root *root, 941 struct inode *inode) 942 { 943 struct btrfs_path *path; 944 int ret; 945 struct btrfs_key key; 946 u64 nlink = 0; 947 unsigned long ptr; 948 unsigned long ptr_end; 949 int name_len; 950 951 key.objectid = inode->i_ino; 952 key.type = BTRFS_INODE_REF_KEY; 953 key.offset = (u64)-1; 954 955 path = btrfs_alloc_path(); 956 957 while (1) { 958 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 959 if (ret < 0) 960 break; 961 if (ret > 0) { 962 if (path->slots[0] == 0) 963 break; 964 path->slots[0]--; 965 } 966 btrfs_item_key_to_cpu(path->nodes[0], &key, 967 path->slots[0]); 968 if (key.objectid != inode->i_ino || 969 key.type != BTRFS_INODE_REF_KEY) 970 break; 971 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 972 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 973 path->slots[0]); 974 while (ptr < ptr_end) { 975 struct btrfs_inode_ref *ref; 976 977 ref = (struct btrfs_inode_ref *)ptr; 978 name_len = btrfs_inode_ref_name_len(path->nodes[0], 979 ref); 980 ptr = (unsigned long)(ref + 1) + name_len; 981 nlink++; 982 } 983 984 if (key.offset == 0) 985 break; 986 key.offset--; 987 btrfs_release_path(root, path); 988 } 989 btrfs_release_path(root, path); 990 if (nlink != inode->i_nlink) { 991 inode->i_nlink = nlink; 992 btrfs_update_inode(trans, root, inode); 993 } 994 BTRFS_I(inode)->index_cnt = (u64)-1; 995 996 if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { 997 ret = replay_dir_deletes(trans, root, NULL, path, 998 inode->i_ino, 1); 999 BUG_ON(ret); 1000 } 1001 btrfs_free_path(path); 1002 1003 return 0; 1004 } 1005 1006 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1007 struct btrfs_root *root, 1008 struct btrfs_path *path) 1009 { 1010 int ret; 1011 struct btrfs_key key; 1012 struct inode *inode; 1013 1014 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1015 key.type = BTRFS_ORPHAN_ITEM_KEY; 1016 key.offset = (u64)-1; 1017 while (1) { 1018 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1019 if (ret < 0) 1020 break; 1021 1022 if (ret == 1) { 1023 if (path->slots[0] == 0) 1024 break; 1025 path->slots[0]--; 1026 } 1027 1028 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1029 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1030 key.type != BTRFS_ORPHAN_ITEM_KEY) 1031 break; 1032 1033 ret = btrfs_del_item(trans, root, path); 1034 BUG_ON(ret); 1035 1036 btrfs_release_path(root, path); 1037 inode = read_one_inode(root, key.offset); 1038 BUG_ON(!inode); 1039 1040 ret = fixup_inode_link_count(trans, root, inode); 1041 BUG_ON(ret); 1042 1043 iput(inode); 1044 1045 /* 1046 * fixup on a directory may create new entries, 1047 * make sure we always look for the highset possible 1048 * offset 1049 */ 1050 key.offset = (u64)-1; 1051 } 1052 btrfs_release_path(root, path); 1053 return 0; 1054 } 1055 1056 1057 /* 1058 * record a given inode in the fixup dir so we can check its link 1059 * count when replay is done. The link count is incremented here 1060 * so the inode won't go away until we check it 1061 */ 1062 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1063 struct btrfs_root *root, 1064 struct btrfs_path *path, 1065 u64 objectid) 1066 { 1067 struct btrfs_key key; 1068 int ret = 0; 1069 struct inode *inode; 1070 1071 inode = read_one_inode(root, objectid); 1072 BUG_ON(!inode); 1073 1074 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1075 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1076 key.offset = objectid; 1077 1078 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1079 1080 btrfs_release_path(root, path); 1081 if (ret == 0) { 1082 btrfs_inc_nlink(inode); 1083 btrfs_update_inode(trans, root, inode); 1084 } else if (ret == -EEXIST) { 1085 ret = 0; 1086 } else { 1087 BUG(); 1088 } 1089 iput(inode); 1090 1091 return ret; 1092 } 1093 1094 /* 1095 * when replaying the log for a directory, we only insert names 1096 * for inodes that actually exist. This means an fsync on a directory 1097 * does not implicitly fsync all the new files in it 1098 */ 1099 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1100 struct btrfs_root *root, 1101 struct btrfs_path *path, 1102 u64 dirid, u64 index, 1103 char *name, int name_len, u8 type, 1104 struct btrfs_key *location) 1105 { 1106 struct inode *inode; 1107 struct inode *dir; 1108 int ret; 1109 1110 inode = read_one_inode(root, location->objectid); 1111 if (!inode) 1112 return -ENOENT; 1113 1114 dir = read_one_inode(root, dirid); 1115 if (!dir) { 1116 iput(inode); 1117 return -EIO; 1118 } 1119 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1120 1121 /* FIXME, put inode into FIXUP list */ 1122 1123 iput(inode); 1124 iput(dir); 1125 return ret; 1126 } 1127 1128 /* 1129 * take a single entry in a log directory item and replay it into 1130 * the subvolume. 1131 * 1132 * if a conflicting item exists in the subdirectory already, 1133 * the inode it points to is unlinked and put into the link count 1134 * fix up tree. 1135 * 1136 * If a name from the log points to a file or directory that does 1137 * not exist in the FS, it is skipped. fsyncs on directories 1138 * do not force down inodes inside that directory, just changes to the 1139 * names or unlinks in a directory. 1140 */ 1141 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1142 struct btrfs_root *root, 1143 struct btrfs_path *path, 1144 struct extent_buffer *eb, 1145 struct btrfs_dir_item *di, 1146 struct btrfs_key *key) 1147 { 1148 char *name; 1149 int name_len; 1150 struct btrfs_dir_item *dst_di; 1151 struct btrfs_key found_key; 1152 struct btrfs_key log_key; 1153 struct inode *dir; 1154 u8 log_type; 1155 int exists; 1156 int ret; 1157 1158 dir = read_one_inode(root, key->objectid); 1159 BUG_ON(!dir); 1160 1161 name_len = btrfs_dir_name_len(eb, di); 1162 name = kmalloc(name_len, GFP_NOFS); 1163 log_type = btrfs_dir_type(eb, di); 1164 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1165 name_len); 1166 1167 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1168 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1169 if (exists == 0) 1170 exists = 1; 1171 else 1172 exists = 0; 1173 btrfs_release_path(root, path); 1174 1175 if (key->type == BTRFS_DIR_ITEM_KEY) { 1176 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1177 name, name_len, 1); 1178 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1179 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1180 key->objectid, 1181 key->offset, name, 1182 name_len, 1); 1183 } else { 1184 BUG(); 1185 } 1186 if (!dst_di || IS_ERR(dst_di)) { 1187 /* we need a sequence number to insert, so we only 1188 * do inserts for the BTRFS_DIR_INDEX_KEY types 1189 */ 1190 if (key->type != BTRFS_DIR_INDEX_KEY) 1191 goto out; 1192 goto insert; 1193 } 1194 1195 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1196 /* the existing item matches the logged item */ 1197 if (found_key.objectid == log_key.objectid && 1198 found_key.type == log_key.type && 1199 found_key.offset == log_key.offset && 1200 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1201 goto out; 1202 } 1203 1204 /* 1205 * don't drop the conflicting directory entry if the inode 1206 * for the new entry doesn't exist 1207 */ 1208 if (!exists) 1209 goto out; 1210 1211 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1212 BUG_ON(ret); 1213 1214 if (key->type == BTRFS_DIR_INDEX_KEY) 1215 goto insert; 1216 out: 1217 btrfs_release_path(root, path); 1218 kfree(name); 1219 iput(dir); 1220 return 0; 1221 1222 insert: 1223 btrfs_release_path(root, path); 1224 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1225 name, name_len, log_type, &log_key); 1226 1227 if (ret && ret != -ENOENT) 1228 BUG(); 1229 goto out; 1230 } 1231 1232 /* 1233 * find all the names in a directory item and reconcile them into 1234 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1235 * one name in a directory item, but the same code gets used for 1236 * both directory index types 1237 */ 1238 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1239 struct btrfs_root *root, 1240 struct btrfs_path *path, 1241 struct extent_buffer *eb, int slot, 1242 struct btrfs_key *key) 1243 { 1244 int ret; 1245 u32 item_size = btrfs_item_size_nr(eb, slot); 1246 struct btrfs_dir_item *di; 1247 int name_len; 1248 unsigned long ptr; 1249 unsigned long ptr_end; 1250 1251 ptr = btrfs_item_ptr_offset(eb, slot); 1252 ptr_end = ptr + item_size; 1253 while (ptr < ptr_end) { 1254 di = (struct btrfs_dir_item *)ptr; 1255 name_len = btrfs_dir_name_len(eb, di); 1256 ret = replay_one_name(trans, root, path, eb, di, key); 1257 BUG_ON(ret); 1258 ptr = (unsigned long)(di + 1); 1259 ptr += name_len; 1260 } 1261 return 0; 1262 } 1263 1264 /* 1265 * directory replay has two parts. There are the standard directory 1266 * items in the log copied from the subvolume, and range items 1267 * created in the log while the subvolume was logged. 1268 * 1269 * The range items tell us which parts of the key space the log 1270 * is authoritative for. During replay, if a key in the subvolume 1271 * directory is in a logged range item, but not actually in the log 1272 * that means it was deleted from the directory before the fsync 1273 * and should be removed. 1274 */ 1275 static noinline int find_dir_range(struct btrfs_root *root, 1276 struct btrfs_path *path, 1277 u64 dirid, int key_type, 1278 u64 *start_ret, u64 *end_ret) 1279 { 1280 struct btrfs_key key; 1281 u64 found_end; 1282 struct btrfs_dir_log_item *item; 1283 int ret; 1284 int nritems; 1285 1286 if (*start_ret == (u64)-1) 1287 return 1; 1288 1289 key.objectid = dirid; 1290 key.type = key_type; 1291 key.offset = *start_ret; 1292 1293 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1294 if (ret < 0) 1295 goto out; 1296 if (ret > 0) { 1297 if (path->slots[0] == 0) 1298 goto out; 1299 path->slots[0]--; 1300 } 1301 if (ret != 0) 1302 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1303 1304 if (key.type != key_type || key.objectid != dirid) { 1305 ret = 1; 1306 goto next; 1307 } 1308 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1309 struct btrfs_dir_log_item); 1310 found_end = btrfs_dir_log_end(path->nodes[0], item); 1311 1312 if (*start_ret >= key.offset && *start_ret <= found_end) { 1313 ret = 0; 1314 *start_ret = key.offset; 1315 *end_ret = found_end; 1316 goto out; 1317 } 1318 ret = 1; 1319 next: 1320 /* check the next slot in the tree to see if it is a valid item */ 1321 nritems = btrfs_header_nritems(path->nodes[0]); 1322 if (path->slots[0] >= nritems) { 1323 ret = btrfs_next_leaf(root, path); 1324 if (ret) 1325 goto out; 1326 } else { 1327 path->slots[0]++; 1328 } 1329 1330 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1331 1332 if (key.type != key_type || key.objectid != dirid) { 1333 ret = 1; 1334 goto out; 1335 } 1336 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1337 struct btrfs_dir_log_item); 1338 found_end = btrfs_dir_log_end(path->nodes[0], item); 1339 *start_ret = key.offset; 1340 *end_ret = found_end; 1341 ret = 0; 1342 out: 1343 btrfs_release_path(root, path); 1344 return ret; 1345 } 1346 1347 /* 1348 * this looks for a given directory item in the log. If the directory 1349 * item is not in the log, the item is removed and the inode it points 1350 * to is unlinked 1351 */ 1352 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1353 struct btrfs_root *root, 1354 struct btrfs_root *log, 1355 struct btrfs_path *path, 1356 struct btrfs_path *log_path, 1357 struct inode *dir, 1358 struct btrfs_key *dir_key) 1359 { 1360 int ret; 1361 struct extent_buffer *eb; 1362 int slot; 1363 u32 item_size; 1364 struct btrfs_dir_item *di; 1365 struct btrfs_dir_item *log_di; 1366 int name_len; 1367 unsigned long ptr; 1368 unsigned long ptr_end; 1369 char *name; 1370 struct inode *inode; 1371 struct btrfs_key location; 1372 1373 again: 1374 eb = path->nodes[0]; 1375 slot = path->slots[0]; 1376 item_size = btrfs_item_size_nr(eb, slot); 1377 ptr = btrfs_item_ptr_offset(eb, slot); 1378 ptr_end = ptr + item_size; 1379 while (ptr < ptr_end) { 1380 di = (struct btrfs_dir_item *)ptr; 1381 name_len = btrfs_dir_name_len(eb, di); 1382 name = kmalloc(name_len, GFP_NOFS); 1383 if (!name) { 1384 ret = -ENOMEM; 1385 goto out; 1386 } 1387 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1388 name_len); 1389 log_di = NULL; 1390 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1391 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1392 dir_key->objectid, 1393 name, name_len, 0); 1394 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1395 log_di = btrfs_lookup_dir_index_item(trans, log, 1396 log_path, 1397 dir_key->objectid, 1398 dir_key->offset, 1399 name, name_len, 0); 1400 } 1401 if (!log_di || IS_ERR(log_di)) { 1402 btrfs_dir_item_key_to_cpu(eb, di, &location); 1403 btrfs_release_path(root, path); 1404 btrfs_release_path(log, log_path); 1405 inode = read_one_inode(root, location.objectid); 1406 BUG_ON(!inode); 1407 1408 ret = link_to_fixup_dir(trans, root, 1409 path, location.objectid); 1410 BUG_ON(ret); 1411 btrfs_inc_nlink(inode); 1412 ret = btrfs_unlink_inode(trans, root, dir, inode, 1413 name, name_len); 1414 BUG_ON(ret); 1415 kfree(name); 1416 iput(inode); 1417 1418 /* there might still be more names under this key 1419 * check and repeat if required 1420 */ 1421 ret = btrfs_search_slot(NULL, root, dir_key, path, 1422 0, 0); 1423 if (ret == 0) 1424 goto again; 1425 ret = 0; 1426 goto out; 1427 } 1428 btrfs_release_path(log, log_path); 1429 kfree(name); 1430 1431 ptr = (unsigned long)(di + 1); 1432 ptr += name_len; 1433 } 1434 ret = 0; 1435 out: 1436 btrfs_release_path(root, path); 1437 btrfs_release_path(log, log_path); 1438 return ret; 1439 } 1440 1441 /* 1442 * deletion replay happens before we copy any new directory items 1443 * out of the log or out of backreferences from inodes. It 1444 * scans the log to find ranges of keys that log is authoritative for, 1445 * and then scans the directory to find items in those ranges that are 1446 * not present in the log. 1447 * 1448 * Anything we don't find in the log is unlinked and removed from the 1449 * directory. 1450 */ 1451 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1452 struct btrfs_root *root, 1453 struct btrfs_root *log, 1454 struct btrfs_path *path, 1455 u64 dirid, int del_all) 1456 { 1457 u64 range_start; 1458 u64 range_end; 1459 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1460 int ret = 0; 1461 struct btrfs_key dir_key; 1462 struct btrfs_key found_key; 1463 struct btrfs_path *log_path; 1464 struct inode *dir; 1465 1466 dir_key.objectid = dirid; 1467 dir_key.type = BTRFS_DIR_ITEM_KEY; 1468 log_path = btrfs_alloc_path(); 1469 if (!log_path) 1470 return -ENOMEM; 1471 1472 dir = read_one_inode(root, dirid); 1473 /* it isn't an error if the inode isn't there, that can happen 1474 * because we replay the deletes before we copy in the inode item 1475 * from the log 1476 */ 1477 if (!dir) { 1478 btrfs_free_path(log_path); 1479 return 0; 1480 } 1481 again: 1482 range_start = 0; 1483 range_end = 0; 1484 while (1) { 1485 if (del_all) 1486 range_end = (u64)-1; 1487 else { 1488 ret = find_dir_range(log, path, dirid, key_type, 1489 &range_start, &range_end); 1490 if (ret != 0) 1491 break; 1492 } 1493 1494 dir_key.offset = range_start; 1495 while (1) { 1496 int nritems; 1497 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1498 0, 0); 1499 if (ret < 0) 1500 goto out; 1501 1502 nritems = btrfs_header_nritems(path->nodes[0]); 1503 if (path->slots[0] >= nritems) { 1504 ret = btrfs_next_leaf(root, path); 1505 if (ret) 1506 break; 1507 } 1508 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1509 path->slots[0]); 1510 if (found_key.objectid != dirid || 1511 found_key.type != dir_key.type) 1512 goto next_type; 1513 1514 if (found_key.offset > range_end) 1515 break; 1516 1517 ret = check_item_in_log(trans, root, log, path, 1518 log_path, dir, 1519 &found_key); 1520 BUG_ON(ret); 1521 if (found_key.offset == (u64)-1) 1522 break; 1523 dir_key.offset = found_key.offset + 1; 1524 } 1525 btrfs_release_path(root, path); 1526 if (range_end == (u64)-1) 1527 break; 1528 range_start = range_end + 1; 1529 } 1530 1531 next_type: 1532 ret = 0; 1533 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1534 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1535 dir_key.type = BTRFS_DIR_INDEX_KEY; 1536 btrfs_release_path(root, path); 1537 goto again; 1538 } 1539 out: 1540 btrfs_release_path(root, path); 1541 btrfs_free_path(log_path); 1542 iput(dir); 1543 return ret; 1544 } 1545 1546 /* 1547 * the process_func used to replay items from the log tree. This 1548 * gets called in two different stages. The first stage just looks 1549 * for inodes and makes sure they are all copied into the subvolume. 1550 * 1551 * The second stage copies all the other item types from the log into 1552 * the subvolume. The two stage approach is slower, but gets rid of 1553 * lots of complexity around inodes referencing other inodes that exist 1554 * only in the log (references come from either directory items or inode 1555 * back refs). 1556 */ 1557 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1558 struct walk_control *wc, u64 gen) 1559 { 1560 int nritems; 1561 struct btrfs_path *path; 1562 struct btrfs_root *root = wc->replay_dest; 1563 struct btrfs_key key; 1564 u32 item_size; 1565 int level; 1566 int i; 1567 int ret; 1568 1569 btrfs_read_buffer(eb, gen); 1570 1571 level = btrfs_header_level(eb); 1572 1573 if (level != 0) 1574 return 0; 1575 1576 path = btrfs_alloc_path(); 1577 BUG_ON(!path); 1578 1579 nritems = btrfs_header_nritems(eb); 1580 for (i = 0; i < nritems; i++) { 1581 btrfs_item_key_to_cpu(eb, &key, i); 1582 item_size = btrfs_item_size_nr(eb, i); 1583 1584 /* inode keys are done during the first stage */ 1585 if (key.type == BTRFS_INODE_ITEM_KEY && 1586 wc->stage == LOG_WALK_REPLAY_INODES) { 1587 struct inode *inode; 1588 struct btrfs_inode_item *inode_item; 1589 u32 mode; 1590 1591 inode_item = btrfs_item_ptr(eb, i, 1592 struct btrfs_inode_item); 1593 mode = btrfs_inode_mode(eb, inode_item); 1594 if (S_ISDIR(mode)) { 1595 ret = replay_dir_deletes(wc->trans, 1596 root, log, path, key.objectid, 0); 1597 BUG_ON(ret); 1598 } 1599 ret = overwrite_item(wc->trans, root, path, 1600 eb, i, &key); 1601 BUG_ON(ret); 1602 1603 /* for regular files, truncate away 1604 * extents past the new EOF 1605 */ 1606 if (S_ISREG(mode)) { 1607 inode = read_one_inode(root, 1608 key.objectid); 1609 BUG_ON(!inode); 1610 1611 ret = btrfs_truncate_inode_items(wc->trans, 1612 root, inode, inode->i_size, 1613 BTRFS_EXTENT_DATA_KEY); 1614 BUG_ON(ret); 1615 1616 /* if the nlink count is zero here, the iput 1617 * will free the inode. We bump it to make 1618 * sure it doesn't get freed until the link 1619 * count fixup is done 1620 */ 1621 if (inode->i_nlink == 0) { 1622 btrfs_inc_nlink(inode); 1623 btrfs_update_inode(wc->trans, 1624 root, inode); 1625 } 1626 iput(inode); 1627 } 1628 ret = link_to_fixup_dir(wc->trans, root, 1629 path, key.objectid); 1630 BUG_ON(ret); 1631 } 1632 if (wc->stage < LOG_WALK_REPLAY_ALL) 1633 continue; 1634 1635 /* these keys are simply copied */ 1636 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1637 ret = overwrite_item(wc->trans, root, path, 1638 eb, i, &key); 1639 BUG_ON(ret); 1640 } else if (key.type == BTRFS_INODE_REF_KEY) { 1641 ret = add_inode_ref(wc->trans, root, log, path, 1642 eb, i, &key); 1643 BUG_ON(ret && ret != -ENOENT); 1644 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1645 ret = replay_one_extent(wc->trans, root, path, 1646 eb, i, &key); 1647 BUG_ON(ret); 1648 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1649 key.type == BTRFS_DIR_INDEX_KEY) { 1650 ret = replay_one_dir_item(wc->trans, root, path, 1651 eb, i, &key); 1652 BUG_ON(ret); 1653 } 1654 } 1655 btrfs_free_path(path); 1656 return 0; 1657 } 1658 1659 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1660 struct btrfs_root *root, 1661 struct btrfs_path *path, int *level, 1662 struct walk_control *wc) 1663 { 1664 u64 root_owner; 1665 u64 root_gen; 1666 u64 bytenr; 1667 u64 ptr_gen; 1668 struct extent_buffer *next; 1669 struct extent_buffer *cur; 1670 struct extent_buffer *parent; 1671 u32 blocksize; 1672 int ret = 0; 1673 1674 WARN_ON(*level < 0); 1675 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1676 1677 while (*level > 0) { 1678 WARN_ON(*level < 0); 1679 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1680 cur = path->nodes[*level]; 1681 1682 if (btrfs_header_level(cur) != *level) 1683 WARN_ON(1); 1684 1685 if (path->slots[*level] >= 1686 btrfs_header_nritems(cur)) 1687 break; 1688 1689 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1690 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1691 blocksize = btrfs_level_size(root, *level - 1); 1692 1693 parent = path->nodes[*level]; 1694 root_owner = btrfs_header_owner(parent); 1695 root_gen = btrfs_header_generation(parent); 1696 1697 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1698 1699 wc->process_func(root, next, wc, ptr_gen); 1700 1701 if (*level == 1) { 1702 path->slots[*level]++; 1703 if (wc->free) { 1704 btrfs_read_buffer(next, ptr_gen); 1705 1706 btrfs_tree_lock(next); 1707 clean_tree_block(trans, root, next); 1708 btrfs_set_lock_blocking(next); 1709 btrfs_wait_tree_block_writeback(next); 1710 btrfs_tree_unlock(next); 1711 1712 ret = btrfs_drop_leaf_ref(trans, root, next); 1713 BUG_ON(ret); 1714 1715 WARN_ON(root_owner != 1716 BTRFS_TREE_LOG_OBJECTID); 1717 ret = btrfs_free_reserved_extent(root, 1718 bytenr, blocksize); 1719 BUG_ON(ret); 1720 } 1721 free_extent_buffer(next); 1722 continue; 1723 } 1724 btrfs_read_buffer(next, ptr_gen); 1725 1726 WARN_ON(*level <= 0); 1727 if (path->nodes[*level-1]) 1728 free_extent_buffer(path->nodes[*level-1]); 1729 path->nodes[*level-1] = next; 1730 *level = btrfs_header_level(next); 1731 path->slots[*level] = 0; 1732 cond_resched(); 1733 } 1734 WARN_ON(*level < 0); 1735 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1736 1737 if (path->nodes[*level] == root->node) 1738 parent = path->nodes[*level]; 1739 else 1740 parent = path->nodes[*level + 1]; 1741 1742 bytenr = path->nodes[*level]->start; 1743 1744 blocksize = btrfs_level_size(root, *level); 1745 root_owner = btrfs_header_owner(parent); 1746 root_gen = btrfs_header_generation(parent); 1747 1748 wc->process_func(root, path->nodes[*level], wc, 1749 btrfs_header_generation(path->nodes[*level])); 1750 1751 if (wc->free) { 1752 next = path->nodes[*level]; 1753 btrfs_tree_lock(next); 1754 clean_tree_block(trans, root, next); 1755 btrfs_set_lock_blocking(next); 1756 btrfs_wait_tree_block_writeback(next); 1757 btrfs_tree_unlock(next); 1758 1759 if (*level == 0) { 1760 ret = btrfs_drop_leaf_ref(trans, root, next); 1761 BUG_ON(ret); 1762 } 1763 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1764 ret = btrfs_free_reserved_extent(root, bytenr, blocksize); 1765 BUG_ON(ret); 1766 } 1767 free_extent_buffer(path->nodes[*level]); 1768 path->nodes[*level] = NULL; 1769 *level += 1; 1770 1771 cond_resched(); 1772 return 0; 1773 } 1774 1775 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1776 struct btrfs_root *root, 1777 struct btrfs_path *path, int *level, 1778 struct walk_control *wc) 1779 { 1780 u64 root_owner; 1781 u64 root_gen; 1782 int i; 1783 int slot; 1784 int ret; 1785 1786 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1787 slot = path->slots[i]; 1788 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1789 struct extent_buffer *node; 1790 node = path->nodes[i]; 1791 path->slots[i]++; 1792 *level = i; 1793 WARN_ON(*level == 0); 1794 return 0; 1795 } else { 1796 struct extent_buffer *parent; 1797 if (path->nodes[*level] == root->node) 1798 parent = path->nodes[*level]; 1799 else 1800 parent = path->nodes[*level + 1]; 1801 1802 root_owner = btrfs_header_owner(parent); 1803 root_gen = btrfs_header_generation(parent); 1804 wc->process_func(root, path->nodes[*level], wc, 1805 btrfs_header_generation(path->nodes[*level])); 1806 if (wc->free) { 1807 struct extent_buffer *next; 1808 1809 next = path->nodes[*level]; 1810 1811 btrfs_tree_lock(next); 1812 clean_tree_block(trans, root, next); 1813 btrfs_set_lock_blocking(next); 1814 btrfs_wait_tree_block_writeback(next); 1815 btrfs_tree_unlock(next); 1816 1817 if (*level == 0) { 1818 ret = btrfs_drop_leaf_ref(trans, root, 1819 next); 1820 BUG_ON(ret); 1821 } 1822 1823 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1824 ret = btrfs_free_reserved_extent(root, 1825 path->nodes[*level]->start, 1826 path->nodes[*level]->len); 1827 BUG_ON(ret); 1828 } 1829 free_extent_buffer(path->nodes[*level]); 1830 path->nodes[*level] = NULL; 1831 *level = i + 1; 1832 } 1833 } 1834 return 1; 1835 } 1836 1837 /* 1838 * drop the reference count on the tree rooted at 'snap'. This traverses 1839 * the tree freeing any blocks that have a ref count of zero after being 1840 * decremented. 1841 */ 1842 static int walk_log_tree(struct btrfs_trans_handle *trans, 1843 struct btrfs_root *log, struct walk_control *wc) 1844 { 1845 int ret = 0; 1846 int wret; 1847 int level; 1848 struct btrfs_path *path; 1849 int i; 1850 int orig_level; 1851 1852 path = btrfs_alloc_path(); 1853 BUG_ON(!path); 1854 1855 level = btrfs_header_level(log->node); 1856 orig_level = level; 1857 path->nodes[level] = log->node; 1858 extent_buffer_get(log->node); 1859 path->slots[level] = 0; 1860 1861 while (1) { 1862 wret = walk_down_log_tree(trans, log, path, &level, wc); 1863 if (wret > 0) 1864 break; 1865 if (wret < 0) 1866 ret = wret; 1867 1868 wret = walk_up_log_tree(trans, log, path, &level, wc); 1869 if (wret > 0) 1870 break; 1871 if (wret < 0) 1872 ret = wret; 1873 } 1874 1875 /* was the root node processed? if not, catch it here */ 1876 if (path->nodes[orig_level]) { 1877 wc->process_func(log, path->nodes[orig_level], wc, 1878 btrfs_header_generation(path->nodes[orig_level])); 1879 if (wc->free) { 1880 struct extent_buffer *next; 1881 1882 next = path->nodes[orig_level]; 1883 1884 btrfs_tree_lock(next); 1885 clean_tree_block(trans, log, next); 1886 btrfs_set_lock_blocking(next); 1887 btrfs_wait_tree_block_writeback(next); 1888 btrfs_tree_unlock(next); 1889 1890 if (orig_level == 0) { 1891 ret = btrfs_drop_leaf_ref(trans, log, 1892 next); 1893 BUG_ON(ret); 1894 } 1895 WARN_ON(log->root_key.objectid != 1896 BTRFS_TREE_LOG_OBJECTID); 1897 ret = btrfs_free_reserved_extent(log, next->start, 1898 next->len); 1899 BUG_ON(ret); 1900 } 1901 } 1902 1903 for (i = 0; i <= orig_level; i++) { 1904 if (path->nodes[i]) { 1905 free_extent_buffer(path->nodes[i]); 1906 path->nodes[i] = NULL; 1907 } 1908 } 1909 btrfs_free_path(path); 1910 return ret; 1911 } 1912 1913 /* 1914 * helper function to update the item for a given subvolumes log root 1915 * in the tree of log roots 1916 */ 1917 static int update_log_root(struct btrfs_trans_handle *trans, 1918 struct btrfs_root *log) 1919 { 1920 int ret; 1921 1922 if (log->log_transid == 1) { 1923 /* insert root item on the first sync */ 1924 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1925 &log->root_key, &log->root_item); 1926 } else { 1927 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1928 &log->root_key, &log->root_item); 1929 } 1930 return ret; 1931 } 1932 1933 static int wait_log_commit(struct btrfs_trans_handle *trans, 1934 struct btrfs_root *root, unsigned long transid) 1935 { 1936 DEFINE_WAIT(wait); 1937 int index = transid % 2; 1938 1939 /* 1940 * we only allow two pending log transactions at a time, 1941 * so we know that if ours is more than 2 older than the 1942 * current transaction, we're done 1943 */ 1944 do { 1945 prepare_to_wait(&root->log_commit_wait[index], 1946 &wait, TASK_UNINTERRUPTIBLE); 1947 mutex_unlock(&root->log_mutex); 1948 1949 if (root->fs_info->last_trans_log_full_commit != 1950 trans->transid && root->log_transid < transid + 2 && 1951 atomic_read(&root->log_commit[index])) 1952 schedule(); 1953 1954 finish_wait(&root->log_commit_wait[index], &wait); 1955 mutex_lock(&root->log_mutex); 1956 } while (root->log_transid < transid + 2 && 1957 atomic_read(&root->log_commit[index])); 1958 return 0; 1959 } 1960 1961 static int wait_for_writer(struct btrfs_trans_handle *trans, 1962 struct btrfs_root *root) 1963 { 1964 DEFINE_WAIT(wait); 1965 while (atomic_read(&root->log_writers)) { 1966 prepare_to_wait(&root->log_writer_wait, 1967 &wait, TASK_UNINTERRUPTIBLE); 1968 mutex_unlock(&root->log_mutex); 1969 if (root->fs_info->last_trans_log_full_commit != 1970 trans->transid && atomic_read(&root->log_writers)) 1971 schedule(); 1972 mutex_lock(&root->log_mutex); 1973 finish_wait(&root->log_writer_wait, &wait); 1974 } 1975 return 0; 1976 } 1977 1978 /* 1979 * btrfs_sync_log does sends a given tree log down to the disk and 1980 * updates the super blocks to record it. When this call is done, 1981 * you know that any inodes previously logged are safely on disk only 1982 * if it returns 0. 1983 * 1984 * Any other return value means you need to call btrfs_commit_transaction. 1985 * Some of the edge cases for fsyncing directories that have had unlinks 1986 * or renames done in the past mean that sometimes the only safe 1987 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 1988 * that has happened. 1989 */ 1990 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1991 struct btrfs_root *root) 1992 { 1993 int index1; 1994 int index2; 1995 int ret; 1996 struct btrfs_root *log = root->log_root; 1997 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1998 1999 mutex_lock(&root->log_mutex); 2000 index1 = root->log_transid % 2; 2001 if (atomic_read(&root->log_commit[index1])) { 2002 wait_log_commit(trans, root, root->log_transid); 2003 mutex_unlock(&root->log_mutex); 2004 return 0; 2005 } 2006 atomic_set(&root->log_commit[index1], 1); 2007 2008 /* wait for previous tree log sync to complete */ 2009 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2010 wait_log_commit(trans, root, root->log_transid - 1); 2011 2012 while (1) { 2013 unsigned long batch = root->log_batch; 2014 mutex_unlock(&root->log_mutex); 2015 schedule_timeout_uninterruptible(1); 2016 mutex_lock(&root->log_mutex); 2017 2018 wait_for_writer(trans, root); 2019 if (batch == root->log_batch) 2020 break; 2021 } 2022 2023 /* bail out if we need to do a full commit */ 2024 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2025 ret = -EAGAIN; 2026 mutex_unlock(&root->log_mutex); 2027 goto out; 2028 } 2029 2030 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2031 BUG_ON(ret); 2032 2033 btrfs_set_root_bytenr(&log->root_item, log->node->start); 2034 btrfs_set_root_generation(&log->root_item, trans->transid); 2035 btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); 2036 2037 root->log_batch = 0; 2038 root->log_transid++; 2039 log->log_transid = root->log_transid; 2040 smp_mb(); 2041 /* 2042 * log tree has been flushed to disk, new modifications of 2043 * the log will be written to new positions. so it's safe to 2044 * allow log writers to go in. 2045 */ 2046 mutex_unlock(&root->log_mutex); 2047 2048 mutex_lock(&log_root_tree->log_mutex); 2049 log_root_tree->log_batch++; 2050 atomic_inc(&log_root_tree->log_writers); 2051 mutex_unlock(&log_root_tree->log_mutex); 2052 2053 ret = update_log_root(trans, log); 2054 BUG_ON(ret); 2055 2056 mutex_lock(&log_root_tree->log_mutex); 2057 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2058 smp_mb(); 2059 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2060 wake_up(&log_root_tree->log_writer_wait); 2061 } 2062 2063 index2 = log_root_tree->log_transid % 2; 2064 if (atomic_read(&log_root_tree->log_commit[index2])) { 2065 wait_log_commit(trans, log_root_tree, 2066 log_root_tree->log_transid); 2067 mutex_unlock(&log_root_tree->log_mutex); 2068 goto out; 2069 } 2070 atomic_set(&log_root_tree->log_commit[index2], 1); 2071 2072 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2073 wait_log_commit(trans, log_root_tree, 2074 log_root_tree->log_transid - 1); 2075 } 2076 2077 wait_for_writer(trans, log_root_tree); 2078 2079 /* 2080 * now that we've moved on to the tree of log tree roots, 2081 * check the full commit flag again 2082 */ 2083 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2084 mutex_unlock(&log_root_tree->log_mutex); 2085 ret = -EAGAIN; 2086 goto out_wake_log_root; 2087 } 2088 2089 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2090 &log_root_tree->dirty_log_pages); 2091 BUG_ON(ret); 2092 2093 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2094 log_root_tree->node->start); 2095 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2096 btrfs_header_level(log_root_tree->node)); 2097 2098 log_root_tree->log_batch = 0; 2099 log_root_tree->log_transid++; 2100 smp_mb(); 2101 2102 mutex_unlock(&log_root_tree->log_mutex); 2103 2104 /* 2105 * nobody else is going to jump in and write the the ctree 2106 * super here because the log_commit atomic below is protecting 2107 * us. We must be called with a transaction handle pinning 2108 * the running transaction open, so a full commit can't hop 2109 * in and cause problems either. 2110 */ 2111 write_ctree_super(trans, root->fs_info->tree_root, 2); 2112 ret = 0; 2113 2114 out_wake_log_root: 2115 atomic_set(&log_root_tree->log_commit[index2], 0); 2116 smp_mb(); 2117 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2118 wake_up(&log_root_tree->log_commit_wait[index2]); 2119 out: 2120 atomic_set(&root->log_commit[index1], 0); 2121 smp_mb(); 2122 if (waitqueue_active(&root->log_commit_wait[index1])) 2123 wake_up(&root->log_commit_wait[index1]); 2124 return 0; 2125 } 2126 2127 /* 2128 * free all the extents used by the tree log. This should be called 2129 * at commit time of the full transaction 2130 */ 2131 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2132 { 2133 int ret; 2134 struct btrfs_root *log; 2135 struct key; 2136 u64 start; 2137 u64 end; 2138 struct walk_control wc = { 2139 .free = 1, 2140 .process_func = process_one_buffer 2141 }; 2142 2143 if (!root->log_root || root->fs_info->log_root_recovering) 2144 return 0; 2145 2146 log = root->log_root; 2147 ret = walk_log_tree(trans, log, &wc); 2148 BUG_ON(ret); 2149 2150 while (1) { 2151 ret = find_first_extent_bit(&log->dirty_log_pages, 2152 0, &start, &end, EXTENT_DIRTY); 2153 if (ret) 2154 break; 2155 2156 clear_extent_dirty(&log->dirty_log_pages, 2157 start, end, GFP_NOFS); 2158 } 2159 2160 if (log->log_transid > 0) { 2161 ret = btrfs_del_root(trans, root->fs_info->log_root_tree, 2162 &log->root_key); 2163 BUG_ON(ret); 2164 } 2165 root->log_root = NULL; 2166 free_extent_buffer(log->node); 2167 kfree(log); 2168 return 0; 2169 } 2170 2171 /* 2172 * If both a file and directory are logged, and unlinks or renames are 2173 * mixed in, we have a few interesting corners: 2174 * 2175 * create file X in dir Y 2176 * link file X to X.link in dir Y 2177 * fsync file X 2178 * unlink file X but leave X.link 2179 * fsync dir Y 2180 * 2181 * After a crash we would expect only X.link to exist. But file X 2182 * didn't get fsync'd again so the log has back refs for X and X.link. 2183 * 2184 * We solve this by removing directory entries and inode backrefs from the 2185 * log when a file that was logged in the current transaction is 2186 * unlinked. Any later fsync will include the updated log entries, and 2187 * we'll be able to reconstruct the proper directory items from backrefs. 2188 * 2189 * This optimizations allows us to avoid relogging the entire inode 2190 * or the entire directory. 2191 */ 2192 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2193 struct btrfs_root *root, 2194 const char *name, int name_len, 2195 struct inode *dir, u64 index) 2196 { 2197 struct btrfs_root *log; 2198 struct btrfs_dir_item *di; 2199 struct btrfs_path *path; 2200 int ret; 2201 int bytes_del = 0; 2202 2203 if (BTRFS_I(dir)->logged_trans < trans->transid) 2204 return 0; 2205 2206 ret = join_running_log_trans(root); 2207 if (ret) 2208 return 0; 2209 2210 mutex_lock(&BTRFS_I(dir)->log_mutex); 2211 2212 log = root->log_root; 2213 path = btrfs_alloc_path(); 2214 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2215 name, name_len, -1); 2216 if (di && !IS_ERR(di)) { 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2218 bytes_del += name_len; 2219 BUG_ON(ret); 2220 } 2221 btrfs_release_path(log, path); 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2223 index, name, name_len, -1); 2224 if (di && !IS_ERR(di)) { 2225 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2226 bytes_del += name_len; 2227 BUG_ON(ret); 2228 } 2229 2230 /* update the directory size in the log to reflect the names 2231 * we have removed 2232 */ 2233 if (bytes_del) { 2234 struct btrfs_key key; 2235 2236 key.objectid = dir->i_ino; 2237 key.offset = 0; 2238 key.type = BTRFS_INODE_ITEM_KEY; 2239 btrfs_release_path(log, path); 2240 2241 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2242 if (ret == 0) { 2243 struct btrfs_inode_item *item; 2244 u64 i_size; 2245 2246 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2247 struct btrfs_inode_item); 2248 i_size = btrfs_inode_size(path->nodes[0], item); 2249 if (i_size > bytes_del) 2250 i_size -= bytes_del; 2251 else 2252 i_size = 0; 2253 btrfs_set_inode_size(path->nodes[0], item, i_size); 2254 btrfs_mark_buffer_dirty(path->nodes[0]); 2255 } else 2256 ret = 0; 2257 btrfs_release_path(log, path); 2258 } 2259 2260 btrfs_free_path(path); 2261 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2262 btrfs_end_log_trans(root); 2263 2264 return 0; 2265 } 2266 2267 /* see comments for btrfs_del_dir_entries_in_log */ 2268 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2269 struct btrfs_root *root, 2270 const char *name, int name_len, 2271 struct inode *inode, u64 dirid) 2272 { 2273 struct btrfs_root *log; 2274 u64 index; 2275 int ret; 2276 2277 if (BTRFS_I(inode)->logged_trans < trans->transid) 2278 return 0; 2279 2280 ret = join_running_log_trans(root); 2281 if (ret) 2282 return 0; 2283 log = root->log_root; 2284 mutex_lock(&BTRFS_I(inode)->log_mutex); 2285 2286 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2287 dirid, &index); 2288 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2289 btrfs_end_log_trans(root); 2290 2291 return ret; 2292 } 2293 2294 /* 2295 * creates a range item in the log for 'dirid'. first_offset and 2296 * last_offset tell us which parts of the key space the log should 2297 * be considered authoritative for. 2298 */ 2299 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2300 struct btrfs_root *log, 2301 struct btrfs_path *path, 2302 int key_type, u64 dirid, 2303 u64 first_offset, u64 last_offset) 2304 { 2305 int ret; 2306 struct btrfs_key key; 2307 struct btrfs_dir_log_item *item; 2308 2309 key.objectid = dirid; 2310 key.offset = first_offset; 2311 if (key_type == BTRFS_DIR_ITEM_KEY) 2312 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2313 else 2314 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2315 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2316 BUG_ON(ret); 2317 2318 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2319 struct btrfs_dir_log_item); 2320 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2321 btrfs_mark_buffer_dirty(path->nodes[0]); 2322 btrfs_release_path(log, path); 2323 return 0; 2324 } 2325 2326 /* 2327 * log all the items included in the current transaction for a given 2328 * directory. This also creates the range items in the log tree required 2329 * to replay anything deleted before the fsync 2330 */ 2331 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2332 struct btrfs_root *root, struct inode *inode, 2333 struct btrfs_path *path, 2334 struct btrfs_path *dst_path, int key_type, 2335 u64 min_offset, u64 *last_offset_ret) 2336 { 2337 struct btrfs_key min_key; 2338 struct btrfs_key max_key; 2339 struct btrfs_root *log = root->log_root; 2340 struct extent_buffer *src; 2341 int ret; 2342 int i; 2343 int nritems; 2344 u64 first_offset = min_offset; 2345 u64 last_offset = (u64)-1; 2346 2347 log = root->log_root; 2348 max_key.objectid = inode->i_ino; 2349 max_key.offset = (u64)-1; 2350 max_key.type = key_type; 2351 2352 min_key.objectid = inode->i_ino; 2353 min_key.type = key_type; 2354 min_key.offset = min_offset; 2355 2356 path->keep_locks = 1; 2357 2358 ret = btrfs_search_forward(root, &min_key, &max_key, 2359 path, 0, trans->transid); 2360 2361 /* 2362 * we didn't find anything from this transaction, see if there 2363 * is anything at all 2364 */ 2365 if (ret != 0 || min_key.objectid != inode->i_ino || 2366 min_key.type != key_type) { 2367 min_key.objectid = inode->i_ino; 2368 min_key.type = key_type; 2369 min_key.offset = (u64)-1; 2370 btrfs_release_path(root, path); 2371 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2372 if (ret < 0) { 2373 btrfs_release_path(root, path); 2374 return ret; 2375 } 2376 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2377 2378 /* if ret == 0 there are items for this type, 2379 * create a range to tell us the last key of this type. 2380 * otherwise, there are no items in this directory after 2381 * *min_offset, and we create a range to indicate that. 2382 */ 2383 if (ret == 0) { 2384 struct btrfs_key tmp; 2385 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2386 path->slots[0]); 2387 if (key_type == tmp.type) 2388 first_offset = max(min_offset, tmp.offset) + 1; 2389 } 2390 goto done; 2391 } 2392 2393 /* go backward to find any previous key */ 2394 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2395 if (ret == 0) { 2396 struct btrfs_key tmp; 2397 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2398 if (key_type == tmp.type) { 2399 first_offset = tmp.offset; 2400 ret = overwrite_item(trans, log, dst_path, 2401 path->nodes[0], path->slots[0], 2402 &tmp); 2403 } 2404 } 2405 btrfs_release_path(root, path); 2406 2407 /* find the first key from this transaction again */ 2408 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2409 if (ret != 0) { 2410 WARN_ON(1); 2411 goto done; 2412 } 2413 2414 /* 2415 * we have a block from this transaction, log every item in it 2416 * from our directory 2417 */ 2418 while (1) { 2419 struct btrfs_key tmp; 2420 src = path->nodes[0]; 2421 nritems = btrfs_header_nritems(src); 2422 for (i = path->slots[0]; i < nritems; i++) { 2423 btrfs_item_key_to_cpu(src, &min_key, i); 2424 2425 if (min_key.objectid != inode->i_ino || 2426 min_key.type != key_type) 2427 goto done; 2428 ret = overwrite_item(trans, log, dst_path, src, i, 2429 &min_key); 2430 BUG_ON(ret); 2431 } 2432 path->slots[0] = nritems; 2433 2434 /* 2435 * look ahead to the next item and see if it is also 2436 * from this directory and from this transaction 2437 */ 2438 ret = btrfs_next_leaf(root, path); 2439 if (ret == 1) { 2440 last_offset = (u64)-1; 2441 goto done; 2442 } 2443 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2444 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2445 last_offset = (u64)-1; 2446 goto done; 2447 } 2448 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2449 ret = overwrite_item(trans, log, dst_path, 2450 path->nodes[0], path->slots[0], 2451 &tmp); 2452 2453 BUG_ON(ret); 2454 last_offset = tmp.offset; 2455 goto done; 2456 } 2457 } 2458 done: 2459 *last_offset_ret = last_offset; 2460 btrfs_release_path(root, path); 2461 btrfs_release_path(log, dst_path); 2462 2463 /* insert the log range keys to indicate where the log is valid */ 2464 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2465 first_offset, last_offset); 2466 BUG_ON(ret); 2467 return 0; 2468 } 2469 2470 /* 2471 * logging directories is very similar to logging inodes, We find all the items 2472 * from the current transaction and write them to the log. 2473 * 2474 * The recovery code scans the directory in the subvolume, and if it finds a 2475 * key in the range logged that is not present in the log tree, then it means 2476 * that dir entry was unlinked during the transaction. 2477 * 2478 * In order for that scan to work, we must include one key smaller than 2479 * the smallest logged by this transaction and one key larger than the largest 2480 * key logged by this transaction. 2481 */ 2482 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2483 struct btrfs_root *root, struct inode *inode, 2484 struct btrfs_path *path, 2485 struct btrfs_path *dst_path) 2486 { 2487 u64 min_key; 2488 u64 max_key; 2489 int ret; 2490 int key_type = BTRFS_DIR_ITEM_KEY; 2491 2492 again: 2493 min_key = 0; 2494 max_key = 0; 2495 while (1) { 2496 ret = log_dir_items(trans, root, inode, path, 2497 dst_path, key_type, min_key, 2498 &max_key); 2499 BUG_ON(ret); 2500 if (max_key == (u64)-1) 2501 break; 2502 min_key = max_key + 1; 2503 } 2504 2505 if (key_type == BTRFS_DIR_ITEM_KEY) { 2506 key_type = BTRFS_DIR_INDEX_KEY; 2507 goto again; 2508 } 2509 return 0; 2510 } 2511 2512 /* 2513 * a helper function to drop items from the log before we relog an 2514 * inode. max_key_type indicates the highest item type to remove. 2515 * This cannot be run for file data extents because it does not 2516 * free the extents they point to. 2517 */ 2518 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2519 struct btrfs_root *log, 2520 struct btrfs_path *path, 2521 u64 objectid, int max_key_type) 2522 { 2523 int ret; 2524 struct btrfs_key key; 2525 struct btrfs_key found_key; 2526 2527 key.objectid = objectid; 2528 key.type = max_key_type; 2529 key.offset = (u64)-1; 2530 2531 while (1) { 2532 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2533 2534 if (ret != 1) 2535 break; 2536 2537 if (path->slots[0] == 0) 2538 break; 2539 2540 path->slots[0]--; 2541 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2542 path->slots[0]); 2543 2544 if (found_key.objectid != objectid) 2545 break; 2546 2547 ret = btrfs_del_item(trans, log, path); 2548 BUG_ON(ret); 2549 btrfs_release_path(log, path); 2550 } 2551 btrfs_release_path(log, path); 2552 return 0; 2553 } 2554 2555 static noinline int copy_items(struct btrfs_trans_handle *trans, 2556 struct btrfs_root *log, 2557 struct btrfs_path *dst_path, 2558 struct extent_buffer *src, 2559 int start_slot, int nr, int inode_only) 2560 { 2561 unsigned long src_offset; 2562 unsigned long dst_offset; 2563 struct btrfs_file_extent_item *extent; 2564 struct btrfs_inode_item *inode_item; 2565 int ret; 2566 struct btrfs_key *ins_keys; 2567 u32 *ins_sizes; 2568 char *ins_data; 2569 int i; 2570 struct list_head ordered_sums; 2571 2572 INIT_LIST_HEAD(&ordered_sums); 2573 2574 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2575 nr * sizeof(u32), GFP_NOFS); 2576 ins_sizes = (u32 *)ins_data; 2577 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2578 2579 for (i = 0; i < nr; i++) { 2580 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2581 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2582 } 2583 ret = btrfs_insert_empty_items(trans, log, dst_path, 2584 ins_keys, ins_sizes, nr); 2585 BUG_ON(ret); 2586 2587 for (i = 0; i < nr; i++) { 2588 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2589 dst_path->slots[0]); 2590 2591 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2592 2593 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2594 src_offset, ins_sizes[i]); 2595 2596 if (inode_only == LOG_INODE_EXISTS && 2597 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2598 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2599 dst_path->slots[0], 2600 struct btrfs_inode_item); 2601 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2602 2603 /* set the generation to zero so the recover code 2604 * can tell the difference between an logging 2605 * just to say 'this inode exists' and a logging 2606 * to say 'update this inode with these values' 2607 */ 2608 btrfs_set_inode_generation(dst_path->nodes[0], 2609 inode_item, 0); 2610 } 2611 /* take a reference on file data extents so that truncates 2612 * or deletes of this inode don't have to relog the inode 2613 * again 2614 */ 2615 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2616 int found_type; 2617 extent = btrfs_item_ptr(src, start_slot + i, 2618 struct btrfs_file_extent_item); 2619 2620 found_type = btrfs_file_extent_type(src, extent); 2621 if (found_type == BTRFS_FILE_EXTENT_REG || 2622 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2623 u64 ds = btrfs_file_extent_disk_bytenr(src, 2624 extent); 2625 u64 dl = btrfs_file_extent_disk_num_bytes(src, 2626 extent); 2627 u64 cs = btrfs_file_extent_offset(src, extent); 2628 u64 cl = btrfs_file_extent_num_bytes(src, 2629 extent);; 2630 if (btrfs_file_extent_compression(src, 2631 extent)) { 2632 cs = 0; 2633 cl = dl; 2634 } 2635 /* ds == 0 is a hole */ 2636 if (ds != 0) { 2637 ret = btrfs_inc_extent_ref(trans, log, 2638 ds, dl, 2639 dst_path->nodes[0]->start, 2640 BTRFS_TREE_LOG_OBJECTID, 2641 trans->transid, 2642 ins_keys[i].objectid); 2643 BUG_ON(ret); 2644 ret = btrfs_lookup_csums_range( 2645 log->fs_info->csum_root, 2646 ds + cs, ds + cs + cl - 1, 2647 &ordered_sums); 2648 BUG_ON(ret); 2649 } 2650 } 2651 } 2652 dst_path->slots[0]++; 2653 } 2654 2655 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2656 btrfs_release_path(log, dst_path); 2657 kfree(ins_data); 2658 2659 /* 2660 * we have to do this after the loop above to avoid changing the 2661 * log tree while trying to change the log tree. 2662 */ 2663 while (!list_empty(&ordered_sums)) { 2664 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2665 struct btrfs_ordered_sum, 2666 list); 2667 ret = btrfs_csum_file_blocks(trans, log, sums); 2668 BUG_ON(ret); 2669 list_del(&sums->list); 2670 kfree(sums); 2671 } 2672 return 0; 2673 } 2674 2675 /* log a single inode in the tree log. 2676 * At least one parent directory for this inode must exist in the tree 2677 * or be logged already. 2678 * 2679 * Any items from this inode changed by the current transaction are copied 2680 * to the log tree. An extra reference is taken on any extents in this 2681 * file, allowing us to avoid a whole pile of corner cases around logging 2682 * blocks that have been removed from the tree. 2683 * 2684 * See LOG_INODE_ALL and related defines for a description of what inode_only 2685 * does. 2686 * 2687 * This handles both files and directories. 2688 */ 2689 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2690 struct btrfs_root *root, struct inode *inode, 2691 int inode_only) 2692 { 2693 struct btrfs_path *path; 2694 struct btrfs_path *dst_path; 2695 struct btrfs_key min_key; 2696 struct btrfs_key max_key; 2697 struct btrfs_root *log = root->log_root; 2698 struct extent_buffer *src = NULL; 2699 u32 size; 2700 int ret; 2701 int nritems; 2702 int ins_start_slot = 0; 2703 int ins_nr; 2704 2705 log = root->log_root; 2706 2707 path = btrfs_alloc_path(); 2708 dst_path = btrfs_alloc_path(); 2709 2710 min_key.objectid = inode->i_ino; 2711 min_key.type = BTRFS_INODE_ITEM_KEY; 2712 min_key.offset = 0; 2713 2714 max_key.objectid = inode->i_ino; 2715 2716 /* today the code can only do partial logging of directories */ 2717 if (!S_ISDIR(inode->i_mode)) 2718 inode_only = LOG_INODE_ALL; 2719 2720 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2721 max_key.type = BTRFS_XATTR_ITEM_KEY; 2722 else 2723 max_key.type = (u8)-1; 2724 max_key.offset = (u64)-1; 2725 2726 mutex_lock(&BTRFS_I(inode)->log_mutex); 2727 2728 /* 2729 * a brute force approach to making sure we get the most uptodate 2730 * copies of everything. 2731 */ 2732 if (S_ISDIR(inode->i_mode)) { 2733 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2734 2735 if (inode_only == LOG_INODE_EXISTS) 2736 max_key_type = BTRFS_XATTR_ITEM_KEY; 2737 ret = drop_objectid_items(trans, log, path, 2738 inode->i_ino, max_key_type); 2739 } else { 2740 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2741 } 2742 BUG_ON(ret); 2743 path->keep_locks = 1; 2744 2745 while (1) { 2746 ins_nr = 0; 2747 ret = btrfs_search_forward(root, &min_key, &max_key, 2748 path, 0, trans->transid); 2749 if (ret != 0) 2750 break; 2751 again: 2752 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2753 if (min_key.objectid != inode->i_ino) 2754 break; 2755 if (min_key.type > max_key.type) 2756 break; 2757 2758 src = path->nodes[0]; 2759 size = btrfs_item_size_nr(src, path->slots[0]); 2760 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2761 ins_nr++; 2762 goto next_slot; 2763 } else if (!ins_nr) { 2764 ins_start_slot = path->slots[0]; 2765 ins_nr = 1; 2766 goto next_slot; 2767 } 2768 2769 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2770 ins_nr, inode_only); 2771 BUG_ON(ret); 2772 ins_nr = 1; 2773 ins_start_slot = path->slots[0]; 2774 next_slot: 2775 2776 nritems = btrfs_header_nritems(path->nodes[0]); 2777 path->slots[0]++; 2778 if (path->slots[0] < nritems) { 2779 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2780 path->slots[0]); 2781 goto again; 2782 } 2783 if (ins_nr) { 2784 ret = copy_items(trans, log, dst_path, src, 2785 ins_start_slot, 2786 ins_nr, inode_only); 2787 BUG_ON(ret); 2788 ins_nr = 0; 2789 } 2790 btrfs_release_path(root, path); 2791 2792 if (min_key.offset < (u64)-1) 2793 min_key.offset++; 2794 else if (min_key.type < (u8)-1) 2795 min_key.type++; 2796 else if (min_key.objectid < (u64)-1) 2797 min_key.objectid++; 2798 else 2799 break; 2800 } 2801 if (ins_nr) { 2802 ret = copy_items(trans, log, dst_path, src, 2803 ins_start_slot, 2804 ins_nr, inode_only); 2805 BUG_ON(ret); 2806 ins_nr = 0; 2807 } 2808 WARN_ON(ins_nr); 2809 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2810 btrfs_release_path(root, path); 2811 btrfs_release_path(log, dst_path); 2812 ret = log_directory_changes(trans, root, inode, path, dst_path); 2813 BUG_ON(ret); 2814 } 2815 BTRFS_I(inode)->logged_trans = trans->transid; 2816 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2817 2818 btrfs_free_path(path); 2819 btrfs_free_path(dst_path); 2820 return 0; 2821 } 2822 2823 /* 2824 * follow the dentry parent pointers up the chain and see if any 2825 * of the directories in it require a full commit before they can 2826 * be logged. Returns zero if nothing special needs to be done or 1 if 2827 * a full commit is required. 2828 */ 2829 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 2830 struct inode *inode, 2831 struct dentry *parent, 2832 struct super_block *sb, 2833 u64 last_committed) 2834 { 2835 int ret = 0; 2836 struct btrfs_root *root; 2837 2838 /* 2839 * for regular files, if its inode is already on disk, we don't 2840 * have to worry about the parents at all. This is because 2841 * we can use the last_unlink_trans field to record renames 2842 * and other fun in this file. 2843 */ 2844 if (S_ISREG(inode->i_mode) && 2845 BTRFS_I(inode)->generation <= last_committed && 2846 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2847 goto out; 2848 2849 if (!S_ISDIR(inode->i_mode)) { 2850 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2851 goto out; 2852 inode = parent->d_inode; 2853 } 2854 2855 while (1) { 2856 BTRFS_I(inode)->logged_trans = trans->transid; 2857 smp_mb(); 2858 2859 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 2860 root = BTRFS_I(inode)->root; 2861 2862 /* 2863 * make sure any commits to the log are forced 2864 * to be full commits 2865 */ 2866 root->fs_info->last_trans_log_full_commit = 2867 trans->transid; 2868 ret = 1; 2869 break; 2870 } 2871 2872 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2873 break; 2874 2875 if (parent == sb->s_root) 2876 break; 2877 2878 parent = parent->d_parent; 2879 inode = parent->d_inode; 2880 2881 } 2882 out: 2883 return ret; 2884 } 2885 2886 /* 2887 * helper function around btrfs_log_inode to make sure newly created 2888 * parent directories also end up in the log. A minimal inode and backref 2889 * only logging is done of any parent directories that are older than 2890 * the last committed transaction 2891 */ 2892 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 2893 struct btrfs_root *root, struct inode *inode, 2894 struct dentry *parent, int exists_only) 2895 { 2896 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2897 struct super_block *sb; 2898 int ret = 0; 2899 u64 last_committed = root->fs_info->last_trans_committed; 2900 2901 sb = inode->i_sb; 2902 2903 if (root->fs_info->last_trans_log_full_commit > 2904 root->fs_info->last_trans_committed) { 2905 ret = 1; 2906 goto end_no_trans; 2907 } 2908 2909 ret = check_parent_dirs_for_sync(trans, inode, parent, 2910 sb, last_committed); 2911 if (ret) 2912 goto end_no_trans; 2913 2914 start_log_trans(trans, root); 2915 2916 ret = btrfs_log_inode(trans, root, inode, inode_only); 2917 BUG_ON(ret); 2918 2919 /* 2920 * for regular files, if its inode is already on disk, we don't 2921 * have to worry about the parents at all. This is because 2922 * we can use the last_unlink_trans field to record renames 2923 * and other fun in this file. 2924 */ 2925 if (S_ISREG(inode->i_mode) && 2926 BTRFS_I(inode)->generation <= last_committed && 2927 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2928 goto no_parent; 2929 2930 inode_only = LOG_INODE_EXISTS; 2931 while (1) { 2932 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2933 break; 2934 2935 inode = parent->d_inode; 2936 if (BTRFS_I(inode)->generation > 2937 root->fs_info->last_trans_committed) { 2938 ret = btrfs_log_inode(trans, root, inode, inode_only); 2939 BUG_ON(ret); 2940 } 2941 if (parent == sb->s_root) 2942 break; 2943 2944 parent = parent->d_parent; 2945 } 2946 no_parent: 2947 ret = 0; 2948 btrfs_end_log_trans(root); 2949 end_no_trans: 2950 return ret; 2951 } 2952 2953 /* 2954 * it is not safe to log dentry if the chunk root has added new 2955 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 2956 * If this returns 1, you must commit the transaction to safely get your 2957 * data on disk. 2958 */ 2959 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 2960 struct btrfs_root *root, struct dentry *dentry) 2961 { 2962 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 2963 dentry->d_parent, 0); 2964 } 2965 2966 /* 2967 * should be called during mount to recover any replay any log trees 2968 * from the FS 2969 */ 2970 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 2971 { 2972 int ret; 2973 struct btrfs_path *path; 2974 struct btrfs_trans_handle *trans; 2975 struct btrfs_key key; 2976 struct btrfs_key found_key; 2977 struct btrfs_key tmp_key; 2978 struct btrfs_root *log; 2979 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 2980 u64 highest_inode; 2981 struct walk_control wc = { 2982 .process_func = process_one_buffer, 2983 .stage = 0, 2984 }; 2985 2986 fs_info->log_root_recovering = 1; 2987 path = btrfs_alloc_path(); 2988 BUG_ON(!path); 2989 2990 trans = btrfs_start_transaction(fs_info->tree_root, 1); 2991 2992 wc.trans = trans; 2993 wc.pin = 1; 2994 2995 walk_log_tree(trans, log_root_tree, &wc); 2996 2997 again: 2998 key.objectid = BTRFS_TREE_LOG_OBJECTID; 2999 key.offset = (u64)-1; 3000 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3001 3002 while (1) { 3003 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 3004 if (ret < 0) 3005 break; 3006 if (ret > 0) { 3007 if (path->slots[0] == 0) 3008 break; 3009 path->slots[0]--; 3010 } 3011 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3012 path->slots[0]); 3013 btrfs_release_path(log_root_tree, path); 3014 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3015 break; 3016 3017 log = btrfs_read_fs_root_no_radix(log_root_tree, 3018 &found_key); 3019 BUG_ON(!log); 3020 3021 3022 tmp_key.objectid = found_key.offset; 3023 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3024 tmp_key.offset = (u64)-1; 3025 3026 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3027 BUG_ON(!wc.replay_dest); 3028 3029 wc.replay_dest->log_root = log; 3030 mutex_lock(&fs_info->trans_mutex); 3031 btrfs_record_root_in_trans(wc.replay_dest); 3032 mutex_unlock(&fs_info->trans_mutex); 3033 ret = walk_log_tree(trans, log, &wc); 3034 BUG_ON(ret); 3035 3036 if (wc.stage == LOG_WALK_REPLAY_ALL) { 3037 ret = fixup_inode_link_counts(trans, wc.replay_dest, 3038 path); 3039 BUG_ON(ret); 3040 } 3041 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); 3042 if (ret == 0) { 3043 wc.replay_dest->highest_inode = highest_inode; 3044 wc.replay_dest->last_inode_alloc = highest_inode; 3045 } 3046 3047 key.offset = found_key.offset - 1; 3048 wc.replay_dest->log_root = NULL; 3049 free_extent_buffer(log->node); 3050 kfree(log); 3051 3052 if (found_key.offset == 0) 3053 break; 3054 } 3055 btrfs_release_path(log_root_tree, path); 3056 3057 /* step one is to pin it all, step two is to replay just inodes */ 3058 if (wc.pin) { 3059 wc.pin = 0; 3060 wc.process_func = replay_one_buffer; 3061 wc.stage = LOG_WALK_REPLAY_INODES; 3062 goto again; 3063 } 3064 /* step three is to replay everything */ 3065 if (wc.stage < LOG_WALK_REPLAY_ALL) { 3066 wc.stage++; 3067 goto again; 3068 } 3069 3070 btrfs_free_path(path); 3071 3072 free_extent_buffer(log_root_tree->node); 3073 log_root_tree->log_root = NULL; 3074 fs_info->log_root_recovering = 0; 3075 3076 /* step 4: commit the transaction, which also unpins the blocks */ 3077 btrfs_commit_transaction(trans, fs_info->tree_root); 3078 3079 kfree(log_root_tree); 3080 return 0; 3081 } 3082 3083 /* 3084 * there are some corner cases where we want to force a full 3085 * commit instead of allowing a directory to be logged. 3086 * 3087 * They revolve around files there were unlinked from the directory, and 3088 * this function updates the parent directory so that a full commit is 3089 * properly done if it is fsync'd later after the unlinks are done. 3090 */ 3091 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 3092 struct inode *dir, struct inode *inode, 3093 int for_rename) 3094 { 3095 /* 3096 * when we're logging a file, if it hasn't been renamed 3097 * or unlinked, and its inode is fully committed on disk, 3098 * we don't have to worry about walking up the directory chain 3099 * to log its parents. 3100 * 3101 * So, we use the last_unlink_trans field to put this transid 3102 * into the file. When the file is logged we check it and 3103 * don't log the parents if the file is fully on disk. 3104 */ 3105 if (S_ISREG(inode->i_mode)) 3106 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3107 3108 /* 3109 * if this directory was already logged any new 3110 * names for this file/dir will get recorded 3111 */ 3112 smp_mb(); 3113 if (BTRFS_I(dir)->logged_trans == trans->transid) 3114 return; 3115 3116 /* 3117 * if the inode we're about to unlink was logged, 3118 * the log will be properly updated for any new names 3119 */ 3120 if (BTRFS_I(inode)->logged_trans == trans->transid) 3121 return; 3122 3123 /* 3124 * when renaming files across directories, if the directory 3125 * there we're unlinking from gets fsync'd later on, there's 3126 * no way to find the destination directory later and fsync it 3127 * properly. So, we have to be conservative and force commits 3128 * so the new name gets discovered. 3129 */ 3130 if (for_rename) 3131 goto record; 3132 3133 /* we can safely do the unlink without any special recording */ 3134 return; 3135 3136 record: 3137 BTRFS_I(dir)->last_unlink_trans = trans->transid; 3138 } 3139 3140 /* 3141 * Call this after adding a new name for a file and it will properly 3142 * update the log to reflect the new name. 3143 * 3144 * It will return zero if all goes well, and it will return 1 if a 3145 * full transaction commit is required. 3146 */ 3147 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 3148 struct inode *inode, struct inode *old_dir, 3149 struct dentry *parent) 3150 { 3151 struct btrfs_root * root = BTRFS_I(inode)->root; 3152 3153 /* 3154 * this will force the logging code to walk the dentry chain 3155 * up for the file 3156 */ 3157 if (S_ISREG(inode->i_mode)) 3158 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3159 3160 /* 3161 * if this inode hasn't been logged and directory we're renaming it 3162 * from hasn't been logged, we don't need to log it 3163 */ 3164 if (BTRFS_I(inode)->logged_trans <= 3165 root->fs_info->last_trans_committed && 3166 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 3167 root->fs_info->last_trans_committed)) 3168 return 0; 3169 3170 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 3171 } 3172 3173