1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include "ctree.h" 22 #include "transaction.h" 23 #include "disk-io.h" 24 #include "locking.h" 25 #include "print-tree.h" 26 #include "compat.h" 27 #include "tree-log.h" 28 29 /* magic values for the inode_only field in btrfs_log_inode: 30 * 31 * LOG_INODE_ALL means to log everything 32 * LOG_INODE_EXISTS means to log just enough to recreate the inode 33 * during log replay 34 */ 35 #define LOG_INODE_ALL 0 36 #define LOG_INODE_EXISTS 1 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 #define LOG_WALK_PIN_ONLY 0 91 #define LOG_WALK_REPLAY_INODES 1 92 #define LOG_WALK_REPLAY_ALL 2 93 94 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 95 struct btrfs_root *root, struct inode *inode, 96 int inode_only); 97 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, 99 struct btrfs_path *path, u64 objectid); 100 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_root *log, 103 struct btrfs_path *path, 104 u64 dirid, int del_all); 105 106 /* 107 * tree logging is a special write ahead log used to make sure that 108 * fsyncs and O_SYNCs can happen without doing full tree commits. 109 * 110 * Full tree commits are expensive because they require commonly 111 * modified blocks to be recowed, creating many dirty pages in the 112 * extent tree an 4x-6x higher write load than ext3. 113 * 114 * Instead of doing a tree commit on every fsync, we use the 115 * key ranges and transaction ids to find items for a given file or directory 116 * that have changed in this transaction. Those items are copied into 117 * a special tree (one per subvolume root), that tree is written to disk 118 * and then the fsync is considered complete. 119 * 120 * After a crash, items are copied out of the log-tree back into the 121 * subvolume tree. Any file data extents found are recorded in the extent 122 * allocation tree, and the log-tree freed. 123 * 124 * The log tree is read three times, once to pin down all the extents it is 125 * using in ram and once, once to create all the inodes logged in the tree 126 * and once to do all the other items. 127 */ 128 129 /* 130 * start a sub transaction and setup the log tree 131 * this increments the log tree writer count to make the people 132 * syncing the tree wait for us to finish 133 */ 134 static int start_log_trans(struct btrfs_trans_handle *trans, 135 struct btrfs_root *root) 136 { 137 int ret; 138 int err = 0; 139 140 mutex_lock(&root->log_mutex); 141 if (root->log_root) { 142 if (!root->log_start_pid) { 143 root->log_start_pid = current->pid; 144 root->log_multiple_pids = false; 145 } else if (root->log_start_pid != current->pid) { 146 root->log_multiple_pids = true; 147 } 148 149 root->log_batch++; 150 atomic_inc(&root->log_writers); 151 mutex_unlock(&root->log_mutex); 152 return 0; 153 } 154 root->log_multiple_pids = false; 155 root->log_start_pid = current->pid; 156 mutex_lock(&root->fs_info->tree_log_mutex); 157 if (!root->fs_info->log_root_tree) { 158 ret = btrfs_init_log_root_tree(trans, root->fs_info); 159 if (ret) 160 err = ret; 161 } 162 if (err == 0 && !root->log_root) { 163 ret = btrfs_add_log_tree(trans, root); 164 if (ret) 165 err = ret; 166 } 167 mutex_unlock(&root->fs_info->tree_log_mutex); 168 root->log_batch++; 169 atomic_inc(&root->log_writers); 170 mutex_unlock(&root->log_mutex); 171 return err; 172 } 173 174 /* 175 * returns 0 if there was a log transaction running and we were able 176 * to join, or returns -ENOENT if there were not transactions 177 * in progress 178 */ 179 static int join_running_log_trans(struct btrfs_root *root) 180 { 181 int ret = -ENOENT; 182 183 smp_mb(); 184 if (!root->log_root) 185 return -ENOENT; 186 187 mutex_lock(&root->log_mutex); 188 if (root->log_root) { 189 ret = 0; 190 atomic_inc(&root->log_writers); 191 } 192 mutex_unlock(&root->log_mutex); 193 return ret; 194 } 195 196 /* 197 * This either makes the current running log transaction wait 198 * until you call btrfs_end_log_trans() or it makes any future 199 * log transactions wait until you call btrfs_end_log_trans() 200 */ 201 int btrfs_pin_log_trans(struct btrfs_root *root) 202 { 203 int ret = -ENOENT; 204 205 mutex_lock(&root->log_mutex); 206 atomic_inc(&root->log_writers); 207 mutex_unlock(&root->log_mutex); 208 return ret; 209 } 210 211 /* 212 * indicate we're done making changes to the log tree 213 * and wake up anyone waiting to do a sync 214 */ 215 int btrfs_end_log_trans(struct btrfs_root *root) 216 { 217 if (atomic_dec_and_test(&root->log_writers)) { 218 smp_mb(); 219 if (waitqueue_active(&root->log_writer_wait)) 220 wake_up(&root->log_writer_wait); 221 } 222 return 0; 223 } 224 225 226 /* 227 * the walk control struct is used to pass state down the chain when 228 * processing the log tree. The stage field tells us which part 229 * of the log tree processing we are currently doing. The others 230 * are state fields used for that specific part 231 */ 232 struct walk_control { 233 /* should we free the extent on disk when done? This is used 234 * at transaction commit time while freeing a log tree 235 */ 236 int free; 237 238 /* should we write out the extent buffer? This is used 239 * while flushing the log tree to disk during a sync 240 */ 241 int write; 242 243 /* should we wait for the extent buffer io to finish? Also used 244 * while flushing the log tree to disk for a sync 245 */ 246 int wait; 247 248 /* pin only walk, we record which extents on disk belong to the 249 * log trees 250 */ 251 int pin; 252 253 /* what stage of the replay code we're currently in */ 254 int stage; 255 256 /* the root we are currently replaying */ 257 struct btrfs_root *replay_dest; 258 259 /* the trans handle for the current replay */ 260 struct btrfs_trans_handle *trans; 261 262 /* the function that gets used to process blocks we find in the 263 * tree. Note the extent_buffer might not be up to date when it is 264 * passed in, and it must be checked or read if you need the data 265 * inside it 266 */ 267 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 268 struct walk_control *wc, u64 gen); 269 }; 270 271 /* 272 * process_func used to pin down extents, write them or wait on them 273 */ 274 static int process_one_buffer(struct btrfs_root *log, 275 struct extent_buffer *eb, 276 struct walk_control *wc, u64 gen) 277 { 278 if (wc->pin) 279 btrfs_pin_extent(log->fs_info->extent_root, 280 eb->start, eb->len, 0); 281 282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (wc->write) 284 btrfs_write_tree_block(eb); 285 if (wc->wait) 286 btrfs_wait_tree_block_writeback(eb); 287 } 288 return 0; 289 } 290 291 /* 292 * Item overwrite used by replay and tree logging. eb, slot and key all refer 293 * to the src data we are copying out. 294 * 295 * root is the tree we are copying into, and path is a scratch 296 * path for use in this function (it should be released on entry and 297 * will be released on exit). 298 * 299 * If the key is already in the destination tree the existing item is 300 * overwritten. If the existing item isn't big enough, it is extended. 301 * If it is too large, it is truncated. 302 * 303 * If the key isn't in the destination yet, a new item is inserted. 304 */ 305 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 306 struct btrfs_root *root, 307 struct btrfs_path *path, 308 struct extent_buffer *eb, int slot, 309 struct btrfs_key *key) 310 { 311 int ret; 312 u32 item_size; 313 u64 saved_i_size = 0; 314 int save_old_i_size = 0; 315 unsigned long src_ptr; 316 unsigned long dst_ptr; 317 int overwrite_root = 0; 318 319 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 320 overwrite_root = 1; 321 322 item_size = btrfs_item_size_nr(eb, slot); 323 src_ptr = btrfs_item_ptr_offset(eb, slot); 324 325 /* look for the key in the destination tree */ 326 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 327 if (ret == 0) { 328 char *src_copy; 329 char *dst_copy; 330 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 331 path->slots[0]); 332 if (dst_size != item_size) 333 goto insert; 334 335 if (item_size == 0) { 336 btrfs_release_path(root, path); 337 return 0; 338 } 339 dst_copy = kmalloc(item_size, GFP_NOFS); 340 src_copy = kmalloc(item_size, GFP_NOFS); 341 if (!dst_copy || !src_copy) { 342 btrfs_release_path(root, path); 343 kfree(dst_copy); 344 kfree(src_copy); 345 return -ENOMEM; 346 } 347 348 read_extent_buffer(eb, src_copy, src_ptr, item_size); 349 350 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 351 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 352 item_size); 353 ret = memcmp(dst_copy, src_copy, item_size); 354 355 kfree(dst_copy); 356 kfree(src_copy); 357 /* 358 * they have the same contents, just return, this saves 359 * us from cowing blocks in the destination tree and doing 360 * extra writes that may not have been done by a previous 361 * sync 362 */ 363 if (ret == 0) { 364 btrfs_release_path(root, path); 365 return 0; 366 } 367 368 } 369 insert: 370 btrfs_release_path(root, path); 371 /* try to insert the key into the destination tree */ 372 ret = btrfs_insert_empty_item(trans, root, path, 373 key, item_size); 374 375 /* make sure any existing item is the correct size */ 376 if (ret == -EEXIST) { 377 u32 found_size; 378 found_size = btrfs_item_size_nr(path->nodes[0], 379 path->slots[0]); 380 if (found_size > item_size) { 381 btrfs_truncate_item(trans, root, path, item_size, 1); 382 } else if (found_size < item_size) { 383 ret = btrfs_extend_item(trans, root, path, 384 item_size - found_size); 385 BUG_ON(ret); 386 } 387 } else if (ret) { 388 return ret; 389 } 390 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 391 path->slots[0]); 392 393 /* don't overwrite an existing inode if the generation number 394 * was logged as zero. This is done when the tree logging code 395 * is just logging an inode to make sure it exists after recovery. 396 * 397 * Also, don't overwrite i_size on directories during replay. 398 * log replay inserts and removes directory items based on the 399 * state of the tree found in the subvolume, and i_size is modified 400 * as it goes 401 */ 402 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 403 struct btrfs_inode_item *src_item; 404 struct btrfs_inode_item *dst_item; 405 406 src_item = (struct btrfs_inode_item *)src_ptr; 407 dst_item = (struct btrfs_inode_item *)dst_ptr; 408 409 if (btrfs_inode_generation(eb, src_item) == 0) 410 goto no_copy; 411 412 if (overwrite_root && 413 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 414 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 415 save_old_i_size = 1; 416 saved_i_size = btrfs_inode_size(path->nodes[0], 417 dst_item); 418 } 419 } 420 421 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 422 src_ptr, item_size); 423 424 if (save_old_i_size) { 425 struct btrfs_inode_item *dst_item; 426 dst_item = (struct btrfs_inode_item *)dst_ptr; 427 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 428 } 429 430 /* make sure the generation is filled in */ 431 if (key->type == BTRFS_INODE_ITEM_KEY) { 432 struct btrfs_inode_item *dst_item; 433 dst_item = (struct btrfs_inode_item *)dst_ptr; 434 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 435 btrfs_set_inode_generation(path->nodes[0], dst_item, 436 trans->transid); 437 } 438 } 439 no_copy: 440 btrfs_mark_buffer_dirty(path->nodes[0]); 441 btrfs_release_path(root, path); 442 return 0; 443 } 444 445 /* 446 * simple helper to read an inode off the disk from a given root 447 * This can only be called for subvolume roots and not for the log 448 */ 449 static noinline struct inode *read_one_inode(struct btrfs_root *root, 450 u64 objectid) 451 { 452 struct btrfs_key key; 453 struct inode *inode; 454 455 key.objectid = objectid; 456 key.type = BTRFS_INODE_ITEM_KEY; 457 key.offset = 0; 458 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 459 if (IS_ERR(inode)) { 460 inode = NULL; 461 } else if (is_bad_inode(inode)) { 462 iput(inode); 463 inode = NULL; 464 } 465 return inode; 466 } 467 468 /* replays a single extent in 'eb' at 'slot' with 'key' into the 469 * subvolume 'root'. path is released on entry and should be released 470 * on exit. 471 * 472 * extents in the log tree have not been allocated out of the extent 473 * tree yet. So, this completes the allocation, taking a reference 474 * as required if the extent already exists or creating a new extent 475 * if it isn't in the extent allocation tree yet. 476 * 477 * The extent is inserted into the file, dropping any existing extents 478 * from the file that overlap the new one. 479 */ 480 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 481 struct btrfs_root *root, 482 struct btrfs_path *path, 483 struct extent_buffer *eb, int slot, 484 struct btrfs_key *key) 485 { 486 int found_type; 487 u64 mask = root->sectorsize - 1; 488 u64 extent_end; 489 u64 alloc_hint; 490 u64 start = key->offset; 491 u64 saved_nbytes; 492 struct btrfs_file_extent_item *item; 493 struct inode *inode = NULL; 494 unsigned long size; 495 int ret = 0; 496 497 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 498 found_type = btrfs_file_extent_type(eb, item); 499 500 if (found_type == BTRFS_FILE_EXTENT_REG || 501 found_type == BTRFS_FILE_EXTENT_PREALLOC) 502 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 503 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 504 size = btrfs_file_extent_inline_len(eb, item); 505 extent_end = (start + size + mask) & ~mask; 506 } else { 507 ret = 0; 508 goto out; 509 } 510 511 inode = read_one_inode(root, key->objectid); 512 if (!inode) { 513 ret = -EIO; 514 goto out; 515 } 516 517 /* 518 * first check to see if we already have this extent in the 519 * file. This must be done before the btrfs_drop_extents run 520 * so we don't try to drop this extent. 521 */ 522 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 523 start, 0); 524 525 if (ret == 0 && 526 (found_type == BTRFS_FILE_EXTENT_REG || 527 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 528 struct btrfs_file_extent_item cmp1; 529 struct btrfs_file_extent_item cmp2; 530 struct btrfs_file_extent_item *existing; 531 struct extent_buffer *leaf; 532 533 leaf = path->nodes[0]; 534 existing = btrfs_item_ptr(leaf, path->slots[0], 535 struct btrfs_file_extent_item); 536 537 read_extent_buffer(eb, &cmp1, (unsigned long)item, 538 sizeof(cmp1)); 539 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 540 sizeof(cmp2)); 541 542 /* 543 * we already have a pointer to this exact extent, 544 * we don't have to do anything 545 */ 546 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 547 btrfs_release_path(root, path); 548 goto out; 549 } 550 } 551 btrfs_release_path(root, path); 552 553 saved_nbytes = inode_get_bytes(inode); 554 /* drop any overlapping extents */ 555 ret = btrfs_drop_extents(trans, inode, start, extent_end, 556 &alloc_hint, 1); 557 BUG_ON(ret); 558 559 if (found_type == BTRFS_FILE_EXTENT_REG || 560 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 561 u64 offset; 562 unsigned long dest_offset; 563 struct btrfs_key ins; 564 565 ret = btrfs_insert_empty_item(trans, root, path, key, 566 sizeof(*item)); 567 BUG_ON(ret); 568 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 569 path->slots[0]); 570 copy_extent_buffer(path->nodes[0], eb, dest_offset, 571 (unsigned long)item, sizeof(*item)); 572 573 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 574 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 575 ins.type = BTRFS_EXTENT_ITEM_KEY; 576 offset = key->offset - btrfs_file_extent_offset(eb, item); 577 578 if (ins.objectid > 0) { 579 u64 csum_start; 580 u64 csum_end; 581 LIST_HEAD(ordered_sums); 582 /* 583 * is this extent already allocated in the extent 584 * allocation tree? If so, just add a reference 585 */ 586 ret = btrfs_lookup_extent(root, ins.objectid, 587 ins.offset); 588 if (ret == 0) { 589 ret = btrfs_inc_extent_ref(trans, root, 590 ins.objectid, ins.offset, 591 0, root->root_key.objectid, 592 key->objectid, offset); 593 } else { 594 /* 595 * insert the extent pointer in the extent 596 * allocation tree 597 */ 598 ret = btrfs_alloc_logged_file_extent(trans, 599 root, root->root_key.objectid, 600 key->objectid, offset, &ins); 601 BUG_ON(ret); 602 } 603 btrfs_release_path(root, path); 604 605 if (btrfs_file_extent_compression(eb, item)) { 606 csum_start = ins.objectid; 607 csum_end = csum_start + ins.offset; 608 } else { 609 csum_start = ins.objectid + 610 btrfs_file_extent_offset(eb, item); 611 csum_end = csum_start + 612 btrfs_file_extent_num_bytes(eb, item); 613 } 614 615 ret = btrfs_lookup_csums_range(root->log_root, 616 csum_start, csum_end - 1, 617 &ordered_sums); 618 BUG_ON(ret); 619 while (!list_empty(&ordered_sums)) { 620 struct btrfs_ordered_sum *sums; 621 sums = list_entry(ordered_sums.next, 622 struct btrfs_ordered_sum, 623 list); 624 ret = btrfs_csum_file_blocks(trans, 625 root->fs_info->csum_root, 626 sums); 627 BUG_ON(ret); 628 list_del(&sums->list); 629 kfree(sums); 630 } 631 } else { 632 btrfs_release_path(root, path); 633 } 634 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 635 /* inline extents are easy, we just overwrite them */ 636 ret = overwrite_item(trans, root, path, eb, slot, key); 637 BUG_ON(ret); 638 } 639 640 inode_set_bytes(inode, saved_nbytes); 641 btrfs_update_inode(trans, root, inode); 642 out: 643 if (inode) 644 iput(inode); 645 return ret; 646 } 647 648 /* 649 * when cleaning up conflicts between the directory names in the 650 * subvolume, directory names in the log and directory names in the 651 * inode back references, we may have to unlink inodes from directories. 652 * 653 * This is a helper function to do the unlink of a specific directory 654 * item 655 */ 656 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 657 struct btrfs_root *root, 658 struct btrfs_path *path, 659 struct inode *dir, 660 struct btrfs_dir_item *di) 661 { 662 struct inode *inode; 663 char *name; 664 int name_len; 665 struct extent_buffer *leaf; 666 struct btrfs_key location; 667 int ret; 668 669 leaf = path->nodes[0]; 670 671 btrfs_dir_item_key_to_cpu(leaf, di, &location); 672 name_len = btrfs_dir_name_len(leaf, di); 673 name = kmalloc(name_len, GFP_NOFS); 674 if (!name) 675 return -ENOMEM; 676 677 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 678 btrfs_release_path(root, path); 679 680 inode = read_one_inode(root, location.objectid); 681 BUG_ON(!inode); 682 683 ret = link_to_fixup_dir(trans, root, path, location.objectid); 684 BUG_ON(ret); 685 686 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 687 BUG_ON(ret); 688 kfree(name); 689 690 iput(inode); 691 return ret; 692 } 693 694 /* 695 * helper function to see if a given name and sequence number found 696 * in an inode back reference are already in a directory and correctly 697 * point to this inode 698 */ 699 static noinline int inode_in_dir(struct btrfs_root *root, 700 struct btrfs_path *path, 701 u64 dirid, u64 objectid, u64 index, 702 const char *name, int name_len) 703 { 704 struct btrfs_dir_item *di; 705 struct btrfs_key location; 706 int match = 0; 707 708 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 709 index, name, name_len, 0); 710 if (di && !IS_ERR(di)) { 711 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 712 if (location.objectid != objectid) 713 goto out; 714 } else 715 goto out; 716 btrfs_release_path(root, path); 717 718 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 719 if (di && !IS_ERR(di)) { 720 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 721 if (location.objectid != objectid) 722 goto out; 723 } else 724 goto out; 725 match = 1; 726 out: 727 btrfs_release_path(root, path); 728 return match; 729 } 730 731 /* 732 * helper function to check a log tree for a named back reference in 733 * an inode. This is used to decide if a back reference that is 734 * found in the subvolume conflicts with what we find in the log. 735 * 736 * inode backreferences may have multiple refs in a single item, 737 * during replay we process one reference at a time, and we don't 738 * want to delete valid links to a file from the subvolume if that 739 * link is also in the log. 740 */ 741 static noinline int backref_in_log(struct btrfs_root *log, 742 struct btrfs_key *key, 743 char *name, int namelen) 744 { 745 struct btrfs_path *path; 746 struct btrfs_inode_ref *ref; 747 unsigned long ptr; 748 unsigned long ptr_end; 749 unsigned long name_ptr; 750 int found_name_len; 751 int item_size; 752 int ret; 753 int match = 0; 754 755 path = btrfs_alloc_path(); 756 if (!path) 757 return -ENOMEM; 758 759 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 760 if (ret != 0) 761 goto out; 762 763 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 764 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 765 ptr_end = ptr + item_size; 766 while (ptr < ptr_end) { 767 ref = (struct btrfs_inode_ref *)ptr; 768 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 769 if (found_name_len == namelen) { 770 name_ptr = (unsigned long)(ref + 1); 771 ret = memcmp_extent_buffer(path->nodes[0], name, 772 name_ptr, namelen); 773 if (ret == 0) { 774 match = 1; 775 goto out; 776 } 777 } 778 ptr = (unsigned long)(ref + 1) + found_name_len; 779 } 780 out: 781 btrfs_free_path(path); 782 return match; 783 } 784 785 786 /* 787 * replay one inode back reference item found in the log tree. 788 * eb, slot and key refer to the buffer and key found in the log tree. 789 * root is the destination we are replaying into, and path is for temp 790 * use by this function. (it should be released on return). 791 */ 792 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 793 struct btrfs_root *root, 794 struct btrfs_root *log, 795 struct btrfs_path *path, 796 struct extent_buffer *eb, int slot, 797 struct btrfs_key *key) 798 { 799 struct inode *dir; 800 int ret; 801 struct btrfs_inode_ref *ref; 802 struct btrfs_dir_item *di; 803 struct inode *inode; 804 char *name; 805 int namelen; 806 unsigned long ref_ptr; 807 unsigned long ref_end; 808 809 /* 810 * it is possible that we didn't log all the parent directories 811 * for a given inode. If we don't find the dir, just don't 812 * copy the back ref in. The link count fixup code will take 813 * care of the rest 814 */ 815 dir = read_one_inode(root, key->offset); 816 if (!dir) 817 return -ENOENT; 818 819 inode = read_one_inode(root, key->objectid); 820 BUG_ON(!inode); 821 822 ref_ptr = btrfs_item_ptr_offset(eb, slot); 823 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 824 825 again: 826 ref = (struct btrfs_inode_ref *)ref_ptr; 827 828 namelen = btrfs_inode_ref_name_len(eb, ref); 829 name = kmalloc(namelen, GFP_NOFS); 830 BUG_ON(!name); 831 832 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 833 834 /* if we already have a perfect match, we're done */ 835 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 836 btrfs_inode_ref_index(eb, ref), 837 name, namelen)) { 838 goto out; 839 } 840 841 /* 842 * look for a conflicting back reference in the metadata. 843 * if we find one we have to unlink that name of the file 844 * before we add our new link. Later on, we overwrite any 845 * existing back reference, and we don't want to create 846 * dangling pointers in the directory. 847 */ 848 conflict_again: 849 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 850 if (ret == 0) { 851 char *victim_name; 852 int victim_name_len; 853 struct btrfs_inode_ref *victim_ref; 854 unsigned long ptr; 855 unsigned long ptr_end; 856 struct extent_buffer *leaf = path->nodes[0]; 857 858 /* are we trying to overwrite a back ref for the root directory 859 * if so, just jump out, we're done 860 */ 861 if (key->objectid == key->offset) 862 goto out_nowrite; 863 864 /* check all the names in this back reference to see 865 * if they are in the log. if so, we allow them to stay 866 * otherwise they must be unlinked as a conflict 867 */ 868 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 869 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 870 while (ptr < ptr_end) { 871 victim_ref = (struct btrfs_inode_ref *)ptr; 872 victim_name_len = btrfs_inode_ref_name_len(leaf, 873 victim_ref); 874 victim_name = kmalloc(victim_name_len, GFP_NOFS); 875 BUG_ON(!victim_name); 876 877 read_extent_buffer(leaf, victim_name, 878 (unsigned long)(victim_ref + 1), 879 victim_name_len); 880 881 if (!backref_in_log(log, key, victim_name, 882 victim_name_len)) { 883 btrfs_inc_nlink(inode); 884 btrfs_release_path(root, path); 885 886 ret = btrfs_unlink_inode(trans, root, dir, 887 inode, victim_name, 888 victim_name_len); 889 kfree(victim_name); 890 btrfs_release_path(root, path); 891 goto conflict_again; 892 } 893 kfree(victim_name); 894 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 895 } 896 BUG_ON(ret); 897 } 898 btrfs_release_path(root, path); 899 900 /* look for a conflicting sequence number */ 901 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 902 btrfs_inode_ref_index(eb, ref), 903 name, namelen, 0); 904 if (di && !IS_ERR(di)) { 905 ret = drop_one_dir_item(trans, root, path, dir, di); 906 BUG_ON(ret); 907 } 908 btrfs_release_path(root, path); 909 910 911 /* look for a conflicting name */ 912 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 913 name, namelen, 0); 914 if (di && !IS_ERR(di)) { 915 ret = drop_one_dir_item(trans, root, path, dir, di); 916 BUG_ON(ret); 917 } 918 btrfs_release_path(root, path); 919 920 /* insert our name */ 921 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 922 btrfs_inode_ref_index(eb, ref)); 923 BUG_ON(ret); 924 925 btrfs_update_inode(trans, root, inode); 926 927 out: 928 ref_ptr = (unsigned long)(ref + 1) + namelen; 929 kfree(name); 930 if (ref_ptr < ref_end) 931 goto again; 932 933 /* finally write the back reference in the inode */ 934 ret = overwrite_item(trans, root, path, eb, slot, key); 935 BUG_ON(ret); 936 937 out_nowrite: 938 btrfs_release_path(root, path); 939 iput(dir); 940 iput(inode); 941 return 0; 942 } 943 944 static int insert_orphan_item(struct btrfs_trans_handle *trans, 945 struct btrfs_root *root, u64 offset) 946 { 947 int ret; 948 ret = btrfs_find_orphan_item(root, offset); 949 if (ret > 0) 950 ret = btrfs_insert_orphan_item(trans, root, offset); 951 return ret; 952 } 953 954 955 /* 956 * There are a few corners where the link count of the file can't 957 * be properly maintained during replay. So, instead of adding 958 * lots of complexity to the log code, we just scan the backrefs 959 * for any file that has been through replay. 960 * 961 * The scan will update the link count on the inode to reflect the 962 * number of back refs found. If it goes down to zero, the iput 963 * will free the inode. 964 */ 965 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 966 struct btrfs_root *root, 967 struct inode *inode) 968 { 969 struct btrfs_path *path; 970 int ret; 971 struct btrfs_key key; 972 u64 nlink = 0; 973 unsigned long ptr; 974 unsigned long ptr_end; 975 int name_len; 976 977 key.objectid = inode->i_ino; 978 key.type = BTRFS_INODE_REF_KEY; 979 key.offset = (u64)-1; 980 981 path = btrfs_alloc_path(); 982 if (!path) 983 return -ENOMEM; 984 985 while (1) { 986 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 987 if (ret < 0) 988 break; 989 if (ret > 0) { 990 if (path->slots[0] == 0) 991 break; 992 path->slots[0]--; 993 } 994 btrfs_item_key_to_cpu(path->nodes[0], &key, 995 path->slots[0]); 996 if (key.objectid != inode->i_ino || 997 key.type != BTRFS_INODE_REF_KEY) 998 break; 999 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1000 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1001 path->slots[0]); 1002 while (ptr < ptr_end) { 1003 struct btrfs_inode_ref *ref; 1004 1005 ref = (struct btrfs_inode_ref *)ptr; 1006 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1007 ref); 1008 ptr = (unsigned long)(ref + 1) + name_len; 1009 nlink++; 1010 } 1011 1012 if (key.offset == 0) 1013 break; 1014 key.offset--; 1015 btrfs_release_path(root, path); 1016 } 1017 btrfs_release_path(root, path); 1018 if (nlink != inode->i_nlink) { 1019 inode->i_nlink = nlink; 1020 btrfs_update_inode(trans, root, inode); 1021 } 1022 BTRFS_I(inode)->index_cnt = (u64)-1; 1023 1024 if (inode->i_nlink == 0) { 1025 if (S_ISDIR(inode->i_mode)) { 1026 ret = replay_dir_deletes(trans, root, NULL, path, 1027 inode->i_ino, 1); 1028 BUG_ON(ret); 1029 } 1030 ret = insert_orphan_item(trans, root, inode->i_ino); 1031 BUG_ON(ret); 1032 } 1033 btrfs_free_path(path); 1034 1035 return 0; 1036 } 1037 1038 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1039 struct btrfs_root *root, 1040 struct btrfs_path *path) 1041 { 1042 int ret; 1043 struct btrfs_key key; 1044 struct inode *inode; 1045 1046 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1047 key.type = BTRFS_ORPHAN_ITEM_KEY; 1048 key.offset = (u64)-1; 1049 while (1) { 1050 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1051 if (ret < 0) 1052 break; 1053 1054 if (ret == 1) { 1055 if (path->slots[0] == 0) 1056 break; 1057 path->slots[0]--; 1058 } 1059 1060 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1061 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1062 key.type != BTRFS_ORPHAN_ITEM_KEY) 1063 break; 1064 1065 ret = btrfs_del_item(trans, root, path); 1066 BUG_ON(ret); 1067 1068 btrfs_release_path(root, path); 1069 inode = read_one_inode(root, key.offset); 1070 BUG_ON(!inode); 1071 1072 ret = fixup_inode_link_count(trans, root, inode); 1073 BUG_ON(ret); 1074 1075 iput(inode); 1076 1077 /* 1078 * fixup on a directory may create new entries, 1079 * make sure we always look for the highset possible 1080 * offset 1081 */ 1082 key.offset = (u64)-1; 1083 } 1084 btrfs_release_path(root, path); 1085 return 0; 1086 } 1087 1088 1089 /* 1090 * record a given inode in the fixup dir so we can check its link 1091 * count when replay is done. The link count is incremented here 1092 * so the inode won't go away until we check it 1093 */ 1094 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1095 struct btrfs_root *root, 1096 struct btrfs_path *path, 1097 u64 objectid) 1098 { 1099 struct btrfs_key key; 1100 int ret = 0; 1101 struct inode *inode; 1102 1103 inode = read_one_inode(root, objectid); 1104 BUG_ON(!inode); 1105 1106 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1107 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1108 key.offset = objectid; 1109 1110 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1111 1112 btrfs_release_path(root, path); 1113 if (ret == 0) { 1114 btrfs_inc_nlink(inode); 1115 btrfs_update_inode(trans, root, inode); 1116 } else if (ret == -EEXIST) { 1117 ret = 0; 1118 } else { 1119 BUG(); 1120 } 1121 iput(inode); 1122 1123 return ret; 1124 } 1125 1126 /* 1127 * when replaying the log for a directory, we only insert names 1128 * for inodes that actually exist. This means an fsync on a directory 1129 * does not implicitly fsync all the new files in it 1130 */ 1131 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1132 struct btrfs_root *root, 1133 struct btrfs_path *path, 1134 u64 dirid, u64 index, 1135 char *name, int name_len, u8 type, 1136 struct btrfs_key *location) 1137 { 1138 struct inode *inode; 1139 struct inode *dir; 1140 int ret; 1141 1142 inode = read_one_inode(root, location->objectid); 1143 if (!inode) 1144 return -ENOENT; 1145 1146 dir = read_one_inode(root, dirid); 1147 if (!dir) { 1148 iput(inode); 1149 return -EIO; 1150 } 1151 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1152 1153 /* FIXME, put inode into FIXUP list */ 1154 1155 iput(inode); 1156 iput(dir); 1157 return ret; 1158 } 1159 1160 /* 1161 * take a single entry in a log directory item and replay it into 1162 * the subvolume. 1163 * 1164 * if a conflicting item exists in the subdirectory already, 1165 * the inode it points to is unlinked and put into the link count 1166 * fix up tree. 1167 * 1168 * If a name from the log points to a file or directory that does 1169 * not exist in the FS, it is skipped. fsyncs on directories 1170 * do not force down inodes inside that directory, just changes to the 1171 * names or unlinks in a directory. 1172 */ 1173 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1174 struct btrfs_root *root, 1175 struct btrfs_path *path, 1176 struct extent_buffer *eb, 1177 struct btrfs_dir_item *di, 1178 struct btrfs_key *key) 1179 { 1180 char *name; 1181 int name_len; 1182 struct btrfs_dir_item *dst_di; 1183 struct btrfs_key found_key; 1184 struct btrfs_key log_key; 1185 struct inode *dir; 1186 u8 log_type; 1187 int exists; 1188 int ret; 1189 1190 dir = read_one_inode(root, key->objectid); 1191 BUG_ON(!dir); 1192 1193 name_len = btrfs_dir_name_len(eb, di); 1194 name = kmalloc(name_len, GFP_NOFS); 1195 if (!name) 1196 return -ENOMEM; 1197 1198 log_type = btrfs_dir_type(eb, di); 1199 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1200 name_len); 1201 1202 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1203 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1204 if (exists == 0) 1205 exists = 1; 1206 else 1207 exists = 0; 1208 btrfs_release_path(root, path); 1209 1210 if (key->type == BTRFS_DIR_ITEM_KEY) { 1211 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1212 name, name_len, 1); 1213 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1214 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1215 key->objectid, 1216 key->offset, name, 1217 name_len, 1); 1218 } else { 1219 BUG(); 1220 } 1221 if (!dst_di || IS_ERR(dst_di)) { 1222 /* we need a sequence number to insert, so we only 1223 * do inserts for the BTRFS_DIR_INDEX_KEY types 1224 */ 1225 if (key->type != BTRFS_DIR_INDEX_KEY) 1226 goto out; 1227 goto insert; 1228 } 1229 1230 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1231 /* the existing item matches the logged item */ 1232 if (found_key.objectid == log_key.objectid && 1233 found_key.type == log_key.type && 1234 found_key.offset == log_key.offset && 1235 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1236 goto out; 1237 } 1238 1239 /* 1240 * don't drop the conflicting directory entry if the inode 1241 * for the new entry doesn't exist 1242 */ 1243 if (!exists) 1244 goto out; 1245 1246 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1247 BUG_ON(ret); 1248 1249 if (key->type == BTRFS_DIR_INDEX_KEY) 1250 goto insert; 1251 out: 1252 btrfs_release_path(root, path); 1253 kfree(name); 1254 iput(dir); 1255 return 0; 1256 1257 insert: 1258 btrfs_release_path(root, path); 1259 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1260 name, name_len, log_type, &log_key); 1261 1262 BUG_ON(ret && ret != -ENOENT); 1263 goto out; 1264 } 1265 1266 /* 1267 * find all the names in a directory item and reconcile them into 1268 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1269 * one name in a directory item, but the same code gets used for 1270 * both directory index types 1271 */ 1272 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1273 struct btrfs_root *root, 1274 struct btrfs_path *path, 1275 struct extent_buffer *eb, int slot, 1276 struct btrfs_key *key) 1277 { 1278 int ret; 1279 u32 item_size = btrfs_item_size_nr(eb, slot); 1280 struct btrfs_dir_item *di; 1281 int name_len; 1282 unsigned long ptr; 1283 unsigned long ptr_end; 1284 1285 ptr = btrfs_item_ptr_offset(eb, slot); 1286 ptr_end = ptr + item_size; 1287 while (ptr < ptr_end) { 1288 di = (struct btrfs_dir_item *)ptr; 1289 name_len = btrfs_dir_name_len(eb, di); 1290 ret = replay_one_name(trans, root, path, eb, di, key); 1291 BUG_ON(ret); 1292 ptr = (unsigned long)(di + 1); 1293 ptr += name_len; 1294 } 1295 return 0; 1296 } 1297 1298 /* 1299 * directory replay has two parts. There are the standard directory 1300 * items in the log copied from the subvolume, and range items 1301 * created in the log while the subvolume was logged. 1302 * 1303 * The range items tell us which parts of the key space the log 1304 * is authoritative for. During replay, if a key in the subvolume 1305 * directory is in a logged range item, but not actually in the log 1306 * that means it was deleted from the directory before the fsync 1307 * and should be removed. 1308 */ 1309 static noinline int find_dir_range(struct btrfs_root *root, 1310 struct btrfs_path *path, 1311 u64 dirid, int key_type, 1312 u64 *start_ret, u64 *end_ret) 1313 { 1314 struct btrfs_key key; 1315 u64 found_end; 1316 struct btrfs_dir_log_item *item; 1317 int ret; 1318 int nritems; 1319 1320 if (*start_ret == (u64)-1) 1321 return 1; 1322 1323 key.objectid = dirid; 1324 key.type = key_type; 1325 key.offset = *start_ret; 1326 1327 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1328 if (ret < 0) 1329 goto out; 1330 if (ret > 0) { 1331 if (path->slots[0] == 0) 1332 goto out; 1333 path->slots[0]--; 1334 } 1335 if (ret != 0) 1336 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1337 1338 if (key.type != key_type || key.objectid != dirid) { 1339 ret = 1; 1340 goto next; 1341 } 1342 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1343 struct btrfs_dir_log_item); 1344 found_end = btrfs_dir_log_end(path->nodes[0], item); 1345 1346 if (*start_ret >= key.offset && *start_ret <= found_end) { 1347 ret = 0; 1348 *start_ret = key.offset; 1349 *end_ret = found_end; 1350 goto out; 1351 } 1352 ret = 1; 1353 next: 1354 /* check the next slot in the tree to see if it is a valid item */ 1355 nritems = btrfs_header_nritems(path->nodes[0]); 1356 if (path->slots[0] >= nritems) { 1357 ret = btrfs_next_leaf(root, path); 1358 if (ret) 1359 goto out; 1360 } else { 1361 path->slots[0]++; 1362 } 1363 1364 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1365 1366 if (key.type != key_type || key.objectid != dirid) { 1367 ret = 1; 1368 goto out; 1369 } 1370 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1371 struct btrfs_dir_log_item); 1372 found_end = btrfs_dir_log_end(path->nodes[0], item); 1373 *start_ret = key.offset; 1374 *end_ret = found_end; 1375 ret = 0; 1376 out: 1377 btrfs_release_path(root, path); 1378 return ret; 1379 } 1380 1381 /* 1382 * this looks for a given directory item in the log. If the directory 1383 * item is not in the log, the item is removed and the inode it points 1384 * to is unlinked 1385 */ 1386 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1387 struct btrfs_root *root, 1388 struct btrfs_root *log, 1389 struct btrfs_path *path, 1390 struct btrfs_path *log_path, 1391 struct inode *dir, 1392 struct btrfs_key *dir_key) 1393 { 1394 int ret; 1395 struct extent_buffer *eb; 1396 int slot; 1397 u32 item_size; 1398 struct btrfs_dir_item *di; 1399 struct btrfs_dir_item *log_di; 1400 int name_len; 1401 unsigned long ptr; 1402 unsigned long ptr_end; 1403 char *name; 1404 struct inode *inode; 1405 struct btrfs_key location; 1406 1407 again: 1408 eb = path->nodes[0]; 1409 slot = path->slots[0]; 1410 item_size = btrfs_item_size_nr(eb, slot); 1411 ptr = btrfs_item_ptr_offset(eb, slot); 1412 ptr_end = ptr + item_size; 1413 while (ptr < ptr_end) { 1414 di = (struct btrfs_dir_item *)ptr; 1415 name_len = btrfs_dir_name_len(eb, di); 1416 name = kmalloc(name_len, GFP_NOFS); 1417 if (!name) { 1418 ret = -ENOMEM; 1419 goto out; 1420 } 1421 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1422 name_len); 1423 log_di = NULL; 1424 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1425 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1426 dir_key->objectid, 1427 name, name_len, 0); 1428 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1429 log_di = btrfs_lookup_dir_index_item(trans, log, 1430 log_path, 1431 dir_key->objectid, 1432 dir_key->offset, 1433 name, name_len, 0); 1434 } 1435 if (!log_di || IS_ERR(log_di)) { 1436 btrfs_dir_item_key_to_cpu(eb, di, &location); 1437 btrfs_release_path(root, path); 1438 btrfs_release_path(log, log_path); 1439 inode = read_one_inode(root, location.objectid); 1440 BUG_ON(!inode); 1441 1442 ret = link_to_fixup_dir(trans, root, 1443 path, location.objectid); 1444 BUG_ON(ret); 1445 btrfs_inc_nlink(inode); 1446 ret = btrfs_unlink_inode(trans, root, dir, inode, 1447 name, name_len); 1448 BUG_ON(ret); 1449 kfree(name); 1450 iput(inode); 1451 1452 /* there might still be more names under this key 1453 * check and repeat if required 1454 */ 1455 ret = btrfs_search_slot(NULL, root, dir_key, path, 1456 0, 0); 1457 if (ret == 0) 1458 goto again; 1459 ret = 0; 1460 goto out; 1461 } 1462 btrfs_release_path(log, log_path); 1463 kfree(name); 1464 1465 ptr = (unsigned long)(di + 1); 1466 ptr += name_len; 1467 } 1468 ret = 0; 1469 out: 1470 btrfs_release_path(root, path); 1471 btrfs_release_path(log, log_path); 1472 return ret; 1473 } 1474 1475 /* 1476 * deletion replay happens before we copy any new directory items 1477 * out of the log or out of backreferences from inodes. It 1478 * scans the log to find ranges of keys that log is authoritative for, 1479 * and then scans the directory to find items in those ranges that are 1480 * not present in the log. 1481 * 1482 * Anything we don't find in the log is unlinked and removed from the 1483 * directory. 1484 */ 1485 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1486 struct btrfs_root *root, 1487 struct btrfs_root *log, 1488 struct btrfs_path *path, 1489 u64 dirid, int del_all) 1490 { 1491 u64 range_start; 1492 u64 range_end; 1493 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1494 int ret = 0; 1495 struct btrfs_key dir_key; 1496 struct btrfs_key found_key; 1497 struct btrfs_path *log_path; 1498 struct inode *dir; 1499 1500 dir_key.objectid = dirid; 1501 dir_key.type = BTRFS_DIR_ITEM_KEY; 1502 log_path = btrfs_alloc_path(); 1503 if (!log_path) 1504 return -ENOMEM; 1505 1506 dir = read_one_inode(root, dirid); 1507 /* it isn't an error if the inode isn't there, that can happen 1508 * because we replay the deletes before we copy in the inode item 1509 * from the log 1510 */ 1511 if (!dir) { 1512 btrfs_free_path(log_path); 1513 return 0; 1514 } 1515 again: 1516 range_start = 0; 1517 range_end = 0; 1518 while (1) { 1519 if (del_all) 1520 range_end = (u64)-1; 1521 else { 1522 ret = find_dir_range(log, path, dirid, key_type, 1523 &range_start, &range_end); 1524 if (ret != 0) 1525 break; 1526 } 1527 1528 dir_key.offset = range_start; 1529 while (1) { 1530 int nritems; 1531 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1532 0, 0); 1533 if (ret < 0) 1534 goto out; 1535 1536 nritems = btrfs_header_nritems(path->nodes[0]); 1537 if (path->slots[0] >= nritems) { 1538 ret = btrfs_next_leaf(root, path); 1539 if (ret) 1540 break; 1541 } 1542 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1543 path->slots[0]); 1544 if (found_key.objectid != dirid || 1545 found_key.type != dir_key.type) 1546 goto next_type; 1547 1548 if (found_key.offset > range_end) 1549 break; 1550 1551 ret = check_item_in_log(trans, root, log, path, 1552 log_path, dir, 1553 &found_key); 1554 BUG_ON(ret); 1555 if (found_key.offset == (u64)-1) 1556 break; 1557 dir_key.offset = found_key.offset + 1; 1558 } 1559 btrfs_release_path(root, path); 1560 if (range_end == (u64)-1) 1561 break; 1562 range_start = range_end + 1; 1563 } 1564 1565 next_type: 1566 ret = 0; 1567 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1568 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1569 dir_key.type = BTRFS_DIR_INDEX_KEY; 1570 btrfs_release_path(root, path); 1571 goto again; 1572 } 1573 out: 1574 btrfs_release_path(root, path); 1575 btrfs_free_path(log_path); 1576 iput(dir); 1577 return ret; 1578 } 1579 1580 /* 1581 * the process_func used to replay items from the log tree. This 1582 * gets called in two different stages. The first stage just looks 1583 * for inodes and makes sure they are all copied into the subvolume. 1584 * 1585 * The second stage copies all the other item types from the log into 1586 * the subvolume. The two stage approach is slower, but gets rid of 1587 * lots of complexity around inodes referencing other inodes that exist 1588 * only in the log (references come from either directory items or inode 1589 * back refs). 1590 */ 1591 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1592 struct walk_control *wc, u64 gen) 1593 { 1594 int nritems; 1595 struct btrfs_path *path; 1596 struct btrfs_root *root = wc->replay_dest; 1597 struct btrfs_key key; 1598 int level; 1599 int i; 1600 int ret; 1601 1602 btrfs_read_buffer(eb, gen); 1603 1604 level = btrfs_header_level(eb); 1605 1606 if (level != 0) 1607 return 0; 1608 1609 path = btrfs_alloc_path(); 1610 BUG_ON(!path); 1611 1612 nritems = btrfs_header_nritems(eb); 1613 for (i = 0; i < nritems; i++) { 1614 btrfs_item_key_to_cpu(eb, &key, i); 1615 1616 /* inode keys are done during the first stage */ 1617 if (key.type == BTRFS_INODE_ITEM_KEY && 1618 wc->stage == LOG_WALK_REPLAY_INODES) { 1619 struct btrfs_inode_item *inode_item; 1620 u32 mode; 1621 1622 inode_item = btrfs_item_ptr(eb, i, 1623 struct btrfs_inode_item); 1624 mode = btrfs_inode_mode(eb, inode_item); 1625 if (S_ISDIR(mode)) { 1626 ret = replay_dir_deletes(wc->trans, 1627 root, log, path, key.objectid, 0); 1628 BUG_ON(ret); 1629 } 1630 ret = overwrite_item(wc->trans, root, path, 1631 eb, i, &key); 1632 BUG_ON(ret); 1633 1634 /* for regular files, make sure corresponding 1635 * orhpan item exist. extents past the new EOF 1636 * will be truncated later by orphan cleanup. 1637 */ 1638 if (S_ISREG(mode)) { 1639 ret = insert_orphan_item(wc->trans, root, 1640 key.objectid); 1641 BUG_ON(ret); 1642 } 1643 1644 ret = link_to_fixup_dir(wc->trans, root, 1645 path, key.objectid); 1646 BUG_ON(ret); 1647 } 1648 if (wc->stage < LOG_WALK_REPLAY_ALL) 1649 continue; 1650 1651 /* these keys are simply copied */ 1652 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1653 ret = overwrite_item(wc->trans, root, path, 1654 eb, i, &key); 1655 BUG_ON(ret); 1656 } else if (key.type == BTRFS_INODE_REF_KEY) { 1657 ret = add_inode_ref(wc->trans, root, log, path, 1658 eb, i, &key); 1659 BUG_ON(ret && ret != -ENOENT); 1660 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1661 ret = replay_one_extent(wc->trans, root, path, 1662 eb, i, &key); 1663 BUG_ON(ret); 1664 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1665 key.type == BTRFS_DIR_INDEX_KEY) { 1666 ret = replay_one_dir_item(wc->trans, root, path, 1667 eb, i, &key); 1668 BUG_ON(ret); 1669 } 1670 } 1671 btrfs_free_path(path); 1672 return 0; 1673 } 1674 1675 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1676 struct btrfs_root *root, 1677 struct btrfs_path *path, int *level, 1678 struct walk_control *wc) 1679 { 1680 u64 root_owner; 1681 u64 bytenr; 1682 u64 ptr_gen; 1683 struct extent_buffer *next; 1684 struct extent_buffer *cur; 1685 struct extent_buffer *parent; 1686 u32 blocksize; 1687 int ret = 0; 1688 1689 WARN_ON(*level < 0); 1690 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1691 1692 while (*level > 0) { 1693 WARN_ON(*level < 0); 1694 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1695 cur = path->nodes[*level]; 1696 1697 if (btrfs_header_level(cur) != *level) 1698 WARN_ON(1); 1699 1700 if (path->slots[*level] >= 1701 btrfs_header_nritems(cur)) 1702 break; 1703 1704 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 1705 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 1706 blocksize = btrfs_level_size(root, *level - 1); 1707 1708 parent = path->nodes[*level]; 1709 root_owner = btrfs_header_owner(parent); 1710 1711 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1712 if (!next) 1713 return -ENOMEM; 1714 1715 if (*level == 1) { 1716 wc->process_func(root, next, wc, ptr_gen); 1717 1718 path->slots[*level]++; 1719 if (wc->free) { 1720 btrfs_read_buffer(next, ptr_gen); 1721 1722 btrfs_tree_lock(next); 1723 clean_tree_block(trans, root, next); 1724 btrfs_set_lock_blocking(next); 1725 btrfs_wait_tree_block_writeback(next); 1726 btrfs_tree_unlock(next); 1727 1728 WARN_ON(root_owner != 1729 BTRFS_TREE_LOG_OBJECTID); 1730 ret = btrfs_free_reserved_extent(root, 1731 bytenr, blocksize); 1732 BUG_ON(ret); 1733 } 1734 free_extent_buffer(next); 1735 continue; 1736 } 1737 btrfs_read_buffer(next, ptr_gen); 1738 1739 WARN_ON(*level <= 0); 1740 if (path->nodes[*level-1]) 1741 free_extent_buffer(path->nodes[*level-1]); 1742 path->nodes[*level-1] = next; 1743 *level = btrfs_header_level(next); 1744 path->slots[*level] = 0; 1745 cond_resched(); 1746 } 1747 WARN_ON(*level < 0); 1748 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1749 1750 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 1751 1752 cond_resched(); 1753 return 0; 1754 } 1755 1756 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 1757 struct btrfs_root *root, 1758 struct btrfs_path *path, int *level, 1759 struct walk_control *wc) 1760 { 1761 u64 root_owner; 1762 int i; 1763 int slot; 1764 int ret; 1765 1766 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1767 slot = path->slots[i]; 1768 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 1769 path->slots[i]++; 1770 *level = i; 1771 WARN_ON(*level == 0); 1772 return 0; 1773 } else { 1774 struct extent_buffer *parent; 1775 if (path->nodes[*level] == root->node) 1776 parent = path->nodes[*level]; 1777 else 1778 parent = path->nodes[*level + 1]; 1779 1780 root_owner = btrfs_header_owner(parent); 1781 wc->process_func(root, path->nodes[*level], wc, 1782 btrfs_header_generation(path->nodes[*level])); 1783 if (wc->free) { 1784 struct extent_buffer *next; 1785 1786 next = path->nodes[*level]; 1787 1788 btrfs_tree_lock(next); 1789 clean_tree_block(trans, root, next); 1790 btrfs_set_lock_blocking(next); 1791 btrfs_wait_tree_block_writeback(next); 1792 btrfs_tree_unlock(next); 1793 1794 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1795 ret = btrfs_free_reserved_extent(root, 1796 path->nodes[*level]->start, 1797 path->nodes[*level]->len); 1798 BUG_ON(ret); 1799 } 1800 free_extent_buffer(path->nodes[*level]); 1801 path->nodes[*level] = NULL; 1802 *level = i + 1; 1803 } 1804 } 1805 return 1; 1806 } 1807 1808 /* 1809 * drop the reference count on the tree rooted at 'snap'. This traverses 1810 * the tree freeing any blocks that have a ref count of zero after being 1811 * decremented. 1812 */ 1813 static int walk_log_tree(struct btrfs_trans_handle *trans, 1814 struct btrfs_root *log, struct walk_control *wc) 1815 { 1816 int ret = 0; 1817 int wret; 1818 int level; 1819 struct btrfs_path *path; 1820 int i; 1821 int orig_level; 1822 1823 path = btrfs_alloc_path(); 1824 BUG_ON(!path); 1825 1826 level = btrfs_header_level(log->node); 1827 orig_level = level; 1828 path->nodes[level] = log->node; 1829 extent_buffer_get(log->node); 1830 path->slots[level] = 0; 1831 1832 while (1) { 1833 wret = walk_down_log_tree(trans, log, path, &level, wc); 1834 if (wret > 0) 1835 break; 1836 if (wret < 0) 1837 ret = wret; 1838 1839 wret = walk_up_log_tree(trans, log, path, &level, wc); 1840 if (wret > 0) 1841 break; 1842 if (wret < 0) 1843 ret = wret; 1844 } 1845 1846 /* was the root node processed? if not, catch it here */ 1847 if (path->nodes[orig_level]) { 1848 wc->process_func(log, path->nodes[orig_level], wc, 1849 btrfs_header_generation(path->nodes[orig_level])); 1850 if (wc->free) { 1851 struct extent_buffer *next; 1852 1853 next = path->nodes[orig_level]; 1854 1855 btrfs_tree_lock(next); 1856 clean_tree_block(trans, log, next); 1857 btrfs_set_lock_blocking(next); 1858 btrfs_wait_tree_block_writeback(next); 1859 btrfs_tree_unlock(next); 1860 1861 WARN_ON(log->root_key.objectid != 1862 BTRFS_TREE_LOG_OBJECTID); 1863 ret = btrfs_free_reserved_extent(log, next->start, 1864 next->len); 1865 BUG_ON(ret); 1866 } 1867 } 1868 1869 for (i = 0; i <= orig_level; i++) { 1870 if (path->nodes[i]) { 1871 free_extent_buffer(path->nodes[i]); 1872 path->nodes[i] = NULL; 1873 } 1874 } 1875 btrfs_free_path(path); 1876 return ret; 1877 } 1878 1879 /* 1880 * helper function to update the item for a given subvolumes log root 1881 * in the tree of log roots 1882 */ 1883 static int update_log_root(struct btrfs_trans_handle *trans, 1884 struct btrfs_root *log) 1885 { 1886 int ret; 1887 1888 if (log->log_transid == 1) { 1889 /* insert root item on the first sync */ 1890 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 1891 &log->root_key, &log->root_item); 1892 } else { 1893 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 1894 &log->root_key, &log->root_item); 1895 } 1896 return ret; 1897 } 1898 1899 static int wait_log_commit(struct btrfs_trans_handle *trans, 1900 struct btrfs_root *root, unsigned long transid) 1901 { 1902 DEFINE_WAIT(wait); 1903 int index = transid % 2; 1904 1905 /* 1906 * we only allow two pending log transactions at a time, 1907 * so we know that if ours is more than 2 older than the 1908 * current transaction, we're done 1909 */ 1910 do { 1911 prepare_to_wait(&root->log_commit_wait[index], 1912 &wait, TASK_UNINTERRUPTIBLE); 1913 mutex_unlock(&root->log_mutex); 1914 1915 if (root->fs_info->last_trans_log_full_commit != 1916 trans->transid && root->log_transid < transid + 2 && 1917 atomic_read(&root->log_commit[index])) 1918 schedule(); 1919 1920 finish_wait(&root->log_commit_wait[index], &wait); 1921 mutex_lock(&root->log_mutex); 1922 } while (root->log_transid < transid + 2 && 1923 atomic_read(&root->log_commit[index])); 1924 return 0; 1925 } 1926 1927 static int wait_for_writer(struct btrfs_trans_handle *trans, 1928 struct btrfs_root *root) 1929 { 1930 DEFINE_WAIT(wait); 1931 while (atomic_read(&root->log_writers)) { 1932 prepare_to_wait(&root->log_writer_wait, 1933 &wait, TASK_UNINTERRUPTIBLE); 1934 mutex_unlock(&root->log_mutex); 1935 if (root->fs_info->last_trans_log_full_commit != 1936 trans->transid && atomic_read(&root->log_writers)) 1937 schedule(); 1938 mutex_lock(&root->log_mutex); 1939 finish_wait(&root->log_writer_wait, &wait); 1940 } 1941 return 0; 1942 } 1943 1944 /* 1945 * btrfs_sync_log does sends a given tree log down to the disk and 1946 * updates the super blocks to record it. When this call is done, 1947 * you know that any inodes previously logged are safely on disk only 1948 * if it returns 0. 1949 * 1950 * Any other return value means you need to call btrfs_commit_transaction. 1951 * Some of the edge cases for fsyncing directories that have had unlinks 1952 * or renames done in the past mean that sometimes the only safe 1953 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 1954 * that has happened. 1955 */ 1956 int btrfs_sync_log(struct btrfs_trans_handle *trans, 1957 struct btrfs_root *root) 1958 { 1959 int index1; 1960 int index2; 1961 int mark; 1962 int ret; 1963 struct btrfs_root *log = root->log_root; 1964 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1965 unsigned long log_transid = 0; 1966 1967 mutex_lock(&root->log_mutex); 1968 index1 = root->log_transid % 2; 1969 if (atomic_read(&root->log_commit[index1])) { 1970 wait_log_commit(trans, root, root->log_transid); 1971 mutex_unlock(&root->log_mutex); 1972 return 0; 1973 } 1974 atomic_set(&root->log_commit[index1], 1); 1975 1976 /* wait for previous tree log sync to complete */ 1977 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 1978 wait_log_commit(trans, root, root->log_transid - 1); 1979 1980 while (1) { 1981 unsigned long batch = root->log_batch; 1982 if (root->log_multiple_pids) { 1983 mutex_unlock(&root->log_mutex); 1984 schedule_timeout_uninterruptible(1); 1985 mutex_lock(&root->log_mutex); 1986 } 1987 wait_for_writer(trans, root); 1988 if (batch == root->log_batch) 1989 break; 1990 } 1991 1992 /* bail out if we need to do a full commit */ 1993 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 1994 ret = -EAGAIN; 1995 mutex_unlock(&root->log_mutex); 1996 goto out; 1997 } 1998 1999 log_transid = root->log_transid; 2000 if (log_transid % 2 == 0) 2001 mark = EXTENT_DIRTY; 2002 else 2003 mark = EXTENT_NEW; 2004 2005 /* we start IO on all the marked extents here, but we don't actually 2006 * wait for them until later. 2007 */ 2008 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2009 BUG_ON(ret); 2010 2011 btrfs_set_root_node(&log->root_item, log->node); 2012 2013 root->log_batch = 0; 2014 root->log_transid++; 2015 log->log_transid = root->log_transid; 2016 root->log_start_pid = 0; 2017 smp_mb(); 2018 /* 2019 * IO has been started, blocks of the log tree have WRITTEN flag set 2020 * in their headers. new modifications of the log will be written to 2021 * new positions. so it's safe to allow log writers to go in. 2022 */ 2023 mutex_unlock(&root->log_mutex); 2024 2025 mutex_lock(&log_root_tree->log_mutex); 2026 log_root_tree->log_batch++; 2027 atomic_inc(&log_root_tree->log_writers); 2028 mutex_unlock(&log_root_tree->log_mutex); 2029 2030 ret = update_log_root(trans, log); 2031 2032 mutex_lock(&log_root_tree->log_mutex); 2033 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2034 smp_mb(); 2035 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2036 wake_up(&log_root_tree->log_writer_wait); 2037 } 2038 2039 if (ret) { 2040 BUG_ON(ret != -ENOSPC); 2041 root->fs_info->last_trans_log_full_commit = trans->transid; 2042 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2043 mutex_unlock(&log_root_tree->log_mutex); 2044 ret = -EAGAIN; 2045 goto out; 2046 } 2047 2048 index2 = log_root_tree->log_transid % 2; 2049 if (atomic_read(&log_root_tree->log_commit[index2])) { 2050 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2051 wait_log_commit(trans, log_root_tree, 2052 log_root_tree->log_transid); 2053 mutex_unlock(&log_root_tree->log_mutex); 2054 ret = 0; 2055 goto out; 2056 } 2057 atomic_set(&log_root_tree->log_commit[index2], 1); 2058 2059 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2060 wait_log_commit(trans, log_root_tree, 2061 log_root_tree->log_transid - 1); 2062 } 2063 2064 wait_for_writer(trans, log_root_tree); 2065 2066 /* 2067 * now that we've moved on to the tree of log tree roots, 2068 * check the full commit flag again 2069 */ 2070 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2071 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2072 mutex_unlock(&log_root_tree->log_mutex); 2073 ret = -EAGAIN; 2074 goto out_wake_log_root; 2075 } 2076 2077 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2078 &log_root_tree->dirty_log_pages, 2079 EXTENT_DIRTY | EXTENT_NEW); 2080 BUG_ON(ret); 2081 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2082 2083 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2084 log_root_tree->node->start); 2085 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2086 btrfs_header_level(log_root_tree->node)); 2087 2088 log_root_tree->log_batch = 0; 2089 log_root_tree->log_transid++; 2090 smp_mb(); 2091 2092 mutex_unlock(&log_root_tree->log_mutex); 2093 2094 /* 2095 * nobody else is going to jump in and write the the ctree 2096 * super here because the log_commit atomic below is protecting 2097 * us. We must be called with a transaction handle pinning 2098 * the running transaction open, so a full commit can't hop 2099 * in and cause problems either. 2100 */ 2101 write_ctree_super(trans, root->fs_info->tree_root, 1); 2102 ret = 0; 2103 2104 mutex_lock(&root->log_mutex); 2105 if (root->last_log_commit < log_transid) 2106 root->last_log_commit = log_transid; 2107 mutex_unlock(&root->log_mutex); 2108 2109 out_wake_log_root: 2110 atomic_set(&log_root_tree->log_commit[index2], 0); 2111 smp_mb(); 2112 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2113 wake_up(&log_root_tree->log_commit_wait[index2]); 2114 out: 2115 atomic_set(&root->log_commit[index1], 0); 2116 smp_mb(); 2117 if (waitqueue_active(&root->log_commit_wait[index1])) 2118 wake_up(&root->log_commit_wait[index1]); 2119 return ret; 2120 } 2121 2122 static void free_log_tree(struct btrfs_trans_handle *trans, 2123 struct btrfs_root *log) 2124 { 2125 int ret; 2126 u64 start; 2127 u64 end; 2128 struct walk_control wc = { 2129 .free = 1, 2130 .process_func = process_one_buffer 2131 }; 2132 2133 ret = walk_log_tree(trans, log, &wc); 2134 BUG_ON(ret); 2135 2136 while (1) { 2137 ret = find_first_extent_bit(&log->dirty_log_pages, 2138 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); 2139 if (ret) 2140 break; 2141 2142 clear_extent_bits(&log->dirty_log_pages, start, end, 2143 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2144 } 2145 2146 free_extent_buffer(log->node); 2147 kfree(log); 2148 } 2149 2150 /* 2151 * free all the extents used by the tree log. This should be called 2152 * at commit time of the full transaction 2153 */ 2154 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2155 { 2156 if (root->log_root) { 2157 free_log_tree(trans, root->log_root); 2158 root->log_root = NULL; 2159 } 2160 return 0; 2161 } 2162 2163 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2164 struct btrfs_fs_info *fs_info) 2165 { 2166 if (fs_info->log_root_tree) { 2167 free_log_tree(trans, fs_info->log_root_tree); 2168 fs_info->log_root_tree = NULL; 2169 } 2170 return 0; 2171 } 2172 2173 /* 2174 * If both a file and directory are logged, and unlinks or renames are 2175 * mixed in, we have a few interesting corners: 2176 * 2177 * create file X in dir Y 2178 * link file X to X.link in dir Y 2179 * fsync file X 2180 * unlink file X but leave X.link 2181 * fsync dir Y 2182 * 2183 * After a crash we would expect only X.link to exist. But file X 2184 * didn't get fsync'd again so the log has back refs for X and X.link. 2185 * 2186 * We solve this by removing directory entries and inode backrefs from the 2187 * log when a file that was logged in the current transaction is 2188 * unlinked. Any later fsync will include the updated log entries, and 2189 * we'll be able to reconstruct the proper directory items from backrefs. 2190 * 2191 * This optimizations allows us to avoid relogging the entire inode 2192 * or the entire directory. 2193 */ 2194 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2195 struct btrfs_root *root, 2196 const char *name, int name_len, 2197 struct inode *dir, u64 index) 2198 { 2199 struct btrfs_root *log; 2200 struct btrfs_dir_item *di; 2201 struct btrfs_path *path; 2202 int ret; 2203 int err = 0; 2204 int bytes_del = 0; 2205 2206 if (BTRFS_I(dir)->logged_trans < trans->transid) 2207 return 0; 2208 2209 ret = join_running_log_trans(root); 2210 if (ret) 2211 return 0; 2212 2213 mutex_lock(&BTRFS_I(dir)->log_mutex); 2214 2215 log = root->log_root; 2216 path = btrfs_alloc_path(); 2217 if (!path) 2218 return -ENOMEM; 2219 2220 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2221 name, name_len, -1); 2222 if (IS_ERR(di)) { 2223 err = PTR_ERR(di); 2224 goto fail; 2225 } 2226 if (di) { 2227 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2228 bytes_del += name_len; 2229 BUG_ON(ret); 2230 } 2231 btrfs_release_path(log, path); 2232 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2233 index, name, name_len, -1); 2234 if (IS_ERR(di)) { 2235 err = PTR_ERR(di); 2236 goto fail; 2237 } 2238 if (di) { 2239 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2240 bytes_del += name_len; 2241 BUG_ON(ret); 2242 } 2243 2244 /* update the directory size in the log to reflect the names 2245 * we have removed 2246 */ 2247 if (bytes_del) { 2248 struct btrfs_key key; 2249 2250 key.objectid = dir->i_ino; 2251 key.offset = 0; 2252 key.type = BTRFS_INODE_ITEM_KEY; 2253 btrfs_release_path(log, path); 2254 2255 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2256 if (ret < 0) { 2257 err = ret; 2258 goto fail; 2259 } 2260 if (ret == 0) { 2261 struct btrfs_inode_item *item; 2262 u64 i_size; 2263 2264 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2265 struct btrfs_inode_item); 2266 i_size = btrfs_inode_size(path->nodes[0], item); 2267 if (i_size > bytes_del) 2268 i_size -= bytes_del; 2269 else 2270 i_size = 0; 2271 btrfs_set_inode_size(path->nodes[0], item, i_size); 2272 btrfs_mark_buffer_dirty(path->nodes[0]); 2273 } else 2274 ret = 0; 2275 btrfs_release_path(log, path); 2276 } 2277 fail: 2278 btrfs_free_path(path); 2279 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2280 if (ret == -ENOSPC) { 2281 root->fs_info->last_trans_log_full_commit = trans->transid; 2282 ret = 0; 2283 } 2284 btrfs_end_log_trans(root); 2285 2286 return err; 2287 } 2288 2289 /* see comments for btrfs_del_dir_entries_in_log */ 2290 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2291 struct btrfs_root *root, 2292 const char *name, int name_len, 2293 struct inode *inode, u64 dirid) 2294 { 2295 struct btrfs_root *log; 2296 u64 index; 2297 int ret; 2298 2299 if (BTRFS_I(inode)->logged_trans < trans->transid) 2300 return 0; 2301 2302 ret = join_running_log_trans(root); 2303 if (ret) 2304 return 0; 2305 log = root->log_root; 2306 mutex_lock(&BTRFS_I(inode)->log_mutex); 2307 2308 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2309 dirid, &index); 2310 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2311 if (ret == -ENOSPC) { 2312 root->fs_info->last_trans_log_full_commit = trans->transid; 2313 ret = 0; 2314 } 2315 btrfs_end_log_trans(root); 2316 2317 return ret; 2318 } 2319 2320 /* 2321 * creates a range item in the log for 'dirid'. first_offset and 2322 * last_offset tell us which parts of the key space the log should 2323 * be considered authoritative for. 2324 */ 2325 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2326 struct btrfs_root *log, 2327 struct btrfs_path *path, 2328 int key_type, u64 dirid, 2329 u64 first_offset, u64 last_offset) 2330 { 2331 int ret; 2332 struct btrfs_key key; 2333 struct btrfs_dir_log_item *item; 2334 2335 key.objectid = dirid; 2336 key.offset = first_offset; 2337 if (key_type == BTRFS_DIR_ITEM_KEY) 2338 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2339 else 2340 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2341 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2342 if (ret) 2343 return ret; 2344 2345 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2346 struct btrfs_dir_log_item); 2347 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2348 btrfs_mark_buffer_dirty(path->nodes[0]); 2349 btrfs_release_path(log, path); 2350 return 0; 2351 } 2352 2353 /* 2354 * log all the items included in the current transaction for a given 2355 * directory. This also creates the range items in the log tree required 2356 * to replay anything deleted before the fsync 2357 */ 2358 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2359 struct btrfs_root *root, struct inode *inode, 2360 struct btrfs_path *path, 2361 struct btrfs_path *dst_path, int key_type, 2362 u64 min_offset, u64 *last_offset_ret) 2363 { 2364 struct btrfs_key min_key; 2365 struct btrfs_key max_key; 2366 struct btrfs_root *log = root->log_root; 2367 struct extent_buffer *src; 2368 int err = 0; 2369 int ret; 2370 int i; 2371 int nritems; 2372 u64 first_offset = min_offset; 2373 u64 last_offset = (u64)-1; 2374 2375 log = root->log_root; 2376 max_key.objectid = inode->i_ino; 2377 max_key.offset = (u64)-1; 2378 max_key.type = key_type; 2379 2380 min_key.objectid = inode->i_ino; 2381 min_key.type = key_type; 2382 min_key.offset = min_offset; 2383 2384 path->keep_locks = 1; 2385 2386 ret = btrfs_search_forward(root, &min_key, &max_key, 2387 path, 0, trans->transid); 2388 2389 /* 2390 * we didn't find anything from this transaction, see if there 2391 * is anything at all 2392 */ 2393 if (ret != 0 || min_key.objectid != inode->i_ino || 2394 min_key.type != key_type) { 2395 min_key.objectid = inode->i_ino; 2396 min_key.type = key_type; 2397 min_key.offset = (u64)-1; 2398 btrfs_release_path(root, path); 2399 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2400 if (ret < 0) { 2401 btrfs_release_path(root, path); 2402 return ret; 2403 } 2404 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2405 2406 /* if ret == 0 there are items for this type, 2407 * create a range to tell us the last key of this type. 2408 * otherwise, there are no items in this directory after 2409 * *min_offset, and we create a range to indicate that. 2410 */ 2411 if (ret == 0) { 2412 struct btrfs_key tmp; 2413 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2414 path->slots[0]); 2415 if (key_type == tmp.type) 2416 first_offset = max(min_offset, tmp.offset) + 1; 2417 } 2418 goto done; 2419 } 2420 2421 /* go backward to find any previous key */ 2422 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2423 if (ret == 0) { 2424 struct btrfs_key tmp; 2425 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2426 if (key_type == tmp.type) { 2427 first_offset = tmp.offset; 2428 ret = overwrite_item(trans, log, dst_path, 2429 path->nodes[0], path->slots[0], 2430 &tmp); 2431 if (ret) { 2432 err = ret; 2433 goto done; 2434 } 2435 } 2436 } 2437 btrfs_release_path(root, path); 2438 2439 /* find the first key from this transaction again */ 2440 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2441 if (ret != 0) { 2442 WARN_ON(1); 2443 goto done; 2444 } 2445 2446 /* 2447 * we have a block from this transaction, log every item in it 2448 * from our directory 2449 */ 2450 while (1) { 2451 struct btrfs_key tmp; 2452 src = path->nodes[0]; 2453 nritems = btrfs_header_nritems(src); 2454 for (i = path->slots[0]; i < nritems; i++) { 2455 btrfs_item_key_to_cpu(src, &min_key, i); 2456 2457 if (min_key.objectid != inode->i_ino || 2458 min_key.type != key_type) 2459 goto done; 2460 ret = overwrite_item(trans, log, dst_path, src, i, 2461 &min_key); 2462 if (ret) { 2463 err = ret; 2464 goto done; 2465 } 2466 } 2467 path->slots[0] = nritems; 2468 2469 /* 2470 * look ahead to the next item and see if it is also 2471 * from this directory and from this transaction 2472 */ 2473 ret = btrfs_next_leaf(root, path); 2474 if (ret == 1) { 2475 last_offset = (u64)-1; 2476 goto done; 2477 } 2478 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2479 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2480 last_offset = (u64)-1; 2481 goto done; 2482 } 2483 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2484 ret = overwrite_item(trans, log, dst_path, 2485 path->nodes[0], path->slots[0], 2486 &tmp); 2487 if (ret) 2488 err = ret; 2489 else 2490 last_offset = tmp.offset; 2491 goto done; 2492 } 2493 } 2494 done: 2495 btrfs_release_path(root, path); 2496 btrfs_release_path(log, dst_path); 2497 2498 if (err == 0) { 2499 *last_offset_ret = last_offset; 2500 /* 2501 * insert the log range keys to indicate where the log 2502 * is valid 2503 */ 2504 ret = insert_dir_log_key(trans, log, path, key_type, 2505 inode->i_ino, first_offset, 2506 last_offset); 2507 if (ret) 2508 err = ret; 2509 } 2510 return err; 2511 } 2512 2513 /* 2514 * logging directories is very similar to logging inodes, We find all the items 2515 * from the current transaction and write them to the log. 2516 * 2517 * The recovery code scans the directory in the subvolume, and if it finds a 2518 * key in the range logged that is not present in the log tree, then it means 2519 * that dir entry was unlinked during the transaction. 2520 * 2521 * In order for that scan to work, we must include one key smaller than 2522 * the smallest logged by this transaction and one key larger than the largest 2523 * key logged by this transaction. 2524 */ 2525 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2526 struct btrfs_root *root, struct inode *inode, 2527 struct btrfs_path *path, 2528 struct btrfs_path *dst_path) 2529 { 2530 u64 min_key; 2531 u64 max_key; 2532 int ret; 2533 int key_type = BTRFS_DIR_ITEM_KEY; 2534 2535 again: 2536 min_key = 0; 2537 max_key = 0; 2538 while (1) { 2539 ret = log_dir_items(trans, root, inode, path, 2540 dst_path, key_type, min_key, 2541 &max_key); 2542 if (ret) 2543 return ret; 2544 if (max_key == (u64)-1) 2545 break; 2546 min_key = max_key + 1; 2547 } 2548 2549 if (key_type == BTRFS_DIR_ITEM_KEY) { 2550 key_type = BTRFS_DIR_INDEX_KEY; 2551 goto again; 2552 } 2553 return 0; 2554 } 2555 2556 /* 2557 * a helper function to drop items from the log before we relog an 2558 * inode. max_key_type indicates the highest item type to remove. 2559 * This cannot be run for file data extents because it does not 2560 * free the extents they point to. 2561 */ 2562 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2563 struct btrfs_root *log, 2564 struct btrfs_path *path, 2565 u64 objectid, int max_key_type) 2566 { 2567 int ret; 2568 struct btrfs_key key; 2569 struct btrfs_key found_key; 2570 2571 key.objectid = objectid; 2572 key.type = max_key_type; 2573 key.offset = (u64)-1; 2574 2575 while (1) { 2576 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2577 BUG_ON(ret == 0); 2578 if (ret < 0) 2579 break; 2580 2581 if (path->slots[0] == 0) 2582 break; 2583 2584 path->slots[0]--; 2585 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2586 path->slots[0]); 2587 2588 if (found_key.objectid != objectid) 2589 break; 2590 2591 ret = btrfs_del_item(trans, log, path); 2592 BUG_ON(ret); 2593 btrfs_release_path(log, path); 2594 } 2595 btrfs_release_path(log, path); 2596 return ret; 2597 } 2598 2599 static noinline int copy_items(struct btrfs_trans_handle *trans, 2600 struct btrfs_root *log, 2601 struct btrfs_path *dst_path, 2602 struct extent_buffer *src, 2603 int start_slot, int nr, int inode_only) 2604 { 2605 unsigned long src_offset; 2606 unsigned long dst_offset; 2607 struct btrfs_file_extent_item *extent; 2608 struct btrfs_inode_item *inode_item; 2609 int ret; 2610 struct btrfs_key *ins_keys; 2611 u32 *ins_sizes; 2612 char *ins_data; 2613 int i; 2614 struct list_head ordered_sums; 2615 2616 INIT_LIST_HEAD(&ordered_sums); 2617 2618 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2619 nr * sizeof(u32), GFP_NOFS); 2620 if (!ins_data) 2621 return -ENOMEM; 2622 2623 ins_sizes = (u32 *)ins_data; 2624 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2625 2626 for (i = 0; i < nr; i++) { 2627 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 2628 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 2629 } 2630 ret = btrfs_insert_empty_items(trans, log, dst_path, 2631 ins_keys, ins_sizes, nr); 2632 if (ret) { 2633 kfree(ins_data); 2634 return ret; 2635 } 2636 2637 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2638 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2639 dst_path->slots[0]); 2640 2641 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 2642 2643 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 2644 src_offset, ins_sizes[i]); 2645 2646 if (inode_only == LOG_INODE_EXISTS && 2647 ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 2648 inode_item = btrfs_item_ptr(dst_path->nodes[0], 2649 dst_path->slots[0], 2650 struct btrfs_inode_item); 2651 btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); 2652 2653 /* set the generation to zero so the recover code 2654 * can tell the difference between an logging 2655 * just to say 'this inode exists' and a logging 2656 * to say 'update this inode with these values' 2657 */ 2658 btrfs_set_inode_generation(dst_path->nodes[0], 2659 inode_item, 0); 2660 } 2661 /* take a reference on file data extents so that truncates 2662 * or deletes of this inode don't have to relog the inode 2663 * again 2664 */ 2665 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { 2666 int found_type; 2667 extent = btrfs_item_ptr(src, start_slot + i, 2668 struct btrfs_file_extent_item); 2669 2670 found_type = btrfs_file_extent_type(src, extent); 2671 if (found_type == BTRFS_FILE_EXTENT_REG || 2672 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2673 u64 ds, dl, cs, cl; 2674 ds = btrfs_file_extent_disk_bytenr(src, 2675 extent); 2676 /* ds == 0 is a hole */ 2677 if (ds == 0) 2678 continue; 2679 2680 dl = btrfs_file_extent_disk_num_bytes(src, 2681 extent); 2682 cs = btrfs_file_extent_offset(src, extent); 2683 cl = btrfs_file_extent_num_bytes(src, 2684 extent); 2685 if (btrfs_file_extent_compression(src, 2686 extent)) { 2687 cs = 0; 2688 cl = dl; 2689 } 2690 2691 ret = btrfs_lookup_csums_range( 2692 log->fs_info->csum_root, 2693 ds + cs, ds + cs + cl - 1, 2694 &ordered_sums); 2695 BUG_ON(ret); 2696 } 2697 } 2698 } 2699 2700 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2701 btrfs_release_path(log, dst_path); 2702 kfree(ins_data); 2703 2704 /* 2705 * we have to do this after the loop above to avoid changing the 2706 * log tree while trying to change the log tree. 2707 */ 2708 ret = 0; 2709 while (!list_empty(&ordered_sums)) { 2710 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2711 struct btrfs_ordered_sum, 2712 list); 2713 if (!ret) 2714 ret = btrfs_csum_file_blocks(trans, log, sums); 2715 list_del(&sums->list); 2716 kfree(sums); 2717 } 2718 return ret; 2719 } 2720 2721 /* log a single inode in the tree log. 2722 * At least one parent directory for this inode must exist in the tree 2723 * or be logged already. 2724 * 2725 * Any items from this inode changed by the current transaction are copied 2726 * to the log tree. An extra reference is taken on any extents in this 2727 * file, allowing us to avoid a whole pile of corner cases around logging 2728 * blocks that have been removed from the tree. 2729 * 2730 * See LOG_INODE_ALL and related defines for a description of what inode_only 2731 * does. 2732 * 2733 * This handles both files and directories. 2734 */ 2735 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 2736 struct btrfs_root *root, struct inode *inode, 2737 int inode_only) 2738 { 2739 struct btrfs_path *path; 2740 struct btrfs_path *dst_path; 2741 struct btrfs_key min_key; 2742 struct btrfs_key max_key; 2743 struct btrfs_root *log = root->log_root; 2744 struct extent_buffer *src = NULL; 2745 int err = 0; 2746 int ret; 2747 int nritems; 2748 int ins_start_slot = 0; 2749 int ins_nr; 2750 2751 log = root->log_root; 2752 2753 path = btrfs_alloc_path(); 2754 if (!path) 2755 return -ENOMEM; 2756 dst_path = btrfs_alloc_path(); 2757 if (!dst_path) { 2758 btrfs_free_path(path); 2759 return -ENOMEM; 2760 } 2761 2762 min_key.objectid = inode->i_ino; 2763 min_key.type = BTRFS_INODE_ITEM_KEY; 2764 min_key.offset = 0; 2765 2766 max_key.objectid = inode->i_ino; 2767 2768 /* today the code can only do partial logging of directories */ 2769 if (!S_ISDIR(inode->i_mode)) 2770 inode_only = LOG_INODE_ALL; 2771 2772 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 2773 max_key.type = BTRFS_XATTR_ITEM_KEY; 2774 else 2775 max_key.type = (u8)-1; 2776 max_key.offset = (u64)-1; 2777 2778 mutex_lock(&BTRFS_I(inode)->log_mutex); 2779 2780 /* 2781 * a brute force approach to making sure we get the most uptodate 2782 * copies of everything. 2783 */ 2784 if (S_ISDIR(inode->i_mode)) { 2785 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 2786 2787 if (inode_only == LOG_INODE_EXISTS) 2788 max_key_type = BTRFS_XATTR_ITEM_KEY; 2789 ret = drop_objectid_items(trans, log, path, 2790 inode->i_ino, max_key_type); 2791 } else { 2792 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2793 } 2794 if (ret) { 2795 err = ret; 2796 goto out_unlock; 2797 } 2798 path->keep_locks = 1; 2799 2800 while (1) { 2801 ins_nr = 0; 2802 ret = btrfs_search_forward(root, &min_key, &max_key, 2803 path, 0, trans->transid); 2804 if (ret != 0) 2805 break; 2806 again: 2807 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2808 if (min_key.objectid != inode->i_ino) 2809 break; 2810 if (min_key.type > max_key.type) 2811 break; 2812 2813 src = path->nodes[0]; 2814 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2815 ins_nr++; 2816 goto next_slot; 2817 } else if (!ins_nr) { 2818 ins_start_slot = path->slots[0]; 2819 ins_nr = 1; 2820 goto next_slot; 2821 } 2822 2823 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2824 ins_nr, inode_only); 2825 if (ret) { 2826 err = ret; 2827 goto out_unlock; 2828 } 2829 ins_nr = 1; 2830 ins_start_slot = path->slots[0]; 2831 next_slot: 2832 2833 nritems = btrfs_header_nritems(path->nodes[0]); 2834 path->slots[0]++; 2835 if (path->slots[0] < nritems) { 2836 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 2837 path->slots[0]); 2838 goto again; 2839 } 2840 if (ins_nr) { 2841 ret = copy_items(trans, log, dst_path, src, 2842 ins_start_slot, 2843 ins_nr, inode_only); 2844 if (ret) { 2845 err = ret; 2846 goto out_unlock; 2847 } 2848 ins_nr = 0; 2849 } 2850 btrfs_release_path(root, path); 2851 2852 if (min_key.offset < (u64)-1) 2853 min_key.offset++; 2854 else if (min_key.type < (u8)-1) 2855 min_key.type++; 2856 else if (min_key.objectid < (u64)-1) 2857 min_key.objectid++; 2858 else 2859 break; 2860 } 2861 if (ins_nr) { 2862 ret = copy_items(trans, log, dst_path, src, 2863 ins_start_slot, 2864 ins_nr, inode_only); 2865 if (ret) { 2866 err = ret; 2867 goto out_unlock; 2868 } 2869 ins_nr = 0; 2870 } 2871 WARN_ON(ins_nr); 2872 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2873 btrfs_release_path(root, path); 2874 btrfs_release_path(log, dst_path); 2875 ret = log_directory_changes(trans, root, inode, path, dst_path); 2876 if (ret) { 2877 err = ret; 2878 goto out_unlock; 2879 } 2880 } 2881 BTRFS_I(inode)->logged_trans = trans->transid; 2882 out_unlock: 2883 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2884 2885 btrfs_free_path(path); 2886 btrfs_free_path(dst_path); 2887 return err; 2888 } 2889 2890 /* 2891 * follow the dentry parent pointers up the chain and see if any 2892 * of the directories in it require a full commit before they can 2893 * be logged. Returns zero if nothing special needs to be done or 1 if 2894 * a full commit is required. 2895 */ 2896 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 2897 struct inode *inode, 2898 struct dentry *parent, 2899 struct super_block *sb, 2900 u64 last_committed) 2901 { 2902 int ret = 0; 2903 struct btrfs_root *root; 2904 struct dentry *old_parent = NULL; 2905 2906 /* 2907 * for regular files, if its inode is already on disk, we don't 2908 * have to worry about the parents at all. This is because 2909 * we can use the last_unlink_trans field to record renames 2910 * and other fun in this file. 2911 */ 2912 if (S_ISREG(inode->i_mode) && 2913 BTRFS_I(inode)->generation <= last_committed && 2914 BTRFS_I(inode)->last_unlink_trans <= last_committed) 2915 goto out; 2916 2917 if (!S_ISDIR(inode->i_mode)) { 2918 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2919 goto out; 2920 inode = parent->d_inode; 2921 } 2922 2923 while (1) { 2924 BTRFS_I(inode)->logged_trans = trans->transid; 2925 smp_mb(); 2926 2927 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 2928 root = BTRFS_I(inode)->root; 2929 2930 /* 2931 * make sure any commits to the log are forced 2932 * to be full commits 2933 */ 2934 root->fs_info->last_trans_log_full_commit = 2935 trans->transid; 2936 ret = 1; 2937 break; 2938 } 2939 2940 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 2941 break; 2942 2943 if (IS_ROOT(parent)) 2944 break; 2945 2946 parent = dget_parent(parent); 2947 dput(old_parent); 2948 old_parent = parent; 2949 inode = parent->d_inode; 2950 2951 } 2952 dput(old_parent); 2953 out: 2954 return ret; 2955 } 2956 2957 static int inode_in_log(struct btrfs_trans_handle *trans, 2958 struct inode *inode) 2959 { 2960 struct btrfs_root *root = BTRFS_I(inode)->root; 2961 int ret = 0; 2962 2963 mutex_lock(&root->log_mutex); 2964 if (BTRFS_I(inode)->logged_trans == trans->transid && 2965 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) 2966 ret = 1; 2967 mutex_unlock(&root->log_mutex); 2968 return ret; 2969 } 2970 2971 2972 /* 2973 * helper function around btrfs_log_inode to make sure newly created 2974 * parent directories also end up in the log. A minimal inode and backref 2975 * only logging is done of any parent directories that are older than 2976 * the last committed transaction 2977 */ 2978 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 2979 struct btrfs_root *root, struct inode *inode, 2980 struct dentry *parent, int exists_only) 2981 { 2982 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2983 struct super_block *sb; 2984 struct dentry *old_parent = NULL; 2985 int ret = 0; 2986 u64 last_committed = root->fs_info->last_trans_committed; 2987 2988 sb = inode->i_sb; 2989 2990 if (btrfs_test_opt(root, NOTREELOG)) { 2991 ret = 1; 2992 goto end_no_trans; 2993 } 2994 2995 if (root->fs_info->last_trans_log_full_commit > 2996 root->fs_info->last_trans_committed) { 2997 ret = 1; 2998 goto end_no_trans; 2999 } 3000 3001 if (root != BTRFS_I(inode)->root || 3002 btrfs_root_refs(&root->root_item) == 0) { 3003 ret = 1; 3004 goto end_no_trans; 3005 } 3006 3007 ret = check_parent_dirs_for_sync(trans, inode, parent, 3008 sb, last_committed); 3009 if (ret) 3010 goto end_no_trans; 3011 3012 if (inode_in_log(trans, inode)) { 3013 ret = BTRFS_NO_LOG_SYNC; 3014 goto end_no_trans; 3015 } 3016 3017 ret = start_log_trans(trans, root); 3018 if (ret) 3019 goto end_trans; 3020 3021 ret = btrfs_log_inode(trans, root, inode, inode_only); 3022 if (ret) 3023 goto end_trans; 3024 3025 /* 3026 * for regular files, if its inode is already on disk, we don't 3027 * have to worry about the parents at all. This is because 3028 * we can use the last_unlink_trans field to record renames 3029 * and other fun in this file. 3030 */ 3031 if (S_ISREG(inode->i_mode) && 3032 BTRFS_I(inode)->generation <= last_committed && 3033 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 3034 ret = 0; 3035 goto end_trans; 3036 } 3037 3038 inode_only = LOG_INODE_EXISTS; 3039 while (1) { 3040 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3041 break; 3042 3043 inode = parent->d_inode; 3044 if (root != BTRFS_I(inode)->root) 3045 break; 3046 3047 if (BTRFS_I(inode)->generation > 3048 root->fs_info->last_trans_committed) { 3049 ret = btrfs_log_inode(trans, root, inode, inode_only); 3050 if (ret) 3051 goto end_trans; 3052 } 3053 if (IS_ROOT(parent)) 3054 break; 3055 3056 parent = dget_parent(parent); 3057 dput(old_parent); 3058 old_parent = parent; 3059 } 3060 ret = 0; 3061 end_trans: 3062 dput(old_parent); 3063 if (ret < 0) { 3064 BUG_ON(ret != -ENOSPC); 3065 root->fs_info->last_trans_log_full_commit = trans->transid; 3066 ret = 1; 3067 } 3068 btrfs_end_log_trans(root); 3069 end_no_trans: 3070 return ret; 3071 } 3072 3073 /* 3074 * it is not safe to log dentry if the chunk root has added new 3075 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 3076 * If this returns 1, you must commit the transaction to safely get your 3077 * data on disk. 3078 */ 3079 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3080 struct btrfs_root *root, struct dentry *dentry) 3081 { 3082 struct dentry *parent = dget_parent(dentry); 3083 int ret; 3084 3085 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 3086 dput(parent); 3087 3088 return ret; 3089 } 3090 3091 /* 3092 * should be called during mount to recover any replay any log trees 3093 * from the FS 3094 */ 3095 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 3096 { 3097 int ret; 3098 struct btrfs_path *path; 3099 struct btrfs_trans_handle *trans; 3100 struct btrfs_key key; 3101 struct btrfs_key found_key; 3102 struct btrfs_key tmp_key; 3103 struct btrfs_root *log; 3104 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 3105 struct walk_control wc = { 3106 .process_func = process_one_buffer, 3107 .stage = 0, 3108 }; 3109 3110 fs_info->log_root_recovering = 1; 3111 path = btrfs_alloc_path(); 3112 BUG_ON(!path); 3113 3114 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3115 BUG_ON(IS_ERR(trans)); 3116 3117 wc.trans = trans; 3118 wc.pin = 1; 3119 3120 walk_log_tree(trans, log_root_tree, &wc); 3121 3122 again: 3123 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3124 key.offset = (u64)-1; 3125 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3126 3127 while (1) { 3128 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 3129 if (ret < 0) 3130 break; 3131 if (ret > 0) { 3132 if (path->slots[0] == 0) 3133 break; 3134 path->slots[0]--; 3135 } 3136 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3137 path->slots[0]); 3138 btrfs_release_path(log_root_tree, path); 3139 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3140 break; 3141 3142 log = btrfs_read_fs_root_no_radix(log_root_tree, 3143 &found_key); 3144 BUG_ON(!log); 3145 3146 3147 tmp_key.objectid = found_key.offset; 3148 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3149 tmp_key.offset = (u64)-1; 3150 3151 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3152 BUG_ON(!wc.replay_dest); 3153 3154 wc.replay_dest->log_root = log; 3155 btrfs_record_root_in_trans(trans, wc.replay_dest); 3156 ret = walk_log_tree(trans, log, &wc); 3157 BUG_ON(ret); 3158 3159 if (wc.stage == LOG_WALK_REPLAY_ALL) { 3160 ret = fixup_inode_link_counts(trans, wc.replay_dest, 3161 path); 3162 BUG_ON(ret); 3163 } 3164 3165 key.offset = found_key.offset - 1; 3166 wc.replay_dest->log_root = NULL; 3167 free_extent_buffer(log->node); 3168 free_extent_buffer(log->commit_root); 3169 kfree(log); 3170 3171 if (found_key.offset == 0) 3172 break; 3173 } 3174 btrfs_release_path(log_root_tree, path); 3175 3176 /* step one is to pin it all, step two is to replay just inodes */ 3177 if (wc.pin) { 3178 wc.pin = 0; 3179 wc.process_func = replay_one_buffer; 3180 wc.stage = LOG_WALK_REPLAY_INODES; 3181 goto again; 3182 } 3183 /* step three is to replay everything */ 3184 if (wc.stage < LOG_WALK_REPLAY_ALL) { 3185 wc.stage++; 3186 goto again; 3187 } 3188 3189 btrfs_free_path(path); 3190 3191 free_extent_buffer(log_root_tree->node); 3192 log_root_tree->log_root = NULL; 3193 fs_info->log_root_recovering = 0; 3194 3195 /* step 4: commit the transaction, which also unpins the blocks */ 3196 btrfs_commit_transaction(trans, fs_info->tree_root); 3197 3198 kfree(log_root_tree); 3199 return 0; 3200 } 3201 3202 /* 3203 * there are some corner cases where we want to force a full 3204 * commit instead of allowing a directory to be logged. 3205 * 3206 * They revolve around files there were unlinked from the directory, and 3207 * this function updates the parent directory so that a full commit is 3208 * properly done if it is fsync'd later after the unlinks are done. 3209 */ 3210 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 3211 struct inode *dir, struct inode *inode, 3212 int for_rename) 3213 { 3214 /* 3215 * when we're logging a file, if it hasn't been renamed 3216 * or unlinked, and its inode is fully committed on disk, 3217 * we don't have to worry about walking up the directory chain 3218 * to log its parents. 3219 * 3220 * So, we use the last_unlink_trans field to put this transid 3221 * into the file. When the file is logged we check it and 3222 * don't log the parents if the file is fully on disk. 3223 */ 3224 if (S_ISREG(inode->i_mode)) 3225 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3226 3227 /* 3228 * if this directory was already logged any new 3229 * names for this file/dir will get recorded 3230 */ 3231 smp_mb(); 3232 if (BTRFS_I(dir)->logged_trans == trans->transid) 3233 return; 3234 3235 /* 3236 * if the inode we're about to unlink was logged, 3237 * the log will be properly updated for any new names 3238 */ 3239 if (BTRFS_I(inode)->logged_trans == trans->transid) 3240 return; 3241 3242 /* 3243 * when renaming files across directories, if the directory 3244 * there we're unlinking from gets fsync'd later on, there's 3245 * no way to find the destination directory later and fsync it 3246 * properly. So, we have to be conservative and force commits 3247 * so the new name gets discovered. 3248 */ 3249 if (for_rename) 3250 goto record; 3251 3252 /* we can safely do the unlink without any special recording */ 3253 return; 3254 3255 record: 3256 BTRFS_I(dir)->last_unlink_trans = trans->transid; 3257 } 3258 3259 /* 3260 * Call this after adding a new name for a file and it will properly 3261 * update the log to reflect the new name. 3262 * 3263 * It will return zero if all goes well, and it will return 1 if a 3264 * full transaction commit is required. 3265 */ 3266 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 3267 struct inode *inode, struct inode *old_dir, 3268 struct dentry *parent) 3269 { 3270 struct btrfs_root * root = BTRFS_I(inode)->root; 3271 3272 /* 3273 * this will force the logging code to walk the dentry chain 3274 * up for the file 3275 */ 3276 if (S_ISREG(inode->i_mode)) 3277 BTRFS_I(inode)->last_unlink_trans = trans->transid; 3278 3279 /* 3280 * if this inode hasn't been logged and directory we're renaming it 3281 * from hasn't been logged, we don't need to log it 3282 */ 3283 if (BTRFS_I(inode)->logged_trans <= 3284 root->fs_info->last_trans_committed && 3285 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 3286 root->fs_info->last_trans_committed)) 3287 return 0; 3288 3289 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 3290 } 3291 3292