1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/list_sort.h> 22 #include "ctree.h" 23 #include "transaction.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "compat.h" 29 #include "tree-log.h" 30 #include "hash.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 41 /* 42 * directory trouble cases 43 * 44 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 45 * log, we must force a full commit before doing an fsync of the directory 46 * where the unlink was done. 47 * ---> record transid of last unlink/rename per directory 48 * 49 * mkdir foo/some_dir 50 * normal commit 51 * rename foo/some_dir foo2/some_dir 52 * mkdir foo/some_dir 53 * fsync foo/some_dir/some_file 54 * 55 * The fsync above will unlink the original some_dir without recording 56 * it in its new location (foo2). After a crash, some_dir will be gone 57 * unless the fsync of some_file forces a full commit 58 * 59 * 2) we must log any new names for any file or dir that is in the fsync 60 * log. ---> check inode while renaming/linking. 61 * 62 * 2a) we must log any new names for any file or dir during rename 63 * when the directory they are being removed from was logged. 64 * ---> check inode and old parent dir during rename 65 * 66 * 2a is actually the more important variant. With the extra logging 67 * a crash might unlink the old name without recreating the new one 68 * 69 * 3) after a crash, we must go through any directories with a link count 70 * of zero and redo the rm -rf 71 * 72 * mkdir f1/foo 73 * normal commit 74 * rm -rf f1/foo 75 * fsync(f1) 76 * 77 * The directory f1 was fully removed from the FS, but fsync was never 78 * called on f1, only its parent dir. After a crash the rm -rf must 79 * be replayed. This must be able to recurse down the entire 80 * directory tree. The inode link count fixup code takes care of the 81 * ugly details. 82 */ 83 84 /* 85 * stages for the tree walking. The first 86 * stage (0) is to only pin down the blocks we find 87 * the second stage (1) is to make sure that all the inodes 88 * we find in the log are created in the subvolume. 89 * 90 * The last stage is to deal with directories and links and extents 91 * and all the other fun semantics 92 */ 93 #define LOG_WALK_PIN_ONLY 0 94 #define LOG_WALK_REPLAY_INODES 1 95 #define LOG_WALK_REPLAY_ALL 2 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, struct inode *inode, 99 int inode_only); 100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_path *path, u64 objectid); 103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 104 struct btrfs_root *root, 105 struct btrfs_root *log, 106 struct btrfs_path *path, 107 u64 dirid, int del_all); 108 109 /* 110 * tree logging is a special write ahead log used to make sure that 111 * fsyncs and O_SYNCs can happen without doing full tree commits. 112 * 113 * Full tree commits are expensive because they require commonly 114 * modified blocks to be recowed, creating many dirty pages in the 115 * extent tree an 4x-6x higher write load than ext3. 116 * 117 * Instead of doing a tree commit on every fsync, we use the 118 * key ranges and transaction ids to find items for a given file or directory 119 * that have changed in this transaction. Those items are copied into 120 * a special tree (one per subvolume root), that tree is written to disk 121 * and then the fsync is considered complete. 122 * 123 * After a crash, items are copied out of the log-tree back into the 124 * subvolume tree. Any file data extents found are recorded in the extent 125 * allocation tree, and the log-tree freed. 126 * 127 * The log tree is read three times, once to pin down all the extents it is 128 * using in ram and once, once to create all the inodes logged in the tree 129 * and once to do all the other items. 130 */ 131 132 /* 133 * start a sub transaction and setup the log tree 134 * this increments the log tree writer count to make the people 135 * syncing the tree wait for us to finish 136 */ 137 static int start_log_trans(struct btrfs_trans_handle *trans, 138 struct btrfs_root *root) 139 { 140 int ret; 141 int err = 0; 142 143 mutex_lock(&root->log_mutex); 144 if (root->log_root) { 145 if (!root->log_start_pid) { 146 root->log_start_pid = current->pid; 147 root->log_multiple_pids = false; 148 } else if (root->log_start_pid != current->pid) { 149 root->log_multiple_pids = true; 150 } 151 152 atomic_inc(&root->log_batch); 153 atomic_inc(&root->log_writers); 154 mutex_unlock(&root->log_mutex); 155 return 0; 156 } 157 root->log_multiple_pids = false; 158 root->log_start_pid = current->pid; 159 mutex_lock(&root->fs_info->tree_log_mutex); 160 if (!root->fs_info->log_root_tree) { 161 ret = btrfs_init_log_root_tree(trans, root->fs_info); 162 if (ret) 163 err = ret; 164 } 165 if (err == 0 && !root->log_root) { 166 ret = btrfs_add_log_tree(trans, root); 167 if (ret) 168 err = ret; 169 } 170 mutex_unlock(&root->fs_info->tree_log_mutex); 171 atomic_inc(&root->log_batch); 172 atomic_inc(&root->log_writers); 173 mutex_unlock(&root->log_mutex); 174 return err; 175 } 176 177 /* 178 * returns 0 if there was a log transaction running and we were able 179 * to join, or returns -ENOENT if there were not transactions 180 * in progress 181 */ 182 static int join_running_log_trans(struct btrfs_root *root) 183 { 184 int ret = -ENOENT; 185 186 smp_mb(); 187 if (!root->log_root) 188 return -ENOENT; 189 190 mutex_lock(&root->log_mutex); 191 if (root->log_root) { 192 ret = 0; 193 atomic_inc(&root->log_writers); 194 } 195 mutex_unlock(&root->log_mutex); 196 return ret; 197 } 198 199 /* 200 * This either makes the current running log transaction wait 201 * until you call btrfs_end_log_trans() or it makes any future 202 * log transactions wait until you call btrfs_end_log_trans() 203 */ 204 int btrfs_pin_log_trans(struct btrfs_root *root) 205 { 206 int ret = -ENOENT; 207 208 mutex_lock(&root->log_mutex); 209 atomic_inc(&root->log_writers); 210 mutex_unlock(&root->log_mutex); 211 return ret; 212 } 213 214 /* 215 * indicate we're done making changes to the log tree 216 * and wake up anyone waiting to do a sync 217 */ 218 void btrfs_end_log_trans(struct btrfs_root *root) 219 { 220 if (atomic_dec_and_test(&root->log_writers)) { 221 smp_mb(); 222 if (waitqueue_active(&root->log_writer_wait)) 223 wake_up(&root->log_writer_wait); 224 } 225 } 226 227 228 /* 229 * the walk control struct is used to pass state down the chain when 230 * processing the log tree. The stage field tells us which part 231 * of the log tree processing we are currently doing. The others 232 * are state fields used for that specific part 233 */ 234 struct walk_control { 235 /* should we free the extent on disk when done? This is used 236 * at transaction commit time while freeing a log tree 237 */ 238 int free; 239 240 /* should we write out the extent buffer? This is used 241 * while flushing the log tree to disk during a sync 242 */ 243 int write; 244 245 /* should we wait for the extent buffer io to finish? Also used 246 * while flushing the log tree to disk for a sync 247 */ 248 int wait; 249 250 /* pin only walk, we record which extents on disk belong to the 251 * log trees 252 */ 253 int pin; 254 255 /* what stage of the replay code we're currently in */ 256 int stage; 257 258 /* the root we are currently replaying */ 259 struct btrfs_root *replay_dest; 260 261 /* the trans handle for the current replay */ 262 struct btrfs_trans_handle *trans; 263 264 /* the function that gets used to process blocks we find in the 265 * tree. Note the extent_buffer might not be up to date when it is 266 * passed in, and it must be checked or read if you need the data 267 * inside it 268 */ 269 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 270 struct walk_control *wc, u64 gen); 271 }; 272 273 /* 274 * process_func used to pin down extents, write them or wait on them 275 */ 276 static int process_one_buffer(struct btrfs_root *log, 277 struct extent_buffer *eb, 278 struct walk_control *wc, u64 gen) 279 { 280 if (wc->pin) 281 btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 282 eb->start, eb->len); 283 284 if (btrfs_buffer_uptodate(eb, gen, 0)) { 285 if (wc->write) 286 btrfs_write_tree_block(eb); 287 if (wc->wait) 288 btrfs_wait_tree_block_writeback(eb); 289 } 290 return 0; 291 } 292 293 /* 294 * Item overwrite used by replay and tree logging. eb, slot and key all refer 295 * to the src data we are copying out. 296 * 297 * root is the tree we are copying into, and path is a scratch 298 * path for use in this function (it should be released on entry and 299 * will be released on exit). 300 * 301 * If the key is already in the destination tree the existing item is 302 * overwritten. If the existing item isn't big enough, it is extended. 303 * If it is too large, it is truncated. 304 * 305 * If the key isn't in the destination yet, a new item is inserted. 306 */ 307 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 308 struct btrfs_root *root, 309 struct btrfs_path *path, 310 struct extent_buffer *eb, int slot, 311 struct btrfs_key *key) 312 { 313 int ret; 314 u32 item_size; 315 u64 saved_i_size = 0; 316 int save_old_i_size = 0; 317 unsigned long src_ptr; 318 unsigned long dst_ptr; 319 int overwrite_root = 0; 320 321 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 322 overwrite_root = 1; 323 324 item_size = btrfs_item_size_nr(eb, slot); 325 src_ptr = btrfs_item_ptr_offset(eb, slot); 326 327 /* look for the key in the destination tree */ 328 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 329 if (ret == 0) { 330 char *src_copy; 331 char *dst_copy; 332 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 333 path->slots[0]); 334 if (dst_size != item_size) 335 goto insert; 336 337 if (item_size == 0) { 338 btrfs_release_path(path); 339 return 0; 340 } 341 dst_copy = kmalloc(item_size, GFP_NOFS); 342 src_copy = kmalloc(item_size, GFP_NOFS); 343 if (!dst_copy || !src_copy) { 344 btrfs_release_path(path); 345 kfree(dst_copy); 346 kfree(src_copy); 347 return -ENOMEM; 348 } 349 350 read_extent_buffer(eb, src_copy, src_ptr, item_size); 351 352 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 353 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 354 item_size); 355 ret = memcmp(dst_copy, src_copy, item_size); 356 357 kfree(dst_copy); 358 kfree(src_copy); 359 /* 360 * they have the same contents, just return, this saves 361 * us from cowing blocks in the destination tree and doing 362 * extra writes that may not have been done by a previous 363 * sync 364 */ 365 if (ret == 0) { 366 btrfs_release_path(path); 367 return 0; 368 } 369 370 } 371 insert: 372 btrfs_release_path(path); 373 /* try to insert the key into the destination tree */ 374 ret = btrfs_insert_empty_item(trans, root, path, 375 key, item_size); 376 377 /* make sure any existing item is the correct size */ 378 if (ret == -EEXIST) { 379 u32 found_size; 380 found_size = btrfs_item_size_nr(path->nodes[0], 381 path->slots[0]); 382 if (found_size > item_size) 383 btrfs_truncate_item(trans, root, path, item_size, 1); 384 else if (found_size < item_size) 385 btrfs_extend_item(trans, root, path, 386 item_size - found_size); 387 } else if (ret) { 388 return ret; 389 } 390 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 391 path->slots[0]); 392 393 /* don't overwrite an existing inode if the generation number 394 * was logged as zero. This is done when the tree logging code 395 * is just logging an inode to make sure it exists after recovery. 396 * 397 * Also, don't overwrite i_size on directories during replay. 398 * log replay inserts and removes directory items based on the 399 * state of the tree found in the subvolume, and i_size is modified 400 * as it goes 401 */ 402 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 403 struct btrfs_inode_item *src_item; 404 struct btrfs_inode_item *dst_item; 405 406 src_item = (struct btrfs_inode_item *)src_ptr; 407 dst_item = (struct btrfs_inode_item *)dst_ptr; 408 409 if (btrfs_inode_generation(eb, src_item) == 0) 410 goto no_copy; 411 412 if (overwrite_root && 413 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 414 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 415 save_old_i_size = 1; 416 saved_i_size = btrfs_inode_size(path->nodes[0], 417 dst_item); 418 } 419 } 420 421 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 422 src_ptr, item_size); 423 424 if (save_old_i_size) { 425 struct btrfs_inode_item *dst_item; 426 dst_item = (struct btrfs_inode_item *)dst_ptr; 427 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 428 } 429 430 /* make sure the generation is filled in */ 431 if (key->type == BTRFS_INODE_ITEM_KEY) { 432 struct btrfs_inode_item *dst_item; 433 dst_item = (struct btrfs_inode_item *)dst_ptr; 434 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 435 btrfs_set_inode_generation(path->nodes[0], dst_item, 436 trans->transid); 437 } 438 } 439 no_copy: 440 btrfs_mark_buffer_dirty(path->nodes[0]); 441 btrfs_release_path(path); 442 return 0; 443 } 444 445 /* 446 * simple helper to read an inode off the disk from a given root 447 * This can only be called for subvolume roots and not for the log 448 */ 449 static noinline struct inode *read_one_inode(struct btrfs_root *root, 450 u64 objectid) 451 { 452 struct btrfs_key key; 453 struct inode *inode; 454 455 key.objectid = objectid; 456 key.type = BTRFS_INODE_ITEM_KEY; 457 key.offset = 0; 458 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 459 if (IS_ERR(inode)) { 460 inode = NULL; 461 } else if (is_bad_inode(inode)) { 462 iput(inode); 463 inode = NULL; 464 } 465 return inode; 466 } 467 468 /* replays a single extent in 'eb' at 'slot' with 'key' into the 469 * subvolume 'root'. path is released on entry and should be released 470 * on exit. 471 * 472 * extents in the log tree have not been allocated out of the extent 473 * tree yet. So, this completes the allocation, taking a reference 474 * as required if the extent already exists or creating a new extent 475 * if it isn't in the extent allocation tree yet. 476 * 477 * The extent is inserted into the file, dropping any existing extents 478 * from the file that overlap the new one. 479 */ 480 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 481 struct btrfs_root *root, 482 struct btrfs_path *path, 483 struct extent_buffer *eb, int slot, 484 struct btrfs_key *key) 485 { 486 int found_type; 487 u64 extent_end; 488 u64 start = key->offset; 489 u64 saved_nbytes; 490 struct btrfs_file_extent_item *item; 491 struct inode *inode = NULL; 492 unsigned long size; 493 int ret = 0; 494 495 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 496 found_type = btrfs_file_extent_type(eb, item); 497 498 if (found_type == BTRFS_FILE_EXTENT_REG || 499 found_type == BTRFS_FILE_EXTENT_PREALLOC) 500 extent_end = start + btrfs_file_extent_num_bytes(eb, item); 501 else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 502 size = btrfs_file_extent_inline_len(eb, item); 503 extent_end = ALIGN(start + size, root->sectorsize); 504 } else { 505 ret = 0; 506 goto out; 507 } 508 509 inode = read_one_inode(root, key->objectid); 510 if (!inode) { 511 ret = -EIO; 512 goto out; 513 } 514 515 /* 516 * first check to see if we already have this extent in the 517 * file. This must be done before the btrfs_drop_extents run 518 * so we don't try to drop this extent. 519 */ 520 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 521 start, 0); 522 523 if (ret == 0 && 524 (found_type == BTRFS_FILE_EXTENT_REG || 525 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 526 struct btrfs_file_extent_item cmp1; 527 struct btrfs_file_extent_item cmp2; 528 struct btrfs_file_extent_item *existing; 529 struct extent_buffer *leaf; 530 531 leaf = path->nodes[0]; 532 existing = btrfs_item_ptr(leaf, path->slots[0], 533 struct btrfs_file_extent_item); 534 535 read_extent_buffer(eb, &cmp1, (unsigned long)item, 536 sizeof(cmp1)); 537 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 538 sizeof(cmp2)); 539 540 /* 541 * we already have a pointer to this exact extent, 542 * we don't have to do anything 543 */ 544 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 545 btrfs_release_path(path); 546 goto out; 547 } 548 } 549 btrfs_release_path(path); 550 551 saved_nbytes = inode_get_bytes(inode); 552 /* drop any overlapping extents */ 553 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 554 BUG_ON(ret); 555 556 if (found_type == BTRFS_FILE_EXTENT_REG || 557 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 558 u64 offset; 559 unsigned long dest_offset; 560 struct btrfs_key ins; 561 562 ret = btrfs_insert_empty_item(trans, root, path, key, 563 sizeof(*item)); 564 BUG_ON(ret); 565 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 566 path->slots[0]); 567 copy_extent_buffer(path->nodes[0], eb, dest_offset, 568 (unsigned long)item, sizeof(*item)); 569 570 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 571 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 572 ins.type = BTRFS_EXTENT_ITEM_KEY; 573 offset = key->offset - btrfs_file_extent_offset(eb, item); 574 575 if (ins.objectid > 0) { 576 u64 csum_start; 577 u64 csum_end; 578 LIST_HEAD(ordered_sums); 579 /* 580 * is this extent already allocated in the extent 581 * allocation tree? If so, just add a reference 582 */ 583 ret = btrfs_lookup_extent(root, ins.objectid, 584 ins.offset); 585 if (ret == 0) { 586 ret = btrfs_inc_extent_ref(trans, root, 587 ins.objectid, ins.offset, 588 0, root->root_key.objectid, 589 key->objectid, offset, 0); 590 BUG_ON(ret); 591 } else { 592 /* 593 * insert the extent pointer in the extent 594 * allocation tree 595 */ 596 ret = btrfs_alloc_logged_file_extent(trans, 597 root, root->root_key.objectid, 598 key->objectid, offset, &ins); 599 BUG_ON(ret); 600 } 601 btrfs_release_path(path); 602 603 if (btrfs_file_extent_compression(eb, item)) { 604 csum_start = ins.objectid; 605 csum_end = csum_start + ins.offset; 606 } else { 607 csum_start = ins.objectid + 608 btrfs_file_extent_offset(eb, item); 609 csum_end = csum_start + 610 btrfs_file_extent_num_bytes(eb, item); 611 } 612 613 ret = btrfs_lookup_csums_range(root->log_root, 614 csum_start, csum_end - 1, 615 &ordered_sums, 0); 616 BUG_ON(ret); 617 while (!list_empty(&ordered_sums)) { 618 struct btrfs_ordered_sum *sums; 619 sums = list_entry(ordered_sums.next, 620 struct btrfs_ordered_sum, 621 list); 622 ret = btrfs_csum_file_blocks(trans, 623 root->fs_info->csum_root, 624 sums); 625 BUG_ON(ret); 626 list_del(&sums->list); 627 kfree(sums); 628 } 629 } else { 630 btrfs_release_path(path); 631 } 632 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 633 /* inline extents are easy, we just overwrite them */ 634 ret = overwrite_item(trans, root, path, eb, slot, key); 635 BUG_ON(ret); 636 } 637 638 inode_set_bytes(inode, saved_nbytes); 639 ret = btrfs_update_inode(trans, root, inode); 640 out: 641 if (inode) 642 iput(inode); 643 return ret; 644 } 645 646 /* 647 * when cleaning up conflicts between the directory names in the 648 * subvolume, directory names in the log and directory names in the 649 * inode back references, we may have to unlink inodes from directories. 650 * 651 * This is a helper function to do the unlink of a specific directory 652 * item 653 */ 654 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 655 struct btrfs_root *root, 656 struct btrfs_path *path, 657 struct inode *dir, 658 struct btrfs_dir_item *di) 659 { 660 struct inode *inode; 661 char *name; 662 int name_len; 663 struct extent_buffer *leaf; 664 struct btrfs_key location; 665 int ret; 666 667 leaf = path->nodes[0]; 668 669 btrfs_dir_item_key_to_cpu(leaf, di, &location); 670 name_len = btrfs_dir_name_len(leaf, di); 671 name = kmalloc(name_len, GFP_NOFS); 672 if (!name) 673 return -ENOMEM; 674 675 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 676 btrfs_release_path(path); 677 678 inode = read_one_inode(root, location.objectid); 679 if (!inode) { 680 kfree(name); 681 return -EIO; 682 } 683 684 ret = link_to_fixup_dir(trans, root, path, location.objectid); 685 BUG_ON(ret); 686 687 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 688 BUG_ON(ret); 689 kfree(name); 690 691 iput(inode); 692 693 btrfs_run_delayed_items(trans, root); 694 return ret; 695 } 696 697 /* 698 * helper function to see if a given name and sequence number found 699 * in an inode back reference are already in a directory and correctly 700 * point to this inode 701 */ 702 static noinline int inode_in_dir(struct btrfs_root *root, 703 struct btrfs_path *path, 704 u64 dirid, u64 objectid, u64 index, 705 const char *name, int name_len) 706 { 707 struct btrfs_dir_item *di; 708 struct btrfs_key location; 709 int match = 0; 710 711 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 712 index, name, name_len, 0); 713 if (di && !IS_ERR(di)) { 714 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 715 if (location.objectid != objectid) 716 goto out; 717 } else 718 goto out; 719 btrfs_release_path(path); 720 721 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 722 if (di && !IS_ERR(di)) { 723 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 724 if (location.objectid != objectid) 725 goto out; 726 } else 727 goto out; 728 match = 1; 729 out: 730 btrfs_release_path(path); 731 return match; 732 } 733 734 /* 735 * helper function to check a log tree for a named back reference in 736 * an inode. This is used to decide if a back reference that is 737 * found in the subvolume conflicts with what we find in the log. 738 * 739 * inode backreferences may have multiple refs in a single item, 740 * during replay we process one reference at a time, and we don't 741 * want to delete valid links to a file from the subvolume if that 742 * link is also in the log. 743 */ 744 static noinline int backref_in_log(struct btrfs_root *log, 745 struct btrfs_key *key, 746 u64 ref_objectid, 747 char *name, int namelen) 748 { 749 struct btrfs_path *path; 750 struct btrfs_inode_ref *ref; 751 unsigned long ptr; 752 unsigned long ptr_end; 753 unsigned long name_ptr; 754 int found_name_len; 755 int item_size; 756 int ret; 757 int match = 0; 758 759 path = btrfs_alloc_path(); 760 if (!path) 761 return -ENOMEM; 762 763 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 764 if (ret != 0) 765 goto out; 766 767 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 768 769 if (key->type == BTRFS_INODE_EXTREF_KEY) { 770 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 771 name, namelen, NULL)) 772 match = 1; 773 774 goto out; 775 } 776 777 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 778 ptr_end = ptr + item_size; 779 while (ptr < ptr_end) { 780 ref = (struct btrfs_inode_ref *)ptr; 781 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 782 if (found_name_len == namelen) { 783 name_ptr = (unsigned long)(ref + 1); 784 ret = memcmp_extent_buffer(path->nodes[0], name, 785 name_ptr, namelen); 786 if (ret == 0) { 787 match = 1; 788 goto out; 789 } 790 } 791 ptr = (unsigned long)(ref + 1) + found_name_len; 792 } 793 out: 794 btrfs_free_path(path); 795 return match; 796 } 797 798 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 799 struct btrfs_root *root, 800 struct btrfs_path *path, 801 struct btrfs_root *log_root, 802 struct inode *dir, struct inode *inode, 803 struct extent_buffer *eb, 804 u64 inode_objectid, u64 parent_objectid, 805 u64 ref_index, char *name, int namelen, 806 int *search_done) 807 { 808 int ret; 809 char *victim_name; 810 int victim_name_len; 811 struct extent_buffer *leaf; 812 struct btrfs_dir_item *di; 813 struct btrfs_key search_key; 814 struct btrfs_inode_extref *extref; 815 816 again: 817 /* Search old style refs */ 818 search_key.objectid = inode_objectid; 819 search_key.type = BTRFS_INODE_REF_KEY; 820 search_key.offset = parent_objectid; 821 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 822 if (ret == 0) { 823 struct btrfs_inode_ref *victim_ref; 824 unsigned long ptr; 825 unsigned long ptr_end; 826 827 leaf = path->nodes[0]; 828 829 /* are we trying to overwrite a back ref for the root directory 830 * if so, just jump out, we're done 831 */ 832 if (search_key.objectid == search_key.offset) 833 return 1; 834 835 /* check all the names in this back reference to see 836 * if they are in the log. if so, we allow them to stay 837 * otherwise they must be unlinked as a conflict 838 */ 839 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 840 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 841 while (ptr < ptr_end) { 842 victim_ref = (struct btrfs_inode_ref *)ptr; 843 victim_name_len = btrfs_inode_ref_name_len(leaf, 844 victim_ref); 845 victim_name = kmalloc(victim_name_len, GFP_NOFS); 846 BUG_ON(!victim_name); 847 848 read_extent_buffer(leaf, victim_name, 849 (unsigned long)(victim_ref + 1), 850 victim_name_len); 851 852 if (!backref_in_log(log_root, &search_key, 853 parent_objectid, 854 victim_name, 855 victim_name_len)) { 856 btrfs_inc_nlink(inode); 857 btrfs_release_path(path); 858 859 ret = btrfs_unlink_inode(trans, root, dir, 860 inode, victim_name, 861 victim_name_len); 862 BUG_ON(ret); 863 btrfs_run_delayed_items(trans, root); 864 kfree(victim_name); 865 *search_done = 1; 866 goto again; 867 } 868 kfree(victim_name); 869 870 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 871 } 872 BUG_ON(ret); 873 874 /* 875 * NOTE: we have searched root tree and checked the 876 * coresponding ref, it does not need to check again. 877 */ 878 *search_done = 1; 879 } 880 btrfs_release_path(path); 881 882 /* Same search but for extended refs */ 883 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 884 inode_objectid, parent_objectid, 0, 885 0); 886 if (!IS_ERR_OR_NULL(extref)) { 887 u32 item_size; 888 u32 cur_offset = 0; 889 unsigned long base; 890 struct inode *victim_parent; 891 892 leaf = path->nodes[0]; 893 894 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 895 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 896 897 while (cur_offset < item_size) { 898 extref = (struct btrfs_inode_extref *)base + cur_offset; 899 900 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 901 902 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 903 goto next; 904 905 victim_name = kmalloc(victim_name_len, GFP_NOFS); 906 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 907 victim_name_len); 908 909 search_key.objectid = inode_objectid; 910 search_key.type = BTRFS_INODE_EXTREF_KEY; 911 search_key.offset = btrfs_extref_hash(parent_objectid, 912 victim_name, 913 victim_name_len); 914 ret = 0; 915 if (!backref_in_log(log_root, &search_key, 916 parent_objectid, victim_name, 917 victim_name_len)) { 918 ret = -ENOENT; 919 victim_parent = read_one_inode(root, 920 parent_objectid); 921 if (victim_parent) { 922 btrfs_inc_nlink(inode); 923 btrfs_release_path(path); 924 925 ret = btrfs_unlink_inode(trans, root, 926 victim_parent, 927 inode, 928 victim_name, 929 victim_name_len); 930 btrfs_run_delayed_items(trans, root); 931 } 932 BUG_ON(ret); 933 iput(victim_parent); 934 kfree(victim_name); 935 *search_done = 1; 936 goto again; 937 } 938 kfree(victim_name); 939 BUG_ON(ret); 940 next: 941 cur_offset += victim_name_len + sizeof(*extref); 942 } 943 *search_done = 1; 944 } 945 btrfs_release_path(path); 946 947 /* look for a conflicting sequence number */ 948 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 949 ref_index, name, namelen, 0); 950 if (di && !IS_ERR(di)) { 951 ret = drop_one_dir_item(trans, root, path, dir, di); 952 BUG_ON(ret); 953 } 954 btrfs_release_path(path); 955 956 /* look for a conflicing name */ 957 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 958 name, namelen, 0); 959 if (di && !IS_ERR(di)) { 960 ret = drop_one_dir_item(trans, root, path, dir, di); 961 BUG_ON(ret); 962 } 963 btrfs_release_path(path); 964 965 return 0; 966 } 967 968 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 969 u32 *namelen, char **name, u64 *index, 970 u64 *parent_objectid) 971 { 972 struct btrfs_inode_extref *extref; 973 974 extref = (struct btrfs_inode_extref *)ref_ptr; 975 976 *namelen = btrfs_inode_extref_name_len(eb, extref); 977 *name = kmalloc(*namelen, GFP_NOFS); 978 if (*name == NULL) 979 return -ENOMEM; 980 981 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 982 *namelen); 983 984 *index = btrfs_inode_extref_index(eb, extref); 985 if (parent_objectid) 986 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 987 988 return 0; 989 } 990 991 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 992 u32 *namelen, char **name, u64 *index) 993 { 994 struct btrfs_inode_ref *ref; 995 996 ref = (struct btrfs_inode_ref *)ref_ptr; 997 998 *namelen = btrfs_inode_ref_name_len(eb, ref); 999 *name = kmalloc(*namelen, GFP_NOFS); 1000 if (*name == NULL) 1001 return -ENOMEM; 1002 1003 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1004 1005 *index = btrfs_inode_ref_index(eb, ref); 1006 1007 return 0; 1008 } 1009 1010 /* 1011 * replay one inode back reference item found in the log tree. 1012 * eb, slot and key refer to the buffer and key found in the log tree. 1013 * root is the destination we are replaying into, and path is for temp 1014 * use by this function. (it should be released on return). 1015 */ 1016 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1017 struct btrfs_root *root, 1018 struct btrfs_root *log, 1019 struct btrfs_path *path, 1020 struct extent_buffer *eb, int slot, 1021 struct btrfs_key *key) 1022 { 1023 struct inode *dir; 1024 struct inode *inode; 1025 unsigned long ref_ptr; 1026 unsigned long ref_end; 1027 char *name; 1028 int namelen; 1029 int ret; 1030 int search_done = 0; 1031 int log_ref_ver = 0; 1032 u64 parent_objectid; 1033 u64 inode_objectid; 1034 u64 ref_index = 0; 1035 int ref_struct_size; 1036 1037 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1038 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1039 1040 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1041 struct btrfs_inode_extref *r; 1042 1043 ref_struct_size = sizeof(struct btrfs_inode_extref); 1044 log_ref_ver = 1; 1045 r = (struct btrfs_inode_extref *)ref_ptr; 1046 parent_objectid = btrfs_inode_extref_parent(eb, r); 1047 } else { 1048 ref_struct_size = sizeof(struct btrfs_inode_ref); 1049 parent_objectid = key->offset; 1050 } 1051 inode_objectid = key->objectid; 1052 1053 /* 1054 * it is possible that we didn't log all the parent directories 1055 * for a given inode. If we don't find the dir, just don't 1056 * copy the back ref in. The link count fixup code will take 1057 * care of the rest 1058 */ 1059 dir = read_one_inode(root, parent_objectid); 1060 if (!dir) 1061 return -ENOENT; 1062 1063 inode = read_one_inode(root, inode_objectid); 1064 if (!inode) { 1065 iput(dir); 1066 return -EIO; 1067 } 1068 1069 while (ref_ptr < ref_end) { 1070 if (log_ref_ver) { 1071 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1072 &ref_index, &parent_objectid); 1073 /* 1074 * parent object can change from one array 1075 * item to another. 1076 */ 1077 if (!dir) 1078 dir = read_one_inode(root, parent_objectid); 1079 if (!dir) 1080 return -ENOENT; 1081 } else { 1082 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1083 &ref_index); 1084 } 1085 if (ret) 1086 return ret; 1087 1088 /* if we already have a perfect match, we're done */ 1089 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1090 ref_index, name, namelen)) { 1091 /* 1092 * look for a conflicting back reference in the 1093 * metadata. if we find one we have to unlink that name 1094 * of the file before we add our new link. Later on, we 1095 * overwrite any existing back reference, and we don't 1096 * want to create dangling pointers in the directory. 1097 */ 1098 1099 if (!search_done) { 1100 ret = __add_inode_ref(trans, root, path, log, 1101 dir, inode, eb, 1102 inode_objectid, 1103 parent_objectid, 1104 ref_index, name, namelen, 1105 &search_done); 1106 if (ret == 1) 1107 goto out; 1108 BUG_ON(ret); 1109 } 1110 1111 /* insert our name */ 1112 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1113 0, ref_index); 1114 BUG_ON(ret); 1115 1116 btrfs_update_inode(trans, root, inode); 1117 } 1118 1119 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1120 kfree(name); 1121 if (log_ref_ver) { 1122 iput(dir); 1123 dir = NULL; 1124 } 1125 } 1126 1127 /* finally write the back reference in the inode */ 1128 ret = overwrite_item(trans, root, path, eb, slot, key); 1129 BUG_ON(ret); 1130 1131 out: 1132 btrfs_release_path(path); 1133 iput(dir); 1134 iput(inode); 1135 return 0; 1136 } 1137 1138 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1139 struct btrfs_root *root, u64 offset) 1140 { 1141 int ret; 1142 ret = btrfs_find_orphan_item(root, offset); 1143 if (ret > 0) 1144 ret = btrfs_insert_orphan_item(trans, root, offset); 1145 return ret; 1146 } 1147 1148 static int count_inode_extrefs(struct btrfs_root *root, 1149 struct inode *inode, struct btrfs_path *path) 1150 { 1151 int ret = 0; 1152 int name_len; 1153 unsigned int nlink = 0; 1154 u32 item_size; 1155 u32 cur_offset = 0; 1156 u64 inode_objectid = btrfs_ino(inode); 1157 u64 offset = 0; 1158 unsigned long ptr; 1159 struct btrfs_inode_extref *extref; 1160 struct extent_buffer *leaf; 1161 1162 while (1) { 1163 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1164 &extref, &offset); 1165 if (ret) 1166 break; 1167 1168 leaf = path->nodes[0]; 1169 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1170 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1171 1172 while (cur_offset < item_size) { 1173 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1174 name_len = btrfs_inode_extref_name_len(leaf, extref); 1175 1176 nlink++; 1177 1178 cur_offset += name_len + sizeof(*extref); 1179 } 1180 1181 offset++; 1182 btrfs_release_path(path); 1183 } 1184 btrfs_release_path(path); 1185 1186 if (ret < 0) 1187 return ret; 1188 return nlink; 1189 } 1190 1191 static int count_inode_refs(struct btrfs_root *root, 1192 struct inode *inode, struct btrfs_path *path) 1193 { 1194 int ret; 1195 struct btrfs_key key; 1196 unsigned int nlink = 0; 1197 unsigned long ptr; 1198 unsigned long ptr_end; 1199 int name_len; 1200 u64 ino = btrfs_ino(inode); 1201 1202 key.objectid = ino; 1203 key.type = BTRFS_INODE_REF_KEY; 1204 key.offset = (u64)-1; 1205 1206 while (1) { 1207 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1208 if (ret < 0) 1209 break; 1210 if (ret > 0) { 1211 if (path->slots[0] == 0) 1212 break; 1213 path->slots[0]--; 1214 } 1215 btrfs_item_key_to_cpu(path->nodes[0], &key, 1216 path->slots[0]); 1217 if (key.objectid != ino || 1218 key.type != BTRFS_INODE_REF_KEY) 1219 break; 1220 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1221 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1222 path->slots[0]); 1223 while (ptr < ptr_end) { 1224 struct btrfs_inode_ref *ref; 1225 1226 ref = (struct btrfs_inode_ref *)ptr; 1227 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1228 ref); 1229 ptr = (unsigned long)(ref + 1) + name_len; 1230 nlink++; 1231 } 1232 1233 if (key.offset == 0) 1234 break; 1235 key.offset--; 1236 btrfs_release_path(path); 1237 } 1238 btrfs_release_path(path); 1239 1240 return nlink; 1241 } 1242 1243 /* 1244 * There are a few corners where the link count of the file can't 1245 * be properly maintained during replay. So, instead of adding 1246 * lots of complexity to the log code, we just scan the backrefs 1247 * for any file that has been through replay. 1248 * 1249 * The scan will update the link count on the inode to reflect the 1250 * number of back refs found. If it goes down to zero, the iput 1251 * will free the inode. 1252 */ 1253 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1254 struct btrfs_root *root, 1255 struct inode *inode) 1256 { 1257 struct btrfs_path *path; 1258 int ret; 1259 u64 nlink = 0; 1260 u64 ino = btrfs_ino(inode); 1261 1262 path = btrfs_alloc_path(); 1263 if (!path) 1264 return -ENOMEM; 1265 1266 ret = count_inode_refs(root, inode, path); 1267 if (ret < 0) 1268 goto out; 1269 1270 nlink = ret; 1271 1272 ret = count_inode_extrefs(root, inode, path); 1273 if (ret == -ENOENT) 1274 ret = 0; 1275 1276 if (ret < 0) 1277 goto out; 1278 1279 nlink += ret; 1280 1281 ret = 0; 1282 1283 if (nlink != inode->i_nlink) { 1284 set_nlink(inode, nlink); 1285 btrfs_update_inode(trans, root, inode); 1286 } 1287 BTRFS_I(inode)->index_cnt = (u64)-1; 1288 1289 if (inode->i_nlink == 0) { 1290 if (S_ISDIR(inode->i_mode)) { 1291 ret = replay_dir_deletes(trans, root, NULL, path, 1292 ino, 1); 1293 BUG_ON(ret); 1294 } 1295 ret = insert_orphan_item(trans, root, ino); 1296 BUG_ON(ret); 1297 } 1298 1299 out: 1300 btrfs_free_path(path); 1301 return ret; 1302 } 1303 1304 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1305 struct btrfs_root *root, 1306 struct btrfs_path *path) 1307 { 1308 int ret; 1309 struct btrfs_key key; 1310 struct inode *inode; 1311 1312 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1313 key.type = BTRFS_ORPHAN_ITEM_KEY; 1314 key.offset = (u64)-1; 1315 while (1) { 1316 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1317 if (ret < 0) 1318 break; 1319 1320 if (ret == 1) { 1321 if (path->slots[0] == 0) 1322 break; 1323 path->slots[0]--; 1324 } 1325 1326 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1327 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1328 key.type != BTRFS_ORPHAN_ITEM_KEY) 1329 break; 1330 1331 ret = btrfs_del_item(trans, root, path); 1332 if (ret) 1333 goto out; 1334 1335 btrfs_release_path(path); 1336 inode = read_one_inode(root, key.offset); 1337 if (!inode) 1338 return -EIO; 1339 1340 ret = fixup_inode_link_count(trans, root, inode); 1341 BUG_ON(ret); 1342 1343 iput(inode); 1344 1345 /* 1346 * fixup on a directory may create new entries, 1347 * make sure we always look for the highset possible 1348 * offset 1349 */ 1350 key.offset = (u64)-1; 1351 } 1352 ret = 0; 1353 out: 1354 btrfs_release_path(path); 1355 return ret; 1356 } 1357 1358 1359 /* 1360 * record a given inode in the fixup dir so we can check its link 1361 * count when replay is done. The link count is incremented here 1362 * so the inode won't go away until we check it 1363 */ 1364 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1365 struct btrfs_root *root, 1366 struct btrfs_path *path, 1367 u64 objectid) 1368 { 1369 struct btrfs_key key; 1370 int ret = 0; 1371 struct inode *inode; 1372 1373 inode = read_one_inode(root, objectid); 1374 if (!inode) 1375 return -EIO; 1376 1377 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1378 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1379 key.offset = objectid; 1380 1381 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1382 1383 btrfs_release_path(path); 1384 if (ret == 0) { 1385 if (!inode->i_nlink) 1386 set_nlink(inode, 1); 1387 else 1388 btrfs_inc_nlink(inode); 1389 ret = btrfs_update_inode(trans, root, inode); 1390 } else if (ret == -EEXIST) { 1391 ret = 0; 1392 } else { 1393 BUG(); 1394 } 1395 iput(inode); 1396 1397 return ret; 1398 } 1399 1400 /* 1401 * when replaying the log for a directory, we only insert names 1402 * for inodes that actually exist. This means an fsync on a directory 1403 * does not implicitly fsync all the new files in it 1404 */ 1405 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1406 struct btrfs_root *root, 1407 struct btrfs_path *path, 1408 u64 dirid, u64 index, 1409 char *name, int name_len, u8 type, 1410 struct btrfs_key *location) 1411 { 1412 struct inode *inode; 1413 struct inode *dir; 1414 int ret; 1415 1416 inode = read_one_inode(root, location->objectid); 1417 if (!inode) 1418 return -ENOENT; 1419 1420 dir = read_one_inode(root, dirid); 1421 if (!dir) { 1422 iput(inode); 1423 return -EIO; 1424 } 1425 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1426 1427 /* FIXME, put inode into FIXUP list */ 1428 1429 iput(inode); 1430 iput(dir); 1431 return ret; 1432 } 1433 1434 /* 1435 * take a single entry in a log directory item and replay it into 1436 * the subvolume. 1437 * 1438 * if a conflicting item exists in the subdirectory already, 1439 * the inode it points to is unlinked and put into the link count 1440 * fix up tree. 1441 * 1442 * If a name from the log points to a file or directory that does 1443 * not exist in the FS, it is skipped. fsyncs on directories 1444 * do not force down inodes inside that directory, just changes to the 1445 * names or unlinks in a directory. 1446 */ 1447 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1448 struct btrfs_root *root, 1449 struct btrfs_path *path, 1450 struct extent_buffer *eb, 1451 struct btrfs_dir_item *di, 1452 struct btrfs_key *key) 1453 { 1454 char *name; 1455 int name_len; 1456 struct btrfs_dir_item *dst_di; 1457 struct btrfs_key found_key; 1458 struct btrfs_key log_key; 1459 struct inode *dir; 1460 u8 log_type; 1461 int exists; 1462 int ret; 1463 1464 dir = read_one_inode(root, key->objectid); 1465 if (!dir) 1466 return -EIO; 1467 1468 name_len = btrfs_dir_name_len(eb, di); 1469 name = kmalloc(name_len, GFP_NOFS); 1470 if (!name) 1471 return -ENOMEM; 1472 1473 log_type = btrfs_dir_type(eb, di); 1474 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1475 name_len); 1476 1477 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1478 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1479 if (exists == 0) 1480 exists = 1; 1481 else 1482 exists = 0; 1483 btrfs_release_path(path); 1484 1485 if (key->type == BTRFS_DIR_ITEM_KEY) { 1486 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1487 name, name_len, 1); 1488 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1489 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1490 key->objectid, 1491 key->offset, name, 1492 name_len, 1); 1493 } else { 1494 BUG(); 1495 } 1496 if (IS_ERR_OR_NULL(dst_di)) { 1497 /* we need a sequence number to insert, so we only 1498 * do inserts for the BTRFS_DIR_INDEX_KEY types 1499 */ 1500 if (key->type != BTRFS_DIR_INDEX_KEY) 1501 goto out; 1502 goto insert; 1503 } 1504 1505 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1506 /* the existing item matches the logged item */ 1507 if (found_key.objectid == log_key.objectid && 1508 found_key.type == log_key.type && 1509 found_key.offset == log_key.offset && 1510 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1511 goto out; 1512 } 1513 1514 /* 1515 * don't drop the conflicting directory entry if the inode 1516 * for the new entry doesn't exist 1517 */ 1518 if (!exists) 1519 goto out; 1520 1521 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1522 BUG_ON(ret); 1523 1524 if (key->type == BTRFS_DIR_INDEX_KEY) 1525 goto insert; 1526 out: 1527 btrfs_release_path(path); 1528 kfree(name); 1529 iput(dir); 1530 return 0; 1531 1532 insert: 1533 btrfs_release_path(path); 1534 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1535 name, name_len, log_type, &log_key); 1536 1537 BUG_ON(ret && ret != -ENOENT); 1538 goto out; 1539 } 1540 1541 /* 1542 * find all the names in a directory item and reconcile them into 1543 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1544 * one name in a directory item, but the same code gets used for 1545 * both directory index types 1546 */ 1547 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1548 struct btrfs_root *root, 1549 struct btrfs_path *path, 1550 struct extent_buffer *eb, int slot, 1551 struct btrfs_key *key) 1552 { 1553 int ret; 1554 u32 item_size = btrfs_item_size_nr(eb, slot); 1555 struct btrfs_dir_item *di; 1556 int name_len; 1557 unsigned long ptr; 1558 unsigned long ptr_end; 1559 1560 ptr = btrfs_item_ptr_offset(eb, slot); 1561 ptr_end = ptr + item_size; 1562 while (ptr < ptr_end) { 1563 di = (struct btrfs_dir_item *)ptr; 1564 if (verify_dir_item(root, eb, di)) 1565 return -EIO; 1566 name_len = btrfs_dir_name_len(eb, di); 1567 ret = replay_one_name(trans, root, path, eb, di, key); 1568 BUG_ON(ret); 1569 ptr = (unsigned long)(di + 1); 1570 ptr += name_len; 1571 } 1572 return 0; 1573 } 1574 1575 /* 1576 * directory replay has two parts. There are the standard directory 1577 * items in the log copied from the subvolume, and range items 1578 * created in the log while the subvolume was logged. 1579 * 1580 * The range items tell us which parts of the key space the log 1581 * is authoritative for. During replay, if a key in the subvolume 1582 * directory is in a logged range item, but not actually in the log 1583 * that means it was deleted from the directory before the fsync 1584 * and should be removed. 1585 */ 1586 static noinline int find_dir_range(struct btrfs_root *root, 1587 struct btrfs_path *path, 1588 u64 dirid, int key_type, 1589 u64 *start_ret, u64 *end_ret) 1590 { 1591 struct btrfs_key key; 1592 u64 found_end; 1593 struct btrfs_dir_log_item *item; 1594 int ret; 1595 int nritems; 1596 1597 if (*start_ret == (u64)-1) 1598 return 1; 1599 1600 key.objectid = dirid; 1601 key.type = key_type; 1602 key.offset = *start_ret; 1603 1604 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1605 if (ret < 0) 1606 goto out; 1607 if (ret > 0) { 1608 if (path->slots[0] == 0) 1609 goto out; 1610 path->slots[0]--; 1611 } 1612 if (ret != 0) 1613 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1614 1615 if (key.type != key_type || key.objectid != dirid) { 1616 ret = 1; 1617 goto next; 1618 } 1619 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1620 struct btrfs_dir_log_item); 1621 found_end = btrfs_dir_log_end(path->nodes[0], item); 1622 1623 if (*start_ret >= key.offset && *start_ret <= found_end) { 1624 ret = 0; 1625 *start_ret = key.offset; 1626 *end_ret = found_end; 1627 goto out; 1628 } 1629 ret = 1; 1630 next: 1631 /* check the next slot in the tree to see if it is a valid item */ 1632 nritems = btrfs_header_nritems(path->nodes[0]); 1633 if (path->slots[0] >= nritems) { 1634 ret = btrfs_next_leaf(root, path); 1635 if (ret) 1636 goto out; 1637 } else { 1638 path->slots[0]++; 1639 } 1640 1641 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1642 1643 if (key.type != key_type || key.objectid != dirid) { 1644 ret = 1; 1645 goto out; 1646 } 1647 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1648 struct btrfs_dir_log_item); 1649 found_end = btrfs_dir_log_end(path->nodes[0], item); 1650 *start_ret = key.offset; 1651 *end_ret = found_end; 1652 ret = 0; 1653 out: 1654 btrfs_release_path(path); 1655 return ret; 1656 } 1657 1658 /* 1659 * this looks for a given directory item in the log. If the directory 1660 * item is not in the log, the item is removed and the inode it points 1661 * to is unlinked 1662 */ 1663 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1664 struct btrfs_root *root, 1665 struct btrfs_root *log, 1666 struct btrfs_path *path, 1667 struct btrfs_path *log_path, 1668 struct inode *dir, 1669 struct btrfs_key *dir_key) 1670 { 1671 int ret; 1672 struct extent_buffer *eb; 1673 int slot; 1674 u32 item_size; 1675 struct btrfs_dir_item *di; 1676 struct btrfs_dir_item *log_di; 1677 int name_len; 1678 unsigned long ptr; 1679 unsigned long ptr_end; 1680 char *name; 1681 struct inode *inode; 1682 struct btrfs_key location; 1683 1684 again: 1685 eb = path->nodes[0]; 1686 slot = path->slots[0]; 1687 item_size = btrfs_item_size_nr(eb, slot); 1688 ptr = btrfs_item_ptr_offset(eb, slot); 1689 ptr_end = ptr + item_size; 1690 while (ptr < ptr_end) { 1691 di = (struct btrfs_dir_item *)ptr; 1692 if (verify_dir_item(root, eb, di)) { 1693 ret = -EIO; 1694 goto out; 1695 } 1696 1697 name_len = btrfs_dir_name_len(eb, di); 1698 name = kmalloc(name_len, GFP_NOFS); 1699 if (!name) { 1700 ret = -ENOMEM; 1701 goto out; 1702 } 1703 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1704 name_len); 1705 log_di = NULL; 1706 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1707 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1708 dir_key->objectid, 1709 name, name_len, 0); 1710 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1711 log_di = btrfs_lookup_dir_index_item(trans, log, 1712 log_path, 1713 dir_key->objectid, 1714 dir_key->offset, 1715 name, name_len, 0); 1716 } 1717 if (IS_ERR_OR_NULL(log_di)) { 1718 btrfs_dir_item_key_to_cpu(eb, di, &location); 1719 btrfs_release_path(path); 1720 btrfs_release_path(log_path); 1721 inode = read_one_inode(root, location.objectid); 1722 if (!inode) { 1723 kfree(name); 1724 return -EIO; 1725 } 1726 1727 ret = link_to_fixup_dir(trans, root, 1728 path, location.objectid); 1729 BUG_ON(ret); 1730 btrfs_inc_nlink(inode); 1731 ret = btrfs_unlink_inode(trans, root, dir, inode, 1732 name, name_len); 1733 BUG_ON(ret); 1734 1735 btrfs_run_delayed_items(trans, root); 1736 1737 kfree(name); 1738 iput(inode); 1739 1740 /* there might still be more names under this key 1741 * check and repeat if required 1742 */ 1743 ret = btrfs_search_slot(NULL, root, dir_key, path, 1744 0, 0); 1745 if (ret == 0) 1746 goto again; 1747 ret = 0; 1748 goto out; 1749 } 1750 btrfs_release_path(log_path); 1751 kfree(name); 1752 1753 ptr = (unsigned long)(di + 1); 1754 ptr += name_len; 1755 } 1756 ret = 0; 1757 out: 1758 btrfs_release_path(path); 1759 btrfs_release_path(log_path); 1760 return ret; 1761 } 1762 1763 /* 1764 * deletion replay happens before we copy any new directory items 1765 * out of the log or out of backreferences from inodes. It 1766 * scans the log to find ranges of keys that log is authoritative for, 1767 * and then scans the directory to find items in those ranges that are 1768 * not present in the log. 1769 * 1770 * Anything we don't find in the log is unlinked and removed from the 1771 * directory. 1772 */ 1773 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1774 struct btrfs_root *root, 1775 struct btrfs_root *log, 1776 struct btrfs_path *path, 1777 u64 dirid, int del_all) 1778 { 1779 u64 range_start; 1780 u64 range_end; 1781 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1782 int ret = 0; 1783 struct btrfs_key dir_key; 1784 struct btrfs_key found_key; 1785 struct btrfs_path *log_path; 1786 struct inode *dir; 1787 1788 dir_key.objectid = dirid; 1789 dir_key.type = BTRFS_DIR_ITEM_KEY; 1790 log_path = btrfs_alloc_path(); 1791 if (!log_path) 1792 return -ENOMEM; 1793 1794 dir = read_one_inode(root, dirid); 1795 /* it isn't an error if the inode isn't there, that can happen 1796 * because we replay the deletes before we copy in the inode item 1797 * from the log 1798 */ 1799 if (!dir) { 1800 btrfs_free_path(log_path); 1801 return 0; 1802 } 1803 again: 1804 range_start = 0; 1805 range_end = 0; 1806 while (1) { 1807 if (del_all) 1808 range_end = (u64)-1; 1809 else { 1810 ret = find_dir_range(log, path, dirid, key_type, 1811 &range_start, &range_end); 1812 if (ret != 0) 1813 break; 1814 } 1815 1816 dir_key.offset = range_start; 1817 while (1) { 1818 int nritems; 1819 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1820 0, 0); 1821 if (ret < 0) 1822 goto out; 1823 1824 nritems = btrfs_header_nritems(path->nodes[0]); 1825 if (path->slots[0] >= nritems) { 1826 ret = btrfs_next_leaf(root, path); 1827 if (ret) 1828 break; 1829 } 1830 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1831 path->slots[0]); 1832 if (found_key.objectid != dirid || 1833 found_key.type != dir_key.type) 1834 goto next_type; 1835 1836 if (found_key.offset > range_end) 1837 break; 1838 1839 ret = check_item_in_log(trans, root, log, path, 1840 log_path, dir, 1841 &found_key); 1842 BUG_ON(ret); 1843 if (found_key.offset == (u64)-1) 1844 break; 1845 dir_key.offset = found_key.offset + 1; 1846 } 1847 btrfs_release_path(path); 1848 if (range_end == (u64)-1) 1849 break; 1850 range_start = range_end + 1; 1851 } 1852 1853 next_type: 1854 ret = 0; 1855 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1856 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1857 dir_key.type = BTRFS_DIR_INDEX_KEY; 1858 btrfs_release_path(path); 1859 goto again; 1860 } 1861 out: 1862 btrfs_release_path(path); 1863 btrfs_free_path(log_path); 1864 iput(dir); 1865 return ret; 1866 } 1867 1868 /* 1869 * the process_func used to replay items from the log tree. This 1870 * gets called in two different stages. The first stage just looks 1871 * for inodes and makes sure they are all copied into the subvolume. 1872 * 1873 * The second stage copies all the other item types from the log into 1874 * the subvolume. The two stage approach is slower, but gets rid of 1875 * lots of complexity around inodes referencing other inodes that exist 1876 * only in the log (references come from either directory items or inode 1877 * back refs). 1878 */ 1879 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1880 struct walk_control *wc, u64 gen) 1881 { 1882 int nritems; 1883 struct btrfs_path *path; 1884 struct btrfs_root *root = wc->replay_dest; 1885 struct btrfs_key key; 1886 int level; 1887 int i; 1888 int ret; 1889 1890 ret = btrfs_read_buffer(eb, gen); 1891 if (ret) 1892 return ret; 1893 1894 level = btrfs_header_level(eb); 1895 1896 if (level != 0) 1897 return 0; 1898 1899 path = btrfs_alloc_path(); 1900 if (!path) 1901 return -ENOMEM; 1902 1903 nritems = btrfs_header_nritems(eb); 1904 for (i = 0; i < nritems; i++) { 1905 btrfs_item_key_to_cpu(eb, &key, i); 1906 1907 /* inode keys are done during the first stage */ 1908 if (key.type == BTRFS_INODE_ITEM_KEY && 1909 wc->stage == LOG_WALK_REPLAY_INODES) { 1910 struct btrfs_inode_item *inode_item; 1911 u32 mode; 1912 1913 inode_item = btrfs_item_ptr(eb, i, 1914 struct btrfs_inode_item); 1915 mode = btrfs_inode_mode(eb, inode_item); 1916 if (S_ISDIR(mode)) { 1917 ret = replay_dir_deletes(wc->trans, 1918 root, log, path, key.objectid, 0); 1919 BUG_ON(ret); 1920 } 1921 ret = overwrite_item(wc->trans, root, path, 1922 eb, i, &key); 1923 BUG_ON(ret); 1924 1925 /* for regular files, make sure corresponding 1926 * orhpan item exist. extents past the new EOF 1927 * will be truncated later by orphan cleanup. 1928 */ 1929 if (S_ISREG(mode)) { 1930 ret = insert_orphan_item(wc->trans, root, 1931 key.objectid); 1932 BUG_ON(ret); 1933 } 1934 1935 ret = link_to_fixup_dir(wc->trans, root, 1936 path, key.objectid); 1937 BUG_ON(ret); 1938 } 1939 if (wc->stage < LOG_WALK_REPLAY_ALL) 1940 continue; 1941 1942 /* these keys are simply copied */ 1943 if (key.type == BTRFS_XATTR_ITEM_KEY) { 1944 ret = overwrite_item(wc->trans, root, path, 1945 eb, i, &key); 1946 BUG_ON(ret); 1947 } else if (key.type == BTRFS_INODE_REF_KEY) { 1948 ret = add_inode_ref(wc->trans, root, log, path, 1949 eb, i, &key); 1950 BUG_ON(ret && ret != -ENOENT); 1951 } else if (key.type == BTRFS_INODE_EXTREF_KEY) { 1952 ret = add_inode_ref(wc->trans, root, log, path, 1953 eb, i, &key); 1954 BUG_ON(ret && ret != -ENOENT); 1955 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 1956 ret = replay_one_extent(wc->trans, root, path, 1957 eb, i, &key); 1958 BUG_ON(ret); 1959 } else if (key.type == BTRFS_DIR_ITEM_KEY || 1960 key.type == BTRFS_DIR_INDEX_KEY) { 1961 ret = replay_one_dir_item(wc->trans, root, path, 1962 eb, i, &key); 1963 BUG_ON(ret); 1964 } 1965 } 1966 btrfs_free_path(path); 1967 return 0; 1968 } 1969 1970 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 1971 struct btrfs_root *root, 1972 struct btrfs_path *path, int *level, 1973 struct walk_control *wc) 1974 { 1975 u64 root_owner; 1976 u64 bytenr; 1977 u64 ptr_gen; 1978 struct extent_buffer *next; 1979 struct extent_buffer *cur; 1980 struct extent_buffer *parent; 1981 u32 blocksize; 1982 int ret = 0; 1983 1984 WARN_ON(*level < 0); 1985 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1986 1987 while (*level > 0) { 1988 WARN_ON(*level < 0); 1989 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1990 cur = path->nodes[*level]; 1991 1992 if (btrfs_header_level(cur) != *level) 1993 WARN_ON(1); 1994 1995 if (path->slots[*level] >= 1996 btrfs_header_nritems(cur)) 1997 break; 1998 1999 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2000 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2001 blocksize = btrfs_level_size(root, *level - 1); 2002 2003 parent = path->nodes[*level]; 2004 root_owner = btrfs_header_owner(parent); 2005 2006 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 2007 if (!next) 2008 return -ENOMEM; 2009 2010 if (*level == 1) { 2011 ret = wc->process_func(root, next, wc, ptr_gen); 2012 if (ret) 2013 return ret; 2014 2015 path->slots[*level]++; 2016 if (wc->free) { 2017 ret = btrfs_read_buffer(next, ptr_gen); 2018 if (ret) { 2019 free_extent_buffer(next); 2020 return ret; 2021 } 2022 2023 btrfs_tree_lock(next); 2024 btrfs_set_lock_blocking(next); 2025 clean_tree_block(trans, root, next); 2026 btrfs_wait_tree_block_writeback(next); 2027 btrfs_tree_unlock(next); 2028 2029 WARN_ON(root_owner != 2030 BTRFS_TREE_LOG_OBJECTID); 2031 ret = btrfs_free_and_pin_reserved_extent(root, 2032 bytenr, blocksize); 2033 BUG_ON(ret); /* -ENOMEM or logic errors */ 2034 } 2035 free_extent_buffer(next); 2036 continue; 2037 } 2038 ret = btrfs_read_buffer(next, ptr_gen); 2039 if (ret) { 2040 free_extent_buffer(next); 2041 return ret; 2042 } 2043 2044 WARN_ON(*level <= 0); 2045 if (path->nodes[*level-1]) 2046 free_extent_buffer(path->nodes[*level-1]); 2047 path->nodes[*level-1] = next; 2048 *level = btrfs_header_level(next); 2049 path->slots[*level] = 0; 2050 cond_resched(); 2051 } 2052 WARN_ON(*level < 0); 2053 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2054 2055 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2056 2057 cond_resched(); 2058 return 0; 2059 } 2060 2061 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2062 struct btrfs_root *root, 2063 struct btrfs_path *path, int *level, 2064 struct walk_control *wc) 2065 { 2066 u64 root_owner; 2067 int i; 2068 int slot; 2069 int ret; 2070 2071 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2072 slot = path->slots[i]; 2073 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2074 path->slots[i]++; 2075 *level = i; 2076 WARN_ON(*level == 0); 2077 return 0; 2078 } else { 2079 struct extent_buffer *parent; 2080 if (path->nodes[*level] == root->node) 2081 parent = path->nodes[*level]; 2082 else 2083 parent = path->nodes[*level + 1]; 2084 2085 root_owner = btrfs_header_owner(parent); 2086 ret = wc->process_func(root, path->nodes[*level], wc, 2087 btrfs_header_generation(path->nodes[*level])); 2088 if (ret) 2089 return ret; 2090 2091 if (wc->free) { 2092 struct extent_buffer *next; 2093 2094 next = path->nodes[*level]; 2095 2096 btrfs_tree_lock(next); 2097 btrfs_set_lock_blocking(next); 2098 clean_tree_block(trans, root, next); 2099 btrfs_wait_tree_block_writeback(next); 2100 btrfs_tree_unlock(next); 2101 2102 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2103 ret = btrfs_free_and_pin_reserved_extent(root, 2104 path->nodes[*level]->start, 2105 path->nodes[*level]->len); 2106 BUG_ON(ret); 2107 } 2108 free_extent_buffer(path->nodes[*level]); 2109 path->nodes[*level] = NULL; 2110 *level = i + 1; 2111 } 2112 } 2113 return 1; 2114 } 2115 2116 /* 2117 * drop the reference count on the tree rooted at 'snap'. This traverses 2118 * the tree freeing any blocks that have a ref count of zero after being 2119 * decremented. 2120 */ 2121 static int walk_log_tree(struct btrfs_trans_handle *trans, 2122 struct btrfs_root *log, struct walk_control *wc) 2123 { 2124 int ret = 0; 2125 int wret; 2126 int level; 2127 struct btrfs_path *path; 2128 int i; 2129 int orig_level; 2130 2131 path = btrfs_alloc_path(); 2132 if (!path) 2133 return -ENOMEM; 2134 2135 level = btrfs_header_level(log->node); 2136 orig_level = level; 2137 path->nodes[level] = log->node; 2138 extent_buffer_get(log->node); 2139 path->slots[level] = 0; 2140 2141 while (1) { 2142 wret = walk_down_log_tree(trans, log, path, &level, wc); 2143 if (wret > 0) 2144 break; 2145 if (wret < 0) { 2146 ret = wret; 2147 goto out; 2148 } 2149 2150 wret = walk_up_log_tree(trans, log, path, &level, wc); 2151 if (wret > 0) 2152 break; 2153 if (wret < 0) { 2154 ret = wret; 2155 goto out; 2156 } 2157 } 2158 2159 /* was the root node processed? if not, catch it here */ 2160 if (path->nodes[orig_level]) { 2161 ret = wc->process_func(log, path->nodes[orig_level], wc, 2162 btrfs_header_generation(path->nodes[orig_level])); 2163 if (ret) 2164 goto out; 2165 if (wc->free) { 2166 struct extent_buffer *next; 2167 2168 next = path->nodes[orig_level]; 2169 2170 btrfs_tree_lock(next); 2171 btrfs_set_lock_blocking(next); 2172 clean_tree_block(trans, log, next); 2173 btrfs_wait_tree_block_writeback(next); 2174 btrfs_tree_unlock(next); 2175 2176 WARN_ON(log->root_key.objectid != 2177 BTRFS_TREE_LOG_OBJECTID); 2178 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2179 next->len); 2180 BUG_ON(ret); /* -ENOMEM or logic errors */ 2181 } 2182 } 2183 2184 out: 2185 for (i = 0; i <= orig_level; i++) { 2186 if (path->nodes[i]) { 2187 free_extent_buffer(path->nodes[i]); 2188 path->nodes[i] = NULL; 2189 } 2190 } 2191 btrfs_free_path(path); 2192 return ret; 2193 } 2194 2195 /* 2196 * helper function to update the item for a given subvolumes log root 2197 * in the tree of log roots 2198 */ 2199 static int update_log_root(struct btrfs_trans_handle *trans, 2200 struct btrfs_root *log) 2201 { 2202 int ret; 2203 2204 if (log->log_transid == 1) { 2205 /* insert root item on the first sync */ 2206 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2207 &log->root_key, &log->root_item); 2208 } else { 2209 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2210 &log->root_key, &log->root_item); 2211 } 2212 return ret; 2213 } 2214 2215 static int wait_log_commit(struct btrfs_trans_handle *trans, 2216 struct btrfs_root *root, unsigned long transid) 2217 { 2218 DEFINE_WAIT(wait); 2219 int index = transid % 2; 2220 2221 /* 2222 * we only allow two pending log transactions at a time, 2223 * so we know that if ours is more than 2 older than the 2224 * current transaction, we're done 2225 */ 2226 do { 2227 prepare_to_wait(&root->log_commit_wait[index], 2228 &wait, TASK_UNINTERRUPTIBLE); 2229 mutex_unlock(&root->log_mutex); 2230 2231 if (root->fs_info->last_trans_log_full_commit != 2232 trans->transid && root->log_transid < transid + 2 && 2233 atomic_read(&root->log_commit[index])) 2234 schedule(); 2235 2236 finish_wait(&root->log_commit_wait[index], &wait); 2237 mutex_lock(&root->log_mutex); 2238 } while (root->fs_info->last_trans_log_full_commit != 2239 trans->transid && root->log_transid < transid + 2 && 2240 atomic_read(&root->log_commit[index])); 2241 return 0; 2242 } 2243 2244 static void wait_for_writer(struct btrfs_trans_handle *trans, 2245 struct btrfs_root *root) 2246 { 2247 DEFINE_WAIT(wait); 2248 while (root->fs_info->last_trans_log_full_commit != 2249 trans->transid && atomic_read(&root->log_writers)) { 2250 prepare_to_wait(&root->log_writer_wait, 2251 &wait, TASK_UNINTERRUPTIBLE); 2252 mutex_unlock(&root->log_mutex); 2253 if (root->fs_info->last_trans_log_full_commit != 2254 trans->transid && atomic_read(&root->log_writers)) 2255 schedule(); 2256 mutex_lock(&root->log_mutex); 2257 finish_wait(&root->log_writer_wait, &wait); 2258 } 2259 } 2260 2261 /* 2262 * btrfs_sync_log does sends a given tree log down to the disk and 2263 * updates the super blocks to record it. When this call is done, 2264 * you know that any inodes previously logged are safely on disk only 2265 * if it returns 0. 2266 * 2267 * Any other return value means you need to call btrfs_commit_transaction. 2268 * Some of the edge cases for fsyncing directories that have had unlinks 2269 * or renames done in the past mean that sometimes the only safe 2270 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2271 * that has happened. 2272 */ 2273 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2274 struct btrfs_root *root) 2275 { 2276 int index1; 2277 int index2; 2278 int mark; 2279 int ret; 2280 struct btrfs_root *log = root->log_root; 2281 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2282 unsigned long log_transid = 0; 2283 2284 mutex_lock(&root->log_mutex); 2285 log_transid = root->log_transid; 2286 index1 = root->log_transid % 2; 2287 if (atomic_read(&root->log_commit[index1])) { 2288 wait_log_commit(trans, root, root->log_transid); 2289 mutex_unlock(&root->log_mutex); 2290 return 0; 2291 } 2292 atomic_set(&root->log_commit[index1], 1); 2293 2294 /* wait for previous tree log sync to complete */ 2295 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2296 wait_log_commit(trans, root, root->log_transid - 1); 2297 while (1) { 2298 int batch = atomic_read(&root->log_batch); 2299 /* when we're on an ssd, just kick the log commit out */ 2300 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2301 mutex_unlock(&root->log_mutex); 2302 schedule_timeout_uninterruptible(1); 2303 mutex_lock(&root->log_mutex); 2304 } 2305 wait_for_writer(trans, root); 2306 if (batch == atomic_read(&root->log_batch)) 2307 break; 2308 } 2309 2310 /* bail out if we need to do a full commit */ 2311 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2312 ret = -EAGAIN; 2313 btrfs_free_logged_extents(log, log_transid); 2314 mutex_unlock(&root->log_mutex); 2315 goto out; 2316 } 2317 2318 if (log_transid % 2 == 0) 2319 mark = EXTENT_DIRTY; 2320 else 2321 mark = EXTENT_NEW; 2322 2323 /* we start IO on all the marked extents here, but we don't actually 2324 * wait for them until later. 2325 */ 2326 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2327 if (ret) { 2328 btrfs_abort_transaction(trans, root, ret); 2329 btrfs_free_logged_extents(log, log_transid); 2330 mutex_unlock(&root->log_mutex); 2331 goto out; 2332 } 2333 2334 btrfs_set_root_node(&log->root_item, log->node); 2335 2336 root->log_transid++; 2337 log->log_transid = root->log_transid; 2338 root->log_start_pid = 0; 2339 smp_mb(); 2340 /* 2341 * IO has been started, blocks of the log tree have WRITTEN flag set 2342 * in their headers. new modifications of the log will be written to 2343 * new positions. so it's safe to allow log writers to go in. 2344 */ 2345 mutex_unlock(&root->log_mutex); 2346 2347 mutex_lock(&log_root_tree->log_mutex); 2348 atomic_inc(&log_root_tree->log_batch); 2349 atomic_inc(&log_root_tree->log_writers); 2350 mutex_unlock(&log_root_tree->log_mutex); 2351 2352 ret = update_log_root(trans, log); 2353 2354 mutex_lock(&log_root_tree->log_mutex); 2355 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2356 smp_mb(); 2357 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2358 wake_up(&log_root_tree->log_writer_wait); 2359 } 2360 2361 if (ret) { 2362 if (ret != -ENOSPC) { 2363 btrfs_abort_transaction(trans, root, ret); 2364 mutex_unlock(&log_root_tree->log_mutex); 2365 goto out; 2366 } 2367 root->fs_info->last_trans_log_full_commit = trans->transid; 2368 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2369 btrfs_free_logged_extents(log, log_transid); 2370 mutex_unlock(&log_root_tree->log_mutex); 2371 ret = -EAGAIN; 2372 goto out; 2373 } 2374 2375 index2 = log_root_tree->log_transid % 2; 2376 if (atomic_read(&log_root_tree->log_commit[index2])) { 2377 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2378 wait_log_commit(trans, log_root_tree, 2379 log_root_tree->log_transid); 2380 btrfs_free_logged_extents(log, log_transid); 2381 mutex_unlock(&log_root_tree->log_mutex); 2382 ret = 0; 2383 goto out; 2384 } 2385 atomic_set(&log_root_tree->log_commit[index2], 1); 2386 2387 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2388 wait_log_commit(trans, log_root_tree, 2389 log_root_tree->log_transid - 1); 2390 } 2391 2392 wait_for_writer(trans, log_root_tree); 2393 2394 /* 2395 * now that we've moved on to the tree of log tree roots, 2396 * check the full commit flag again 2397 */ 2398 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2399 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2400 btrfs_free_logged_extents(log, log_transid); 2401 mutex_unlock(&log_root_tree->log_mutex); 2402 ret = -EAGAIN; 2403 goto out_wake_log_root; 2404 } 2405 2406 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2407 &log_root_tree->dirty_log_pages, 2408 EXTENT_DIRTY | EXTENT_NEW); 2409 if (ret) { 2410 btrfs_abort_transaction(trans, root, ret); 2411 btrfs_free_logged_extents(log, log_transid); 2412 mutex_unlock(&log_root_tree->log_mutex); 2413 goto out_wake_log_root; 2414 } 2415 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2416 btrfs_wait_logged_extents(log, log_transid); 2417 2418 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2419 log_root_tree->node->start); 2420 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2421 btrfs_header_level(log_root_tree->node)); 2422 2423 log_root_tree->log_transid++; 2424 smp_mb(); 2425 2426 mutex_unlock(&log_root_tree->log_mutex); 2427 2428 /* 2429 * nobody else is going to jump in and write the the ctree 2430 * super here because the log_commit atomic below is protecting 2431 * us. We must be called with a transaction handle pinning 2432 * the running transaction open, so a full commit can't hop 2433 * in and cause problems either. 2434 */ 2435 btrfs_scrub_pause_super(root); 2436 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2437 btrfs_scrub_continue_super(root); 2438 if (ret) { 2439 btrfs_abort_transaction(trans, root, ret); 2440 goto out_wake_log_root; 2441 } 2442 2443 mutex_lock(&root->log_mutex); 2444 if (root->last_log_commit < log_transid) 2445 root->last_log_commit = log_transid; 2446 mutex_unlock(&root->log_mutex); 2447 2448 out_wake_log_root: 2449 atomic_set(&log_root_tree->log_commit[index2], 0); 2450 smp_mb(); 2451 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2452 wake_up(&log_root_tree->log_commit_wait[index2]); 2453 out: 2454 atomic_set(&root->log_commit[index1], 0); 2455 smp_mb(); 2456 if (waitqueue_active(&root->log_commit_wait[index1])) 2457 wake_up(&root->log_commit_wait[index1]); 2458 return ret; 2459 } 2460 2461 static void free_log_tree(struct btrfs_trans_handle *trans, 2462 struct btrfs_root *log) 2463 { 2464 int ret; 2465 u64 start; 2466 u64 end; 2467 struct walk_control wc = { 2468 .free = 1, 2469 .process_func = process_one_buffer 2470 }; 2471 2472 if (trans) { 2473 ret = walk_log_tree(trans, log, &wc); 2474 BUG_ON(ret); 2475 } 2476 2477 while (1) { 2478 ret = find_first_extent_bit(&log->dirty_log_pages, 2479 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2480 NULL); 2481 if (ret) 2482 break; 2483 2484 clear_extent_bits(&log->dirty_log_pages, start, end, 2485 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2486 } 2487 2488 /* 2489 * We may have short-circuited the log tree with the full commit logic 2490 * and left ordered extents on our list, so clear these out to keep us 2491 * from leaking inodes and memory. 2492 */ 2493 btrfs_free_logged_extents(log, 0); 2494 btrfs_free_logged_extents(log, 1); 2495 2496 free_extent_buffer(log->node); 2497 kfree(log); 2498 } 2499 2500 /* 2501 * free all the extents used by the tree log. This should be called 2502 * at commit time of the full transaction 2503 */ 2504 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2505 { 2506 if (root->log_root) { 2507 free_log_tree(trans, root->log_root); 2508 root->log_root = NULL; 2509 } 2510 return 0; 2511 } 2512 2513 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2514 struct btrfs_fs_info *fs_info) 2515 { 2516 if (fs_info->log_root_tree) { 2517 free_log_tree(trans, fs_info->log_root_tree); 2518 fs_info->log_root_tree = NULL; 2519 } 2520 return 0; 2521 } 2522 2523 /* 2524 * If both a file and directory are logged, and unlinks or renames are 2525 * mixed in, we have a few interesting corners: 2526 * 2527 * create file X in dir Y 2528 * link file X to X.link in dir Y 2529 * fsync file X 2530 * unlink file X but leave X.link 2531 * fsync dir Y 2532 * 2533 * After a crash we would expect only X.link to exist. But file X 2534 * didn't get fsync'd again so the log has back refs for X and X.link. 2535 * 2536 * We solve this by removing directory entries and inode backrefs from the 2537 * log when a file that was logged in the current transaction is 2538 * unlinked. Any later fsync will include the updated log entries, and 2539 * we'll be able to reconstruct the proper directory items from backrefs. 2540 * 2541 * This optimizations allows us to avoid relogging the entire inode 2542 * or the entire directory. 2543 */ 2544 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2545 struct btrfs_root *root, 2546 const char *name, int name_len, 2547 struct inode *dir, u64 index) 2548 { 2549 struct btrfs_root *log; 2550 struct btrfs_dir_item *di; 2551 struct btrfs_path *path; 2552 int ret; 2553 int err = 0; 2554 int bytes_del = 0; 2555 u64 dir_ino = btrfs_ino(dir); 2556 2557 if (BTRFS_I(dir)->logged_trans < trans->transid) 2558 return 0; 2559 2560 ret = join_running_log_trans(root); 2561 if (ret) 2562 return 0; 2563 2564 mutex_lock(&BTRFS_I(dir)->log_mutex); 2565 2566 log = root->log_root; 2567 path = btrfs_alloc_path(); 2568 if (!path) { 2569 err = -ENOMEM; 2570 goto out_unlock; 2571 } 2572 2573 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2574 name, name_len, -1); 2575 if (IS_ERR(di)) { 2576 err = PTR_ERR(di); 2577 goto fail; 2578 } 2579 if (di) { 2580 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2581 bytes_del += name_len; 2582 BUG_ON(ret); 2583 } 2584 btrfs_release_path(path); 2585 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2586 index, name, name_len, -1); 2587 if (IS_ERR(di)) { 2588 err = PTR_ERR(di); 2589 goto fail; 2590 } 2591 if (di) { 2592 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2593 bytes_del += name_len; 2594 BUG_ON(ret); 2595 } 2596 2597 /* update the directory size in the log to reflect the names 2598 * we have removed 2599 */ 2600 if (bytes_del) { 2601 struct btrfs_key key; 2602 2603 key.objectid = dir_ino; 2604 key.offset = 0; 2605 key.type = BTRFS_INODE_ITEM_KEY; 2606 btrfs_release_path(path); 2607 2608 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2609 if (ret < 0) { 2610 err = ret; 2611 goto fail; 2612 } 2613 if (ret == 0) { 2614 struct btrfs_inode_item *item; 2615 u64 i_size; 2616 2617 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2618 struct btrfs_inode_item); 2619 i_size = btrfs_inode_size(path->nodes[0], item); 2620 if (i_size > bytes_del) 2621 i_size -= bytes_del; 2622 else 2623 i_size = 0; 2624 btrfs_set_inode_size(path->nodes[0], item, i_size); 2625 btrfs_mark_buffer_dirty(path->nodes[0]); 2626 } else 2627 ret = 0; 2628 btrfs_release_path(path); 2629 } 2630 fail: 2631 btrfs_free_path(path); 2632 out_unlock: 2633 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2634 if (ret == -ENOSPC) { 2635 root->fs_info->last_trans_log_full_commit = trans->transid; 2636 ret = 0; 2637 } else if (ret < 0) 2638 btrfs_abort_transaction(trans, root, ret); 2639 2640 btrfs_end_log_trans(root); 2641 2642 return err; 2643 } 2644 2645 /* see comments for btrfs_del_dir_entries_in_log */ 2646 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2647 struct btrfs_root *root, 2648 const char *name, int name_len, 2649 struct inode *inode, u64 dirid) 2650 { 2651 struct btrfs_root *log; 2652 u64 index; 2653 int ret; 2654 2655 if (BTRFS_I(inode)->logged_trans < trans->transid) 2656 return 0; 2657 2658 ret = join_running_log_trans(root); 2659 if (ret) 2660 return 0; 2661 log = root->log_root; 2662 mutex_lock(&BTRFS_I(inode)->log_mutex); 2663 2664 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 2665 dirid, &index); 2666 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2667 if (ret == -ENOSPC) { 2668 root->fs_info->last_trans_log_full_commit = trans->transid; 2669 ret = 0; 2670 } else if (ret < 0 && ret != -ENOENT) 2671 btrfs_abort_transaction(trans, root, ret); 2672 btrfs_end_log_trans(root); 2673 2674 return ret; 2675 } 2676 2677 /* 2678 * creates a range item in the log for 'dirid'. first_offset and 2679 * last_offset tell us which parts of the key space the log should 2680 * be considered authoritative for. 2681 */ 2682 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2683 struct btrfs_root *log, 2684 struct btrfs_path *path, 2685 int key_type, u64 dirid, 2686 u64 first_offset, u64 last_offset) 2687 { 2688 int ret; 2689 struct btrfs_key key; 2690 struct btrfs_dir_log_item *item; 2691 2692 key.objectid = dirid; 2693 key.offset = first_offset; 2694 if (key_type == BTRFS_DIR_ITEM_KEY) 2695 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2696 else 2697 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2698 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2699 if (ret) 2700 return ret; 2701 2702 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2703 struct btrfs_dir_log_item); 2704 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2705 btrfs_mark_buffer_dirty(path->nodes[0]); 2706 btrfs_release_path(path); 2707 return 0; 2708 } 2709 2710 /* 2711 * log all the items included in the current transaction for a given 2712 * directory. This also creates the range items in the log tree required 2713 * to replay anything deleted before the fsync 2714 */ 2715 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2716 struct btrfs_root *root, struct inode *inode, 2717 struct btrfs_path *path, 2718 struct btrfs_path *dst_path, int key_type, 2719 u64 min_offset, u64 *last_offset_ret) 2720 { 2721 struct btrfs_key min_key; 2722 struct btrfs_key max_key; 2723 struct btrfs_root *log = root->log_root; 2724 struct extent_buffer *src; 2725 int err = 0; 2726 int ret; 2727 int i; 2728 int nritems; 2729 u64 first_offset = min_offset; 2730 u64 last_offset = (u64)-1; 2731 u64 ino = btrfs_ino(inode); 2732 2733 log = root->log_root; 2734 max_key.objectid = ino; 2735 max_key.offset = (u64)-1; 2736 max_key.type = key_type; 2737 2738 min_key.objectid = ino; 2739 min_key.type = key_type; 2740 min_key.offset = min_offset; 2741 2742 path->keep_locks = 1; 2743 2744 ret = btrfs_search_forward(root, &min_key, &max_key, 2745 path, trans->transid); 2746 2747 /* 2748 * we didn't find anything from this transaction, see if there 2749 * is anything at all 2750 */ 2751 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 2752 min_key.objectid = ino; 2753 min_key.type = key_type; 2754 min_key.offset = (u64)-1; 2755 btrfs_release_path(path); 2756 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2757 if (ret < 0) { 2758 btrfs_release_path(path); 2759 return ret; 2760 } 2761 ret = btrfs_previous_item(root, path, ino, key_type); 2762 2763 /* if ret == 0 there are items for this type, 2764 * create a range to tell us the last key of this type. 2765 * otherwise, there are no items in this directory after 2766 * *min_offset, and we create a range to indicate that. 2767 */ 2768 if (ret == 0) { 2769 struct btrfs_key tmp; 2770 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2771 path->slots[0]); 2772 if (key_type == tmp.type) 2773 first_offset = max(min_offset, tmp.offset) + 1; 2774 } 2775 goto done; 2776 } 2777 2778 /* go backward to find any previous key */ 2779 ret = btrfs_previous_item(root, path, ino, key_type); 2780 if (ret == 0) { 2781 struct btrfs_key tmp; 2782 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2783 if (key_type == tmp.type) { 2784 first_offset = tmp.offset; 2785 ret = overwrite_item(trans, log, dst_path, 2786 path->nodes[0], path->slots[0], 2787 &tmp); 2788 if (ret) { 2789 err = ret; 2790 goto done; 2791 } 2792 } 2793 } 2794 btrfs_release_path(path); 2795 2796 /* find the first key from this transaction again */ 2797 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2798 if (ret != 0) { 2799 WARN_ON(1); 2800 goto done; 2801 } 2802 2803 /* 2804 * we have a block from this transaction, log every item in it 2805 * from our directory 2806 */ 2807 while (1) { 2808 struct btrfs_key tmp; 2809 src = path->nodes[0]; 2810 nritems = btrfs_header_nritems(src); 2811 for (i = path->slots[0]; i < nritems; i++) { 2812 btrfs_item_key_to_cpu(src, &min_key, i); 2813 2814 if (min_key.objectid != ino || min_key.type != key_type) 2815 goto done; 2816 ret = overwrite_item(trans, log, dst_path, src, i, 2817 &min_key); 2818 if (ret) { 2819 err = ret; 2820 goto done; 2821 } 2822 } 2823 path->slots[0] = nritems; 2824 2825 /* 2826 * look ahead to the next item and see if it is also 2827 * from this directory and from this transaction 2828 */ 2829 ret = btrfs_next_leaf(root, path); 2830 if (ret == 1) { 2831 last_offset = (u64)-1; 2832 goto done; 2833 } 2834 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2835 if (tmp.objectid != ino || tmp.type != key_type) { 2836 last_offset = (u64)-1; 2837 goto done; 2838 } 2839 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2840 ret = overwrite_item(trans, log, dst_path, 2841 path->nodes[0], path->slots[0], 2842 &tmp); 2843 if (ret) 2844 err = ret; 2845 else 2846 last_offset = tmp.offset; 2847 goto done; 2848 } 2849 } 2850 done: 2851 btrfs_release_path(path); 2852 btrfs_release_path(dst_path); 2853 2854 if (err == 0) { 2855 *last_offset_ret = last_offset; 2856 /* 2857 * insert the log range keys to indicate where the log 2858 * is valid 2859 */ 2860 ret = insert_dir_log_key(trans, log, path, key_type, 2861 ino, first_offset, last_offset); 2862 if (ret) 2863 err = ret; 2864 } 2865 return err; 2866 } 2867 2868 /* 2869 * logging directories is very similar to logging inodes, We find all the items 2870 * from the current transaction and write them to the log. 2871 * 2872 * The recovery code scans the directory in the subvolume, and if it finds a 2873 * key in the range logged that is not present in the log tree, then it means 2874 * that dir entry was unlinked during the transaction. 2875 * 2876 * In order for that scan to work, we must include one key smaller than 2877 * the smallest logged by this transaction and one key larger than the largest 2878 * key logged by this transaction. 2879 */ 2880 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2881 struct btrfs_root *root, struct inode *inode, 2882 struct btrfs_path *path, 2883 struct btrfs_path *dst_path) 2884 { 2885 u64 min_key; 2886 u64 max_key; 2887 int ret; 2888 int key_type = BTRFS_DIR_ITEM_KEY; 2889 2890 again: 2891 min_key = 0; 2892 max_key = 0; 2893 while (1) { 2894 ret = log_dir_items(trans, root, inode, path, 2895 dst_path, key_type, min_key, 2896 &max_key); 2897 if (ret) 2898 return ret; 2899 if (max_key == (u64)-1) 2900 break; 2901 min_key = max_key + 1; 2902 } 2903 2904 if (key_type == BTRFS_DIR_ITEM_KEY) { 2905 key_type = BTRFS_DIR_INDEX_KEY; 2906 goto again; 2907 } 2908 return 0; 2909 } 2910 2911 /* 2912 * a helper function to drop items from the log before we relog an 2913 * inode. max_key_type indicates the highest item type to remove. 2914 * This cannot be run for file data extents because it does not 2915 * free the extents they point to. 2916 */ 2917 static int drop_objectid_items(struct btrfs_trans_handle *trans, 2918 struct btrfs_root *log, 2919 struct btrfs_path *path, 2920 u64 objectid, int max_key_type) 2921 { 2922 int ret; 2923 struct btrfs_key key; 2924 struct btrfs_key found_key; 2925 int start_slot; 2926 2927 key.objectid = objectid; 2928 key.type = max_key_type; 2929 key.offset = (u64)-1; 2930 2931 while (1) { 2932 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2933 BUG_ON(ret == 0); 2934 if (ret < 0) 2935 break; 2936 2937 if (path->slots[0] == 0) 2938 break; 2939 2940 path->slots[0]--; 2941 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2942 path->slots[0]); 2943 2944 if (found_key.objectid != objectid) 2945 break; 2946 2947 found_key.offset = 0; 2948 found_key.type = 0; 2949 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 2950 &start_slot); 2951 2952 ret = btrfs_del_items(trans, log, path, start_slot, 2953 path->slots[0] - start_slot + 1); 2954 /* 2955 * If start slot isn't 0 then we don't need to re-search, we've 2956 * found the last guy with the objectid in this tree. 2957 */ 2958 if (ret || start_slot != 0) 2959 break; 2960 btrfs_release_path(path); 2961 } 2962 btrfs_release_path(path); 2963 if (ret > 0) 2964 ret = 0; 2965 return ret; 2966 } 2967 2968 static void fill_inode_item(struct btrfs_trans_handle *trans, 2969 struct extent_buffer *leaf, 2970 struct btrfs_inode_item *item, 2971 struct inode *inode, int log_inode_only) 2972 { 2973 struct btrfs_map_token token; 2974 2975 btrfs_init_map_token(&token); 2976 2977 if (log_inode_only) { 2978 /* set the generation to zero so the recover code 2979 * can tell the difference between an logging 2980 * just to say 'this inode exists' and a logging 2981 * to say 'update this inode with these values' 2982 */ 2983 btrfs_set_token_inode_generation(leaf, item, 0, &token); 2984 btrfs_set_token_inode_size(leaf, item, 0, &token); 2985 } else { 2986 btrfs_set_token_inode_generation(leaf, item, 2987 BTRFS_I(inode)->generation, 2988 &token); 2989 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 2990 } 2991 2992 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 2993 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 2994 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 2995 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 2996 2997 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 2998 inode->i_atime.tv_sec, &token); 2999 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3000 inode->i_atime.tv_nsec, &token); 3001 3002 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3003 inode->i_mtime.tv_sec, &token); 3004 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3005 inode->i_mtime.tv_nsec, &token); 3006 3007 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3008 inode->i_ctime.tv_sec, &token); 3009 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3010 inode->i_ctime.tv_nsec, &token); 3011 3012 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3013 &token); 3014 3015 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3016 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3017 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3018 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3019 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3020 } 3021 3022 static int log_inode_item(struct btrfs_trans_handle *trans, 3023 struct btrfs_root *log, struct btrfs_path *path, 3024 struct inode *inode) 3025 { 3026 struct btrfs_inode_item *inode_item; 3027 struct btrfs_key key; 3028 int ret; 3029 3030 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); 3031 ret = btrfs_insert_empty_item(trans, log, path, &key, 3032 sizeof(*inode_item)); 3033 if (ret && ret != -EEXIST) 3034 return ret; 3035 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3036 struct btrfs_inode_item); 3037 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3038 btrfs_release_path(path); 3039 return 0; 3040 } 3041 3042 static noinline int copy_items(struct btrfs_trans_handle *trans, 3043 struct inode *inode, 3044 struct btrfs_path *dst_path, 3045 struct extent_buffer *src, 3046 int start_slot, int nr, int inode_only) 3047 { 3048 unsigned long src_offset; 3049 unsigned long dst_offset; 3050 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3051 struct btrfs_file_extent_item *extent; 3052 struct btrfs_inode_item *inode_item; 3053 int ret; 3054 struct btrfs_key *ins_keys; 3055 u32 *ins_sizes; 3056 char *ins_data; 3057 int i; 3058 struct list_head ordered_sums; 3059 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3060 3061 INIT_LIST_HEAD(&ordered_sums); 3062 3063 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3064 nr * sizeof(u32), GFP_NOFS); 3065 if (!ins_data) 3066 return -ENOMEM; 3067 3068 ins_sizes = (u32 *)ins_data; 3069 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3070 3071 for (i = 0; i < nr; i++) { 3072 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3073 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3074 } 3075 ret = btrfs_insert_empty_items(trans, log, dst_path, 3076 ins_keys, ins_sizes, nr); 3077 if (ret) { 3078 kfree(ins_data); 3079 return ret; 3080 } 3081 3082 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3083 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3084 dst_path->slots[0]); 3085 3086 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3087 3088 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3089 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3090 dst_path->slots[0], 3091 struct btrfs_inode_item); 3092 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3093 inode, inode_only == LOG_INODE_EXISTS); 3094 } else { 3095 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3096 src_offset, ins_sizes[i]); 3097 } 3098 3099 /* take a reference on file data extents so that truncates 3100 * or deletes of this inode don't have to relog the inode 3101 * again 3102 */ 3103 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3104 !skip_csum) { 3105 int found_type; 3106 extent = btrfs_item_ptr(src, start_slot + i, 3107 struct btrfs_file_extent_item); 3108 3109 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3110 continue; 3111 3112 found_type = btrfs_file_extent_type(src, extent); 3113 if (found_type == BTRFS_FILE_EXTENT_REG) { 3114 u64 ds, dl, cs, cl; 3115 ds = btrfs_file_extent_disk_bytenr(src, 3116 extent); 3117 /* ds == 0 is a hole */ 3118 if (ds == 0) 3119 continue; 3120 3121 dl = btrfs_file_extent_disk_num_bytes(src, 3122 extent); 3123 cs = btrfs_file_extent_offset(src, extent); 3124 cl = btrfs_file_extent_num_bytes(src, 3125 extent); 3126 if (btrfs_file_extent_compression(src, 3127 extent)) { 3128 cs = 0; 3129 cl = dl; 3130 } 3131 3132 ret = btrfs_lookup_csums_range( 3133 log->fs_info->csum_root, 3134 ds + cs, ds + cs + cl - 1, 3135 &ordered_sums, 0); 3136 BUG_ON(ret); 3137 } 3138 } 3139 } 3140 3141 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3142 btrfs_release_path(dst_path); 3143 kfree(ins_data); 3144 3145 /* 3146 * we have to do this after the loop above to avoid changing the 3147 * log tree while trying to change the log tree. 3148 */ 3149 ret = 0; 3150 while (!list_empty(&ordered_sums)) { 3151 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3152 struct btrfs_ordered_sum, 3153 list); 3154 if (!ret) 3155 ret = btrfs_csum_file_blocks(trans, log, sums); 3156 list_del(&sums->list); 3157 kfree(sums); 3158 } 3159 return ret; 3160 } 3161 3162 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3163 { 3164 struct extent_map *em1, *em2; 3165 3166 em1 = list_entry(a, struct extent_map, list); 3167 em2 = list_entry(b, struct extent_map, list); 3168 3169 if (em1->start < em2->start) 3170 return -1; 3171 else if (em1->start > em2->start) 3172 return 1; 3173 return 0; 3174 } 3175 3176 static int drop_adjacent_extents(struct btrfs_trans_handle *trans, 3177 struct btrfs_root *root, struct inode *inode, 3178 struct extent_map *em, 3179 struct btrfs_path *path) 3180 { 3181 struct btrfs_file_extent_item *fi; 3182 struct extent_buffer *leaf; 3183 struct btrfs_key key, new_key; 3184 struct btrfs_map_token token; 3185 u64 extent_end; 3186 u64 extent_offset = 0; 3187 int extent_type; 3188 int del_slot = 0; 3189 int del_nr = 0; 3190 int ret = 0; 3191 3192 while (1) { 3193 btrfs_init_map_token(&token); 3194 leaf = path->nodes[0]; 3195 path->slots[0]++; 3196 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3197 if (del_nr) { 3198 ret = btrfs_del_items(trans, root, path, 3199 del_slot, del_nr); 3200 if (ret) 3201 return ret; 3202 del_nr = 0; 3203 } 3204 3205 ret = btrfs_next_leaf_write(trans, root, path, 1); 3206 if (ret < 0) 3207 return ret; 3208 if (ret > 0) 3209 return 0; 3210 leaf = path->nodes[0]; 3211 } 3212 3213 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3214 if (key.objectid != btrfs_ino(inode) || 3215 key.type != BTRFS_EXTENT_DATA_KEY || 3216 key.offset >= em->start + em->len) 3217 break; 3218 3219 fi = btrfs_item_ptr(leaf, path->slots[0], 3220 struct btrfs_file_extent_item); 3221 extent_type = btrfs_token_file_extent_type(leaf, fi, &token); 3222 if (extent_type == BTRFS_FILE_EXTENT_REG || 3223 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 3224 extent_offset = btrfs_token_file_extent_offset(leaf, 3225 fi, &token); 3226 extent_end = key.offset + 3227 btrfs_token_file_extent_num_bytes(leaf, fi, 3228 &token); 3229 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 3230 extent_end = key.offset + 3231 btrfs_file_extent_inline_len(leaf, fi); 3232 } else { 3233 BUG(); 3234 } 3235 3236 if (extent_end <= em->len + em->start) { 3237 if (!del_nr) { 3238 del_slot = path->slots[0]; 3239 } 3240 del_nr++; 3241 continue; 3242 } 3243 3244 /* 3245 * Ok so we'll ignore previous items if we log a new extent, 3246 * which can lead to overlapping extents, so if we have an 3247 * existing extent we want to adjust we _have_ to check the next 3248 * guy to make sure we even need this extent anymore, this keeps 3249 * us from panicing in set_item_key_safe. 3250 */ 3251 if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { 3252 struct btrfs_key tmp_key; 3253 3254 btrfs_item_key_to_cpu(leaf, &tmp_key, 3255 path->slots[0] + 1); 3256 if (tmp_key.objectid == btrfs_ino(inode) && 3257 tmp_key.type == BTRFS_EXTENT_DATA_KEY && 3258 tmp_key.offset <= em->start + em->len) { 3259 if (!del_nr) 3260 del_slot = path->slots[0]; 3261 del_nr++; 3262 continue; 3263 } 3264 } 3265 3266 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); 3267 memcpy(&new_key, &key, sizeof(new_key)); 3268 new_key.offset = em->start + em->len; 3269 btrfs_set_item_key_safe(trans, root, path, &new_key); 3270 extent_offset += em->start + em->len - key.offset; 3271 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, 3272 &token); 3273 btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - 3274 (em->start + em->len), 3275 &token); 3276 btrfs_mark_buffer_dirty(leaf); 3277 } 3278 3279 if (del_nr) 3280 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 3281 3282 return ret; 3283 } 3284 3285 static int log_one_extent(struct btrfs_trans_handle *trans, 3286 struct inode *inode, struct btrfs_root *root, 3287 struct extent_map *em, struct btrfs_path *path) 3288 { 3289 struct btrfs_root *log = root->log_root; 3290 struct btrfs_file_extent_item *fi; 3291 struct extent_buffer *leaf; 3292 struct btrfs_ordered_extent *ordered; 3293 struct list_head ordered_sums; 3294 struct btrfs_map_token token; 3295 struct btrfs_key key; 3296 u64 mod_start = em->mod_start; 3297 u64 mod_len = em->mod_len; 3298 u64 csum_offset; 3299 u64 csum_len; 3300 u64 extent_offset = em->start - em->orig_start; 3301 u64 block_len; 3302 int ret; 3303 int index = log->log_transid % 2; 3304 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3305 3306 insert: 3307 INIT_LIST_HEAD(&ordered_sums); 3308 btrfs_init_map_token(&token); 3309 key.objectid = btrfs_ino(inode); 3310 key.type = BTRFS_EXTENT_DATA_KEY; 3311 key.offset = em->start; 3312 path->really_keep_locks = 1; 3313 3314 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); 3315 if (ret && ret != -EEXIST) { 3316 path->really_keep_locks = 0; 3317 return ret; 3318 } 3319 leaf = path->nodes[0]; 3320 fi = btrfs_item_ptr(leaf, path->slots[0], 3321 struct btrfs_file_extent_item); 3322 3323 /* 3324 * If we are overwriting an inline extent with a real one then we need 3325 * to just delete the inline extent as it may not be large enough to 3326 * have the entire file_extent_item. 3327 */ 3328 if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == 3329 BTRFS_FILE_EXTENT_INLINE) { 3330 ret = btrfs_del_item(trans, log, path); 3331 btrfs_release_path(path); 3332 if (ret) { 3333 path->really_keep_locks = 0; 3334 return ret; 3335 } 3336 goto insert; 3337 } 3338 3339 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3340 &token); 3341 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3342 skip_csum = true; 3343 btrfs_set_token_file_extent_type(leaf, fi, 3344 BTRFS_FILE_EXTENT_PREALLOC, 3345 &token); 3346 } else { 3347 btrfs_set_token_file_extent_type(leaf, fi, 3348 BTRFS_FILE_EXTENT_REG, 3349 &token); 3350 if (em->block_start == 0) 3351 skip_csum = true; 3352 } 3353 3354 block_len = max(em->block_len, em->orig_block_len); 3355 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3356 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3357 em->block_start, 3358 &token); 3359 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3360 &token); 3361 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3362 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3363 em->block_start - 3364 extent_offset, &token); 3365 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3366 &token); 3367 } else { 3368 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3369 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3370 &token); 3371 } 3372 3373 btrfs_set_token_file_extent_offset(leaf, fi, 3374 em->start - em->orig_start, 3375 &token); 3376 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3377 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); 3378 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3379 &token); 3380 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3381 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3382 btrfs_mark_buffer_dirty(leaf); 3383 3384 /* 3385 * Have to check the extent to the right of us to make sure it doesn't 3386 * fall in our current range. We're ok if the previous extent is in our 3387 * range since the recovery stuff will run us in key order and thus just 3388 * drop the part we overwrote. 3389 */ 3390 ret = drop_adjacent_extents(trans, log, inode, em, path); 3391 btrfs_release_path(path); 3392 path->really_keep_locks = 0; 3393 if (ret) { 3394 return ret; 3395 } 3396 3397 if (skip_csum) 3398 return 0; 3399 3400 if (em->compress_type) { 3401 csum_offset = 0; 3402 csum_len = block_len; 3403 } 3404 3405 /* 3406 * First check and see if our csums are on our outstanding ordered 3407 * extents. 3408 */ 3409 again: 3410 spin_lock_irq(&log->log_extents_lock[index]); 3411 list_for_each_entry(ordered, &log->logged_list[index], log_list) { 3412 struct btrfs_ordered_sum *sum; 3413 3414 if (!mod_len) 3415 break; 3416 3417 if (ordered->inode != inode) 3418 continue; 3419 3420 if (ordered->file_offset + ordered->len <= mod_start || 3421 mod_start + mod_len <= ordered->file_offset) 3422 continue; 3423 3424 /* 3425 * We are going to copy all the csums on this ordered extent, so 3426 * go ahead and adjust mod_start and mod_len in case this 3427 * ordered extent has already been logged. 3428 */ 3429 if (ordered->file_offset > mod_start) { 3430 if (ordered->file_offset + ordered->len >= 3431 mod_start + mod_len) 3432 mod_len = ordered->file_offset - mod_start; 3433 /* 3434 * If we have this case 3435 * 3436 * |--------- logged extent ---------| 3437 * |----- ordered extent ----| 3438 * 3439 * Just don't mess with mod_start and mod_len, we'll 3440 * just end up logging more csums than we need and it 3441 * will be ok. 3442 */ 3443 } else { 3444 if (ordered->file_offset + ordered->len < 3445 mod_start + mod_len) { 3446 mod_len = (mod_start + mod_len) - 3447 (ordered->file_offset + ordered->len); 3448 mod_start = ordered->file_offset + 3449 ordered->len; 3450 } else { 3451 mod_len = 0; 3452 } 3453 } 3454 3455 /* 3456 * To keep us from looping for the above case of an ordered 3457 * extent that falls inside of the logged extent. 3458 */ 3459 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3460 &ordered->flags)) 3461 continue; 3462 atomic_inc(&ordered->refs); 3463 spin_unlock_irq(&log->log_extents_lock[index]); 3464 /* 3465 * we've dropped the lock, we must either break or 3466 * start over after this. 3467 */ 3468 3469 wait_event(ordered->wait, ordered->csum_bytes_left == 0); 3470 3471 list_for_each_entry(sum, &ordered->list, list) { 3472 ret = btrfs_csum_file_blocks(trans, log, sum); 3473 if (ret) { 3474 btrfs_put_ordered_extent(ordered); 3475 goto unlocked; 3476 } 3477 } 3478 btrfs_put_ordered_extent(ordered); 3479 goto again; 3480 3481 } 3482 spin_unlock_irq(&log->log_extents_lock[index]); 3483 unlocked: 3484 3485 if (!mod_len || ret) 3486 return ret; 3487 3488 csum_offset = mod_start - em->start; 3489 csum_len = mod_len; 3490 3491 /* block start is already adjusted for the file extent offset. */ 3492 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3493 em->block_start + csum_offset, 3494 em->block_start + csum_offset + 3495 csum_len - 1, &ordered_sums, 0); 3496 if (ret) 3497 return ret; 3498 3499 while (!list_empty(&ordered_sums)) { 3500 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3501 struct btrfs_ordered_sum, 3502 list); 3503 if (!ret) 3504 ret = btrfs_csum_file_blocks(trans, log, sums); 3505 list_del(&sums->list); 3506 kfree(sums); 3507 } 3508 3509 return ret; 3510 } 3511 3512 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3513 struct btrfs_root *root, 3514 struct inode *inode, 3515 struct btrfs_path *path) 3516 { 3517 struct extent_map *em, *n; 3518 struct list_head extents; 3519 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3520 u64 test_gen; 3521 int ret = 0; 3522 int num = 0; 3523 3524 INIT_LIST_HEAD(&extents); 3525 3526 write_lock(&tree->lock); 3527 test_gen = root->fs_info->last_trans_committed; 3528 3529 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3530 list_del_init(&em->list); 3531 3532 /* 3533 * Just an arbitrary number, this can be really CPU intensive 3534 * once we start getting a lot of extents, and really once we 3535 * have a bunch of extents we just want to commit since it will 3536 * be faster. 3537 */ 3538 if (++num > 32768) { 3539 list_del_init(&tree->modified_extents); 3540 ret = -EFBIG; 3541 goto process; 3542 } 3543 3544 if (em->generation <= test_gen) 3545 continue; 3546 /* Need a ref to keep it from getting evicted from cache */ 3547 atomic_inc(&em->refs); 3548 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3549 list_add_tail(&em->list, &extents); 3550 num++; 3551 } 3552 3553 list_sort(NULL, &extents, extent_cmp); 3554 3555 process: 3556 while (!list_empty(&extents)) { 3557 em = list_entry(extents.next, struct extent_map, list); 3558 3559 list_del_init(&em->list); 3560 3561 /* 3562 * If we had an error we just need to delete everybody from our 3563 * private list. 3564 */ 3565 if (ret) { 3566 clear_em_logging(tree, em); 3567 free_extent_map(em); 3568 continue; 3569 } 3570 3571 write_unlock(&tree->lock); 3572 3573 ret = log_one_extent(trans, inode, root, em, path); 3574 write_lock(&tree->lock); 3575 clear_em_logging(tree, em); 3576 free_extent_map(em); 3577 } 3578 WARN_ON(!list_empty(&extents)); 3579 write_unlock(&tree->lock); 3580 3581 btrfs_release_path(path); 3582 return ret; 3583 } 3584 3585 /* log a single inode in the tree log. 3586 * At least one parent directory for this inode must exist in the tree 3587 * or be logged already. 3588 * 3589 * Any items from this inode changed by the current transaction are copied 3590 * to the log tree. An extra reference is taken on any extents in this 3591 * file, allowing us to avoid a whole pile of corner cases around logging 3592 * blocks that have been removed from the tree. 3593 * 3594 * See LOG_INODE_ALL and related defines for a description of what inode_only 3595 * does. 3596 * 3597 * This handles both files and directories. 3598 */ 3599 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3600 struct btrfs_root *root, struct inode *inode, 3601 int inode_only) 3602 { 3603 struct btrfs_path *path; 3604 struct btrfs_path *dst_path; 3605 struct btrfs_key min_key; 3606 struct btrfs_key max_key; 3607 struct btrfs_root *log = root->log_root; 3608 struct extent_buffer *src = NULL; 3609 int err = 0; 3610 int ret; 3611 int nritems; 3612 int ins_start_slot = 0; 3613 int ins_nr; 3614 bool fast_search = false; 3615 u64 ino = btrfs_ino(inode); 3616 3617 log = root->log_root; 3618 3619 path = btrfs_alloc_path(); 3620 if (!path) 3621 return -ENOMEM; 3622 dst_path = btrfs_alloc_path(); 3623 if (!dst_path) { 3624 btrfs_free_path(path); 3625 return -ENOMEM; 3626 } 3627 3628 min_key.objectid = ino; 3629 min_key.type = BTRFS_INODE_ITEM_KEY; 3630 min_key.offset = 0; 3631 3632 max_key.objectid = ino; 3633 3634 3635 /* today the code can only do partial logging of directories */ 3636 if (S_ISDIR(inode->i_mode) || 3637 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3638 &BTRFS_I(inode)->runtime_flags) && 3639 inode_only == LOG_INODE_EXISTS)) 3640 max_key.type = BTRFS_XATTR_ITEM_KEY; 3641 else 3642 max_key.type = (u8)-1; 3643 max_key.offset = (u64)-1; 3644 3645 /* Only run delayed items if we are a dir or a new file */ 3646 if (S_ISDIR(inode->i_mode) || 3647 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { 3648 ret = btrfs_commit_inode_delayed_items(trans, inode); 3649 if (ret) { 3650 btrfs_free_path(path); 3651 btrfs_free_path(dst_path); 3652 return ret; 3653 } 3654 } 3655 3656 mutex_lock(&BTRFS_I(inode)->log_mutex); 3657 3658 btrfs_get_logged_extents(log, inode); 3659 3660 /* 3661 * a brute force approach to making sure we get the most uptodate 3662 * copies of everything. 3663 */ 3664 if (S_ISDIR(inode->i_mode)) { 3665 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 3666 3667 if (inode_only == LOG_INODE_EXISTS) 3668 max_key_type = BTRFS_XATTR_ITEM_KEY; 3669 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3670 } else { 3671 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3672 &BTRFS_I(inode)->runtime_flags)) { 3673 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3674 &BTRFS_I(inode)->runtime_flags); 3675 ret = btrfs_truncate_inode_items(trans, log, 3676 inode, 0, 0); 3677 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3678 &BTRFS_I(inode)->runtime_flags)) { 3679 if (inode_only == LOG_INODE_ALL) 3680 fast_search = true; 3681 max_key.type = BTRFS_XATTR_ITEM_KEY; 3682 ret = drop_objectid_items(trans, log, path, ino, 3683 max_key.type); 3684 } else { 3685 if (inode_only == LOG_INODE_ALL) 3686 fast_search = true; 3687 ret = log_inode_item(trans, log, dst_path, inode); 3688 if (ret) { 3689 err = ret; 3690 goto out_unlock; 3691 } 3692 goto log_extents; 3693 } 3694 3695 } 3696 if (ret) { 3697 err = ret; 3698 goto out_unlock; 3699 } 3700 path->keep_locks = 1; 3701 3702 while (1) { 3703 ins_nr = 0; 3704 ret = btrfs_search_forward(root, &min_key, &max_key, 3705 path, trans->transid); 3706 if (ret != 0) 3707 break; 3708 again: 3709 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 3710 if (min_key.objectid != ino) 3711 break; 3712 if (min_key.type > max_key.type) 3713 break; 3714 3715 src = path->nodes[0]; 3716 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 3717 ins_nr++; 3718 goto next_slot; 3719 } else if (!ins_nr) { 3720 ins_start_slot = path->slots[0]; 3721 ins_nr = 1; 3722 goto next_slot; 3723 } 3724 3725 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3726 ins_nr, inode_only); 3727 if (ret) { 3728 err = ret; 3729 goto out_unlock; 3730 } 3731 ins_nr = 1; 3732 ins_start_slot = path->slots[0]; 3733 next_slot: 3734 3735 nritems = btrfs_header_nritems(path->nodes[0]); 3736 path->slots[0]++; 3737 if (path->slots[0] < nritems) { 3738 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 3739 path->slots[0]); 3740 goto again; 3741 } 3742 if (ins_nr) { 3743 ret = copy_items(trans, inode, dst_path, src, 3744 ins_start_slot, 3745 ins_nr, inode_only); 3746 if (ret) { 3747 err = ret; 3748 goto out_unlock; 3749 } 3750 ins_nr = 0; 3751 } 3752 btrfs_release_path(path); 3753 3754 if (min_key.offset < (u64)-1) 3755 min_key.offset++; 3756 else if (min_key.type < (u8)-1) 3757 min_key.type++; 3758 else if (min_key.objectid < (u64)-1) 3759 min_key.objectid++; 3760 else 3761 break; 3762 } 3763 if (ins_nr) { 3764 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3765 ins_nr, inode_only); 3766 if (ret) { 3767 err = ret; 3768 goto out_unlock; 3769 } 3770 ins_nr = 0; 3771 } 3772 3773 log_extents: 3774 if (fast_search) { 3775 btrfs_release_path(dst_path); 3776 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 3777 if (ret) { 3778 err = ret; 3779 goto out_unlock; 3780 } 3781 } else { 3782 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3783 struct extent_map *em, *n; 3784 3785 write_lock(&tree->lock); 3786 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3787 list_del_init(&em->list); 3788 write_unlock(&tree->lock); 3789 } 3790 3791 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3792 btrfs_release_path(path); 3793 btrfs_release_path(dst_path); 3794 ret = log_directory_changes(trans, root, inode, path, dst_path); 3795 if (ret) { 3796 err = ret; 3797 goto out_unlock; 3798 } 3799 } 3800 BTRFS_I(inode)->logged_trans = trans->transid; 3801 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 3802 out_unlock: 3803 if (err) 3804 btrfs_free_logged_extents(log, log->log_transid); 3805 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3806 3807 btrfs_free_path(path); 3808 btrfs_free_path(dst_path); 3809 return err; 3810 } 3811 3812 /* 3813 * follow the dentry parent pointers up the chain and see if any 3814 * of the directories in it require a full commit before they can 3815 * be logged. Returns zero if nothing special needs to be done or 1 if 3816 * a full commit is required. 3817 */ 3818 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 3819 struct inode *inode, 3820 struct dentry *parent, 3821 struct super_block *sb, 3822 u64 last_committed) 3823 { 3824 int ret = 0; 3825 struct btrfs_root *root; 3826 struct dentry *old_parent = NULL; 3827 3828 /* 3829 * for regular files, if its inode is already on disk, we don't 3830 * have to worry about the parents at all. This is because 3831 * we can use the last_unlink_trans field to record renames 3832 * and other fun in this file. 3833 */ 3834 if (S_ISREG(inode->i_mode) && 3835 BTRFS_I(inode)->generation <= last_committed && 3836 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3837 goto out; 3838 3839 if (!S_ISDIR(inode->i_mode)) { 3840 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3841 goto out; 3842 inode = parent->d_inode; 3843 } 3844 3845 while (1) { 3846 BTRFS_I(inode)->logged_trans = trans->transid; 3847 smp_mb(); 3848 3849 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 3850 root = BTRFS_I(inode)->root; 3851 3852 /* 3853 * make sure any commits to the log are forced 3854 * to be full commits 3855 */ 3856 root->fs_info->last_trans_log_full_commit = 3857 trans->transid; 3858 ret = 1; 3859 break; 3860 } 3861 3862 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3863 break; 3864 3865 if (IS_ROOT(parent)) 3866 break; 3867 3868 parent = dget_parent(parent); 3869 dput(old_parent); 3870 old_parent = parent; 3871 inode = parent->d_inode; 3872 3873 } 3874 dput(old_parent); 3875 out: 3876 return ret; 3877 } 3878 3879 /* 3880 * helper function around btrfs_log_inode to make sure newly created 3881 * parent directories also end up in the log. A minimal inode and backref 3882 * only logging is done of any parent directories that are older than 3883 * the last committed transaction 3884 */ 3885 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 3886 struct btrfs_root *root, struct inode *inode, 3887 struct dentry *parent, int exists_only) 3888 { 3889 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 3890 struct super_block *sb; 3891 struct dentry *old_parent = NULL; 3892 int ret = 0; 3893 u64 last_committed = root->fs_info->last_trans_committed; 3894 3895 sb = inode->i_sb; 3896 3897 if (btrfs_test_opt(root, NOTREELOG)) { 3898 ret = 1; 3899 goto end_no_trans; 3900 } 3901 3902 if (root->fs_info->last_trans_log_full_commit > 3903 root->fs_info->last_trans_committed) { 3904 ret = 1; 3905 goto end_no_trans; 3906 } 3907 3908 if (root != BTRFS_I(inode)->root || 3909 btrfs_root_refs(&root->root_item) == 0) { 3910 ret = 1; 3911 goto end_no_trans; 3912 } 3913 3914 ret = check_parent_dirs_for_sync(trans, inode, parent, 3915 sb, last_committed); 3916 if (ret) 3917 goto end_no_trans; 3918 3919 if (btrfs_inode_in_log(inode, trans->transid)) { 3920 ret = BTRFS_NO_LOG_SYNC; 3921 goto end_no_trans; 3922 } 3923 3924 ret = start_log_trans(trans, root); 3925 if (ret) 3926 goto end_trans; 3927 3928 ret = btrfs_log_inode(trans, root, inode, inode_only); 3929 if (ret) 3930 goto end_trans; 3931 3932 /* 3933 * for regular files, if its inode is already on disk, we don't 3934 * have to worry about the parents at all. This is because 3935 * we can use the last_unlink_trans field to record renames 3936 * and other fun in this file. 3937 */ 3938 if (S_ISREG(inode->i_mode) && 3939 BTRFS_I(inode)->generation <= last_committed && 3940 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 3941 ret = 0; 3942 goto end_trans; 3943 } 3944 3945 inode_only = LOG_INODE_EXISTS; 3946 while (1) { 3947 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3948 break; 3949 3950 inode = parent->d_inode; 3951 if (root != BTRFS_I(inode)->root) 3952 break; 3953 3954 if (BTRFS_I(inode)->generation > 3955 root->fs_info->last_trans_committed) { 3956 ret = btrfs_log_inode(trans, root, inode, inode_only); 3957 if (ret) 3958 goto end_trans; 3959 } 3960 if (IS_ROOT(parent)) 3961 break; 3962 3963 parent = dget_parent(parent); 3964 dput(old_parent); 3965 old_parent = parent; 3966 } 3967 ret = 0; 3968 end_trans: 3969 dput(old_parent); 3970 if (ret < 0) { 3971 root->fs_info->last_trans_log_full_commit = trans->transid; 3972 ret = 1; 3973 } 3974 btrfs_end_log_trans(root); 3975 end_no_trans: 3976 return ret; 3977 } 3978 3979 /* 3980 * it is not safe to log dentry if the chunk root has added new 3981 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 3982 * If this returns 1, you must commit the transaction to safely get your 3983 * data on disk. 3984 */ 3985 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3986 struct btrfs_root *root, struct dentry *dentry) 3987 { 3988 struct dentry *parent = dget_parent(dentry); 3989 int ret; 3990 3991 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 3992 dput(parent); 3993 3994 return ret; 3995 } 3996 3997 /* 3998 * should be called during mount to recover any replay any log trees 3999 * from the FS 4000 */ 4001 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 4002 { 4003 int ret; 4004 struct btrfs_path *path; 4005 struct btrfs_trans_handle *trans; 4006 struct btrfs_key key; 4007 struct btrfs_key found_key; 4008 struct btrfs_key tmp_key; 4009 struct btrfs_root *log; 4010 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 4011 struct walk_control wc = { 4012 .process_func = process_one_buffer, 4013 .stage = 0, 4014 }; 4015 4016 path = btrfs_alloc_path(); 4017 if (!path) 4018 return -ENOMEM; 4019 4020 fs_info->log_root_recovering = 1; 4021 4022 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4023 if (IS_ERR(trans)) { 4024 ret = PTR_ERR(trans); 4025 goto error; 4026 } 4027 4028 wc.trans = trans; 4029 wc.pin = 1; 4030 4031 ret = walk_log_tree(trans, log_root_tree, &wc); 4032 if (ret) { 4033 btrfs_error(fs_info, ret, "Failed to pin buffers while " 4034 "recovering log root tree."); 4035 goto error; 4036 } 4037 4038 again: 4039 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4040 key.offset = (u64)-1; 4041 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 4042 4043 while (1) { 4044 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4045 4046 if (ret < 0) { 4047 btrfs_error(fs_info, ret, 4048 "Couldn't find tree log root."); 4049 goto error; 4050 } 4051 if (ret > 0) { 4052 if (path->slots[0] == 0) 4053 break; 4054 path->slots[0]--; 4055 } 4056 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4057 path->slots[0]); 4058 btrfs_release_path(path); 4059 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4060 break; 4061 4062 log = btrfs_read_fs_root_no_radix(log_root_tree, 4063 &found_key); 4064 if (IS_ERR(log)) { 4065 ret = PTR_ERR(log); 4066 btrfs_error(fs_info, ret, 4067 "Couldn't read tree log root."); 4068 goto error; 4069 } 4070 4071 tmp_key.objectid = found_key.offset; 4072 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 4073 tmp_key.offset = (u64)-1; 4074 4075 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 4076 if (IS_ERR(wc.replay_dest)) { 4077 ret = PTR_ERR(wc.replay_dest); 4078 btrfs_error(fs_info, ret, "Couldn't read target root " 4079 "for tree log recovery."); 4080 goto error; 4081 } 4082 4083 wc.replay_dest->log_root = log; 4084 btrfs_record_root_in_trans(trans, wc.replay_dest); 4085 ret = walk_log_tree(trans, log, &wc); 4086 BUG_ON(ret); 4087 4088 if (wc.stage == LOG_WALK_REPLAY_ALL) { 4089 ret = fixup_inode_link_counts(trans, wc.replay_dest, 4090 path); 4091 BUG_ON(ret); 4092 } 4093 4094 key.offset = found_key.offset - 1; 4095 wc.replay_dest->log_root = NULL; 4096 free_extent_buffer(log->node); 4097 free_extent_buffer(log->commit_root); 4098 kfree(log); 4099 4100 if (found_key.offset == 0) 4101 break; 4102 } 4103 btrfs_release_path(path); 4104 4105 /* step one is to pin it all, step two is to replay just inodes */ 4106 if (wc.pin) { 4107 wc.pin = 0; 4108 wc.process_func = replay_one_buffer; 4109 wc.stage = LOG_WALK_REPLAY_INODES; 4110 goto again; 4111 } 4112 /* step three is to replay everything */ 4113 if (wc.stage < LOG_WALK_REPLAY_ALL) { 4114 wc.stage++; 4115 goto again; 4116 } 4117 4118 btrfs_free_path(path); 4119 4120 free_extent_buffer(log_root_tree->node); 4121 log_root_tree->log_root = NULL; 4122 fs_info->log_root_recovering = 0; 4123 4124 /* step 4: commit the transaction, which also unpins the blocks */ 4125 btrfs_commit_transaction(trans, fs_info->tree_root); 4126 4127 kfree(log_root_tree); 4128 return 0; 4129 4130 error: 4131 btrfs_free_path(path); 4132 return ret; 4133 } 4134 4135 /* 4136 * there are some corner cases where we want to force a full 4137 * commit instead of allowing a directory to be logged. 4138 * 4139 * They revolve around files there were unlinked from the directory, and 4140 * this function updates the parent directory so that a full commit is 4141 * properly done if it is fsync'd later after the unlinks are done. 4142 */ 4143 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 4144 struct inode *dir, struct inode *inode, 4145 int for_rename) 4146 { 4147 /* 4148 * when we're logging a file, if it hasn't been renamed 4149 * or unlinked, and its inode is fully committed on disk, 4150 * we don't have to worry about walking up the directory chain 4151 * to log its parents. 4152 * 4153 * So, we use the last_unlink_trans field to put this transid 4154 * into the file. When the file is logged we check it and 4155 * don't log the parents if the file is fully on disk. 4156 */ 4157 if (S_ISREG(inode->i_mode)) 4158 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4159 4160 /* 4161 * if this directory was already logged any new 4162 * names for this file/dir will get recorded 4163 */ 4164 smp_mb(); 4165 if (BTRFS_I(dir)->logged_trans == trans->transid) 4166 return; 4167 4168 /* 4169 * if the inode we're about to unlink was logged, 4170 * the log will be properly updated for any new names 4171 */ 4172 if (BTRFS_I(inode)->logged_trans == trans->transid) 4173 return; 4174 4175 /* 4176 * when renaming files across directories, if the directory 4177 * there we're unlinking from gets fsync'd later on, there's 4178 * no way to find the destination directory later and fsync it 4179 * properly. So, we have to be conservative and force commits 4180 * so the new name gets discovered. 4181 */ 4182 if (for_rename) 4183 goto record; 4184 4185 /* we can safely do the unlink without any special recording */ 4186 return; 4187 4188 record: 4189 BTRFS_I(dir)->last_unlink_trans = trans->transid; 4190 } 4191 4192 /* 4193 * Call this after adding a new name for a file and it will properly 4194 * update the log to reflect the new name. 4195 * 4196 * It will return zero if all goes well, and it will return 1 if a 4197 * full transaction commit is required. 4198 */ 4199 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 4200 struct inode *inode, struct inode *old_dir, 4201 struct dentry *parent) 4202 { 4203 struct btrfs_root * root = BTRFS_I(inode)->root; 4204 4205 /* 4206 * this will force the logging code to walk the dentry chain 4207 * up for the file 4208 */ 4209 if (S_ISREG(inode->i_mode)) 4210 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4211 4212 /* 4213 * if this inode hasn't been logged and directory we're renaming it 4214 * from hasn't been logged, we don't need to log it 4215 */ 4216 if (BTRFS_I(inode)->logged_trans <= 4217 root->fs_info->last_trans_committed && 4218 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 4219 root->fs_info->last_trans_committed)) 4220 return 0; 4221 4222 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4223 } 4224 4225