1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/list_sort.h> 22 #include "ctree.h" 23 #include "transaction.h" 24 #include "disk-io.h" 25 #include "locking.h" 26 #include "print-tree.h" 27 #include "backref.h" 28 #include "compat.h" 29 #include "tree-log.h" 30 #include "hash.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 41 /* 42 * directory trouble cases 43 * 44 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 45 * log, we must force a full commit before doing an fsync of the directory 46 * where the unlink was done. 47 * ---> record transid of last unlink/rename per directory 48 * 49 * mkdir foo/some_dir 50 * normal commit 51 * rename foo/some_dir foo2/some_dir 52 * mkdir foo/some_dir 53 * fsync foo/some_dir/some_file 54 * 55 * The fsync above will unlink the original some_dir without recording 56 * it in its new location (foo2). After a crash, some_dir will be gone 57 * unless the fsync of some_file forces a full commit 58 * 59 * 2) we must log any new names for any file or dir that is in the fsync 60 * log. ---> check inode while renaming/linking. 61 * 62 * 2a) we must log any new names for any file or dir during rename 63 * when the directory they are being removed from was logged. 64 * ---> check inode and old parent dir during rename 65 * 66 * 2a is actually the more important variant. With the extra logging 67 * a crash might unlink the old name without recreating the new one 68 * 69 * 3) after a crash, we must go through any directories with a link count 70 * of zero and redo the rm -rf 71 * 72 * mkdir f1/foo 73 * normal commit 74 * rm -rf f1/foo 75 * fsync(f1) 76 * 77 * The directory f1 was fully removed from the FS, but fsync was never 78 * called on f1, only its parent dir. After a crash the rm -rf must 79 * be replayed. This must be able to recurse down the entire 80 * directory tree. The inode link count fixup code takes care of the 81 * ugly details. 82 */ 83 84 /* 85 * stages for the tree walking. The first 86 * stage (0) is to only pin down the blocks we find 87 * the second stage (1) is to make sure that all the inodes 88 * we find in the log are created in the subvolume. 89 * 90 * The last stage is to deal with directories and links and extents 91 * and all the other fun semantics 92 */ 93 #define LOG_WALK_PIN_ONLY 0 94 #define LOG_WALK_REPLAY_INODES 1 95 #define LOG_WALK_REPLAY_ALL 2 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_root *root, struct inode *inode, 99 int inode_only); 100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_path *path, u64 objectid); 103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 104 struct btrfs_root *root, 105 struct btrfs_root *log, 106 struct btrfs_path *path, 107 u64 dirid, int del_all); 108 109 /* 110 * tree logging is a special write ahead log used to make sure that 111 * fsyncs and O_SYNCs can happen without doing full tree commits. 112 * 113 * Full tree commits are expensive because they require commonly 114 * modified blocks to be recowed, creating many dirty pages in the 115 * extent tree an 4x-6x higher write load than ext3. 116 * 117 * Instead of doing a tree commit on every fsync, we use the 118 * key ranges and transaction ids to find items for a given file or directory 119 * that have changed in this transaction. Those items are copied into 120 * a special tree (one per subvolume root), that tree is written to disk 121 * and then the fsync is considered complete. 122 * 123 * After a crash, items are copied out of the log-tree back into the 124 * subvolume tree. Any file data extents found are recorded in the extent 125 * allocation tree, and the log-tree freed. 126 * 127 * The log tree is read three times, once to pin down all the extents it is 128 * using in ram and once, once to create all the inodes logged in the tree 129 * and once to do all the other items. 130 */ 131 132 /* 133 * start a sub transaction and setup the log tree 134 * this increments the log tree writer count to make the people 135 * syncing the tree wait for us to finish 136 */ 137 static int start_log_trans(struct btrfs_trans_handle *trans, 138 struct btrfs_root *root) 139 { 140 int ret; 141 int err = 0; 142 143 mutex_lock(&root->log_mutex); 144 if (root->log_root) { 145 if (!root->log_start_pid) { 146 root->log_start_pid = current->pid; 147 root->log_multiple_pids = false; 148 } else if (root->log_start_pid != current->pid) { 149 root->log_multiple_pids = true; 150 } 151 152 atomic_inc(&root->log_batch); 153 atomic_inc(&root->log_writers); 154 mutex_unlock(&root->log_mutex); 155 return 0; 156 } 157 root->log_multiple_pids = false; 158 root->log_start_pid = current->pid; 159 mutex_lock(&root->fs_info->tree_log_mutex); 160 if (!root->fs_info->log_root_tree) { 161 ret = btrfs_init_log_root_tree(trans, root->fs_info); 162 if (ret) 163 err = ret; 164 } 165 if (err == 0 && !root->log_root) { 166 ret = btrfs_add_log_tree(trans, root); 167 if (ret) 168 err = ret; 169 } 170 mutex_unlock(&root->fs_info->tree_log_mutex); 171 atomic_inc(&root->log_batch); 172 atomic_inc(&root->log_writers); 173 mutex_unlock(&root->log_mutex); 174 return err; 175 } 176 177 /* 178 * returns 0 if there was a log transaction running and we were able 179 * to join, or returns -ENOENT if there were not transactions 180 * in progress 181 */ 182 static int join_running_log_trans(struct btrfs_root *root) 183 { 184 int ret = -ENOENT; 185 186 smp_mb(); 187 if (!root->log_root) 188 return -ENOENT; 189 190 mutex_lock(&root->log_mutex); 191 if (root->log_root) { 192 ret = 0; 193 atomic_inc(&root->log_writers); 194 } 195 mutex_unlock(&root->log_mutex); 196 return ret; 197 } 198 199 /* 200 * This either makes the current running log transaction wait 201 * until you call btrfs_end_log_trans() or it makes any future 202 * log transactions wait until you call btrfs_end_log_trans() 203 */ 204 int btrfs_pin_log_trans(struct btrfs_root *root) 205 { 206 int ret = -ENOENT; 207 208 mutex_lock(&root->log_mutex); 209 atomic_inc(&root->log_writers); 210 mutex_unlock(&root->log_mutex); 211 return ret; 212 } 213 214 /* 215 * indicate we're done making changes to the log tree 216 * and wake up anyone waiting to do a sync 217 */ 218 void btrfs_end_log_trans(struct btrfs_root *root) 219 { 220 if (atomic_dec_and_test(&root->log_writers)) { 221 smp_mb(); 222 if (waitqueue_active(&root->log_writer_wait)) 223 wake_up(&root->log_writer_wait); 224 } 225 } 226 227 228 /* 229 * the walk control struct is used to pass state down the chain when 230 * processing the log tree. The stage field tells us which part 231 * of the log tree processing we are currently doing. The others 232 * are state fields used for that specific part 233 */ 234 struct walk_control { 235 /* should we free the extent on disk when done? This is used 236 * at transaction commit time while freeing a log tree 237 */ 238 int free; 239 240 /* should we write out the extent buffer? This is used 241 * while flushing the log tree to disk during a sync 242 */ 243 int write; 244 245 /* should we wait for the extent buffer io to finish? Also used 246 * while flushing the log tree to disk for a sync 247 */ 248 int wait; 249 250 /* pin only walk, we record which extents on disk belong to the 251 * log trees 252 */ 253 int pin; 254 255 /* what stage of the replay code we're currently in */ 256 int stage; 257 258 /* the root we are currently replaying */ 259 struct btrfs_root *replay_dest; 260 261 /* the trans handle for the current replay */ 262 struct btrfs_trans_handle *trans; 263 264 /* the function that gets used to process blocks we find in the 265 * tree. Note the extent_buffer might not be up to date when it is 266 * passed in, and it must be checked or read if you need the data 267 * inside it 268 */ 269 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 270 struct walk_control *wc, u64 gen); 271 }; 272 273 /* 274 * process_func used to pin down extents, write them or wait on them 275 */ 276 static int process_one_buffer(struct btrfs_root *log, 277 struct extent_buffer *eb, 278 struct walk_control *wc, u64 gen) 279 { 280 int ret = 0; 281 282 if (wc->pin) 283 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 284 eb->start, eb->len); 285 286 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 287 if (wc->write) 288 btrfs_write_tree_block(eb); 289 if (wc->wait) 290 btrfs_wait_tree_block_writeback(eb); 291 } 292 return ret; 293 } 294 295 /* 296 * Item overwrite used by replay and tree logging. eb, slot and key all refer 297 * to the src data we are copying out. 298 * 299 * root is the tree we are copying into, and path is a scratch 300 * path for use in this function (it should be released on entry and 301 * will be released on exit). 302 * 303 * If the key is already in the destination tree the existing item is 304 * overwritten. If the existing item isn't big enough, it is extended. 305 * If it is too large, it is truncated. 306 * 307 * If the key isn't in the destination yet, a new item is inserted. 308 */ 309 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 310 struct btrfs_root *root, 311 struct btrfs_path *path, 312 struct extent_buffer *eb, int slot, 313 struct btrfs_key *key) 314 { 315 int ret; 316 u32 item_size; 317 u64 saved_i_size = 0; 318 int save_old_i_size = 0; 319 unsigned long src_ptr; 320 unsigned long dst_ptr; 321 int overwrite_root = 0; 322 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 323 324 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 325 overwrite_root = 1; 326 327 item_size = btrfs_item_size_nr(eb, slot); 328 src_ptr = btrfs_item_ptr_offset(eb, slot); 329 330 /* look for the key in the destination tree */ 331 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 332 if (ret < 0) 333 return ret; 334 335 if (ret == 0) { 336 char *src_copy; 337 char *dst_copy; 338 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 339 path->slots[0]); 340 if (dst_size != item_size) 341 goto insert; 342 343 if (item_size == 0) { 344 btrfs_release_path(path); 345 return 0; 346 } 347 dst_copy = kmalloc(item_size, GFP_NOFS); 348 src_copy = kmalloc(item_size, GFP_NOFS); 349 if (!dst_copy || !src_copy) { 350 btrfs_release_path(path); 351 kfree(dst_copy); 352 kfree(src_copy); 353 return -ENOMEM; 354 } 355 356 read_extent_buffer(eb, src_copy, src_ptr, item_size); 357 358 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 359 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 360 item_size); 361 ret = memcmp(dst_copy, src_copy, item_size); 362 363 kfree(dst_copy); 364 kfree(src_copy); 365 /* 366 * they have the same contents, just return, this saves 367 * us from cowing blocks in the destination tree and doing 368 * extra writes that may not have been done by a previous 369 * sync 370 */ 371 if (ret == 0) { 372 btrfs_release_path(path); 373 return 0; 374 } 375 376 /* 377 * We need to load the old nbytes into the inode so when we 378 * replay the extents we've logged we get the right nbytes. 379 */ 380 if (inode_item) { 381 struct btrfs_inode_item *item; 382 u64 nbytes; 383 384 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 385 struct btrfs_inode_item); 386 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 387 item = btrfs_item_ptr(eb, slot, 388 struct btrfs_inode_item); 389 btrfs_set_inode_nbytes(eb, item, nbytes); 390 } 391 } else if (inode_item) { 392 struct btrfs_inode_item *item; 393 394 /* 395 * New inode, set nbytes to 0 so that the nbytes comes out 396 * properly when we replay the extents. 397 */ 398 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 399 btrfs_set_inode_nbytes(eb, item, 0); 400 } 401 insert: 402 btrfs_release_path(path); 403 /* try to insert the key into the destination tree */ 404 ret = btrfs_insert_empty_item(trans, root, path, 405 key, item_size); 406 407 /* make sure any existing item is the correct size */ 408 if (ret == -EEXIST) { 409 u32 found_size; 410 found_size = btrfs_item_size_nr(path->nodes[0], 411 path->slots[0]); 412 if (found_size > item_size) 413 btrfs_truncate_item(root, path, item_size, 1); 414 else if (found_size < item_size) 415 btrfs_extend_item(root, path, 416 item_size - found_size); 417 } else if (ret) { 418 return ret; 419 } 420 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 421 path->slots[0]); 422 423 /* don't overwrite an existing inode if the generation number 424 * was logged as zero. This is done when the tree logging code 425 * is just logging an inode to make sure it exists after recovery. 426 * 427 * Also, don't overwrite i_size on directories during replay. 428 * log replay inserts and removes directory items based on the 429 * state of the tree found in the subvolume, and i_size is modified 430 * as it goes 431 */ 432 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 433 struct btrfs_inode_item *src_item; 434 struct btrfs_inode_item *dst_item; 435 436 src_item = (struct btrfs_inode_item *)src_ptr; 437 dst_item = (struct btrfs_inode_item *)dst_ptr; 438 439 if (btrfs_inode_generation(eb, src_item) == 0) 440 goto no_copy; 441 442 if (overwrite_root && 443 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 444 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 445 save_old_i_size = 1; 446 saved_i_size = btrfs_inode_size(path->nodes[0], 447 dst_item); 448 } 449 } 450 451 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 452 src_ptr, item_size); 453 454 if (save_old_i_size) { 455 struct btrfs_inode_item *dst_item; 456 dst_item = (struct btrfs_inode_item *)dst_ptr; 457 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 458 } 459 460 /* make sure the generation is filled in */ 461 if (key->type == BTRFS_INODE_ITEM_KEY) { 462 struct btrfs_inode_item *dst_item; 463 dst_item = (struct btrfs_inode_item *)dst_ptr; 464 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 465 btrfs_set_inode_generation(path->nodes[0], dst_item, 466 trans->transid); 467 } 468 } 469 no_copy: 470 btrfs_mark_buffer_dirty(path->nodes[0]); 471 btrfs_release_path(path); 472 return 0; 473 } 474 475 /* 476 * simple helper to read an inode off the disk from a given root 477 * This can only be called for subvolume roots and not for the log 478 */ 479 static noinline struct inode *read_one_inode(struct btrfs_root *root, 480 u64 objectid) 481 { 482 struct btrfs_key key; 483 struct inode *inode; 484 485 key.objectid = objectid; 486 key.type = BTRFS_INODE_ITEM_KEY; 487 key.offset = 0; 488 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 489 if (IS_ERR(inode)) { 490 inode = NULL; 491 } else if (is_bad_inode(inode)) { 492 iput(inode); 493 inode = NULL; 494 } 495 return inode; 496 } 497 498 /* replays a single extent in 'eb' at 'slot' with 'key' into the 499 * subvolume 'root'. path is released on entry and should be released 500 * on exit. 501 * 502 * extents in the log tree have not been allocated out of the extent 503 * tree yet. So, this completes the allocation, taking a reference 504 * as required if the extent already exists or creating a new extent 505 * if it isn't in the extent allocation tree yet. 506 * 507 * The extent is inserted into the file, dropping any existing extents 508 * from the file that overlap the new one. 509 */ 510 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 511 struct btrfs_root *root, 512 struct btrfs_path *path, 513 struct extent_buffer *eb, int slot, 514 struct btrfs_key *key) 515 { 516 int found_type; 517 u64 extent_end; 518 u64 start = key->offset; 519 u64 nbytes = 0; 520 struct btrfs_file_extent_item *item; 521 struct inode *inode = NULL; 522 unsigned long size; 523 int ret = 0; 524 525 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 526 found_type = btrfs_file_extent_type(eb, item); 527 528 if (found_type == BTRFS_FILE_EXTENT_REG || 529 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 530 nbytes = btrfs_file_extent_num_bytes(eb, item); 531 extent_end = start + nbytes; 532 533 /* 534 * We don't add to the inodes nbytes if we are prealloc or a 535 * hole. 536 */ 537 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 538 nbytes = 0; 539 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 540 size = btrfs_file_extent_inline_len(eb, item); 541 nbytes = btrfs_file_extent_ram_bytes(eb, item); 542 extent_end = ALIGN(start + size, root->sectorsize); 543 } else { 544 ret = 0; 545 goto out; 546 } 547 548 inode = read_one_inode(root, key->objectid); 549 if (!inode) { 550 ret = -EIO; 551 goto out; 552 } 553 554 /* 555 * first check to see if we already have this extent in the 556 * file. This must be done before the btrfs_drop_extents run 557 * so we don't try to drop this extent. 558 */ 559 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 560 start, 0); 561 562 if (ret == 0 && 563 (found_type == BTRFS_FILE_EXTENT_REG || 564 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 565 struct btrfs_file_extent_item cmp1; 566 struct btrfs_file_extent_item cmp2; 567 struct btrfs_file_extent_item *existing; 568 struct extent_buffer *leaf; 569 570 leaf = path->nodes[0]; 571 existing = btrfs_item_ptr(leaf, path->slots[0], 572 struct btrfs_file_extent_item); 573 574 read_extent_buffer(eb, &cmp1, (unsigned long)item, 575 sizeof(cmp1)); 576 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 577 sizeof(cmp2)); 578 579 /* 580 * we already have a pointer to this exact extent, 581 * we don't have to do anything 582 */ 583 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 584 btrfs_release_path(path); 585 goto out; 586 } 587 } 588 btrfs_release_path(path); 589 590 /* drop any overlapping extents */ 591 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 592 if (ret) 593 goto out; 594 595 if (found_type == BTRFS_FILE_EXTENT_REG || 596 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 597 u64 offset; 598 unsigned long dest_offset; 599 struct btrfs_key ins; 600 601 ret = btrfs_insert_empty_item(trans, root, path, key, 602 sizeof(*item)); 603 if (ret) 604 goto out; 605 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 606 path->slots[0]); 607 copy_extent_buffer(path->nodes[0], eb, dest_offset, 608 (unsigned long)item, sizeof(*item)); 609 610 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 611 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 612 ins.type = BTRFS_EXTENT_ITEM_KEY; 613 offset = key->offset - btrfs_file_extent_offset(eb, item); 614 615 if (ins.objectid > 0) { 616 u64 csum_start; 617 u64 csum_end; 618 LIST_HEAD(ordered_sums); 619 /* 620 * is this extent already allocated in the extent 621 * allocation tree? If so, just add a reference 622 */ 623 ret = btrfs_lookup_extent(root, ins.objectid, 624 ins.offset); 625 if (ret == 0) { 626 ret = btrfs_inc_extent_ref(trans, root, 627 ins.objectid, ins.offset, 628 0, root->root_key.objectid, 629 key->objectid, offset, 0); 630 if (ret) 631 goto out; 632 } else { 633 /* 634 * insert the extent pointer in the extent 635 * allocation tree 636 */ 637 ret = btrfs_alloc_logged_file_extent(trans, 638 root, root->root_key.objectid, 639 key->objectid, offset, &ins); 640 if (ret) 641 goto out; 642 } 643 btrfs_release_path(path); 644 645 if (btrfs_file_extent_compression(eb, item)) { 646 csum_start = ins.objectid; 647 csum_end = csum_start + ins.offset; 648 } else { 649 csum_start = ins.objectid + 650 btrfs_file_extent_offset(eb, item); 651 csum_end = csum_start + 652 btrfs_file_extent_num_bytes(eb, item); 653 } 654 655 ret = btrfs_lookup_csums_range(root->log_root, 656 csum_start, csum_end - 1, 657 &ordered_sums, 0); 658 if (ret) 659 goto out; 660 while (!list_empty(&ordered_sums)) { 661 struct btrfs_ordered_sum *sums; 662 sums = list_entry(ordered_sums.next, 663 struct btrfs_ordered_sum, 664 list); 665 if (!ret) 666 ret = btrfs_csum_file_blocks(trans, 667 root->fs_info->csum_root, 668 sums); 669 list_del(&sums->list); 670 kfree(sums); 671 } 672 if (ret) 673 goto out; 674 } else { 675 btrfs_release_path(path); 676 } 677 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 678 /* inline extents are easy, we just overwrite them */ 679 ret = overwrite_item(trans, root, path, eb, slot, key); 680 if (ret) 681 goto out; 682 } 683 684 inode_add_bytes(inode, nbytes); 685 ret = btrfs_update_inode(trans, root, inode); 686 out: 687 if (inode) 688 iput(inode); 689 return ret; 690 } 691 692 /* 693 * when cleaning up conflicts between the directory names in the 694 * subvolume, directory names in the log and directory names in the 695 * inode back references, we may have to unlink inodes from directories. 696 * 697 * This is a helper function to do the unlink of a specific directory 698 * item 699 */ 700 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 701 struct btrfs_root *root, 702 struct btrfs_path *path, 703 struct inode *dir, 704 struct btrfs_dir_item *di) 705 { 706 struct inode *inode; 707 char *name; 708 int name_len; 709 struct extent_buffer *leaf; 710 struct btrfs_key location; 711 int ret; 712 713 leaf = path->nodes[0]; 714 715 btrfs_dir_item_key_to_cpu(leaf, di, &location); 716 name_len = btrfs_dir_name_len(leaf, di); 717 name = kmalloc(name_len, GFP_NOFS); 718 if (!name) 719 return -ENOMEM; 720 721 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 722 btrfs_release_path(path); 723 724 inode = read_one_inode(root, location.objectid); 725 if (!inode) { 726 ret = -EIO; 727 goto out; 728 } 729 730 ret = link_to_fixup_dir(trans, root, path, location.objectid); 731 if (ret) 732 goto out; 733 734 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 735 if (ret) 736 goto out; 737 btrfs_run_delayed_items(trans, root); 738 out: 739 kfree(name); 740 iput(inode); 741 return ret; 742 } 743 744 /* 745 * helper function to see if a given name and sequence number found 746 * in an inode back reference are already in a directory and correctly 747 * point to this inode 748 */ 749 static noinline int inode_in_dir(struct btrfs_root *root, 750 struct btrfs_path *path, 751 u64 dirid, u64 objectid, u64 index, 752 const char *name, int name_len) 753 { 754 struct btrfs_dir_item *di; 755 struct btrfs_key location; 756 int match = 0; 757 758 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 759 index, name, name_len, 0); 760 if (di && !IS_ERR(di)) { 761 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 762 if (location.objectid != objectid) 763 goto out; 764 } else 765 goto out; 766 btrfs_release_path(path); 767 768 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 769 if (di && !IS_ERR(di)) { 770 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 771 if (location.objectid != objectid) 772 goto out; 773 } else 774 goto out; 775 match = 1; 776 out: 777 btrfs_release_path(path); 778 return match; 779 } 780 781 /* 782 * helper function to check a log tree for a named back reference in 783 * an inode. This is used to decide if a back reference that is 784 * found in the subvolume conflicts with what we find in the log. 785 * 786 * inode backreferences may have multiple refs in a single item, 787 * during replay we process one reference at a time, and we don't 788 * want to delete valid links to a file from the subvolume if that 789 * link is also in the log. 790 */ 791 static noinline int backref_in_log(struct btrfs_root *log, 792 struct btrfs_key *key, 793 u64 ref_objectid, 794 char *name, int namelen) 795 { 796 struct btrfs_path *path; 797 struct btrfs_inode_ref *ref; 798 unsigned long ptr; 799 unsigned long ptr_end; 800 unsigned long name_ptr; 801 int found_name_len; 802 int item_size; 803 int ret; 804 int match = 0; 805 806 path = btrfs_alloc_path(); 807 if (!path) 808 return -ENOMEM; 809 810 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 811 if (ret != 0) 812 goto out; 813 814 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 815 816 if (key->type == BTRFS_INODE_EXTREF_KEY) { 817 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 818 name, namelen, NULL)) 819 match = 1; 820 821 goto out; 822 } 823 824 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 825 ptr_end = ptr + item_size; 826 while (ptr < ptr_end) { 827 ref = (struct btrfs_inode_ref *)ptr; 828 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 829 if (found_name_len == namelen) { 830 name_ptr = (unsigned long)(ref + 1); 831 ret = memcmp_extent_buffer(path->nodes[0], name, 832 name_ptr, namelen); 833 if (ret == 0) { 834 match = 1; 835 goto out; 836 } 837 } 838 ptr = (unsigned long)(ref + 1) + found_name_len; 839 } 840 out: 841 btrfs_free_path(path); 842 return match; 843 } 844 845 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 846 struct btrfs_root *root, 847 struct btrfs_path *path, 848 struct btrfs_root *log_root, 849 struct inode *dir, struct inode *inode, 850 struct extent_buffer *eb, 851 u64 inode_objectid, u64 parent_objectid, 852 u64 ref_index, char *name, int namelen, 853 int *search_done) 854 { 855 int ret; 856 char *victim_name; 857 int victim_name_len; 858 struct extent_buffer *leaf; 859 struct btrfs_dir_item *di; 860 struct btrfs_key search_key; 861 struct btrfs_inode_extref *extref; 862 863 again: 864 /* Search old style refs */ 865 search_key.objectid = inode_objectid; 866 search_key.type = BTRFS_INODE_REF_KEY; 867 search_key.offset = parent_objectid; 868 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 869 if (ret == 0) { 870 struct btrfs_inode_ref *victim_ref; 871 unsigned long ptr; 872 unsigned long ptr_end; 873 874 leaf = path->nodes[0]; 875 876 /* are we trying to overwrite a back ref for the root directory 877 * if so, just jump out, we're done 878 */ 879 if (search_key.objectid == search_key.offset) 880 return 1; 881 882 /* check all the names in this back reference to see 883 * if they are in the log. if so, we allow them to stay 884 * otherwise they must be unlinked as a conflict 885 */ 886 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 887 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 888 while (ptr < ptr_end) { 889 victim_ref = (struct btrfs_inode_ref *)ptr; 890 victim_name_len = btrfs_inode_ref_name_len(leaf, 891 victim_ref); 892 victim_name = kmalloc(victim_name_len, GFP_NOFS); 893 if (!victim_name) 894 return -ENOMEM; 895 896 read_extent_buffer(leaf, victim_name, 897 (unsigned long)(victim_ref + 1), 898 victim_name_len); 899 900 if (!backref_in_log(log_root, &search_key, 901 parent_objectid, 902 victim_name, 903 victim_name_len)) { 904 btrfs_inc_nlink(inode); 905 btrfs_release_path(path); 906 907 ret = btrfs_unlink_inode(trans, root, dir, 908 inode, victim_name, 909 victim_name_len); 910 kfree(victim_name); 911 if (ret) 912 return ret; 913 btrfs_run_delayed_items(trans, root); 914 *search_done = 1; 915 goto again; 916 } 917 kfree(victim_name); 918 919 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 920 } 921 922 /* 923 * NOTE: we have searched root tree and checked the 924 * coresponding ref, it does not need to check again. 925 */ 926 *search_done = 1; 927 } 928 btrfs_release_path(path); 929 930 /* Same search but for extended refs */ 931 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 932 inode_objectid, parent_objectid, 0, 933 0); 934 if (!IS_ERR_OR_NULL(extref)) { 935 u32 item_size; 936 u32 cur_offset = 0; 937 unsigned long base; 938 struct inode *victim_parent; 939 940 leaf = path->nodes[0]; 941 942 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 943 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 944 945 while (cur_offset < item_size) { 946 extref = (struct btrfs_inode_extref *)base + cur_offset; 947 948 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 949 950 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 951 goto next; 952 953 victim_name = kmalloc(victim_name_len, GFP_NOFS); 954 if (!victim_name) 955 return -ENOMEM; 956 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 957 victim_name_len); 958 959 search_key.objectid = inode_objectid; 960 search_key.type = BTRFS_INODE_EXTREF_KEY; 961 search_key.offset = btrfs_extref_hash(parent_objectid, 962 victim_name, 963 victim_name_len); 964 ret = 0; 965 if (!backref_in_log(log_root, &search_key, 966 parent_objectid, victim_name, 967 victim_name_len)) { 968 ret = -ENOENT; 969 victim_parent = read_one_inode(root, 970 parent_objectid); 971 if (victim_parent) { 972 btrfs_inc_nlink(inode); 973 btrfs_release_path(path); 974 975 ret = btrfs_unlink_inode(trans, root, 976 victim_parent, 977 inode, 978 victim_name, 979 victim_name_len); 980 btrfs_run_delayed_items(trans, root); 981 } 982 iput(victim_parent); 983 kfree(victim_name); 984 if (ret) 985 return ret; 986 *search_done = 1; 987 goto again; 988 } 989 kfree(victim_name); 990 if (ret) 991 return ret; 992 next: 993 cur_offset += victim_name_len + sizeof(*extref); 994 } 995 *search_done = 1; 996 } 997 btrfs_release_path(path); 998 999 /* look for a conflicting sequence number */ 1000 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1001 ref_index, name, namelen, 0); 1002 if (di && !IS_ERR(di)) { 1003 ret = drop_one_dir_item(trans, root, path, dir, di); 1004 if (ret) 1005 return ret; 1006 } 1007 btrfs_release_path(path); 1008 1009 /* look for a conflicing name */ 1010 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1011 name, namelen, 0); 1012 if (di && !IS_ERR(di)) { 1013 ret = drop_one_dir_item(trans, root, path, dir, di); 1014 if (ret) 1015 return ret; 1016 } 1017 btrfs_release_path(path); 1018 1019 return 0; 1020 } 1021 1022 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1023 u32 *namelen, char **name, u64 *index, 1024 u64 *parent_objectid) 1025 { 1026 struct btrfs_inode_extref *extref; 1027 1028 extref = (struct btrfs_inode_extref *)ref_ptr; 1029 1030 *namelen = btrfs_inode_extref_name_len(eb, extref); 1031 *name = kmalloc(*namelen, GFP_NOFS); 1032 if (*name == NULL) 1033 return -ENOMEM; 1034 1035 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1036 *namelen); 1037 1038 *index = btrfs_inode_extref_index(eb, extref); 1039 if (parent_objectid) 1040 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1041 1042 return 0; 1043 } 1044 1045 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1046 u32 *namelen, char **name, u64 *index) 1047 { 1048 struct btrfs_inode_ref *ref; 1049 1050 ref = (struct btrfs_inode_ref *)ref_ptr; 1051 1052 *namelen = btrfs_inode_ref_name_len(eb, ref); 1053 *name = kmalloc(*namelen, GFP_NOFS); 1054 if (*name == NULL) 1055 return -ENOMEM; 1056 1057 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1058 1059 *index = btrfs_inode_ref_index(eb, ref); 1060 1061 return 0; 1062 } 1063 1064 /* 1065 * replay one inode back reference item found in the log tree. 1066 * eb, slot and key refer to the buffer and key found in the log tree. 1067 * root is the destination we are replaying into, and path is for temp 1068 * use by this function. (it should be released on return). 1069 */ 1070 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1071 struct btrfs_root *root, 1072 struct btrfs_root *log, 1073 struct btrfs_path *path, 1074 struct extent_buffer *eb, int slot, 1075 struct btrfs_key *key) 1076 { 1077 struct inode *dir; 1078 struct inode *inode; 1079 unsigned long ref_ptr; 1080 unsigned long ref_end; 1081 char *name; 1082 int namelen; 1083 int ret; 1084 int search_done = 0; 1085 int log_ref_ver = 0; 1086 u64 parent_objectid; 1087 u64 inode_objectid; 1088 u64 ref_index = 0; 1089 int ref_struct_size; 1090 1091 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1092 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1093 1094 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1095 struct btrfs_inode_extref *r; 1096 1097 ref_struct_size = sizeof(struct btrfs_inode_extref); 1098 log_ref_ver = 1; 1099 r = (struct btrfs_inode_extref *)ref_ptr; 1100 parent_objectid = btrfs_inode_extref_parent(eb, r); 1101 } else { 1102 ref_struct_size = sizeof(struct btrfs_inode_ref); 1103 parent_objectid = key->offset; 1104 } 1105 inode_objectid = key->objectid; 1106 1107 /* 1108 * it is possible that we didn't log all the parent directories 1109 * for a given inode. If we don't find the dir, just don't 1110 * copy the back ref in. The link count fixup code will take 1111 * care of the rest 1112 */ 1113 dir = read_one_inode(root, parent_objectid); 1114 if (!dir) 1115 return -ENOENT; 1116 1117 inode = read_one_inode(root, inode_objectid); 1118 if (!inode) { 1119 iput(dir); 1120 return -EIO; 1121 } 1122 1123 while (ref_ptr < ref_end) { 1124 if (log_ref_ver) { 1125 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1126 &ref_index, &parent_objectid); 1127 /* 1128 * parent object can change from one array 1129 * item to another. 1130 */ 1131 if (!dir) 1132 dir = read_one_inode(root, parent_objectid); 1133 if (!dir) 1134 return -ENOENT; 1135 } else { 1136 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1137 &ref_index); 1138 } 1139 if (ret) 1140 return ret; 1141 1142 /* if we already have a perfect match, we're done */ 1143 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1144 ref_index, name, namelen)) { 1145 /* 1146 * look for a conflicting back reference in the 1147 * metadata. if we find one we have to unlink that name 1148 * of the file before we add our new link. Later on, we 1149 * overwrite any existing back reference, and we don't 1150 * want to create dangling pointers in the directory. 1151 */ 1152 1153 if (!search_done) { 1154 ret = __add_inode_ref(trans, root, path, log, 1155 dir, inode, eb, 1156 inode_objectid, 1157 parent_objectid, 1158 ref_index, name, namelen, 1159 &search_done); 1160 if (ret == 1) { 1161 ret = 0; 1162 goto out; 1163 } 1164 if (ret) 1165 goto out; 1166 } 1167 1168 /* insert our name */ 1169 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1170 0, ref_index); 1171 if (ret) 1172 goto out; 1173 1174 btrfs_update_inode(trans, root, inode); 1175 } 1176 1177 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1178 kfree(name); 1179 if (log_ref_ver) { 1180 iput(dir); 1181 dir = NULL; 1182 } 1183 } 1184 1185 /* finally write the back reference in the inode */ 1186 ret = overwrite_item(trans, root, path, eb, slot, key); 1187 out: 1188 btrfs_release_path(path); 1189 iput(dir); 1190 iput(inode); 1191 return ret; 1192 } 1193 1194 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1195 struct btrfs_root *root, u64 offset) 1196 { 1197 int ret; 1198 ret = btrfs_find_orphan_item(root, offset); 1199 if (ret > 0) 1200 ret = btrfs_insert_orphan_item(trans, root, offset); 1201 return ret; 1202 } 1203 1204 static int count_inode_extrefs(struct btrfs_root *root, 1205 struct inode *inode, struct btrfs_path *path) 1206 { 1207 int ret = 0; 1208 int name_len; 1209 unsigned int nlink = 0; 1210 u32 item_size; 1211 u32 cur_offset = 0; 1212 u64 inode_objectid = btrfs_ino(inode); 1213 u64 offset = 0; 1214 unsigned long ptr; 1215 struct btrfs_inode_extref *extref; 1216 struct extent_buffer *leaf; 1217 1218 while (1) { 1219 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1220 &extref, &offset); 1221 if (ret) 1222 break; 1223 1224 leaf = path->nodes[0]; 1225 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1226 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1227 1228 while (cur_offset < item_size) { 1229 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1230 name_len = btrfs_inode_extref_name_len(leaf, extref); 1231 1232 nlink++; 1233 1234 cur_offset += name_len + sizeof(*extref); 1235 } 1236 1237 offset++; 1238 btrfs_release_path(path); 1239 } 1240 btrfs_release_path(path); 1241 1242 if (ret < 0) 1243 return ret; 1244 return nlink; 1245 } 1246 1247 static int count_inode_refs(struct btrfs_root *root, 1248 struct inode *inode, struct btrfs_path *path) 1249 { 1250 int ret; 1251 struct btrfs_key key; 1252 unsigned int nlink = 0; 1253 unsigned long ptr; 1254 unsigned long ptr_end; 1255 int name_len; 1256 u64 ino = btrfs_ino(inode); 1257 1258 key.objectid = ino; 1259 key.type = BTRFS_INODE_REF_KEY; 1260 key.offset = (u64)-1; 1261 1262 while (1) { 1263 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1264 if (ret < 0) 1265 break; 1266 if (ret > 0) { 1267 if (path->slots[0] == 0) 1268 break; 1269 path->slots[0]--; 1270 } 1271 btrfs_item_key_to_cpu(path->nodes[0], &key, 1272 path->slots[0]); 1273 if (key.objectid != ino || 1274 key.type != BTRFS_INODE_REF_KEY) 1275 break; 1276 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1277 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1278 path->slots[0]); 1279 while (ptr < ptr_end) { 1280 struct btrfs_inode_ref *ref; 1281 1282 ref = (struct btrfs_inode_ref *)ptr; 1283 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1284 ref); 1285 ptr = (unsigned long)(ref + 1) + name_len; 1286 nlink++; 1287 } 1288 1289 if (key.offset == 0) 1290 break; 1291 key.offset--; 1292 btrfs_release_path(path); 1293 } 1294 btrfs_release_path(path); 1295 1296 return nlink; 1297 } 1298 1299 /* 1300 * There are a few corners where the link count of the file can't 1301 * be properly maintained during replay. So, instead of adding 1302 * lots of complexity to the log code, we just scan the backrefs 1303 * for any file that has been through replay. 1304 * 1305 * The scan will update the link count on the inode to reflect the 1306 * number of back refs found. If it goes down to zero, the iput 1307 * will free the inode. 1308 */ 1309 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1310 struct btrfs_root *root, 1311 struct inode *inode) 1312 { 1313 struct btrfs_path *path; 1314 int ret; 1315 u64 nlink = 0; 1316 u64 ino = btrfs_ino(inode); 1317 1318 path = btrfs_alloc_path(); 1319 if (!path) 1320 return -ENOMEM; 1321 1322 ret = count_inode_refs(root, inode, path); 1323 if (ret < 0) 1324 goto out; 1325 1326 nlink = ret; 1327 1328 ret = count_inode_extrefs(root, inode, path); 1329 if (ret == -ENOENT) 1330 ret = 0; 1331 1332 if (ret < 0) 1333 goto out; 1334 1335 nlink += ret; 1336 1337 ret = 0; 1338 1339 if (nlink != inode->i_nlink) { 1340 set_nlink(inode, nlink); 1341 btrfs_update_inode(trans, root, inode); 1342 } 1343 BTRFS_I(inode)->index_cnt = (u64)-1; 1344 1345 if (inode->i_nlink == 0) { 1346 if (S_ISDIR(inode->i_mode)) { 1347 ret = replay_dir_deletes(trans, root, NULL, path, 1348 ino, 1); 1349 if (ret) 1350 goto out; 1351 } 1352 ret = insert_orphan_item(trans, root, ino); 1353 } 1354 1355 out: 1356 btrfs_free_path(path); 1357 return ret; 1358 } 1359 1360 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1361 struct btrfs_root *root, 1362 struct btrfs_path *path) 1363 { 1364 int ret; 1365 struct btrfs_key key; 1366 struct inode *inode; 1367 1368 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1369 key.type = BTRFS_ORPHAN_ITEM_KEY; 1370 key.offset = (u64)-1; 1371 while (1) { 1372 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1373 if (ret < 0) 1374 break; 1375 1376 if (ret == 1) { 1377 if (path->slots[0] == 0) 1378 break; 1379 path->slots[0]--; 1380 } 1381 1382 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1383 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1384 key.type != BTRFS_ORPHAN_ITEM_KEY) 1385 break; 1386 1387 ret = btrfs_del_item(trans, root, path); 1388 if (ret) 1389 goto out; 1390 1391 btrfs_release_path(path); 1392 inode = read_one_inode(root, key.offset); 1393 if (!inode) 1394 return -EIO; 1395 1396 ret = fixup_inode_link_count(trans, root, inode); 1397 iput(inode); 1398 if (ret) 1399 goto out; 1400 1401 /* 1402 * fixup on a directory may create new entries, 1403 * make sure we always look for the highset possible 1404 * offset 1405 */ 1406 key.offset = (u64)-1; 1407 } 1408 ret = 0; 1409 out: 1410 btrfs_release_path(path); 1411 return ret; 1412 } 1413 1414 1415 /* 1416 * record a given inode in the fixup dir so we can check its link 1417 * count when replay is done. The link count is incremented here 1418 * so the inode won't go away until we check it 1419 */ 1420 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1421 struct btrfs_root *root, 1422 struct btrfs_path *path, 1423 u64 objectid) 1424 { 1425 struct btrfs_key key; 1426 int ret = 0; 1427 struct inode *inode; 1428 1429 inode = read_one_inode(root, objectid); 1430 if (!inode) 1431 return -EIO; 1432 1433 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1434 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1435 key.offset = objectid; 1436 1437 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1438 1439 btrfs_release_path(path); 1440 if (ret == 0) { 1441 if (!inode->i_nlink) 1442 set_nlink(inode, 1); 1443 else 1444 btrfs_inc_nlink(inode); 1445 ret = btrfs_update_inode(trans, root, inode); 1446 } else if (ret == -EEXIST) { 1447 ret = 0; 1448 } else { 1449 BUG(); /* Logic Error */ 1450 } 1451 iput(inode); 1452 1453 return ret; 1454 } 1455 1456 /* 1457 * when replaying the log for a directory, we only insert names 1458 * for inodes that actually exist. This means an fsync on a directory 1459 * does not implicitly fsync all the new files in it 1460 */ 1461 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1462 struct btrfs_root *root, 1463 struct btrfs_path *path, 1464 u64 dirid, u64 index, 1465 char *name, int name_len, u8 type, 1466 struct btrfs_key *location) 1467 { 1468 struct inode *inode; 1469 struct inode *dir; 1470 int ret; 1471 1472 inode = read_one_inode(root, location->objectid); 1473 if (!inode) 1474 return -ENOENT; 1475 1476 dir = read_one_inode(root, dirid); 1477 if (!dir) { 1478 iput(inode); 1479 return -EIO; 1480 } 1481 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1482 1483 /* FIXME, put inode into FIXUP list */ 1484 1485 iput(inode); 1486 iput(dir); 1487 return ret; 1488 } 1489 1490 /* 1491 * take a single entry in a log directory item and replay it into 1492 * the subvolume. 1493 * 1494 * if a conflicting item exists in the subdirectory already, 1495 * the inode it points to is unlinked and put into the link count 1496 * fix up tree. 1497 * 1498 * If a name from the log points to a file or directory that does 1499 * not exist in the FS, it is skipped. fsyncs on directories 1500 * do not force down inodes inside that directory, just changes to the 1501 * names or unlinks in a directory. 1502 */ 1503 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1504 struct btrfs_root *root, 1505 struct btrfs_path *path, 1506 struct extent_buffer *eb, 1507 struct btrfs_dir_item *di, 1508 struct btrfs_key *key) 1509 { 1510 char *name; 1511 int name_len; 1512 struct btrfs_dir_item *dst_di; 1513 struct btrfs_key found_key; 1514 struct btrfs_key log_key; 1515 struct inode *dir; 1516 u8 log_type; 1517 int exists; 1518 int ret = 0; 1519 1520 dir = read_one_inode(root, key->objectid); 1521 if (!dir) 1522 return -EIO; 1523 1524 name_len = btrfs_dir_name_len(eb, di); 1525 name = kmalloc(name_len, GFP_NOFS); 1526 if (!name) 1527 return -ENOMEM; 1528 1529 log_type = btrfs_dir_type(eb, di); 1530 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1531 name_len); 1532 1533 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1534 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1535 if (exists == 0) 1536 exists = 1; 1537 else 1538 exists = 0; 1539 btrfs_release_path(path); 1540 1541 if (key->type == BTRFS_DIR_ITEM_KEY) { 1542 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1543 name, name_len, 1); 1544 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1545 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1546 key->objectid, 1547 key->offset, name, 1548 name_len, 1); 1549 } else { 1550 /* Corruption */ 1551 ret = -EINVAL; 1552 goto out; 1553 } 1554 if (IS_ERR_OR_NULL(dst_di)) { 1555 /* we need a sequence number to insert, so we only 1556 * do inserts for the BTRFS_DIR_INDEX_KEY types 1557 */ 1558 if (key->type != BTRFS_DIR_INDEX_KEY) 1559 goto out; 1560 goto insert; 1561 } 1562 1563 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1564 /* the existing item matches the logged item */ 1565 if (found_key.objectid == log_key.objectid && 1566 found_key.type == log_key.type && 1567 found_key.offset == log_key.offset && 1568 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1569 goto out; 1570 } 1571 1572 /* 1573 * don't drop the conflicting directory entry if the inode 1574 * for the new entry doesn't exist 1575 */ 1576 if (!exists) 1577 goto out; 1578 1579 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1580 if (ret) 1581 goto out; 1582 1583 if (key->type == BTRFS_DIR_INDEX_KEY) 1584 goto insert; 1585 out: 1586 btrfs_release_path(path); 1587 kfree(name); 1588 iput(dir); 1589 return ret; 1590 1591 insert: 1592 btrfs_release_path(path); 1593 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1594 name, name_len, log_type, &log_key); 1595 if (ret && ret != -ENOENT) 1596 goto out; 1597 ret = 0; 1598 goto out; 1599 } 1600 1601 /* 1602 * find all the names in a directory item and reconcile them into 1603 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1604 * one name in a directory item, but the same code gets used for 1605 * both directory index types 1606 */ 1607 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1608 struct btrfs_root *root, 1609 struct btrfs_path *path, 1610 struct extent_buffer *eb, int slot, 1611 struct btrfs_key *key) 1612 { 1613 int ret; 1614 u32 item_size = btrfs_item_size_nr(eb, slot); 1615 struct btrfs_dir_item *di; 1616 int name_len; 1617 unsigned long ptr; 1618 unsigned long ptr_end; 1619 1620 ptr = btrfs_item_ptr_offset(eb, slot); 1621 ptr_end = ptr + item_size; 1622 while (ptr < ptr_end) { 1623 di = (struct btrfs_dir_item *)ptr; 1624 if (verify_dir_item(root, eb, di)) 1625 return -EIO; 1626 name_len = btrfs_dir_name_len(eb, di); 1627 ret = replay_one_name(trans, root, path, eb, di, key); 1628 if (ret) 1629 return ret; 1630 ptr = (unsigned long)(di + 1); 1631 ptr += name_len; 1632 } 1633 return 0; 1634 } 1635 1636 /* 1637 * directory replay has two parts. There are the standard directory 1638 * items in the log copied from the subvolume, and range items 1639 * created in the log while the subvolume was logged. 1640 * 1641 * The range items tell us which parts of the key space the log 1642 * is authoritative for. During replay, if a key in the subvolume 1643 * directory is in a logged range item, but not actually in the log 1644 * that means it was deleted from the directory before the fsync 1645 * and should be removed. 1646 */ 1647 static noinline int find_dir_range(struct btrfs_root *root, 1648 struct btrfs_path *path, 1649 u64 dirid, int key_type, 1650 u64 *start_ret, u64 *end_ret) 1651 { 1652 struct btrfs_key key; 1653 u64 found_end; 1654 struct btrfs_dir_log_item *item; 1655 int ret; 1656 int nritems; 1657 1658 if (*start_ret == (u64)-1) 1659 return 1; 1660 1661 key.objectid = dirid; 1662 key.type = key_type; 1663 key.offset = *start_ret; 1664 1665 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1666 if (ret < 0) 1667 goto out; 1668 if (ret > 0) { 1669 if (path->slots[0] == 0) 1670 goto out; 1671 path->slots[0]--; 1672 } 1673 if (ret != 0) 1674 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1675 1676 if (key.type != key_type || key.objectid != dirid) { 1677 ret = 1; 1678 goto next; 1679 } 1680 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1681 struct btrfs_dir_log_item); 1682 found_end = btrfs_dir_log_end(path->nodes[0], item); 1683 1684 if (*start_ret >= key.offset && *start_ret <= found_end) { 1685 ret = 0; 1686 *start_ret = key.offset; 1687 *end_ret = found_end; 1688 goto out; 1689 } 1690 ret = 1; 1691 next: 1692 /* check the next slot in the tree to see if it is a valid item */ 1693 nritems = btrfs_header_nritems(path->nodes[0]); 1694 if (path->slots[0] >= nritems) { 1695 ret = btrfs_next_leaf(root, path); 1696 if (ret) 1697 goto out; 1698 } else { 1699 path->slots[0]++; 1700 } 1701 1702 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1703 1704 if (key.type != key_type || key.objectid != dirid) { 1705 ret = 1; 1706 goto out; 1707 } 1708 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1709 struct btrfs_dir_log_item); 1710 found_end = btrfs_dir_log_end(path->nodes[0], item); 1711 *start_ret = key.offset; 1712 *end_ret = found_end; 1713 ret = 0; 1714 out: 1715 btrfs_release_path(path); 1716 return ret; 1717 } 1718 1719 /* 1720 * this looks for a given directory item in the log. If the directory 1721 * item is not in the log, the item is removed and the inode it points 1722 * to is unlinked 1723 */ 1724 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1725 struct btrfs_root *root, 1726 struct btrfs_root *log, 1727 struct btrfs_path *path, 1728 struct btrfs_path *log_path, 1729 struct inode *dir, 1730 struct btrfs_key *dir_key) 1731 { 1732 int ret; 1733 struct extent_buffer *eb; 1734 int slot; 1735 u32 item_size; 1736 struct btrfs_dir_item *di; 1737 struct btrfs_dir_item *log_di; 1738 int name_len; 1739 unsigned long ptr; 1740 unsigned long ptr_end; 1741 char *name; 1742 struct inode *inode; 1743 struct btrfs_key location; 1744 1745 again: 1746 eb = path->nodes[0]; 1747 slot = path->slots[0]; 1748 item_size = btrfs_item_size_nr(eb, slot); 1749 ptr = btrfs_item_ptr_offset(eb, slot); 1750 ptr_end = ptr + item_size; 1751 while (ptr < ptr_end) { 1752 di = (struct btrfs_dir_item *)ptr; 1753 if (verify_dir_item(root, eb, di)) { 1754 ret = -EIO; 1755 goto out; 1756 } 1757 1758 name_len = btrfs_dir_name_len(eb, di); 1759 name = kmalloc(name_len, GFP_NOFS); 1760 if (!name) { 1761 ret = -ENOMEM; 1762 goto out; 1763 } 1764 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1765 name_len); 1766 log_di = NULL; 1767 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1768 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1769 dir_key->objectid, 1770 name, name_len, 0); 1771 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1772 log_di = btrfs_lookup_dir_index_item(trans, log, 1773 log_path, 1774 dir_key->objectid, 1775 dir_key->offset, 1776 name, name_len, 0); 1777 } 1778 if (IS_ERR_OR_NULL(log_di)) { 1779 btrfs_dir_item_key_to_cpu(eb, di, &location); 1780 btrfs_release_path(path); 1781 btrfs_release_path(log_path); 1782 inode = read_one_inode(root, location.objectid); 1783 if (!inode) { 1784 kfree(name); 1785 return -EIO; 1786 } 1787 1788 ret = link_to_fixup_dir(trans, root, 1789 path, location.objectid); 1790 if (ret) { 1791 kfree(name); 1792 iput(inode); 1793 goto out; 1794 } 1795 1796 btrfs_inc_nlink(inode); 1797 ret = btrfs_unlink_inode(trans, root, dir, inode, 1798 name, name_len); 1799 if (!ret) 1800 btrfs_run_delayed_items(trans, root); 1801 kfree(name); 1802 iput(inode); 1803 if (ret) 1804 goto out; 1805 1806 /* there might still be more names under this key 1807 * check and repeat if required 1808 */ 1809 ret = btrfs_search_slot(NULL, root, dir_key, path, 1810 0, 0); 1811 if (ret == 0) 1812 goto again; 1813 ret = 0; 1814 goto out; 1815 } 1816 btrfs_release_path(log_path); 1817 kfree(name); 1818 1819 ptr = (unsigned long)(di + 1); 1820 ptr += name_len; 1821 } 1822 ret = 0; 1823 out: 1824 btrfs_release_path(path); 1825 btrfs_release_path(log_path); 1826 return ret; 1827 } 1828 1829 /* 1830 * deletion replay happens before we copy any new directory items 1831 * out of the log or out of backreferences from inodes. It 1832 * scans the log to find ranges of keys that log is authoritative for, 1833 * and then scans the directory to find items in those ranges that are 1834 * not present in the log. 1835 * 1836 * Anything we don't find in the log is unlinked and removed from the 1837 * directory. 1838 */ 1839 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1840 struct btrfs_root *root, 1841 struct btrfs_root *log, 1842 struct btrfs_path *path, 1843 u64 dirid, int del_all) 1844 { 1845 u64 range_start; 1846 u64 range_end; 1847 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1848 int ret = 0; 1849 struct btrfs_key dir_key; 1850 struct btrfs_key found_key; 1851 struct btrfs_path *log_path; 1852 struct inode *dir; 1853 1854 dir_key.objectid = dirid; 1855 dir_key.type = BTRFS_DIR_ITEM_KEY; 1856 log_path = btrfs_alloc_path(); 1857 if (!log_path) 1858 return -ENOMEM; 1859 1860 dir = read_one_inode(root, dirid); 1861 /* it isn't an error if the inode isn't there, that can happen 1862 * because we replay the deletes before we copy in the inode item 1863 * from the log 1864 */ 1865 if (!dir) { 1866 btrfs_free_path(log_path); 1867 return 0; 1868 } 1869 again: 1870 range_start = 0; 1871 range_end = 0; 1872 while (1) { 1873 if (del_all) 1874 range_end = (u64)-1; 1875 else { 1876 ret = find_dir_range(log, path, dirid, key_type, 1877 &range_start, &range_end); 1878 if (ret != 0) 1879 break; 1880 } 1881 1882 dir_key.offset = range_start; 1883 while (1) { 1884 int nritems; 1885 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1886 0, 0); 1887 if (ret < 0) 1888 goto out; 1889 1890 nritems = btrfs_header_nritems(path->nodes[0]); 1891 if (path->slots[0] >= nritems) { 1892 ret = btrfs_next_leaf(root, path); 1893 if (ret) 1894 break; 1895 } 1896 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1897 path->slots[0]); 1898 if (found_key.objectid != dirid || 1899 found_key.type != dir_key.type) 1900 goto next_type; 1901 1902 if (found_key.offset > range_end) 1903 break; 1904 1905 ret = check_item_in_log(trans, root, log, path, 1906 log_path, dir, 1907 &found_key); 1908 if (ret) 1909 goto out; 1910 if (found_key.offset == (u64)-1) 1911 break; 1912 dir_key.offset = found_key.offset + 1; 1913 } 1914 btrfs_release_path(path); 1915 if (range_end == (u64)-1) 1916 break; 1917 range_start = range_end + 1; 1918 } 1919 1920 next_type: 1921 ret = 0; 1922 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1923 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1924 dir_key.type = BTRFS_DIR_INDEX_KEY; 1925 btrfs_release_path(path); 1926 goto again; 1927 } 1928 out: 1929 btrfs_release_path(path); 1930 btrfs_free_path(log_path); 1931 iput(dir); 1932 return ret; 1933 } 1934 1935 /* 1936 * the process_func used to replay items from the log tree. This 1937 * gets called in two different stages. The first stage just looks 1938 * for inodes and makes sure they are all copied into the subvolume. 1939 * 1940 * The second stage copies all the other item types from the log into 1941 * the subvolume. The two stage approach is slower, but gets rid of 1942 * lots of complexity around inodes referencing other inodes that exist 1943 * only in the log (references come from either directory items or inode 1944 * back refs). 1945 */ 1946 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1947 struct walk_control *wc, u64 gen) 1948 { 1949 int nritems; 1950 struct btrfs_path *path; 1951 struct btrfs_root *root = wc->replay_dest; 1952 struct btrfs_key key; 1953 int level; 1954 int i; 1955 int ret; 1956 1957 ret = btrfs_read_buffer(eb, gen); 1958 if (ret) 1959 return ret; 1960 1961 level = btrfs_header_level(eb); 1962 1963 if (level != 0) 1964 return 0; 1965 1966 path = btrfs_alloc_path(); 1967 if (!path) 1968 return -ENOMEM; 1969 1970 nritems = btrfs_header_nritems(eb); 1971 for (i = 0; i < nritems; i++) { 1972 btrfs_item_key_to_cpu(eb, &key, i); 1973 1974 /* inode keys are done during the first stage */ 1975 if (key.type == BTRFS_INODE_ITEM_KEY && 1976 wc->stage == LOG_WALK_REPLAY_INODES) { 1977 struct btrfs_inode_item *inode_item; 1978 u32 mode; 1979 1980 inode_item = btrfs_item_ptr(eb, i, 1981 struct btrfs_inode_item); 1982 mode = btrfs_inode_mode(eb, inode_item); 1983 if (S_ISDIR(mode)) { 1984 ret = replay_dir_deletes(wc->trans, 1985 root, log, path, key.objectid, 0); 1986 if (ret) 1987 break; 1988 } 1989 ret = overwrite_item(wc->trans, root, path, 1990 eb, i, &key); 1991 if (ret) 1992 break; 1993 1994 /* for regular files, make sure corresponding 1995 * orhpan item exist. extents past the new EOF 1996 * will be truncated later by orphan cleanup. 1997 */ 1998 if (S_ISREG(mode)) { 1999 ret = insert_orphan_item(wc->trans, root, 2000 key.objectid); 2001 if (ret) 2002 break; 2003 } 2004 2005 ret = link_to_fixup_dir(wc->trans, root, 2006 path, key.objectid); 2007 if (ret) 2008 break; 2009 } 2010 if (wc->stage < LOG_WALK_REPLAY_ALL) 2011 continue; 2012 2013 /* these keys are simply copied */ 2014 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2015 ret = overwrite_item(wc->trans, root, path, 2016 eb, i, &key); 2017 if (ret) 2018 break; 2019 } else if (key.type == BTRFS_INODE_REF_KEY) { 2020 ret = add_inode_ref(wc->trans, root, log, path, 2021 eb, i, &key); 2022 if (ret && ret != -ENOENT) 2023 break; 2024 ret = 0; 2025 } else if (key.type == BTRFS_INODE_EXTREF_KEY) { 2026 ret = add_inode_ref(wc->trans, root, log, path, 2027 eb, i, &key); 2028 if (ret && ret != -ENOENT) 2029 break; 2030 ret = 0; 2031 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2032 ret = replay_one_extent(wc->trans, root, path, 2033 eb, i, &key); 2034 if (ret) 2035 break; 2036 } else if (key.type == BTRFS_DIR_ITEM_KEY || 2037 key.type == BTRFS_DIR_INDEX_KEY) { 2038 ret = replay_one_dir_item(wc->trans, root, path, 2039 eb, i, &key); 2040 if (ret) 2041 break; 2042 } 2043 } 2044 btrfs_free_path(path); 2045 return ret; 2046 } 2047 2048 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2049 struct btrfs_root *root, 2050 struct btrfs_path *path, int *level, 2051 struct walk_control *wc) 2052 { 2053 u64 root_owner; 2054 u64 bytenr; 2055 u64 ptr_gen; 2056 struct extent_buffer *next; 2057 struct extent_buffer *cur; 2058 struct extent_buffer *parent; 2059 u32 blocksize; 2060 int ret = 0; 2061 2062 WARN_ON(*level < 0); 2063 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2064 2065 while (*level > 0) { 2066 WARN_ON(*level < 0); 2067 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2068 cur = path->nodes[*level]; 2069 2070 if (btrfs_header_level(cur) != *level) 2071 WARN_ON(1); 2072 2073 if (path->slots[*level] >= 2074 btrfs_header_nritems(cur)) 2075 break; 2076 2077 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2078 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2079 blocksize = btrfs_level_size(root, *level - 1); 2080 2081 parent = path->nodes[*level]; 2082 root_owner = btrfs_header_owner(parent); 2083 2084 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 2085 if (!next) 2086 return -ENOMEM; 2087 2088 if (*level == 1) { 2089 ret = wc->process_func(root, next, wc, ptr_gen); 2090 if (ret) { 2091 free_extent_buffer(next); 2092 return ret; 2093 } 2094 2095 path->slots[*level]++; 2096 if (wc->free) { 2097 ret = btrfs_read_buffer(next, ptr_gen); 2098 if (ret) { 2099 free_extent_buffer(next); 2100 return ret; 2101 } 2102 2103 btrfs_tree_lock(next); 2104 btrfs_set_lock_blocking(next); 2105 clean_tree_block(trans, root, next); 2106 btrfs_wait_tree_block_writeback(next); 2107 btrfs_tree_unlock(next); 2108 2109 WARN_ON(root_owner != 2110 BTRFS_TREE_LOG_OBJECTID); 2111 ret = btrfs_free_and_pin_reserved_extent(root, 2112 bytenr, blocksize); 2113 if (ret) { 2114 free_extent_buffer(next); 2115 return ret; 2116 } 2117 } 2118 free_extent_buffer(next); 2119 continue; 2120 } 2121 ret = btrfs_read_buffer(next, ptr_gen); 2122 if (ret) { 2123 free_extent_buffer(next); 2124 return ret; 2125 } 2126 2127 WARN_ON(*level <= 0); 2128 if (path->nodes[*level-1]) 2129 free_extent_buffer(path->nodes[*level-1]); 2130 path->nodes[*level-1] = next; 2131 *level = btrfs_header_level(next); 2132 path->slots[*level] = 0; 2133 cond_resched(); 2134 } 2135 WARN_ON(*level < 0); 2136 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2137 2138 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2139 2140 cond_resched(); 2141 return 0; 2142 } 2143 2144 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2145 struct btrfs_root *root, 2146 struct btrfs_path *path, int *level, 2147 struct walk_control *wc) 2148 { 2149 u64 root_owner; 2150 int i; 2151 int slot; 2152 int ret; 2153 2154 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2155 slot = path->slots[i]; 2156 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2157 path->slots[i]++; 2158 *level = i; 2159 WARN_ON(*level == 0); 2160 return 0; 2161 } else { 2162 struct extent_buffer *parent; 2163 if (path->nodes[*level] == root->node) 2164 parent = path->nodes[*level]; 2165 else 2166 parent = path->nodes[*level + 1]; 2167 2168 root_owner = btrfs_header_owner(parent); 2169 ret = wc->process_func(root, path->nodes[*level], wc, 2170 btrfs_header_generation(path->nodes[*level])); 2171 if (ret) 2172 return ret; 2173 2174 if (wc->free) { 2175 struct extent_buffer *next; 2176 2177 next = path->nodes[*level]; 2178 2179 btrfs_tree_lock(next); 2180 btrfs_set_lock_blocking(next); 2181 clean_tree_block(trans, root, next); 2182 btrfs_wait_tree_block_writeback(next); 2183 btrfs_tree_unlock(next); 2184 2185 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2186 ret = btrfs_free_and_pin_reserved_extent(root, 2187 path->nodes[*level]->start, 2188 path->nodes[*level]->len); 2189 if (ret) 2190 return ret; 2191 } 2192 free_extent_buffer(path->nodes[*level]); 2193 path->nodes[*level] = NULL; 2194 *level = i + 1; 2195 } 2196 } 2197 return 1; 2198 } 2199 2200 /* 2201 * drop the reference count on the tree rooted at 'snap'. This traverses 2202 * the tree freeing any blocks that have a ref count of zero after being 2203 * decremented. 2204 */ 2205 static int walk_log_tree(struct btrfs_trans_handle *trans, 2206 struct btrfs_root *log, struct walk_control *wc) 2207 { 2208 int ret = 0; 2209 int wret; 2210 int level; 2211 struct btrfs_path *path; 2212 int orig_level; 2213 2214 path = btrfs_alloc_path(); 2215 if (!path) 2216 return -ENOMEM; 2217 2218 level = btrfs_header_level(log->node); 2219 orig_level = level; 2220 path->nodes[level] = log->node; 2221 extent_buffer_get(log->node); 2222 path->slots[level] = 0; 2223 2224 while (1) { 2225 wret = walk_down_log_tree(trans, log, path, &level, wc); 2226 if (wret > 0) 2227 break; 2228 if (wret < 0) { 2229 ret = wret; 2230 goto out; 2231 } 2232 2233 wret = walk_up_log_tree(trans, log, path, &level, wc); 2234 if (wret > 0) 2235 break; 2236 if (wret < 0) { 2237 ret = wret; 2238 goto out; 2239 } 2240 } 2241 2242 /* was the root node processed? if not, catch it here */ 2243 if (path->nodes[orig_level]) { 2244 ret = wc->process_func(log, path->nodes[orig_level], wc, 2245 btrfs_header_generation(path->nodes[orig_level])); 2246 if (ret) 2247 goto out; 2248 if (wc->free) { 2249 struct extent_buffer *next; 2250 2251 next = path->nodes[orig_level]; 2252 2253 btrfs_tree_lock(next); 2254 btrfs_set_lock_blocking(next); 2255 clean_tree_block(trans, log, next); 2256 btrfs_wait_tree_block_writeback(next); 2257 btrfs_tree_unlock(next); 2258 2259 WARN_ON(log->root_key.objectid != 2260 BTRFS_TREE_LOG_OBJECTID); 2261 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2262 next->len); 2263 if (ret) 2264 goto out; 2265 } 2266 } 2267 2268 out: 2269 btrfs_free_path(path); 2270 return ret; 2271 } 2272 2273 /* 2274 * helper function to update the item for a given subvolumes log root 2275 * in the tree of log roots 2276 */ 2277 static int update_log_root(struct btrfs_trans_handle *trans, 2278 struct btrfs_root *log) 2279 { 2280 int ret; 2281 2282 if (log->log_transid == 1) { 2283 /* insert root item on the first sync */ 2284 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2285 &log->root_key, &log->root_item); 2286 } else { 2287 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2288 &log->root_key, &log->root_item); 2289 } 2290 return ret; 2291 } 2292 2293 static int wait_log_commit(struct btrfs_trans_handle *trans, 2294 struct btrfs_root *root, unsigned long transid) 2295 { 2296 DEFINE_WAIT(wait); 2297 int index = transid % 2; 2298 2299 /* 2300 * we only allow two pending log transactions at a time, 2301 * so we know that if ours is more than 2 older than the 2302 * current transaction, we're done 2303 */ 2304 do { 2305 prepare_to_wait(&root->log_commit_wait[index], 2306 &wait, TASK_UNINTERRUPTIBLE); 2307 mutex_unlock(&root->log_mutex); 2308 2309 if (root->fs_info->last_trans_log_full_commit != 2310 trans->transid && root->log_transid < transid + 2 && 2311 atomic_read(&root->log_commit[index])) 2312 schedule(); 2313 2314 finish_wait(&root->log_commit_wait[index], &wait); 2315 mutex_lock(&root->log_mutex); 2316 } while (root->fs_info->last_trans_log_full_commit != 2317 trans->transid && root->log_transid < transid + 2 && 2318 atomic_read(&root->log_commit[index])); 2319 return 0; 2320 } 2321 2322 static void wait_for_writer(struct btrfs_trans_handle *trans, 2323 struct btrfs_root *root) 2324 { 2325 DEFINE_WAIT(wait); 2326 while (root->fs_info->last_trans_log_full_commit != 2327 trans->transid && atomic_read(&root->log_writers)) { 2328 prepare_to_wait(&root->log_writer_wait, 2329 &wait, TASK_UNINTERRUPTIBLE); 2330 mutex_unlock(&root->log_mutex); 2331 if (root->fs_info->last_trans_log_full_commit != 2332 trans->transid && atomic_read(&root->log_writers)) 2333 schedule(); 2334 mutex_lock(&root->log_mutex); 2335 finish_wait(&root->log_writer_wait, &wait); 2336 } 2337 } 2338 2339 /* 2340 * btrfs_sync_log does sends a given tree log down to the disk and 2341 * updates the super blocks to record it. When this call is done, 2342 * you know that any inodes previously logged are safely on disk only 2343 * if it returns 0. 2344 * 2345 * Any other return value means you need to call btrfs_commit_transaction. 2346 * Some of the edge cases for fsyncing directories that have had unlinks 2347 * or renames done in the past mean that sometimes the only safe 2348 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2349 * that has happened. 2350 */ 2351 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2352 struct btrfs_root *root) 2353 { 2354 int index1; 2355 int index2; 2356 int mark; 2357 int ret; 2358 struct btrfs_root *log = root->log_root; 2359 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2360 unsigned long log_transid = 0; 2361 2362 mutex_lock(&root->log_mutex); 2363 log_transid = root->log_transid; 2364 index1 = root->log_transid % 2; 2365 if (atomic_read(&root->log_commit[index1])) { 2366 wait_log_commit(trans, root, root->log_transid); 2367 mutex_unlock(&root->log_mutex); 2368 return 0; 2369 } 2370 atomic_set(&root->log_commit[index1], 1); 2371 2372 /* wait for previous tree log sync to complete */ 2373 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2374 wait_log_commit(trans, root, root->log_transid - 1); 2375 while (1) { 2376 int batch = atomic_read(&root->log_batch); 2377 /* when we're on an ssd, just kick the log commit out */ 2378 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2379 mutex_unlock(&root->log_mutex); 2380 schedule_timeout_uninterruptible(1); 2381 mutex_lock(&root->log_mutex); 2382 } 2383 wait_for_writer(trans, root); 2384 if (batch == atomic_read(&root->log_batch)) 2385 break; 2386 } 2387 2388 /* bail out if we need to do a full commit */ 2389 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2390 ret = -EAGAIN; 2391 btrfs_free_logged_extents(log, log_transid); 2392 mutex_unlock(&root->log_mutex); 2393 goto out; 2394 } 2395 2396 if (log_transid % 2 == 0) 2397 mark = EXTENT_DIRTY; 2398 else 2399 mark = EXTENT_NEW; 2400 2401 /* we start IO on all the marked extents here, but we don't actually 2402 * wait for them until later. 2403 */ 2404 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2405 if (ret) { 2406 btrfs_abort_transaction(trans, root, ret); 2407 btrfs_free_logged_extents(log, log_transid); 2408 mutex_unlock(&root->log_mutex); 2409 goto out; 2410 } 2411 2412 btrfs_set_root_node(&log->root_item, log->node); 2413 2414 root->log_transid++; 2415 log->log_transid = root->log_transid; 2416 root->log_start_pid = 0; 2417 smp_mb(); 2418 /* 2419 * IO has been started, blocks of the log tree have WRITTEN flag set 2420 * in their headers. new modifications of the log will be written to 2421 * new positions. so it's safe to allow log writers to go in. 2422 */ 2423 mutex_unlock(&root->log_mutex); 2424 2425 mutex_lock(&log_root_tree->log_mutex); 2426 atomic_inc(&log_root_tree->log_batch); 2427 atomic_inc(&log_root_tree->log_writers); 2428 mutex_unlock(&log_root_tree->log_mutex); 2429 2430 ret = update_log_root(trans, log); 2431 2432 mutex_lock(&log_root_tree->log_mutex); 2433 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2434 smp_mb(); 2435 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2436 wake_up(&log_root_tree->log_writer_wait); 2437 } 2438 2439 if (ret) { 2440 if (ret != -ENOSPC) { 2441 btrfs_abort_transaction(trans, root, ret); 2442 mutex_unlock(&log_root_tree->log_mutex); 2443 goto out; 2444 } 2445 root->fs_info->last_trans_log_full_commit = trans->transid; 2446 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2447 btrfs_free_logged_extents(log, log_transid); 2448 mutex_unlock(&log_root_tree->log_mutex); 2449 ret = -EAGAIN; 2450 goto out; 2451 } 2452 2453 index2 = log_root_tree->log_transid % 2; 2454 if (atomic_read(&log_root_tree->log_commit[index2])) { 2455 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2456 wait_log_commit(trans, log_root_tree, 2457 log_root_tree->log_transid); 2458 btrfs_free_logged_extents(log, log_transid); 2459 mutex_unlock(&log_root_tree->log_mutex); 2460 ret = 0; 2461 goto out; 2462 } 2463 atomic_set(&log_root_tree->log_commit[index2], 1); 2464 2465 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2466 wait_log_commit(trans, log_root_tree, 2467 log_root_tree->log_transid - 1); 2468 } 2469 2470 wait_for_writer(trans, log_root_tree); 2471 2472 /* 2473 * now that we've moved on to the tree of log tree roots, 2474 * check the full commit flag again 2475 */ 2476 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2477 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2478 btrfs_free_logged_extents(log, log_transid); 2479 mutex_unlock(&log_root_tree->log_mutex); 2480 ret = -EAGAIN; 2481 goto out_wake_log_root; 2482 } 2483 2484 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2485 &log_root_tree->dirty_log_pages, 2486 EXTENT_DIRTY | EXTENT_NEW); 2487 if (ret) { 2488 btrfs_abort_transaction(trans, root, ret); 2489 btrfs_free_logged_extents(log, log_transid); 2490 mutex_unlock(&log_root_tree->log_mutex); 2491 goto out_wake_log_root; 2492 } 2493 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2494 btrfs_wait_logged_extents(log, log_transid); 2495 2496 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2497 log_root_tree->node->start); 2498 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2499 btrfs_header_level(log_root_tree->node)); 2500 2501 log_root_tree->log_transid++; 2502 smp_mb(); 2503 2504 mutex_unlock(&log_root_tree->log_mutex); 2505 2506 /* 2507 * nobody else is going to jump in and write the the ctree 2508 * super here because the log_commit atomic below is protecting 2509 * us. We must be called with a transaction handle pinning 2510 * the running transaction open, so a full commit can't hop 2511 * in and cause problems either. 2512 */ 2513 btrfs_scrub_pause_super(root); 2514 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2515 btrfs_scrub_continue_super(root); 2516 if (ret) { 2517 btrfs_abort_transaction(trans, root, ret); 2518 goto out_wake_log_root; 2519 } 2520 2521 mutex_lock(&root->log_mutex); 2522 if (root->last_log_commit < log_transid) 2523 root->last_log_commit = log_transid; 2524 mutex_unlock(&root->log_mutex); 2525 2526 out_wake_log_root: 2527 atomic_set(&log_root_tree->log_commit[index2], 0); 2528 smp_mb(); 2529 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2530 wake_up(&log_root_tree->log_commit_wait[index2]); 2531 out: 2532 atomic_set(&root->log_commit[index1], 0); 2533 smp_mb(); 2534 if (waitqueue_active(&root->log_commit_wait[index1])) 2535 wake_up(&root->log_commit_wait[index1]); 2536 return ret; 2537 } 2538 2539 static void free_log_tree(struct btrfs_trans_handle *trans, 2540 struct btrfs_root *log) 2541 { 2542 int ret; 2543 u64 start; 2544 u64 end; 2545 struct walk_control wc = { 2546 .free = 1, 2547 .process_func = process_one_buffer 2548 }; 2549 2550 if (trans) { 2551 ret = walk_log_tree(trans, log, &wc); 2552 2553 /* I don't think this can happen but just in case */ 2554 if (ret) 2555 btrfs_abort_transaction(trans, log, ret); 2556 } 2557 2558 while (1) { 2559 ret = find_first_extent_bit(&log->dirty_log_pages, 2560 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2561 NULL); 2562 if (ret) 2563 break; 2564 2565 clear_extent_bits(&log->dirty_log_pages, start, end, 2566 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2567 } 2568 2569 /* 2570 * We may have short-circuited the log tree with the full commit logic 2571 * and left ordered extents on our list, so clear these out to keep us 2572 * from leaking inodes and memory. 2573 */ 2574 btrfs_free_logged_extents(log, 0); 2575 btrfs_free_logged_extents(log, 1); 2576 2577 free_extent_buffer(log->node); 2578 kfree(log); 2579 } 2580 2581 /* 2582 * free all the extents used by the tree log. This should be called 2583 * at commit time of the full transaction 2584 */ 2585 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2586 { 2587 if (root->log_root) { 2588 free_log_tree(trans, root->log_root); 2589 root->log_root = NULL; 2590 } 2591 return 0; 2592 } 2593 2594 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2595 struct btrfs_fs_info *fs_info) 2596 { 2597 if (fs_info->log_root_tree) { 2598 free_log_tree(trans, fs_info->log_root_tree); 2599 fs_info->log_root_tree = NULL; 2600 } 2601 return 0; 2602 } 2603 2604 /* 2605 * If both a file and directory are logged, and unlinks or renames are 2606 * mixed in, we have a few interesting corners: 2607 * 2608 * create file X in dir Y 2609 * link file X to X.link in dir Y 2610 * fsync file X 2611 * unlink file X but leave X.link 2612 * fsync dir Y 2613 * 2614 * After a crash we would expect only X.link to exist. But file X 2615 * didn't get fsync'd again so the log has back refs for X and X.link. 2616 * 2617 * We solve this by removing directory entries and inode backrefs from the 2618 * log when a file that was logged in the current transaction is 2619 * unlinked. Any later fsync will include the updated log entries, and 2620 * we'll be able to reconstruct the proper directory items from backrefs. 2621 * 2622 * This optimizations allows us to avoid relogging the entire inode 2623 * or the entire directory. 2624 */ 2625 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2626 struct btrfs_root *root, 2627 const char *name, int name_len, 2628 struct inode *dir, u64 index) 2629 { 2630 struct btrfs_root *log; 2631 struct btrfs_dir_item *di; 2632 struct btrfs_path *path; 2633 int ret; 2634 int err = 0; 2635 int bytes_del = 0; 2636 u64 dir_ino = btrfs_ino(dir); 2637 2638 if (BTRFS_I(dir)->logged_trans < trans->transid) 2639 return 0; 2640 2641 ret = join_running_log_trans(root); 2642 if (ret) 2643 return 0; 2644 2645 mutex_lock(&BTRFS_I(dir)->log_mutex); 2646 2647 log = root->log_root; 2648 path = btrfs_alloc_path(); 2649 if (!path) { 2650 err = -ENOMEM; 2651 goto out_unlock; 2652 } 2653 2654 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2655 name, name_len, -1); 2656 if (IS_ERR(di)) { 2657 err = PTR_ERR(di); 2658 goto fail; 2659 } 2660 if (di) { 2661 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2662 bytes_del += name_len; 2663 if (ret) { 2664 err = ret; 2665 goto fail; 2666 } 2667 } 2668 btrfs_release_path(path); 2669 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2670 index, name, name_len, -1); 2671 if (IS_ERR(di)) { 2672 err = PTR_ERR(di); 2673 goto fail; 2674 } 2675 if (di) { 2676 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2677 bytes_del += name_len; 2678 if (ret) { 2679 err = ret; 2680 goto fail; 2681 } 2682 } 2683 2684 /* update the directory size in the log to reflect the names 2685 * we have removed 2686 */ 2687 if (bytes_del) { 2688 struct btrfs_key key; 2689 2690 key.objectid = dir_ino; 2691 key.offset = 0; 2692 key.type = BTRFS_INODE_ITEM_KEY; 2693 btrfs_release_path(path); 2694 2695 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2696 if (ret < 0) { 2697 err = ret; 2698 goto fail; 2699 } 2700 if (ret == 0) { 2701 struct btrfs_inode_item *item; 2702 u64 i_size; 2703 2704 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2705 struct btrfs_inode_item); 2706 i_size = btrfs_inode_size(path->nodes[0], item); 2707 if (i_size > bytes_del) 2708 i_size -= bytes_del; 2709 else 2710 i_size = 0; 2711 btrfs_set_inode_size(path->nodes[0], item, i_size); 2712 btrfs_mark_buffer_dirty(path->nodes[0]); 2713 } else 2714 ret = 0; 2715 btrfs_release_path(path); 2716 } 2717 fail: 2718 btrfs_free_path(path); 2719 out_unlock: 2720 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2721 if (ret == -ENOSPC) { 2722 root->fs_info->last_trans_log_full_commit = trans->transid; 2723 ret = 0; 2724 } else if (ret < 0) 2725 btrfs_abort_transaction(trans, root, ret); 2726 2727 btrfs_end_log_trans(root); 2728 2729 return err; 2730 } 2731 2732 /* see comments for btrfs_del_dir_entries_in_log */ 2733 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2734 struct btrfs_root *root, 2735 const char *name, int name_len, 2736 struct inode *inode, u64 dirid) 2737 { 2738 struct btrfs_root *log; 2739 u64 index; 2740 int ret; 2741 2742 if (BTRFS_I(inode)->logged_trans < trans->transid) 2743 return 0; 2744 2745 ret = join_running_log_trans(root); 2746 if (ret) 2747 return 0; 2748 log = root->log_root; 2749 mutex_lock(&BTRFS_I(inode)->log_mutex); 2750 2751 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 2752 dirid, &index); 2753 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2754 if (ret == -ENOSPC) { 2755 root->fs_info->last_trans_log_full_commit = trans->transid; 2756 ret = 0; 2757 } else if (ret < 0 && ret != -ENOENT) 2758 btrfs_abort_transaction(trans, root, ret); 2759 btrfs_end_log_trans(root); 2760 2761 return ret; 2762 } 2763 2764 /* 2765 * creates a range item in the log for 'dirid'. first_offset and 2766 * last_offset tell us which parts of the key space the log should 2767 * be considered authoritative for. 2768 */ 2769 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2770 struct btrfs_root *log, 2771 struct btrfs_path *path, 2772 int key_type, u64 dirid, 2773 u64 first_offset, u64 last_offset) 2774 { 2775 int ret; 2776 struct btrfs_key key; 2777 struct btrfs_dir_log_item *item; 2778 2779 key.objectid = dirid; 2780 key.offset = first_offset; 2781 if (key_type == BTRFS_DIR_ITEM_KEY) 2782 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2783 else 2784 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2785 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2786 if (ret) 2787 return ret; 2788 2789 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2790 struct btrfs_dir_log_item); 2791 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2792 btrfs_mark_buffer_dirty(path->nodes[0]); 2793 btrfs_release_path(path); 2794 return 0; 2795 } 2796 2797 /* 2798 * log all the items included in the current transaction for a given 2799 * directory. This also creates the range items in the log tree required 2800 * to replay anything deleted before the fsync 2801 */ 2802 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2803 struct btrfs_root *root, struct inode *inode, 2804 struct btrfs_path *path, 2805 struct btrfs_path *dst_path, int key_type, 2806 u64 min_offset, u64 *last_offset_ret) 2807 { 2808 struct btrfs_key min_key; 2809 struct btrfs_key max_key; 2810 struct btrfs_root *log = root->log_root; 2811 struct extent_buffer *src; 2812 int err = 0; 2813 int ret; 2814 int i; 2815 int nritems; 2816 u64 first_offset = min_offset; 2817 u64 last_offset = (u64)-1; 2818 u64 ino = btrfs_ino(inode); 2819 2820 log = root->log_root; 2821 max_key.objectid = ino; 2822 max_key.offset = (u64)-1; 2823 max_key.type = key_type; 2824 2825 min_key.objectid = ino; 2826 min_key.type = key_type; 2827 min_key.offset = min_offset; 2828 2829 path->keep_locks = 1; 2830 2831 ret = btrfs_search_forward(root, &min_key, &max_key, 2832 path, trans->transid); 2833 2834 /* 2835 * we didn't find anything from this transaction, see if there 2836 * is anything at all 2837 */ 2838 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 2839 min_key.objectid = ino; 2840 min_key.type = key_type; 2841 min_key.offset = (u64)-1; 2842 btrfs_release_path(path); 2843 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2844 if (ret < 0) { 2845 btrfs_release_path(path); 2846 return ret; 2847 } 2848 ret = btrfs_previous_item(root, path, ino, key_type); 2849 2850 /* if ret == 0 there are items for this type, 2851 * create a range to tell us the last key of this type. 2852 * otherwise, there are no items in this directory after 2853 * *min_offset, and we create a range to indicate that. 2854 */ 2855 if (ret == 0) { 2856 struct btrfs_key tmp; 2857 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2858 path->slots[0]); 2859 if (key_type == tmp.type) 2860 first_offset = max(min_offset, tmp.offset) + 1; 2861 } 2862 goto done; 2863 } 2864 2865 /* go backward to find any previous key */ 2866 ret = btrfs_previous_item(root, path, ino, key_type); 2867 if (ret == 0) { 2868 struct btrfs_key tmp; 2869 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2870 if (key_type == tmp.type) { 2871 first_offset = tmp.offset; 2872 ret = overwrite_item(trans, log, dst_path, 2873 path->nodes[0], path->slots[0], 2874 &tmp); 2875 if (ret) { 2876 err = ret; 2877 goto done; 2878 } 2879 } 2880 } 2881 btrfs_release_path(path); 2882 2883 /* find the first key from this transaction again */ 2884 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2885 if (ret != 0) { 2886 WARN_ON(1); 2887 goto done; 2888 } 2889 2890 /* 2891 * we have a block from this transaction, log every item in it 2892 * from our directory 2893 */ 2894 while (1) { 2895 struct btrfs_key tmp; 2896 src = path->nodes[0]; 2897 nritems = btrfs_header_nritems(src); 2898 for (i = path->slots[0]; i < nritems; i++) { 2899 btrfs_item_key_to_cpu(src, &min_key, i); 2900 2901 if (min_key.objectid != ino || min_key.type != key_type) 2902 goto done; 2903 ret = overwrite_item(trans, log, dst_path, src, i, 2904 &min_key); 2905 if (ret) { 2906 err = ret; 2907 goto done; 2908 } 2909 } 2910 path->slots[0] = nritems; 2911 2912 /* 2913 * look ahead to the next item and see if it is also 2914 * from this directory and from this transaction 2915 */ 2916 ret = btrfs_next_leaf(root, path); 2917 if (ret == 1) { 2918 last_offset = (u64)-1; 2919 goto done; 2920 } 2921 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2922 if (tmp.objectid != ino || tmp.type != key_type) { 2923 last_offset = (u64)-1; 2924 goto done; 2925 } 2926 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2927 ret = overwrite_item(trans, log, dst_path, 2928 path->nodes[0], path->slots[0], 2929 &tmp); 2930 if (ret) 2931 err = ret; 2932 else 2933 last_offset = tmp.offset; 2934 goto done; 2935 } 2936 } 2937 done: 2938 btrfs_release_path(path); 2939 btrfs_release_path(dst_path); 2940 2941 if (err == 0) { 2942 *last_offset_ret = last_offset; 2943 /* 2944 * insert the log range keys to indicate where the log 2945 * is valid 2946 */ 2947 ret = insert_dir_log_key(trans, log, path, key_type, 2948 ino, first_offset, last_offset); 2949 if (ret) 2950 err = ret; 2951 } 2952 return err; 2953 } 2954 2955 /* 2956 * logging directories is very similar to logging inodes, We find all the items 2957 * from the current transaction and write them to the log. 2958 * 2959 * The recovery code scans the directory in the subvolume, and if it finds a 2960 * key in the range logged that is not present in the log tree, then it means 2961 * that dir entry was unlinked during the transaction. 2962 * 2963 * In order for that scan to work, we must include one key smaller than 2964 * the smallest logged by this transaction and one key larger than the largest 2965 * key logged by this transaction. 2966 */ 2967 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 2968 struct btrfs_root *root, struct inode *inode, 2969 struct btrfs_path *path, 2970 struct btrfs_path *dst_path) 2971 { 2972 u64 min_key; 2973 u64 max_key; 2974 int ret; 2975 int key_type = BTRFS_DIR_ITEM_KEY; 2976 2977 again: 2978 min_key = 0; 2979 max_key = 0; 2980 while (1) { 2981 ret = log_dir_items(trans, root, inode, path, 2982 dst_path, key_type, min_key, 2983 &max_key); 2984 if (ret) 2985 return ret; 2986 if (max_key == (u64)-1) 2987 break; 2988 min_key = max_key + 1; 2989 } 2990 2991 if (key_type == BTRFS_DIR_ITEM_KEY) { 2992 key_type = BTRFS_DIR_INDEX_KEY; 2993 goto again; 2994 } 2995 return 0; 2996 } 2997 2998 /* 2999 * a helper function to drop items from the log before we relog an 3000 * inode. max_key_type indicates the highest item type to remove. 3001 * This cannot be run for file data extents because it does not 3002 * free the extents they point to. 3003 */ 3004 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3005 struct btrfs_root *log, 3006 struct btrfs_path *path, 3007 u64 objectid, int max_key_type) 3008 { 3009 int ret; 3010 struct btrfs_key key; 3011 struct btrfs_key found_key; 3012 int start_slot; 3013 3014 key.objectid = objectid; 3015 key.type = max_key_type; 3016 key.offset = (u64)-1; 3017 3018 while (1) { 3019 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3020 BUG_ON(ret == 0); /* Logic error */ 3021 if (ret < 0) 3022 break; 3023 3024 if (path->slots[0] == 0) 3025 break; 3026 3027 path->slots[0]--; 3028 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3029 path->slots[0]); 3030 3031 if (found_key.objectid != objectid) 3032 break; 3033 3034 found_key.offset = 0; 3035 found_key.type = 0; 3036 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3037 &start_slot); 3038 3039 ret = btrfs_del_items(trans, log, path, start_slot, 3040 path->slots[0] - start_slot + 1); 3041 /* 3042 * If start slot isn't 0 then we don't need to re-search, we've 3043 * found the last guy with the objectid in this tree. 3044 */ 3045 if (ret || start_slot != 0) 3046 break; 3047 btrfs_release_path(path); 3048 } 3049 btrfs_release_path(path); 3050 if (ret > 0) 3051 ret = 0; 3052 return ret; 3053 } 3054 3055 static void fill_inode_item(struct btrfs_trans_handle *trans, 3056 struct extent_buffer *leaf, 3057 struct btrfs_inode_item *item, 3058 struct inode *inode, int log_inode_only) 3059 { 3060 struct btrfs_map_token token; 3061 3062 btrfs_init_map_token(&token); 3063 3064 if (log_inode_only) { 3065 /* set the generation to zero so the recover code 3066 * can tell the difference between an logging 3067 * just to say 'this inode exists' and a logging 3068 * to say 'update this inode with these values' 3069 */ 3070 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3071 btrfs_set_token_inode_size(leaf, item, 0, &token); 3072 } else { 3073 btrfs_set_token_inode_generation(leaf, item, 3074 BTRFS_I(inode)->generation, 3075 &token); 3076 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3077 } 3078 3079 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3080 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3081 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3082 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3083 3084 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3085 inode->i_atime.tv_sec, &token); 3086 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3087 inode->i_atime.tv_nsec, &token); 3088 3089 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3090 inode->i_mtime.tv_sec, &token); 3091 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3092 inode->i_mtime.tv_nsec, &token); 3093 3094 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3095 inode->i_ctime.tv_sec, &token); 3096 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3097 inode->i_ctime.tv_nsec, &token); 3098 3099 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3100 &token); 3101 3102 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3103 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3104 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3105 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3106 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3107 } 3108 3109 static int log_inode_item(struct btrfs_trans_handle *trans, 3110 struct btrfs_root *log, struct btrfs_path *path, 3111 struct inode *inode) 3112 { 3113 struct btrfs_inode_item *inode_item; 3114 struct btrfs_key key; 3115 int ret; 3116 3117 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); 3118 ret = btrfs_insert_empty_item(trans, log, path, &key, 3119 sizeof(*inode_item)); 3120 if (ret && ret != -EEXIST) 3121 return ret; 3122 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3123 struct btrfs_inode_item); 3124 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3125 btrfs_release_path(path); 3126 return 0; 3127 } 3128 3129 static noinline int copy_items(struct btrfs_trans_handle *trans, 3130 struct inode *inode, 3131 struct btrfs_path *dst_path, 3132 struct extent_buffer *src, 3133 int start_slot, int nr, int inode_only) 3134 { 3135 unsigned long src_offset; 3136 unsigned long dst_offset; 3137 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3138 struct btrfs_file_extent_item *extent; 3139 struct btrfs_inode_item *inode_item; 3140 int ret; 3141 struct btrfs_key *ins_keys; 3142 u32 *ins_sizes; 3143 char *ins_data; 3144 int i; 3145 struct list_head ordered_sums; 3146 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3147 3148 INIT_LIST_HEAD(&ordered_sums); 3149 3150 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3151 nr * sizeof(u32), GFP_NOFS); 3152 if (!ins_data) 3153 return -ENOMEM; 3154 3155 ins_sizes = (u32 *)ins_data; 3156 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3157 3158 for (i = 0; i < nr; i++) { 3159 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3160 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3161 } 3162 ret = btrfs_insert_empty_items(trans, log, dst_path, 3163 ins_keys, ins_sizes, nr); 3164 if (ret) { 3165 kfree(ins_data); 3166 return ret; 3167 } 3168 3169 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3170 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3171 dst_path->slots[0]); 3172 3173 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3174 3175 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3176 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3177 dst_path->slots[0], 3178 struct btrfs_inode_item); 3179 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3180 inode, inode_only == LOG_INODE_EXISTS); 3181 } else { 3182 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3183 src_offset, ins_sizes[i]); 3184 } 3185 3186 /* take a reference on file data extents so that truncates 3187 * or deletes of this inode don't have to relog the inode 3188 * again 3189 */ 3190 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3191 !skip_csum) { 3192 int found_type; 3193 extent = btrfs_item_ptr(src, start_slot + i, 3194 struct btrfs_file_extent_item); 3195 3196 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3197 continue; 3198 3199 found_type = btrfs_file_extent_type(src, extent); 3200 if (found_type == BTRFS_FILE_EXTENT_REG) { 3201 u64 ds, dl, cs, cl; 3202 ds = btrfs_file_extent_disk_bytenr(src, 3203 extent); 3204 /* ds == 0 is a hole */ 3205 if (ds == 0) 3206 continue; 3207 3208 dl = btrfs_file_extent_disk_num_bytes(src, 3209 extent); 3210 cs = btrfs_file_extent_offset(src, extent); 3211 cl = btrfs_file_extent_num_bytes(src, 3212 extent); 3213 if (btrfs_file_extent_compression(src, 3214 extent)) { 3215 cs = 0; 3216 cl = dl; 3217 } 3218 3219 ret = btrfs_lookup_csums_range( 3220 log->fs_info->csum_root, 3221 ds + cs, ds + cs + cl - 1, 3222 &ordered_sums, 0); 3223 if (ret) { 3224 btrfs_release_path(dst_path); 3225 kfree(ins_data); 3226 return ret; 3227 } 3228 } 3229 } 3230 } 3231 3232 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3233 btrfs_release_path(dst_path); 3234 kfree(ins_data); 3235 3236 /* 3237 * we have to do this after the loop above to avoid changing the 3238 * log tree while trying to change the log tree. 3239 */ 3240 ret = 0; 3241 while (!list_empty(&ordered_sums)) { 3242 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3243 struct btrfs_ordered_sum, 3244 list); 3245 if (!ret) 3246 ret = btrfs_csum_file_blocks(trans, log, sums); 3247 list_del(&sums->list); 3248 kfree(sums); 3249 } 3250 return ret; 3251 } 3252 3253 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3254 { 3255 struct extent_map *em1, *em2; 3256 3257 em1 = list_entry(a, struct extent_map, list); 3258 em2 = list_entry(b, struct extent_map, list); 3259 3260 if (em1->start < em2->start) 3261 return -1; 3262 else if (em1->start > em2->start) 3263 return 1; 3264 return 0; 3265 } 3266 3267 static int log_one_extent(struct btrfs_trans_handle *trans, 3268 struct inode *inode, struct btrfs_root *root, 3269 struct extent_map *em, struct btrfs_path *path) 3270 { 3271 struct btrfs_root *log = root->log_root; 3272 struct btrfs_file_extent_item *fi; 3273 struct extent_buffer *leaf; 3274 struct btrfs_ordered_extent *ordered; 3275 struct list_head ordered_sums; 3276 struct btrfs_map_token token; 3277 struct btrfs_key key; 3278 u64 mod_start = em->mod_start; 3279 u64 mod_len = em->mod_len; 3280 u64 csum_offset; 3281 u64 csum_len; 3282 u64 extent_offset = em->start - em->orig_start; 3283 u64 block_len; 3284 int ret; 3285 int index = log->log_transid % 2; 3286 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3287 3288 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 3289 em->start + em->len, NULL, 0); 3290 if (ret) 3291 return ret; 3292 3293 INIT_LIST_HEAD(&ordered_sums); 3294 btrfs_init_map_token(&token); 3295 key.objectid = btrfs_ino(inode); 3296 key.type = BTRFS_EXTENT_DATA_KEY; 3297 key.offset = em->start; 3298 3299 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); 3300 if (ret) 3301 return ret; 3302 leaf = path->nodes[0]; 3303 fi = btrfs_item_ptr(leaf, path->slots[0], 3304 struct btrfs_file_extent_item); 3305 3306 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3307 &token); 3308 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3309 skip_csum = true; 3310 btrfs_set_token_file_extent_type(leaf, fi, 3311 BTRFS_FILE_EXTENT_PREALLOC, 3312 &token); 3313 } else { 3314 btrfs_set_token_file_extent_type(leaf, fi, 3315 BTRFS_FILE_EXTENT_REG, 3316 &token); 3317 if (em->block_start == 0) 3318 skip_csum = true; 3319 } 3320 3321 block_len = max(em->block_len, em->orig_block_len); 3322 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3323 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3324 em->block_start, 3325 &token); 3326 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3327 &token); 3328 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3329 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3330 em->block_start - 3331 extent_offset, &token); 3332 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3333 &token); 3334 } else { 3335 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3336 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3337 &token); 3338 } 3339 3340 btrfs_set_token_file_extent_offset(leaf, fi, 3341 em->start - em->orig_start, 3342 &token); 3343 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3344 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 3345 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3346 &token); 3347 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3348 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3349 btrfs_mark_buffer_dirty(leaf); 3350 3351 btrfs_release_path(path); 3352 if (ret) { 3353 return ret; 3354 } 3355 3356 if (skip_csum) 3357 return 0; 3358 3359 if (em->compress_type) { 3360 csum_offset = 0; 3361 csum_len = block_len; 3362 } 3363 3364 /* 3365 * First check and see if our csums are on our outstanding ordered 3366 * extents. 3367 */ 3368 again: 3369 spin_lock_irq(&log->log_extents_lock[index]); 3370 list_for_each_entry(ordered, &log->logged_list[index], log_list) { 3371 struct btrfs_ordered_sum *sum; 3372 3373 if (!mod_len) 3374 break; 3375 3376 if (ordered->inode != inode) 3377 continue; 3378 3379 if (ordered->file_offset + ordered->len <= mod_start || 3380 mod_start + mod_len <= ordered->file_offset) 3381 continue; 3382 3383 /* 3384 * We are going to copy all the csums on this ordered extent, so 3385 * go ahead and adjust mod_start and mod_len in case this 3386 * ordered extent has already been logged. 3387 */ 3388 if (ordered->file_offset > mod_start) { 3389 if (ordered->file_offset + ordered->len >= 3390 mod_start + mod_len) 3391 mod_len = ordered->file_offset - mod_start; 3392 /* 3393 * If we have this case 3394 * 3395 * |--------- logged extent ---------| 3396 * |----- ordered extent ----| 3397 * 3398 * Just don't mess with mod_start and mod_len, we'll 3399 * just end up logging more csums than we need and it 3400 * will be ok. 3401 */ 3402 } else { 3403 if (ordered->file_offset + ordered->len < 3404 mod_start + mod_len) { 3405 mod_len = (mod_start + mod_len) - 3406 (ordered->file_offset + ordered->len); 3407 mod_start = ordered->file_offset + 3408 ordered->len; 3409 } else { 3410 mod_len = 0; 3411 } 3412 } 3413 3414 /* 3415 * To keep us from looping for the above case of an ordered 3416 * extent that falls inside of the logged extent. 3417 */ 3418 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3419 &ordered->flags)) 3420 continue; 3421 atomic_inc(&ordered->refs); 3422 spin_unlock_irq(&log->log_extents_lock[index]); 3423 /* 3424 * we've dropped the lock, we must either break or 3425 * start over after this. 3426 */ 3427 3428 wait_event(ordered->wait, ordered->csum_bytes_left == 0); 3429 3430 list_for_each_entry(sum, &ordered->list, list) { 3431 ret = btrfs_csum_file_blocks(trans, log, sum); 3432 if (ret) { 3433 btrfs_put_ordered_extent(ordered); 3434 goto unlocked; 3435 } 3436 } 3437 btrfs_put_ordered_extent(ordered); 3438 goto again; 3439 3440 } 3441 spin_unlock_irq(&log->log_extents_lock[index]); 3442 unlocked: 3443 3444 if (!mod_len || ret) 3445 return ret; 3446 3447 csum_offset = mod_start - em->start; 3448 csum_len = mod_len; 3449 3450 /* block start is already adjusted for the file extent offset. */ 3451 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3452 em->block_start + csum_offset, 3453 em->block_start + csum_offset + 3454 csum_len - 1, &ordered_sums, 0); 3455 if (ret) 3456 return ret; 3457 3458 while (!list_empty(&ordered_sums)) { 3459 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3460 struct btrfs_ordered_sum, 3461 list); 3462 if (!ret) 3463 ret = btrfs_csum_file_blocks(trans, log, sums); 3464 list_del(&sums->list); 3465 kfree(sums); 3466 } 3467 3468 return ret; 3469 } 3470 3471 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3472 struct btrfs_root *root, 3473 struct inode *inode, 3474 struct btrfs_path *path) 3475 { 3476 struct extent_map *em, *n; 3477 struct list_head extents; 3478 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3479 u64 test_gen; 3480 int ret = 0; 3481 int num = 0; 3482 3483 INIT_LIST_HEAD(&extents); 3484 3485 write_lock(&tree->lock); 3486 test_gen = root->fs_info->last_trans_committed; 3487 3488 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3489 list_del_init(&em->list); 3490 3491 /* 3492 * Just an arbitrary number, this can be really CPU intensive 3493 * once we start getting a lot of extents, and really once we 3494 * have a bunch of extents we just want to commit since it will 3495 * be faster. 3496 */ 3497 if (++num > 32768) { 3498 list_del_init(&tree->modified_extents); 3499 ret = -EFBIG; 3500 goto process; 3501 } 3502 3503 if (em->generation <= test_gen) 3504 continue; 3505 /* Need a ref to keep it from getting evicted from cache */ 3506 atomic_inc(&em->refs); 3507 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3508 list_add_tail(&em->list, &extents); 3509 num++; 3510 } 3511 3512 list_sort(NULL, &extents, extent_cmp); 3513 3514 process: 3515 while (!list_empty(&extents)) { 3516 em = list_entry(extents.next, struct extent_map, list); 3517 3518 list_del_init(&em->list); 3519 3520 /* 3521 * If we had an error we just need to delete everybody from our 3522 * private list. 3523 */ 3524 if (ret) { 3525 clear_em_logging(tree, em); 3526 free_extent_map(em); 3527 continue; 3528 } 3529 3530 write_unlock(&tree->lock); 3531 3532 ret = log_one_extent(trans, inode, root, em, path); 3533 write_lock(&tree->lock); 3534 clear_em_logging(tree, em); 3535 free_extent_map(em); 3536 } 3537 WARN_ON(!list_empty(&extents)); 3538 write_unlock(&tree->lock); 3539 3540 btrfs_release_path(path); 3541 return ret; 3542 } 3543 3544 /* log a single inode in the tree log. 3545 * At least one parent directory for this inode must exist in the tree 3546 * or be logged already. 3547 * 3548 * Any items from this inode changed by the current transaction are copied 3549 * to the log tree. An extra reference is taken on any extents in this 3550 * file, allowing us to avoid a whole pile of corner cases around logging 3551 * blocks that have been removed from the tree. 3552 * 3553 * See LOG_INODE_ALL and related defines for a description of what inode_only 3554 * does. 3555 * 3556 * This handles both files and directories. 3557 */ 3558 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3559 struct btrfs_root *root, struct inode *inode, 3560 int inode_only) 3561 { 3562 struct btrfs_path *path; 3563 struct btrfs_path *dst_path; 3564 struct btrfs_key min_key; 3565 struct btrfs_key max_key; 3566 struct btrfs_root *log = root->log_root; 3567 struct extent_buffer *src = NULL; 3568 int err = 0; 3569 int ret; 3570 int nritems; 3571 int ins_start_slot = 0; 3572 int ins_nr; 3573 bool fast_search = false; 3574 u64 ino = btrfs_ino(inode); 3575 3576 path = btrfs_alloc_path(); 3577 if (!path) 3578 return -ENOMEM; 3579 dst_path = btrfs_alloc_path(); 3580 if (!dst_path) { 3581 btrfs_free_path(path); 3582 return -ENOMEM; 3583 } 3584 3585 min_key.objectid = ino; 3586 min_key.type = BTRFS_INODE_ITEM_KEY; 3587 min_key.offset = 0; 3588 3589 max_key.objectid = ino; 3590 3591 3592 /* today the code can only do partial logging of directories */ 3593 if (S_ISDIR(inode->i_mode) || 3594 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3595 &BTRFS_I(inode)->runtime_flags) && 3596 inode_only == LOG_INODE_EXISTS)) 3597 max_key.type = BTRFS_XATTR_ITEM_KEY; 3598 else 3599 max_key.type = (u8)-1; 3600 max_key.offset = (u64)-1; 3601 3602 /* Only run delayed items if we are a dir or a new file */ 3603 if (S_ISDIR(inode->i_mode) || 3604 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { 3605 ret = btrfs_commit_inode_delayed_items(trans, inode); 3606 if (ret) { 3607 btrfs_free_path(path); 3608 btrfs_free_path(dst_path); 3609 return ret; 3610 } 3611 } 3612 3613 mutex_lock(&BTRFS_I(inode)->log_mutex); 3614 3615 btrfs_get_logged_extents(log, inode); 3616 3617 /* 3618 * a brute force approach to making sure we get the most uptodate 3619 * copies of everything. 3620 */ 3621 if (S_ISDIR(inode->i_mode)) { 3622 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 3623 3624 if (inode_only == LOG_INODE_EXISTS) 3625 max_key_type = BTRFS_XATTR_ITEM_KEY; 3626 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3627 } else { 3628 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3629 &BTRFS_I(inode)->runtime_flags)) { 3630 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3631 &BTRFS_I(inode)->runtime_flags); 3632 ret = btrfs_truncate_inode_items(trans, log, 3633 inode, 0, 0); 3634 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3635 &BTRFS_I(inode)->runtime_flags)) { 3636 if (inode_only == LOG_INODE_ALL) 3637 fast_search = true; 3638 max_key.type = BTRFS_XATTR_ITEM_KEY; 3639 ret = drop_objectid_items(trans, log, path, ino, 3640 max_key.type); 3641 } else { 3642 if (inode_only == LOG_INODE_ALL) 3643 fast_search = true; 3644 ret = log_inode_item(trans, log, dst_path, inode); 3645 if (ret) { 3646 err = ret; 3647 goto out_unlock; 3648 } 3649 goto log_extents; 3650 } 3651 3652 } 3653 if (ret) { 3654 err = ret; 3655 goto out_unlock; 3656 } 3657 path->keep_locks = 1; 3658 3659 while (1) { 3660 ins_nr = 0; 3661 ret = btrfs_search_forward(root, &min_key, &max_key, 3662 path, trans->transid); 3663 if (ret != 0) 3664 break; 3665 again: 3666 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 3667 if (min_key.objectid != ino) 3668 break; 3669 if (min_key.type > max_key.type) 3670 break; 3671 3672 src = path->nodes[0]; 3673 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 3674 ins_nr++; 3675 goto next_slot; 3676 } else if (!ins_nr) { 3677 ins_start_slot = path->slots[0]; 3678 ins_nr = 1; 3679 goto next_slot; 3680 } 3681 3682 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3683 ins_nr, inode_only); 3684 if (ret) { 3685 err = ret; 3686 goto out_unlock; 3687 } 3688 ins_nr = 1; 3689 ins_start_slot = path->slots[0]; 3690 next_slot: 3691 3692 nritems = btrfs_header_nritems(path->nodes[0]); 3693 path->slots[0]++; 3694 if (path->slots[0] < nritems) { 3695 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 3696 path->slots[0]); 3697 goto again; 3698 } 3699 if (ins_nr) { 3700 ret = copy_items(trans, inode, dst_path, src, 3701 ins_start_slot, 3702 ins_nr, inode_only); 3703 if (ret) { 3704 err = ret; 3705 goto out_unlock; 3706 } 3707 ins_nr = 0; 3708 } 3709 btrfs_release_path(path); 3710 3711 if (min_key.offset < (u64)-1) 3712 min_key.offset++; 3713 else if (min_key.type < (u8)-1) 3714 min_key.type++; 3715 else if (min_key.objectid < (u64)-1) 3716 min_key.objectid++; 3717 else 3718 break; 3719 } 3720 if (ins_nr) { 3721 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3722 ins_nr, inode_only); 3723 if (ret) { 3724 err = ret; 3725 goto out_unlock; 3726 } 3727 ins_nr = 0; 3728 } 3729 3730 log_extents: 3731 if (fast_search) { 3732 btrfs_release_path(dst_path); 3733 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 3734 if (ret) { 3735 err = ret; 3736 goto out_unlock; 3737 } 3738 } else { 3739 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3740 struct extent_map *em, *n; 3741 3742 write_lock(&tree->lock); 3743 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3744 list_del_init(&em->list); 3745 write_unlock(&tree->lock); 3746 } 3747 3748 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3749 btrfs_release_path(path); 3750 btrfs_release_path(dst_path); 3751 ret = log_directory_changes(trans, root, inode, path, dst_path); 3752 if (ret) { 3753 err = ret; 3754 goto out_unlock; 3755 } 3756 } 3757 BTRFS_I(inode)->logged_trans = trans->transid; 3758 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 3759 out_unlock: 3760 if (err) 3761 btrfs_free_logged_extents(log, log->log_transid); 3762 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3763 3764 btrfs_free_path(path); 3765 btrfs_free_path(dst_path); 3766 return err; 3767 } 3768 3769 /* 3770 * follow the dentry parent pointers up the chain and see if any 3771 * of the directories in it require a full commit before they can 3772 * be logged. Returns zero if nothing special needs to be done or 1 if 3773 * a full commit is required. 3774 */ 3775 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 3776 struct inode *inode, 3777 struct dentry *parent, 3778 struct super_block *sb, 3779 u64 last_committed) 3780 { 3781 int ret = 0; 3782 struct btrfs_root *root; 3783 struct dentry *old_parent = NULL; 3784 3785 /* 3786 * for regular files, if its inode is already on disk, we don't 3787 * have to worry about the parents at all. This is because 3788 * we can use the last_unlink_trans field to record renames 3789 * and other fun in this file. 3790 */ 3791 if (S_ISREG(inode->i_mode) && 3792 BTRFS_I(inode)->generation <= last_committed && 3793 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3794 goto out; 3795 3796 if (!S_ISDIR(inode->i_mode)) { 3797 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3798 goto out; 3799 inode = parent->d_inode; 3800 } 3801 3802 while (1) { 3803 BTRFS_I(inode)->logged_trans = trans->transid; 3804 smp_mb(); 3805 3806 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 3807 root = BTRFS_I(inode)->root; 3808 3809 /* 3810 * make sure any commits to the log are forced 3811 * to be full commits 3812 */ 3813 root->fs_info->last_trans_log_full_commit = 3814 trans->transid; 3815 ret = 1; 3816 break; 3817 } 3818 3819 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3820 break; 3821 3822 if (IS_ROOT(parent)) 3823 break; 3824 3825 parent = dget_parent(parent); 3826 dput(old_parent); 3827 old_parent = parent; 3828 inode = parent->d_inode; 3829 3830 } 3831 dput(old_parent); 3832 out: 3833 return ret; 3834 } 3835 3836 /* 3837 * helper function around btrfs_log_inode to make sure newly created 3838 * parent directories also end up in the log. A minimal inode and backref 3839 * only logging is done of any parent directories that are older than 3840 * the last committed transaction 3841 */ 3842 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 3843 struct btrfs_root *root, struct inode *inode, 3844 struct dentry *parent, int exists_only) 3845 { 3846 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 3847 struct super_block *sb; 3848 struct dentry *old_parent = NULL; 3849 int ret = 0; 3850 u64 last_committed = root->fs_info->last_trans_committed; 3851 3852 sb = inode->i_sb; 3853 3854 if (btrfs_test_opt(root, NOTREELOG)) { 3855 ret = 1; 3856 goto end_no_trans; 3857 } 3858 3859 if (root->fs_info->last_trans_log_full_commit > 3860 root->fs_info->last_trans_committed) { 3861 ret = 1; 3862 goto end_no_trans; 3863 } 3864 3865 if (root != BTRFS_I(inode)->root || 3866 btrfs_root_refs(&root->root_item) == 0) { 3867 ret = 1; 3868 goto end_no_trans; 3869 } 3870 3871 ret = check_parent_dirs_for_sync(trans, inode, parent, 3872 sb, last_committed); 3873 if (ret) 3874 goto end_no_trans; 3875 3876 if (btrfs_inode_in_log(inode, trans->transid)) { 3877 ret = BTRFS_NO_LOG_SYNC; 3878 goto end_no_trans; 3879 } 3880 3881 ret = start_log_trans(trans, root); 3882 if (ret) 3883 goto end_trans; 3884 3885 ret = btrfs_log_inode(trans, root, inode, inode_only); 3886 if (ret) 3887 goto end_trans; 3888 3889 /* 3890 * for regular files, if its inode is already on disk, we don't 3891 * have to worry about the parents at all. This is because 3892 * we can use the last_unlink_trans field to record renames 3893 * and other fun in this file. 3894 */ 3895 if (S_ISREG(inode->i_mode) && 3896 BTRFS_I(inode)->generation <= last_committed && 3897 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 3898 ret = 0; 3899 goto end_trans; 3900 } 3901 3902 inode_only = LOG_INODE_EXISTS; 3903 while (1) { 3904 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3905 break; 3906 3907 inode = parent->d_inode; 3908 if (root != BTRFS_I(inode)->root) 3909 break; 3910 3911 if (BTRFS_I(inode)->generation > 3912 root->fs_info->last_trans_committed) { 3913 ret = btrfs_log_inode(trans, root, inode, inode_only); 3914 if (ret) 3915 goto end_trans; 3916 } 3917 if (IS_ROOT(parent)) 3918 break; 3919 3920 parent = dget_parent(parent); 3921 dput(old_parent); 3922 old_parent = parent; 3923 } 3924 ret = 0; 3925 end_trans: 3926 dput(old_parent); 3927 if (ret < 0) { 3928 root->fs_info->last_trans_log_full_commit = trans->transid; 3929 ret = 1; 3930 } 3931 btrfs_end_log_trans(root); 3932 end_no_trans: 3933 return ret; 3934 } 3935 3936 /* 3937 * it is not safe to log dentry if the chunk root has added new 3938 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 3939 * If this returns 1, you must commit the transaction to safely get your 3940 * data on disk. 3941 */ 3942 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3943 struct btrfs_root *root, struct dentry *dentry) 3944 { 3945 struct dentry *parent = dget_parent(dentry); 3946 int ret; 3947 3948 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 3949 dput(parent); 3950 3951 return ret; 3952 } 3953 3954 /* 3955 * should be called during mount to recover any replay any log trees 3956 * from the FS 3957 */ 3958 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 3959 { 3960 int ret; 3961 struct btrfs_path *path; 3962 struct btrfs_trans_handle *trans; 3963 struct btrfs_key key; 3964 struct btrfs_key found_key; 3965 struct btrfs_key tmp_key; 3966 struct btrfs_root *log; 3967 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 3968 struct walk_control wc = { 3969 .process_func = process_one_buffer, 3970 .stage = 0, 3971 }; 3972 3973 path = btrfs_alloc_path(); 3974 if (!path) 3975 return -ENOMEM; 3976 3977 fs_info->log_root_recovering = 1; 3978 3979 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3980 if (IS_ERR(trans)) { 3981 ret = PTR_ERR(trans); 3982 goto error; 3983 } 3984 3985 wc.trans = trans; 3986 wc.pin = 1; 3987 3988 ret = walk_log_tree(trans, log_root_tree, &wc); 3989 if (ret) { 3990 btrfs_error(fs_info, ret, "Failed to pin buffers while " 3991 "recovering log root tree."); 3992 goto error; 3993 } 3994 3995 again: 3996 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3997 key.offset = (u64)-1; 3998 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 3999 4000 while (1) { 4001 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4002 4003 if (ret < 0) { 4004 btrfs_error(fs_info, ret, 4005 "Couldn't find tree log root."); 4006 goto error; 4007 } 4008 if (ret > 0) { 4009 if (path->slots[0] == 0) 4010 break; 4011 path->slots[0]--; 4012 } 4013 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4014 path->slots[0]); 4015 btrfs_release_path(path); 4016 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4017 break; 4018 4019 log = btrfs_read_fs_root_no_radix(log_root_tree, 4020 &found_key); 4021 if (IS_ERR(log)) { 4022 ret = PTR_ERR(log); 4023 btrfs_error(fs_info, ret, 4024 "Couldn't read tree log root."); 4025 goto error; 4026 } 4027 4028 tmp_key.objectid = found_key.offset; 4029 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 4030 tmp_key.offset = (u64)-1; 4031 4032 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 4033 if (IS_ERR(wc.replay_dest)) { 4034 ret = PTR_ERR(wc.replay_dest); 4035 free_extent_buffer(log->node); 4036 free_extent_buffer(log->commit_root); 4037 kfree(log); 4038 btrfs_error(fs_info, ret, "Couldn't read target root " 4039 "for tree log recovery."); 4040 goto error; 4041 } 4042 4043 wc.replay_dest->log_root = log; 4044 btrfs_record_root_in_trans(trans, wc.replay_dest); 4045 ret = walk_log_tree(trans, log, &wc); 4046 4047 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 4048 ret = fixup_inode_link_counts(trans, wc.replay_dest, 4049 path); 4050 } 4051 4052 key.offset = found_key.offset - 1; 4053 wc.replay_dest->log_root = NULL; 4054 free_extent_buffer(log->node); 4055 free_extent_buffer(log->commit_root); 4056 kfree(log); 4057 4058 if (ret) 4059 goto error; 4060 4061 if (found_key.offset == 0) 4062 break; 4063 } 4064 btrfs_release_path(path); 4065 4066 /* step one is to pin it all, step two is to replay just inodes */ 4067 if (wc.pin) { 4068 wc.pin = 0; 4069 wc.process_func = replay_one_buffer; 4070 wc.stage = LOG_WALK_REPLAY_INODES; 4071 goto again; 4072 } 4073 /* step three is to replay everything */ 4074 if (wc.stage < LOG_WALK_REPLAY_ALL) { 4075 wc.stage++; 4076 goto again; 4077 } 4078 4079 btrfs_free_path(path); 4080 4081 /* step 4: commit the transaction, which also unpins the blocks */ 4082 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 4083 if (ret) 4084 return ret; 4085 4086 free_extent_buffer(log_root_tree->node); 4087 log_root_tree->log_root = NULL; 4088 fs_info->log_root_recovering = 0; 4089 kfree(log_root_tree); 4090 4091 return 0; 4092 error: 4093 if (wc.trans) 4094 btrfs_end_transaction(wc.trans, fs_info->tree_root); 4095 btrfs_free_path(path); 4096 return ret; 4097 } 4098 4099 /* 4100 * there are some corner cases where we want to force a full 4101 * commit instead of allowing a directory to be logged. 4102 * 4103 * They revolve around files there were unlinked from the directory, and 4104 * this function updates the parent directory so that a full commit is 4105 * properly done if it is fsync'd later after the unlinks are done. 4106 */ 4107 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 4108 struct inode *dir, struct inode *inode, 4109 int for_rename) 4110 { 4111 /* 4112 * when we're logging a file, if it hasn't been renamed 4113 * or unlinked, and its inode is fully committed on disk, 4114 * we don't have to worry about walking up the directory chain 4115 * to log its parents. 4116 * 4117 * So, we use the last_unlink_trans field to put this transid 4118 * into the file. When the file is logged we check it and 4119 * don't log the parents if the file is fully on disk. 4120 */ 4121 if (S_ISREG(inode->i_mode)) 4122 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4123 4124 /* 4125 * if this directory was already logged any new 4126 * names for this file/dir will get recorded 4127 */ 4128 smp_mb(); 4129 if (BTRFS_I(dir)->logged_trans == trans->transid) 4130 return; 4131 4132 /* 4133 * if the inode we're about to unlink was logged, 4134 * the log will be properly updated for any new names 4135 */ 4136 if (BTRFS_I(inode)->logged_trans == trans->transid) 4137 return; 4138 4139 /* 4140 * when renaming files across directories, if the directory 4141 * there we're unlinking from gets fsync'd later on, there's 4142 * no way to find the destination directory later and fsync it 4143 * properly. So, we have to be conservative and force commits 4144 * so the new name gets discovered. 4145 */ 4146 if (for_rename) 4147 goto record; 4148 4149 /* we can safely do the unlink without any special recording */ 4150 return; 4151 4152 record: 4153 BTRFS_I(dir)->last_unlink_trans = trans->transid; 4154 } 4155 4156 /* 4157 * Call this after adding a new name for a file and it will properly 4158 * update the log to reflect the new name. 4159 * 4160 * It will return zero if all goes well, and it will return 1 if a 4161 * full transaction commit is required. 4162 */ 4163 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 4164 struct inode *inode, struct inode *old_dir, 4165 struct dentry *parent) 4166 { 4167 struct btrfs_root * root = BTRFS_I(inode)->root; 4168 4169 /* 4170 * this will force the logging code to walk the dentry chain 4171 * up for the file 4172 */ 4173 if (S_ISREG(inode->i_mode)) 4174 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4175 4176 /* 4177 * if this inode hasn't been logged and directory we're renaming it 4178 * from hasn't been logged, we don't need to log it 4179 */ 4180 if (BTRFS_I(inode)->logged_trans <= 4181 root->fs_info->last_trans_committed && 4182 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 4183 root->fs_info->last_trans_committed)) 4184 return 0; 4185 4186 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4187 } 4188 4189