1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "ctree.h" 24 #include "transaction.h" 25 #include "disk-io.h" 26 #include "locking.h" 27 #include "print-tree.h" 28 #include "backref.h" 29 #include "compat.h" 30 #include "tree-log.h" 31 #include "hash.h" 32 33 /* magic values for the inode_only field in btrfs_log_inode: 34 * 35 * LOG_INODE_ALL means to log everything 36 * LOG_INODE_EXISTS means to log just enough to recreate the inode 37 * during log replay 38 */ 39 #define LOG_INODE_ALL 0 40 #define LOG_INODE_EXISTS 1 41 42 /* 43 * directory trouble cases 44 * 45 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 46 * log, we must force a full commit before doing an fsync of the directory 47 * where the unlink was done. 48 * ---> record transid of last unlink/rename per directory 49 * 50 * mkdir foo/some_dir 51 * normal commit 52 * rename foo/some_dir foo2/some_dir 53 * mkdir foo/some_dir 54 * fsync foo/some_dir/some_file 55 * 56 * The fsync above will unlink the original some_dir without recording 57 * it in its new location (foo2). After a crash, some_dir will be gone 58 * unless the fsync of some_file forces a full commit 59 * 60 * 2) we must log any new names for any file or dir that is in the fsync 61 * log. ---> check inode while renaming/linking. 62 * 63 * 2a) we must log any new names for any file or dir during rename 64 * when the directory they are being removed from was logged. 65 * ---> check inode and old parent dir during rename 66 * 67 * 2a is actually the more important variant. With the extra logging 68 * a crash might unlink the old name without recreating the new one 69 * 70 * 3) after a crash, we must go through any directories with a link count 71 * of zero and redo the rm -rf 72 * 73 * mkdir f1/foo 74 * normal commit 75 * rm -rf f1/foo 76 * fsync(f1) 77 * 78 * The directory f1 was fully removed from the FS, but fsync was never 79 * called on f1, only its parent dir. After a crash the rm -rf must 80 * be replayed. This must be able to recurse down the entire 81 * directory tree. The inode link count fixup code takes care of the 82 * ugly details. 83 */ 84 85 /* 86 * stages for the tree walking. The first 87 * stage (0) is to only pin down the blocks we find 88 * the second stage (1) is to make sure that all the inodes 89 * we find in the log are created in the subvolume. 90 * 91 * The last stage is to deal with directories and links and extents 92 * and all the other fun semantics 93 */ 94 #define LOG_WALK_PIN_ONLY 0 95 #define LOG_WALK_REPLAY_INODES 1 96 #define LOG_WALK_REPLAY_DIR_INDEX 2 97 #define LOG_WALK_REPLAY_ALL 3 98 99 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 100 struct btrfs_root *root, struct inode *inode, 101 int inode_only); 102 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 103 struct btrfs_root *root, 104 struct btrfs_path *path, u64 objectid); 105 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 106 struct btrfs_root *root, 107 struct btrfs_root *log, 108 struct btrfs_path *path, 109 u64 dirid, int del_all); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root) 141 { 142 int ret; 143 int err = 0; 144 145 mutex_lock(&root->log_mutex); 146 if (root->log_root) { 147 if (!root->log_start_pid) { 148 root->log_start_pid = current->pid; 149 root->log_multiple_pids = false; 150 } else if (root->log_start_pid != current->pid) { 151 root->log_multiple_pids = true; 152 } 153 154 atomic_inc(&root->log_batch); 155 atomic_inc(&root->log_writers); 156 mutex_unlock(&root->log_mutex); 157 return 0; 158 } 159 root->log_multiple_pids = false; 160 root->log_start_pid = current->pid; 161 mutex_lock(&root->fs_info->tree_log_mutex); 162 if (!root->fs_info->log_root_tree) { 163 ret = btrfs_init_log_root_tree(trans, root->fs_info); 164 if (ret) 165 err = ret; 166 } 167 if (err == 0 && !root->log_root) { 168 ret = btrfs_add_log_tree(trans, root); 169 if (ret) 170 err = ret; 171 } 172 mutex_unlock(&root->fs_info->tree_log_mutex); 173 atomic_inc(&root->log_batch); 174 atomic_inc(&root->log_writers); 175 mutex_unlock(&root->log_mutex); 176 return err; 177 } 178 179 /* 180 * returns 0 if there was a log transaction running and we were able 181 * to join, or returns -ENOENT if there were not transactions 182 * in progress 183 */ 184 static int join_running_log_trans(struct btrfs_root *root) 185 { 186 int ret = -ENOENT; 187 188 smp_mb(); 189 if (!root->log_root) 190 return -ENOENT; 191 192 mutex_lock(&root->log_mutex); 193 if (root->log_root) { 194 ret = 0; 195 atomic_inc(&root->log_writers); 196 } 197 mutex_unlock(&root->log_mutex); 198 return ret; 199 } 200 201 /* 202 * This either makes the current running log transaction wait 203 * until you call btrfs_end_log_trans() or it makes any future 204 * log transactions wait until you call btrfs_end_log_trans() 205 */ 206 int btrfs_pin_log_trans(struct btrfs_root *root) 207 { 208 int ret = -ENOENT; 209 210 mutex_lock(&root->log_mutex); 211 atomic_inc(&root->log_writers); 212 mutex_unlock(&root->log_mutex); 213 return ret; 214 } 215 216 /* 217 * indicate we're done making changes to the log tree 218 * and wake up anyone waiting to do a sync 219 */ 220 void btrfs_end_log_trans(struct btrfs_root *root) 221 { 222 if (atomic_dec_and_test(&root->log_writers)) { 223 smp_mb(); 224 if (waitqueue_active(&root->log_writer_wait)) 225 wake_up(&root->log_writer_wait); 226 } 227 } 228 229 230 /* 231 * the walk control struct is used to pass state down the chain when 232 * processing the log tree. The stage field tells us which part 233 * of the log tree processing we are currently doing. The others 234 * are state fields used for that specific part 235 */ 236 struct walk_control { 237 /* should we free the extent on disk when done? This is used 238 * at transaction commit time while freeing a log tree 239 */ 240 int free; 241 242 /* should we write out the extent buffer? This is used 243 * while flushing the log tree to disk during a sync 244 */ 245 int write; 246 247 /* should we wait for the extent buffer io to finish? Also used 248 * while flushing the log tree to disk for a sync 249 */ 250 int wait; 251 252 /* pin only walk, we record which extents on disk belong to the 253 * log trees 254 */ 255 int pin; 256 257 /* what stage of the replay code we're currently in */ 258 int stage; 259 260 /* the root we are currently replaying */ 261 struct btrfs_root *replay_dest; 262 263 /* the trans handle for the current replay */ 264 struct btrfs_trans_handle *trans; 265 266 /* the function that gets used to process blocks we find in the 267 * tree. Note the extent_buffer might not be up to date when it is 268 * passed in, and it must be checked or read if you need the data 269 * inside it 270 */ 271 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 272 struct walk_control *wc, u64 gen); 273 }; 274 275 /* 276 * process_func used to pin down extents, write them or wait on them 277 */ 278 static int process_one_buffer(struct btrfs_root *log, 279 struct extent_buffer *eb, 280 struct walk_control *wc, u64 gen) 281 { 282 int ret = 0; 283 284 /* 285 * If this fs is mixed then we need to be able to process the leaves to 286 * pin down any logged extents, so we have to read the block. 287 */ 288 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 289 ret = btrfs_read_buffer(eb, gen); 290 if (ret) 291 return ret; 292 } 293 294 if (wc->pin) 295 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 296 eb->start, eb->len); 297 298 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 299 if (wc->pin && btrfs_header_level(eb) == 0) 300 ret = btrfs_exclude_logged_extents(log, eb); 301 if (wc->write) 302 btrfs_write_tree_block(eb); 303 if (wc->wait) 304 btrfs_wait_tree_block_writeback(eb); 305 } 306 return ret; 307 } 308 309 /* 310 * Item overwrite used by replay and tree logging. eb, slot and key all refer 311 * to the src data we are copying out. 312 * 313 * root is the tree we are copying into, and path is a scratch 314 * path for use in this function (it should be released on entry and 315 * will be released on exit). 316 * 317 * If the key is already in the destination tree the existing item is 318 * overwritten. If the existing item isn't big enough, it is extended. 319 * If it is too large, it is truncated. 320 * 321 * If the key isn't in the destination yet, a new item is inserted. 322 */ 323 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 324 struct btrfs_root *root, 325 struct btrfs_path *path, 326 struct extent_buffer *eb, int slot, 327 struct btrfs_key *key) 328 { 329 int ret; 330 u32 item_size; 331 u64 saved_i_size = 0; 332 int save_old_i_size = 0; 333 unsigned long src_ptr; 334 unsigned long dst_ptr; 335 int overwrite_root = 0; 336 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 337 338 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 339 overwrite_root = 1; 340 341 item_size = btrfs_item_size_nr(eb, slot); 342 src_ptr = btrfs_item_ptr_offset(eb, slot); 343 344 /* look for the key in the destination tree */ 345 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 346 if (ret < 0) 347 return ret; 348 349 if (ret == 0) { 350 char *src_copy; 351 char *dst_copy; 352 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 353 path->slots[0]); 354 if (dst_size != item_size) 355 goto insert; 356 357 if (item_size == 0) { 358 btrfs_release_path(path); 359 return 0; 360 } 361 dst_copy = kmalloc(item_size, GFP_NOFS); 362 src_copy = kmalloc(item_size, GFP_NOFS); 363 if (!dst_copy || !src_copy) { 364 btrfs_release_path(path); 365 kfree(dst_copy); 366 kfree(src_copy); 367 return -ENOMEM; 368 } 369 370 read_extent_buffer(eb, src_copy, src_ptr, item_size); 371 372 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 373 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 374 item_size); 375 ret = memcmp(dst_copy, src_copy, item_size); 376 377 kfree(dst_copy); 378 kfree(src_copy); 379 /* 380 * they have the same contents, just return, this saves 381 * us from cowing blocks in the destination tree and doing 382 * extra writes that may not have been done by a previous 383 * sync 384 */ 385 if (ret == 0) { 386 btrfs_release_path(path); 387 return 0; 388 } 389 390 /* 391 * We need to load the old nbytes into the inode so when we 392 * replay the extents we've logged we get the right nbytes. 393 */ 394 if (inode_item) { 395 struct btrfs_inode_item *item; 396 u64 nbytes; 397 u32 mode; 398 399 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 400 struct btrfs_inode_item); 401 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 402 item = btrfs_item_ptr(eb, slot, 403 struct btrfs_inode_item); 404 btrfs_set_inode_nbytes(eb, item, nbytes); 405 406 /* 407 * If this is a directory we need to reset the i_size to 408 * 0 so that we can set it up properly when replaying 409 * the rest of the items in this log. 410 */ 411 mode = btrfs_inode_mode(eb, item); 412 if (S_ISDIR(mode)) 413 btrfs_set_inode_size(eb, item, 0); 414 } 415 } else if (inode_item) { 416 struct btrfs_inode_item *item; 417 u32 mode; 418 419 /* 420 * New inode, set nbytes to 0 so that the nbytes comes out 421 * properly when we replay the extents. 422 */ 423 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 424 btrfs_set_inode_nbytes(eb, item, 0); 425 426 /* 427 * If this is a directory we need to reset the i_size to 0 so 428 * that we can set it up properly when replaying the rest of 429 * the items in this log. 430 */ 431 mode = btrfs_inode_mode(eb, item); 432 if (S_ISDIR(mode)) 433 btrfs_set_inode_size(eb, item, 0); 434 } 435 insert: 436 btrfs_release_path(path); 437 /* try to insert the key into the destination tree */ 438 ret = btrfs_insert_empty_item(trans, root, path, 439 key, item_size); 440 441 /* make sure any existing item is the correct size */ 442 if (ret == -EEXIST) { 443 u32 found_size; 444 found_size = btrfs_item_size_nr(path->nodes[0], 445 path->slots[0]); 446 if (found_size > item_size) 447 btrfs_truncate_item(root, path, item_size, 1); 448 else if (found_size < item_size) 449 btrfs_extend_item(root, path, 450 item_size - found_size); 451 } else if (ret) { 452 return ret; 453 } 454 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 455 path->slots[0]); 456 457 /* don't overwrite an existing inode if the generation number 458 * was logged as zero. This is done when the tree logging code 459 * is just logging an inode to make sure it exists after recovery. 460 * 461 * Also, don't overwrite i_size on directories during replay. 462 * log replay inserts and removes directory items based on the 463 * state of the tree found in the subvolume, and i_size is modified 464 * as it goes 465 */ 466 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 467 struct btrfs_inode_item *src_item; 468 struct btrfs_inode_item *dst_item; 469 470 src_item = (struct btrfs_inode_item *)src_ptr; 471 dst_item = (struct btrfs_inode_item *)dst_ptr; 472 473 if (btrfs_inode_generation(eb, src_item) == 0) 474 goto no_copy; 475 476 if (overwrite_root && 477 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 478 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 479 save_old_i_size = 1; 480 saved_i_size = btrfs_inode_size(path->nodes[0], 481 dst_item); 482 } 483 } 484 485 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 486 src_ptr, item_size); 487 488 if (save_old_i_size) { 489 struct btrfs_inode_item *dst_item; 490 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 492 } 493 494 /* make sure the generation is filled in */ 495 if (key->type == BTRFS_INODE_ITEM_KEY) { 496 struct btrfs_inode_item *dst_item; 497 dst_item = (struct btrfs_inode_item *)dst_ptr; 498 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 499 btrfs_set_inode_generation(path->nodes[0], dst_item, 500 trans->transid); 501 } 502 } 503 no_copy: 504 btrfs_mark_buffer_dirty(path->nodes[0]); 505 btrfs_release_path(path); 506 return 0; 507 } 508 509 /* 510 * simple helper to read an inode off the disk from a given root 511 * This can only be called for subvolume roots and not for the log 512 */ 513 static noinline struct inode *read_one_inode(struct btrfs_root *root, 514 u64 objectid) 515 { 516 struct btrfs_key key; 517 struct inode *inode; 518 519 key.objectid = objectid; 520 key.type = BTRFS_INODE_ITEM_KEY; 521 key.offset = 0; 522 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 523 if (IS_ERR(inode)) { 524 inode = NULL; 525 } else if (is_bad_inode(inode)) { 526 iput(inode); 527 inode = NULL; 528 } 529 return inode; 530 } 531 532 /* replays a single extent in 'eb' at 'slot' with 'key' into the 533 * subvolume 'root'. path is released on entry and should be released 534 * on exit. 535 * 536 * extents in the log tree have not been allocated out of the extent 537 * tree yet. So, this completes the allocation, taking a reference 538 * as required if the extent already exists or creating a new extent 539 * if it isn't in the extent allocation tree yet. 540 * 541 * The extent is inserted into the file, dropping any existing extents 542 * from the file that overlap the new one. 543 */ 544 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 545 struct btrfs_root *root, 546 struct btrfs_path *path, 547 struct extent_buffer *eb, int slot, 548 struct btrfs_key *key) 549 { 550 int found_type; 551 u64 extent_end; 552 u64 start = key->offset; 553 u64 nbytes = 0; 554 struct btrfs_file_extent_item *item; 555 struct inode *inode = NULL; 556 unsigned long size; 557 int ret = 0; 558 559 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 560 found_type = btrfs_file_extent_type(eb, item); 561 562 if (found_type == BTRFS_FILE_EXTENT_REG || 563 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 564 nbytes = btrfs_file_extent_num_bytes(eb, item); 565 extent_end = start + nbytes; 566 567 /* 568 * We don't add to the inodes nbytes if we are prealloc or a 569 * hole. 570 */ 571 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 572 nbytes = 0; 573 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 574 size = btrfs_file_extent_inline_len(eb, item); 575 nbytes = btrfs_file_extent_ram_bytes(eb, item); 576 extent_end = ALIGN(start + size, root->sectorsize); 577 } else { 578 ret = 0; 579 goto out; 580 } 581 582 inode = read_one_inode(root, key->objectid); 583 if (!inode) { 584 ret = -EIO; 585 goto out; 586 } 587 588 /* 589 * first check to see if we already have this extent in the 590 * file. This must be done before the btrfs_drop_extents run 591 * so we don't try to drop this extent. 592 */ 593 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 594 start, 0); 595 596 if (ret == 0 && 597 (found_type == BTRFS_FILE_EXTENT_REG || 598 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 599 struct btrfs_file_extent_item cmp1; 600 struct btrfs_file_extent_item cmp2; 601 struct btrfs_file_extent_item *existing; 602 struct extent_buffer *leaf; 603 604 leaf = path->nodes[0]; 605 existing = btrfs_item_ptr(leaf, path->slots[0], 606 struct btrfs_file_extent_item); 607 608 read_extent_buffer(eb, &cmp1, (unsigned long)item, 609 sizeof(cmp1)); 610 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 611 sizeof(cmp2)); 612 613 /* 614 * we already have a pointer to this exact extent, 615 * we don't have to do anything 616 */ 617 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 618 btrfs_release_path(path); 619 goto out; 620 } 621 } 622 btrfs_release_path(path); 623 624 /* drop any overlapping extents */ 625 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 626 if (ret) 627 goto out; 628 629 if (found_type == BTRFS_FILE_EXTENT_REG || 630 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 631 u64 offset; 632 unsigned long dest_offset; 633 struct btrfs_key ins; 634 635 ret = btrfs_insert_empty_item(trans, root, path, key, 636 sizeof(*item)); 637 if (ret) 638 goto out; 639 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 640 path->slots[0]); 641 copy_extent_buffer(path->nodes[0], eb, dest_offset, 642 (unsigned long)item, sizeof(*item)); 643 644 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 645 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 646 ins.type = BTRFS_EXTENT_ITEM_KEY; 647 offset = key->offset - btrfs_file_extent_offset(eb, item); 648 649 if (ins.objectid > 0) { 650 u64 csum_start; 651 u64 csum_end; 652 LIST_HEAD(ordered_sums); 653 /* 654 * is this extent already allocated in the extent 655 * allocation tree? If so, just add a reference 656 */ 657 ret = btrfs_lookup_extent(root, ins.objectid, 658 ins.offset); 659 if (ret == 0) { 660 ret = btrfs_inc_extent_ref(trans, root, 661 ins.objectid, ins.offset, 662 0, root->root_key.objectid, 663 key->objectid, offset, 0); 664 if (ret) 665 goto out; 666 } else { 667 /* 668 * insert the extent pointer in the extent 669 * allocation tree 670 */ 671 ret = btrfs_alloc_logged_file_extent(trans, 672 root, root->root_key.objectid, 673 key->objectid, offset, &ins); 674 if (ret) 675 goto out; 676 } 677 btrfs_release_path(path); 678 679 if (btrfs_file_extent_compression(eb, item)) { 680 csum_start = ins.objectid; 681 csum_end = csum_start + ins.offset; 682 } else { 683 csum_start = ins.objectid + 684 btrfs_file_extent_offset(eb, item); 685 csum_end = csum_start + 686 btrfs_file_extent_num_bytes(eb, item); 687 } 688 689 ret = btrfs_lookup_csums_range(root->log_root, 690 csum_start, csum_end - 1, 691 &ordered_sums, 0); 692 if (ret) 693 goto out; 694 while (!list_empty(&ordered_sums)) { 695 struct btrfs_ordered_sum *sums; 696 sums = list_entry(ordered_sums.next, 697 struct btrfs_ordered_sum, 698 list); 699 if (!ret) 700 ret = btrfs_csum_file_blocks(trans, 701 root->fs_info->csum_root, 702 sums); 703 list_del(&sums->list); 704 kfree(sums); 705 } 706 if (ret) 707 goto out; 708 } else { 709 btrfs_release_path(path); 710 } 711 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 712 /* inline extents are easy, we just overwrite them */ 713 ret = overwrite_item(trans, root, path, eb, slot, key); 714 if (ret) 715 goto out; 716 } 717 718 inode_add_bytes(inode, nbytes); 719 ret = btrfs_update_inode(trans, root, inode); 720 out: 721 if (inode) 722 iput(inode); 723 return ret; 724 } 725 726 /* 727 * when cleaning up conflicts between the directory names in the 728 * subvolume, directory names in the log and directory names in the 729 * inode back references, we may have to unlink inodes from directories. 730 * 731 * This is a helper function to do the unlink of a specific directory 732 * item 733 */ 734 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 735 struct btrfs_root *root, 736 struct btrfs_path *path, 737 struct inode *dir, 738 struct btrfs_dir_item *di) 739 { 740 struct inode *inode; 741 char *name; 742 int name_len; 743 struct extent_buffer *leaf; 744 struct btrfs_key location; 745 int ret; 746 747 leaf = path->nodes[0]; 748 749 btrfs_dir_item_key_to_cpu(leaf, di, &location); 750 name_len = btrfs_dir_name_len(leaf, di); 751 name = kmalloc(name_len, GFP_NOFS); 752 if (!name) 753 return -ENOMEM; 754 755 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 756 btrfs_release_path(path); 757 758 inode = read_one_inode(root, location.objectid); 759 if (!inode) { 760 ret = -EIO; 761 goto out; 762 } 763 764 ret = link_to_fixup_dir(trans, root, path, location.objectid); 765 if (ret) 766 goto out; 767 768 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 769 if (ret) 770 goto out; 771 else 772 ret = btrfs_run_delayed_items(trans, root); 773 out: 774 kfree(name); 775 iput(inode); 776 return ret; 777 } 778 779 /* 780 * helper function to see if a given name and sequence number found 781 * in an inode back reference are already in a directory and correctly 782 * point to this inode 783 */ 784 static noinline int inode_in_dir(struct btrfs_root *root, 785 struct btrfs_path *path, 786 u64 dirid, u64 objectid, u64 index, 787 const char *name, int name_len) 788 { 789 struct btrfs_dir_item *di; 790 struct btrfs_key location; 791 int match = 0; 792 793 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 794 index, name, name_len, 0); 795 if (di && !IS_ERR(di)) { 796 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 797 if (location.objectid != objectid) 798 goto out; 799 } else 800 goto out; 801 btrfs_release_path(path); 802 803 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 804 if (di && !IS_ERR(di)) { 805 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 806 if (location.objectid != objectid) 807 goto out; 808 } else 809 goto out; 810 match = 1; 811 out: 812 btrfs_release_path(path); 813 return match; 814 } 815 816 /* 817 * helper function to check a log tree for a named back reference in 818 * an inode. This is used to decide if a back reference that is 819 * found in the subvolume conflicts with what we find in the log. 820 * 821 * inode backreferences may have multiple refs in a single item, 822 * during replay we process one reference at a time, and we don't 823 * want to delete valid links to a file from the subvolume if that 824 * link is also in the log. 825 */ 826 static noinline int backref_in_log(struct btrfs_root *log, 827 struct btrfs_key *key, 828 u64 ref_objectid, 829 char *name, int namelen) 830 { 831 struct btrfs_path *path; 832 struct btrfs_inode_ref *ref; 833 unsigned long ptr; 834 unsigned long ptr_end; 835 unsigned long name_ptr; 836 int found_name_len; 837 int item_size; 838 int ret; 839 int match = 0; 840 841 path = btrfs_alloc_path(); 842 if (!path) 843 return -ENOMEM; 844 845 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 846 if (ret != 0) 847 goto out; 848 849 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 850 851 if (key->type == BTRFS_INODE_EXTREF_KEY) { 852 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 853 name, namelen, NULL)) 854 match = 1; 855 856 goto out; 857 } 858 859 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 860 ptr_end = ptr + item_size; 861 while (ptr < ptr_end) { 862 ref = (struct btrfs_inode_ref *)ptr; 863 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 864 if (found_name_len == namelen) { 865 name_ptr = (unsigned long)(ref + 1); 866 ret = memcmp_extent_buffer(path->nodes[0], name, 867 name_ptr, namelen); 868 if (ret == 0) { 869 match = 1; 870 goto out; 871 } 872 } 873 ptr = (unsigned long)(ref + 1) + found_name_len; 874 } 875 out: 876 btrfs_free_path(path); 877 return match; 878 } 879 880 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 881 struct btrfs_root *root, 882 struct btrfs_path *path, 883 struct btrfs_root *log_root, 884 struct inode *dir, struct inode *inode, 885 struct extent_buffer *eb, 886 u64 inode_objectid, u64 parent_objectid, 887 u64 ref_index, char *name, int namelen, 888 int *search_done) 889 { 890 int ret; 891 char *victim_name; 892 int victim_name_len; 893 struct extent_buffer *leaf; 894 struct btrfs_dir_item *di; 895 struct btrfs_key search_key; 896 struct btrfs_inode_extref *extref; 897 898 again: 899 /* Search old style refs */ 900 search_key.objectid = inode_objectid; 901 search_key.type = BTRFS_INODE_REF_KEY; 902 search_key.offset = parent_objectid; 903 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 904 if (ret == 0) { 905 struct btrfs_inode_ref *victim_ref; 906 unsigned long ptr; 907 unsigned long ptr_end; 908 909 leaf = path->nodes[0]; 910 911 /* are we trying to overwrite a back ref for the root directory 912 * if so, just jump out, we're done 913 */ 914 if (search_key.objectid == search_key.offset) 915 return 1; 916 917 /* check all the names in this back reference to see 918 * if they are in the log. if so, we allow them to stay 919 * otherwise they must be unlinked as a conflict 920 */ 921 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 922 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 923 while (ptr < ptr_end) { 924 victim_ref = (struct btrfs_inode_ref *)ptr; 925 victim_name_len = btrfs_inode_ref_name_len(leaf, 926 victim_ref); 927 victim_name = kmalloc(victim_name_len, GFP_NOFS); 928 if (!victim_name) 929 return -ENOMEM; 930 931 read_extent_buffer(leaf, victim_name, 932 (unsigned long)(victim_ref + 1), 933 victim_name_len); 934 935 if (!backref_in_log(log_root, &search_key, 936 parent_objectid, 937 victim_name, 938 victim_name_len)) { 939 btrfs_inc_nlink(inode); 940 btrfs_release_path(path); 941 942 ret = btrfs_unlink_inode(trans, root, dir, 943 inode, victim_name, 944 victim_name_len); 945 kfree(victim_name); 946 if (ret) 947 return ret; 948 ret = btrfs_run_delayed_items(trans, root); 949 if (ret) 950 return ret; 951 *search_done = 1; 952 goto again; 953 } 954 kfree(victim_name); 955 956 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 957 } 958 959 /* 960 * NOTE: we have searched root tree and checked the 961 * coresponding ref, it does not need to check again. 962 */ 963 *search_done = 1; 964 } 965 btrfs_release_path(path); 966 967 /* Same search but for extended refs */ 968 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 969 inode_objectid, parent_objectid, 0, 970 0); 971 if (!IS_ERR_OR_NULL(extref)) { 972 u32 item_size; 973 u32 cur_offset = 0; 974 unsigned long base; 975 struct inode *victim_parent; 976 977 leaf = path->nodes[0]; 978 979 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 980 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 981 982 while (cur_offset < item_size) { 983 extref = (struct btrfs_inode_extref *)base + cur_offset; 984 985 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 986 987 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 988 goto next; 989 990 victim_name = kmalloc(victim_name_len, GFP_NOFS); 991 if (!victim_name) 992 return -ENOMEM; 993 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 994 victim_name_len); 995 996 search_key.objectid = inode_objectid; 997 search_key.type = BTRFS_INODE_EXTREF_KEY; 998 search_key.offset = btrfs_extref_hash(parent_objectid, 999 victim_name, 1000 victim_name_len); 1001 ret = 0; 1002 if (!backref_in_log(log_root, &search_key, 1003 parent_objectid, victim_name, 1004 victim_name_len)) { 1005 ret = -ENOENT; 1006 victim_parent = read_one_inode(root, 1007 parent_objectid); 1008 if (victim_parent) { 1009 btrfs_inc_nlink(inode); 1010 btrfs_release_path(path); 1011 1012 ret = btrfs_unlink_inode(trans, root, 1013 victim_parent, 1014 inode, 1015 victim_name, 1016 victim_name_len); 1017 if (!ret) 1018 ret = btrfs_run_delayed_items( 1019 trans, root); 1020 } 1021 iput(victim_parent); 1022 kfree(victim_name); 1023 if (ret) 1024 return ret; 1025 *search_done = 1; 1026 goto again; 1027 } 1028 kfree(victim_name); 1029 if (ret) 1030 return ret; 1031 next: 1032 cur_offset += victim_name_len + sizeof(*extref); 1033 } 1034 *search_done = 1; 1035 } 1036 btrfs_release_path(path); 1037 1038 /* look for a conflicting sequence number */ 1039 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1040 ref_index, name, namelen, 0); 1041 if (di && !IS_ERR(di)) { 1042 ret = drop_one_dir_item(trans, root, path, dir, di); 1043 if (ret) 1044 return ret; 1045 } 1046 btrfs_release_path(path); 1047 1048 /* look for a conflicing name */ 1049 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1050 name, namelen, 0); 1051 if (di && !IS_ERR(di)) { 1052 ret = drop_one_dir_item(trans, root, path, dir, di); 1053 if (ret) 1054 return ret; 1055 } 1056 btrfs_release_path(path); 1057 1058 return 0; 1059 } 1060 1061 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1062 u32 *namelen, char **name, u64 *index, 1063 u64 *parent_objectid) 1064 { 1065 struct btrfs_inode_extref *extref; 1066 1067 extref = (struct btrfs_inode_extref *)ref_ptr; 1068 1069 *namelen = btrfs_inode_extref_name_len(eb, extref); 1070 *name = kmalloc(*namelen, GFP_NOFS); 1071 if (*name == NULL) 1072 return -ENOMEM; 1073 1074 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1075 *namelen); 1076 1077 *index = btrfs_inode_extref_index(eb, extref); 1078 if (parent_objectid) 1079 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1080 1081 return 0; 1082 } 1083 1084 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1085 u32 *namelen, char **name, u64 *index) 1086 { 1087 struct btrfs_inode_ref *ref; 1088 1089 ref = (struct btrfs_inode_ref *)ref_ptr; 1090 1091 *namelen = btrfs_inode_ref_name_len(eb, ref); 1092 *name = kmalloc(*namelen, GFP_NOFS); 1093 if (*name == NULL) 1094 return -ENOMEM; 1095 1096 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1097 1098 *index = btrfs_inode_ref_index(eb, ref); 1099 1100 return 0; 1101 } 1102 1103 /* 1104 * replay one inode back reference item found in the log tree. 1105 * eb, slot and key refer to the buffer and key found in the log tree. 1106 * root is the destination we are replaying into, and path is for temp 1107 * use by this function. (it should be released on return). 1108 */ 1109 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1110 struct btrfs_root *root, 1111 struct btrfs_root *log, 1112 struct btrfs_path *path, 1113 struct extent_buffer *eb, int slot, 1114 struct btrfs_key *key) 1115 { 1116 struct inode *dir; 1117 struct inode *inode; 1118 unsigned long ref_ptr; 1119 unsigned long ref_end; 1120 char *name; 1121 int namelen; 1122 int ret; 1123 int search_done = 0; 1124 int log_ref_ver = 0; 1125 u64 parent_objectid; 1126 u64 inode_objectid; 1127 u64 ref_index = 0; 1128 int ref_struct_size; 1129 1130 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1131 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1132 1133 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1134 struct btrfs_inode_extref *r; 1135 1136 ref_struct_size = sizeof(struct btrfs_inode_extref); 1137 log_ref_ver = 1; 1138 r = (struct btrfs_inode_extref *)ref_ptr; 1139 parent_objectid = btrfs_inode_extref_parent(eb, r); 1140 } else { 1141 ref_struct_size = sizeof(struct btrfs_inode_ref); 1142 parent_objectid = key->offset; 1143 } 1144 inode_objectid = key->objectid; 1145 1146 /* 1147 * it is possible that we didn't log all the parent directories 1148 * for a given inode. If we don't find the dir, just don't 1149 * copy the back ref in. The link count fixup code will take 1150 * care of the rest 1151 */ 1152 dir = read_one_inode(root, parent_objectid); 1153 if (!dir) 1154 return -ENOENT; 1155 1156 inode = read_one_inode(root, inode_objectid); 1157 if (!inode) { 1158 iput(dir); 1159 return -EIO; 1160 } 1161 1162 while (ref_ptr < ref_end) { 1163 if (log_ref_ver) { 1164 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1165 &ref_index, &parent_objectid); 1166 /* 1167 * parent object can change from one array 1168 * item to another. 1169 */ 1170 if (!dir) 1171 dir = read_one_inode(root, parent_objectid); 1172 if (!dir) 1173 return -ENOENT; 1174 } else { 1175 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1176 &ref_index); 1177 } 1178 if (ret) 1179 return ret; 1180 1181 /* if we already have a perfect match, we're done */ 1182 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1183 ref_index, name, namelen)) { 1184 /* 1185 * look for a conflicting back reference in the 1186 * metadata. if we find one we have to unlink that name 1187 * of the file before we add our new link. Later on, we 1188 * overwrite any existing back reference, and we don't 1189 * want to create dangling pointers in the directory. 1190 */ 1191 1192 if (!search_done) { 1193 ret = __add_inode_ref(trans, root, path, log, 1194 dir, inode, eb, 1195 inode_objectid, 1196 parent_objectid, 1197 ref_index, name, namelen, 1198 &search_done); 1199 if (ret == 1) { 1200 ret = 0; 1201 goto out; 1202 } 1203 if (ret) 1204 goto out; 1205 } 1206 1207 /* insert our name */ 1208 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1209 0, ref_index); 1210 if (ret) 1211 goto out; 1212 1213 btrfs_update_inode(trans, root, inode); 1214 } 1215 1216 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1217 kfree(name); 1218 if (log_ref_ver) { 1219 iput(dir); 1220 dir = NULL; 1221 } 1222 } 1223 1224 /* finally write the back reference in the inode */ 1225 ret = overwrite_item(trans, root, path, eb, slot, key); 1226 out: 1227 btrfs_release_path(path); 1228 iput(dir); 1229 iput(inode); 1230 return ret; 1231 } 1232 1233 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1234 struct btrfs_root *root, u64 offset) 1235 { 1236 int ret; 1237 ret = btrfs_find_orphan_item(root, offset); 1238 if (ret > 0) 1239 ret = btrfs_insert_orphan_item(trans, root, offset); 1240 return ret; 1241 } 1242 1243 static int count_inode_extrefs(struct btrfs_root *root, 1244 struct inode *inode, struct btrfs_path *path) 1245 { 1246 int ret = 0; 1247 int name_len; 1248 unsigned int nlink = 0; 1249 u32 item_size; 1250 u32 cur_offset = 0; 1251 u64 inode_objectid = btrfs_ino(inode); 1252 u64 offset = 0; 1253 unsigned long ptr; 1254 struct btrfs_inode_extref *extref; 1255 struct extent_buffer *leaf; 1256 1257 while (1) { 1258 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1259 &extref, &offset); 1260 if (ret) 1261 break; 1262 1263 leaf = path->nodes[0]; 1264 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1265 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1266 1267 while (cur_offset < item_size) { 1268 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1269 name_len = btrfs_inode_extref_name_len(leaf, extref); 1270 1271 nlink++; 1272 1273 cur_offset += name_len + sizeof(*extref); 1274 } 1275 1276 offset++; 1277 btrfs_release_path(path); 1278 } 1279 btrfs_release_path(path); 1280 1281 if (ret < 0) 1282 return ret; 1283 return nlink; 1284 } 1285 1286 static int count_inode_refs(struct btrfs_root *root, 1287 struct inode *inode, struct btrfs_path *path) 1288 { 1289 int ret; 1290 struct btrfs_key key; 1291 unsigned int nlink = 0; 1292 unsigned long ptr; 1293 unsigned long ptr_end; 1294 int name_len; 1295 u64 ino = btrfs_ino(inode); 1296 1297 key.objectid = ino; 1298 key.type = BTRFS_INODE_REF_KEY; 1299 key.offset = (u64)-1; 1300 1301 while (1) { 1302 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1303 if (ret < 0) 1304 break; 1305 if (ret > 0) { 1306 if (path->slots[0] == 0) 1307 break; 1308 path->slots[0]--; 1309 } 1310 btrfs_item_key_to_cpu(path->nodes[0], &key, 1311 path->slots[0]); 1312 if (key.objectid != ino || 1313 key.type != BTRFS_INODE_REF_KEY) 1314 break; 1315 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1316 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1317 path->slots[0]); 1318 while (ptr < ptr_end) { 1319 struct btrfs_inode_ref *ref; 1320 1321 ref = (struct btrfs_inode_ref *)ptr; 1322 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1323 ref); 1324 ptr = (unsigned long)(ref + 1) + name_len; 1325 nlink++; 1326 } 1327 1328 if (key.offset == 0) 1329 break; 1330 key.offset--; 1331 btrfs_release_path(path); 1332 } 1333 btrfs_release_path(path); 1334 1335 return nlink; 1336 } 1337 1338 /* 1339 * There are a few corners where the link count of the file can't 1340 * be properly maintained during replay. So, instead of adding 1341 * lots of complexity to the log code, we just scan the backrefs 1342 * for any file that has been through replay. 1343 * 1344 * The scan will update the link count on the inode to reflect the 1345 * number of back refs found. If it goes down to zero, the iput 1346 * will free the inode. 1347 */ 1348 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1349 struct btrfs_root *root, 1350 struct inode *inode) 1351 { 1352 struct btrfs_path *path; 1353 int ret; 1354 u64 nlink = 0; 1355 u64 ino = btrfs_ino(inode); 1356 1357 path = btrfs_alloc_path(); 1358 if (!path) 1359 return -ENOMEM; 1360 1361 ret = count_inode_refs(root, inode, path); 1362 if (ret < 0) 1363 goto out; 1364 1365 nlink = ret; 1366 1367 ret = count_inode_extrefs(root, inode, path); 1368 if (ret == -ENOENT) 1369 ret = 0; 1370 1371 if (ret < 0) 1372 goto out; 1373 1374 nlink += ret; 1375 1376 ret = 0; 1377 1378 if (nlink != inode->i_nlink) { 1379 set_nlink(inode, nlink); 1380 btrfs_update_inode(trans, root, inode); 1381 } 1382 BTRFS_I(inode)->index_cnt = (u64)-1; 1383 1384 if (inode->i_nlink == 0) { 1385 if (S_ISDIR(inode->i_mode)) { 1386 ret = replay_dir_deletes(trans, root, NULL, path, 1387 ino, 1); 1388 if (ret) 1389 goto out; 1390 } 1391 ret = insert_orphan_item(trans, root, ino); 1392 } 1393 1394 out: 1395 btrfs_free_path(path); 1396 return ret; 1397 } 1398 1399 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1400 struct btrfs_root *root, 1401 struct btrfs_path *path) 1402 { 1403 int ret; 1404 struct btrfs_key key; 1405 struct inode *inode; 1406 1407 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1408 key.type = BTRFS_ORPHAN_ITEM_KEY; 1409 key.offset = (u64)-1; 1410 while (1) { 1411 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1412 if (ret < 0) 1413 break; 1414 1415 if (ret == 1) { 1416 if (path->slots[0] == 0) 1417 break; 1418 path->slots[0]--; 1419 } 1420 1421 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1422 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1423 key.type != BTRFS_ORPHAN_ITEM_KEY) 1424 break; 1425 1426 ret = btrfs_del_item(trans, root, path); 1427 if (ret) 1428 goto out; 1429 1430 btrfs_release_path(path); 1431 inode = read_one_inode(root, key.offset); 1432 if (!inode) 1433 return -EIO; 1434 1435 ret = fixup_inode_link_count(trans, root, inode); 1436 iput(inode); 1437 if (ret) 1438 goto out; 1439 1440 /* 1441 * fixup on a directory may create new entries, 1442 * make sure we always look for the highset possible 1443 * offset 1444 */ 1445 key.offset = (u64)-1; 1446 } 1447 ret = 0; 1448 out: 1449 btrfs_release_path(path); 1450 return ret; 1451 } 1452 1453 1454 /* 1455 * record a given inode in the fixup dir so we can check its link 1456 * count when replay is done. The link count is incremented here 1457 * so the inode won't go away until we check it 1458 */ 1459 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1460 struct btrfs_root *root, 1461 struct btrfs_path *path, 1462 u64 objectid) 1463 { 1464 struct btrfs_key key; 1465 int ret = 0; 1466 struct inode *inode; 1467 1468 inode = read_one_inode(root, objectid); 1469 if (!inode) 1470 return -EIO; 1471 1472 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1473 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1474 key.offset = objectid; 1475 1476 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1477 1478 btrfs_release_path(path); 1479 if (ret == 0) { 1480 if (!inode->i_nlink) 1481 set_nlink(inode, 1); 1482 else 1483 btrfs_inc_nlink(inode); 1484 ret = btrfs_update_inode(trans, root, inode); 1485 } else if (ret == -EEXIST) { 1486 ret = 0; 1487 } else { 1488 BUG(); /* Logic Error */ 1489 } 1490 iput(inode); 1491 1492 return ret; 1493 } 1494 1495 /* 1496 * when replaying the log for a directory, we only insert names 1497 * for inodes that actually exist. This means an fsync on a directory 1498 * does not implicitly fsync all the new files in it 1499 */ 1500 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1501 struct btrfs_root *root, 1502 struct btrfs_path *path, 1503 u64 dirid, u64 index, 1504 char *name, int name_len, u8 type, 1505 struct btrfs_key *location) 1506 { 1507 struct inode *inode; 1508 struct inode *dir; 1509 int ret; 1510 1511 inode = read_one_inode(root, location->objectid); 1512 if (!inode) 1513 return -ENOENT; 1514 1515 dir = read_one_inode(root, dirid); 1516 if (!dir) { 1517 iput(inode); 1518 return -EIO; 1519 } 1520 1521 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1522 1523 /* FIXME, put inode into FIXUP list */ 1524 1525 iput(inode); 1526 iput(dir); 1527 return ret; 1528 } 1529 1530 /* 1531 * take a single entry in a log directory item and replay it into 1532 * the subvolume. 1533 * 1534 * if a conflicting item exists in the subdirectory already, 1535 * the inode it points to is unlinked and put into the link count 1536 * fix up tree. 1537 * 1538 * If a name from the log points to a file or directory that does 1539 * not exist in the FS, it is skipped. fsyncs on directories 1540 * do not force down inodes inside that directory, just changes to the 1541 * names or unlinks in a directory. 1542 */ 1543 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1544 struct btrfs_root *root, 1545 struct btrfs_path *path, 1546 struct extent_buffer *eb, 1547 struct btrfs_dir_item *di, 1548 struct btrfs_key *key) 1549 { 1550 char *name; 1551 int name_len; 1552 struct btrfs_dir_item *dst_di; 1553 struct btrfs_key found_key; 1554 struct btrfs_key log_key; 1555 struct inode *dir; 1556 u8 log_type; 1557 int exists; 1558 int ret = 0; 1559 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1560 1561 dir = read_one_inode(root, key->objectid); 1562 if (!dir) 1563 return -EIO; 1564 1565 name_len = btrfs_dir_name_len(eb, di); 1566 name = kmalloc(name_len, GFP_NOFS); 1567 if (!name) { 1568 ret = -ENOMEM; 1569 goto out; 1570 } 1571 1572 log_type = btrfs_dir_type(eb, di); 1573 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1574 name_len); 1575 1576 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1577 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1578 if (exists == 0) 1579 exists = 1; 1580 else 1581 exists = 0; 1582 btrfs_release_path(path); 1583 1584 if (key->type == BTRFS_DIR_ITEM_KEY) { 1585 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1586 name, name_len, 1); 1587 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1588 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1589 key->objectid, 1590 key->offset, name, 1591 name_len, 1); 1592 } else { 1593 /* Corruption */ 1594 ret = -EINVAL; 1595 goto out; 1596 } 1597 if (IS_ERR_OR_NULL(dst_di)) { 1598 /* we need a sequence number to insert, so we only 1599 * do inserts for the BTRFS_DIR_INDEX_KEY types 1600 */ 1601 if (key->type != BTRFS_DIR_INDEX_KEY) 1602 goto out; 1603 goto insert; 1604 } 1605 1606 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1607 /* the existing item matches the logged item */ 1608 if (found_key.objectid == log_key.objectid && 1609 found_key.type == log_key.type && 1610 found_key.offset == log_key.offset && 1611 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1612 goto out; 1613 } 1614 1615 /* 1616 * don't drop the conflicting directory entry if the inode 1617 * for the new entry doesn't exist 1618 */ 1619 if (!exists) 1620 goto out; 1621 1622 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1623 if (ret) 1624 goto out; 1625 1626 if (key->type == BTRFS_DIR_INDEX_KEY) 1627 goto insert; 1628 out: 1629 btrfs_release_path(path); 1630 if (!ret && update_size) { 1631 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1632 ret = btrfs_update_inode(trans, root, dir); 1633 } 1634 kfree(name); 1635 iput(dir); 1636 return ret; 1637 1638 insert: 1639 btrfs_release_path(path); 1640 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1641 name, name_len, log_type, &log_key); 1642 if (ret && ret != -ENOENT) 1643 goto out; 1644 update_size = false; 1645 ret = 0; 1646 goto out; 1647 } 1648 1649 /* 1650 * find all the names in a directory item and reconcile them into 1651 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1652 * one name in a directory item, but the same code gets used for 1653 * both directory index types 1654 */ 1655 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1656 struct btrfs_root *root, 1657 struct btrfs_path *path, 1658 struct extent_buffer *eb, int slot, 1659 struct btrfs_key *key) 1660 { 1661 int ret; 1662 u32 item_size = btrfs_item_size_nr(eb, slot); 1663 struct btrfs_dir_item *di; 1664 int name_len; 1665 unsigned long ptr; 1666 unsigned long ptr_end; 1667 1668 ptr = btrfs_item_ptr_offset(eb, slot); 1669 ptr_end = ptr + item_size; 1670 while (ptr < ptr_end) { 1671 di = (struct btrfs_dir_item *)ptr; 1672 if (verify_dir_item(root, eb, di)) 1673 return -EIO; 1674 name_len = btrfs_dir_name_len(eb, di); 1675 ret = replay_one_name(trans, root, path, eb, di, key); 1676 if (ret) 1677 return ret; 1678 ptr = (unsigned long)(di + 1); 1679 ptr += name_len; 1680 } 1681 return 0; 1682 } 1683 1684 /* 1685 * directory replay has two parts. There are the standard directory 1686 * items in the log copied from the subvolume, and range items 1687 * created in the log while the subvolume was logged. 1688 * 1689 * The range items tell us which parts of the key space the log 1690 * is authoritative for. During replay, if a key in the subvolume 1691 * directory is in a logged range item, but not actually in the log 1692 * that means it was deleted from the directory before the fsync 1693 * and should be removed. 1694 */ 1695 static noinline int find_dir_range(struct btrfs_root *root, 1696 struct btrfs_path *path, 1697 u64 dirid, int key_type, 1698 u64 *start_ret, u64 *end_ret) 1699 { 1700 struct btrfs_key key; 1701 u64 found_end; 1702 struct btrfs_dir_log_item *item; 1703 int ret; 1704 int nritems; 1705 1706 if (*start_ret == (u64)-1) 1707 return 1; 1708 1709 key.objectid = dirid; 1710 key.type = key_type; 1711 key.offset = *start_ret; 1712 1713 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1714 if (ret < 0) 1715 goto out; 1716 if (ret > 0) { 1717 if (path->slots[0] == 0) 1718 goto out; 1719 path->slots[0]--; 1720 } 1721 if (ret != 0) 1722 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1723 1724 if (key.type != key_type || key.objectid != dirid) { 1725 ret = 1; 1726 goto next; 1727 } 1728 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1729 struct btrfs_dir_log_item); 1730 found_end = btrfs_dir_log_end(path->nodes[0], item); 1731 1732 if (*start_ret >= key.offset && *start_ret <= found_end) { 1733 ret = 0; 1734 *start_ret = key.offset; 1735 *end_ret = found_end; 1736 goto out; 1737 } 1738 ret = 1; 1739 next: 1740 /* check the next slot in the tree to see if it is a valid item */ 1741 nritems = btrfs_header_nritems(path->nodes[0]); 1742 if (path->slots[0] >= nritems) { 1743 ret = btrfs_next_leaf(root, path); 1744 if (ret) 1745 goto out; 1746 } else { 1747 path->slots[0]++; 1748 } 1749 1750 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1751 1752 if (key.type != key_type || key.objectid != dirid) { 1753 ret = 1; 1754 goto out; 1755 } 1756 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1757 struct btrfs_dir_log_item); 1758 found_end = btrfs_dir_log_end(path->nodes[0], item); 1759 *start_ret = key.offset; 1760 *end_ret = found_end; 1761 ret = 0; 1762 out: 1763 btrfs_release_path(path); 1764 return ret; 1765 } 1766 1767 /* 1768 * this looks for a given directory item in the log. If the directory 1769 * item is not in the log, the item is removed and the inode it points 1770 * to is unlinked 1771 */ 1772 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1773 struct btrfs_root *root, 1774 struct btrfs_root *log, 1775 struct btrfs_path *path, 1776 struct btrfs_path *log_path, 1777 struct inode *dir, 1778 struct btrfs_key *dir_key) 1779 { 1780 int ret; 1781 struct extent_buffer *eb; 1782 int slot; 1783 u32 item_size; 1784 struct btrfs_dir_item *di; 1785 struct btrfs_dir_item *log_di; 1786 int name_len; 1787 unsigned long ptr; 1788 unsigned long ptr_end; 1789 char *name; 1790 struct inode *inode; 1791 struct btrfs_key location; 1792 1793 again: 1794 eb = path->nodes[0]; 1795 slot = path->slots[0]; 1796 item_size = btrfs_item_size_nr(eb, slot); 1797 ptr = btrfs_item_ptr_offset(eb, slot); 1798 ptr_end = ptr + item_size; 1799 while (ptr < ptr_end) { 1800 di = (struct btrfs_dir_item *)ptr; 1801 if (verify_dir_item(root, eb, di)) { 1802 ret = -EIO; 1803 goto out; 1804 } 1805 1806 name_len = btrfs_dir_name_len(eb, di); 1807 name = kmalloc(name_len, GFP_NOFS); 1808 if (!name) { 1809 ret = -ENOMEM; 1810 goto out; 1811 } 1812 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1813 name_len); 1814 log_di = NULL; 1815 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1816 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1817 dir_key->objectid, 1818 name, name_len, 0); 1819 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1820 log_di = btrfs_lookup_dir_index_item(trans, log, 1821 log_path, 1822 dir_key->objectid, 1823 dir_key->offset, 1824 name, name_len, 0); 1825 } 1826 if (IS_ERR_OR_NULL(log_di)) { 1827 btrfs_dir_item_key_to_cpu(eb, di, &location); 1828 btrfs_release_path(path); 1829 btrfs_release_path(log_path); 1830 inode = read_one_inode(root, location.objectid); 1831 if (!inode) { 1832 kfree(name); 1833 return -EIO; 1834 } 1835 1836 ret = link_to_fixup_dir(trans, root, 1837 path, location.objectid); 1838 if (ret) { 1839 kfree(name); 1840 iput(inode); 1841 goto out; 1842 } 1843 1844 btrfs_inc_nlink(inode); 1845 ret = btrfs_unlink_inode(trans, root, dir, inode, 1846 name, name_len); 1847 if (!ret) 1848 ret = btrfs_run_delayed_items(trans, root); 1849 kfree(name); 1850 iput(inode); 1851 if (ret) 1852 goto out; 1853 1854 /* there might still be more names under this key 1855 * check and repeat if required 1856 */ 1857 ret = btrfs_search_slot(NULL, root, dir_key, path, 1858 0, 0); 1859 if (ret == 0) 1860 goto again; 1861 ret = 0; 1862 goto out; 1863 } 1864 btrfs_release_path(log_path); 1865 kfree(name); 1866 1867 ptr = (unsigned long)(di + 1); 1868 ptr += name_len; 1869 } 1870 ret = 0; 1871 out: 1872 btrfs_release_path(path); 1873 btrfs_release_path(log_path); 1874 return ret; 1875 } 1876 1877 /* 1878 * deletion replay happens before we copy any new directory items 1879 * out of the log or out of backreferences from inodes. It 1880 * scans the log to find ranges of keys that log is authoritative for, 1881 * and then scans the directory to find items in those ranges that are 1882 * not present in the log. 1883 * 1884 * Anything we don't find in the log is unlinked and removed from the 1885 * directory. 1886 */ 1887 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1888 struct btrfs_root *root, 1889 struct btrfs_root *log, 1890 struct btrfs_path *path, 1891 u64 dirid, int del_all) 1892 { 1893 u64 range_start; 1894 u64 range_end; 1895 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1896 int ret = 0; 1897 struct btrfs_key dir_key; 1898 struct btrfs_key found_key; 1899 struct btrfs_path *log_path; 1900 struct inode *dir; 1901 1902 dir_key.objectid = dirid; 1903 dir_key.type = BTRFS_DIR_ITEM_KEY; 1904 log_path = btrfs_alloc_path(); 1905 if (!log_path) 1906 return -ENOMEM; 1907 1908 dir = read_one_inode(root, dirid); 1909 /* it isn't an error if the inode isn't there, that can happen 1910 * because we replay the deletes before we copy in the inode item 1911 * from the log 1912 */ 1913 if (!dir) { 1914 btrfs_free_path(log_path); 1915 return 0; 1916 } 1917 again: 1918 range_start = 0; 1919 range_end = 0; 1920 while (1) { 1921 if (del_all) 1922 range_end = (u64)-1; 1923 else { 1924 ret = find_dir_range(log, path, dirid, key_type, 1925 &range_start, &range_end); 1926 if (ret != 0) 1927 break; 1928 } 1929 1930 dir_key.offset = range_start; 1931 while (1) { 1932 int nritems; 1933 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1934 0, 0); 1935 if (ret < 0) 1936 goto out; 1937 1938 nritems = btrfs_header_nritems(path->nodes[0]); 1939 if (path->slots[0] >= nritems) { 1940 ret = btrfs_next_leaf(root, path); 1941 if (ret) 1942 break; 1943 } 1944 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1945 path->slots[0]); 1946 if (found_key.objectid != dirid || 1947 found_key.type != dir_key.type) 1948 goto next_type; 1949 1950 if (found_key.offset > range_end) 1951 break; 1952 1953 ret = check_item_in_log(trans, root, log, path, 1954 log_path, dir, 1955 &found_key); 1956 if (ret) 1957 goto out; 1958 if (found_key.offset == (u64)-1) 1959 break; 1960 dir_key.offset = found_key.offset + 1; 1961 } 1962 btrfs_release_path(path); 1963 if (range_end == (u64)-1) 1964 break; 1965 range_start = range_end + 1; 1966 } 1967 1968 next_type: 1969 ret = 0; 1970 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1971 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1972 dir_key.type = BTRFS_DIR_INDEX_KEY; 1973 btrfs_release_path(path); 1974 goto again; 1975 } 1976 out: 1977 btrfs_release_path(path); 1978 btrfs_free_path(log_path); 1979 iput(dir); 1980 return ret; 1981 } 1982 1983 /* 1984 * the process_func used to replay items from the log tree. This 1985 * gets called in two different stages. The first stage just looks 1986 * for inodes and makes sure they are all copied into the subvolume. 1987 * 1988 * The second stage copies all the other item types from the log into 1989 * the subvolume. The two stage approach is slower, but gets rid of 1990 * lots of complexity around inodes referencing other inodes that exist 1991 * only in the log (references come from either directory items or inode 1992 * back refs). 1993 */ 1994 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 1995 struct walk_control *wc, u64 gen) 1996 { 1997 int nritems; 1998 struct btrfs_path *path; 1999 struct btrfs_root *root = wc->replay_dest; 2000 struct btrfs_key key; 2001 int level; 2002 int i; 2003 int ret; 2004 2005 ret = btrfs_read_buffer(eb, gen); 2006 if (ret) 2007 return ret; 2008 2009 level = btrfs_header_level(eb); 2010 2011 if (level != 0) 2012 return 0; 2013 2014 path = btrfs_alloc_path(); 2015 if (!path) 2016 return -ENOMEM; 2017 2018 nritems = btrfs_header_nritems(eb); 2019 for (i = 0; i < nritems; i++) { 2020 btrfs_item_key_to_cpu(eb, &key, i); 2021 2022 /* inode keys are done during the first stage */ 2023 if (key.type == BTRFS_INODE_ITEM_KEY && 2024 wc->stage == LOG_WALK_REPLAY_INODES) { 2025 struct btrfs_inode_item *inode_item; 2026 u32 mode; 2027 2028 inode_item = btrfs_item_ptr(eb, i, 2029 struct btrfs_inode_item); 2030 mode = btrfs_inode_mode(eb, inode_item); 2031 if (S_ISDIR(mode)) { 2032 ret = replay_dir_deletes(wc->trans, 2033 root, log, path, key.objectid, 0); 2034 if (ret) 2035 break; 2036 } 2037 ret = overwrite_item(wc->trans, root, path, 2038 eb, i, &key); 2039 if (ret) 2040 break; 2041 2042 /* for regular files, make sure corresponding 2043 * orhpan item exist. extents past the new EOF 2044 * will be truncated later by orphan cleanup. 2045 */ 2046 if (S_ISREG(mode)) { 2047 ret = insert_orphan_item(wc->trans, root, 2048 key.objectid); 2049 if (ret) 2050 break; 2051 } 2052 2053 ret = link_to_fixup_dir(wc->trans, root, 2054 path, key.objectid); 2055 if (ret) 2056 break; 2057 } 2058 2059 if (key.type == BTRFS_DIR_INDEX_KEY && 2060 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2061 ret = replay_one_dir_item(wc->trans, root, path, 2062 eb, i, &key); 2063 if (ret) 2064 break; 2065 } 2066 2067 if (wc->stage < LOG_WALK_REPLAY_ALL) 2068 continue; 2069 2070 /* these keys are simply copied */ 2071 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2072 ret = overwrite_item(wc->trans, root, path, 2073 eb, i, &key); 2074 if (ret) 2075 break; 2076 } else if (key.type == BTRFS_INODE_REF_KEY || 2077 key.type == BTRFS_INODE_EXTREF_KEY) { 2078 ret = add_inode_ref(wc->trans, root, log, path, 2079 eb, i, &key); 2080 if (ret && ret != -ENOENT) 2081 break; 2082 ret = 0; 2083 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2084 ret = replay_one_extent(wc->trans, root, path, 2085 eb, i, &key); 2086 if (ret) 2087 break; 2088 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2089 ret = replay_one_dir_item(wc->trans, root, path, 2090 eb, i, &key); 2091 if (ret) 2092 break; 2093 } 2094 } 2095 btrfs_free_path(path); 2096 return ret; 2097 } 2098 2099 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2100 struct btrfs_root *root, 2101 struct btrfs_path *path, int *level, 2102 struct walk_control *wc) 2103 { 2104 u64 root_owner; 2105 u64 bytenr; 2106 u64 ptr_gen; 2107 struct extent_buffer *next; 2108 struct extent_buffer *cur; 2109 struct extent_buffer *parent; 2110 u32 blocksize; 2111 int ret = 0; 2112 2113 WARN_ON(*level < 0); 2114 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2115 2116 while (*level > 0) { 2117 WARN_ON(*level < 0); 2118 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2119 cur = path->nodes[*level]; 2120 2121 if (btrfs_header_level(cur) != *level) 2122 WARN_ON(1); 2123 2124 if (path->slots[*level] >= 2125 btrfs_header_nritems(cur)) 2126 break; 2127 2128 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2129 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2130 blocksize = btrfs_level_size(root, *level - 1); 2131 2132 parent = path->nodes[*level]; 2133 root_owner = btrfs_header_owner(parent); 2134 2135 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 2136 if (!next) 2137 return -ENOMEM; 2138 2139 if (*level == 1) { 2140 ret = wc->process_func(root, next, wc, ptr_gen); 2141 if (ret) { 2142 free_extent_buffer(next); 2143 return ret; 2144 } 2145 2146 path->slots[*level]++; 2147 if (wc->free) { 2148 ret = btrfs_read_buffer(next, ptr_gen); 2149 if (ret) { 2150 free_extent_buffer(next); 2151 return ret; 2152 } 2153 2154 btrfs_tree_lock(next); 2155 btrfs_set_lock_blocking(next); 2156 clean_tree_block(trans, root, next); 2157 btrfs_wait_tree_block_writeback(next); 2158 btrfs_tree_unlock(next); 2159 2160 WARN_ON(root_owner != 2161 BTRFS_TREE_LOG_OBJECTID); 2162 ret = btrfs_free_and_pin_reserved_extent(root, 2163 bytenr, blocksize); 2164 if (ret) { 2165 free_extent_buffer(next); 2166 return ret; 2167 } 2168 } 2169 free_extent_buffer(next); 2170 continue; 2171 } 2172 ret = btrfs_read_buffer(next, ptr_gen); 2173 if (ret) { 2174 free_extent_buffer(next); 2175 return ret; 2176 } 2177 2178 WARN_ON(*level <= 0); 2179 if (path->nodes[*level-1]) 2180 free_extent_buffer(path->nodes[*level-1]); 2181 path->nodes[*level-1] = next; 2182 *level = btrfs_header_level(next); 2183 path->slots[*level] = 0; 2184 cond_resched(); 2185 } 2186 WARN_ON(*level < 0); 2187 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2188 2189 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2190 2191 cond_resched(); 2192 return 0; 2193 } 2194 2195 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2196 struct btrfs_root *root, 2197 struct btrfs_path *path, int *level, 2198 struct walk_control *wc) 2199 { 2200 u64 root_owner; 2201 int i; 2202 int slot; 2203 int ret; 2204 2205 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2206 slot = path->slots[i]; 2207 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2208 path->slots[i]++; 2209 *level = i; 2210 WARN_ON(*level == 0); 2211 return 0; 2212 } else { 2213 struct extent_buffer *parent; 2214 if (path->nodes[*level] == root->node) 2215 parent = path->nodes[*level]; 2216 else 2217 parent = path->nodes[*level + 1]; 2218 2219 root_owner = btrfs_header_owner(parent); 2220 ret = wc->process_func(root, path->nodes[*level], wc, 2221 btrfs_header_generation(path->nodes[*level])); 2222 if (ret) 2223 return ret; 2224 2225 if (wc->free) { 2226 struct extent_buffer *next; 2227 2228 next = path->nodes[*level]; 2229 2230 btrfs_tree_lock(next); 2231 btrfs_set_lock_blocking(next); 2232 clean_tree_block(trans, root, next); 2233 btrfs_wait_tree_block_writeback(next); 2234 btrfs_tree_unlock(next); 2235 2236 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2237 ret = btrfs_free_and_pin_reserved_extent(root, 2238 path->nodes[*level]->start, 2239 path->nodes[*level]->len); 2240 if (ret) 2241 return ret; 2242 } 2243 free_extent_buffer(path->nodes[*level]); 2244 path->nodes[*level] = NULL; 2245 *level = i + 1; 2246 } 2247 } 2248 return 1; 2249 } 2250 2251 /* 2252 * drop the reference count on the tree rooted at 'snap'. This traverses 2253 * the tree freeing any blocks that have a ref count of zero after being 2254 * decremented. 2255 */ 2256 static int walk_log_tree(struct btrfs_trans_handle *trans, 2257 struct btrfs_root *log, struct walk_control *wc) 2258 { 2259 int ret = 0; 2260 int wret; 2261 int level; 2262 struct btrfs_path *path; 2263 int orig_level; 2264 2265 path = btrfs_alloc_path(); 2266 if (!path) 2267 return -ENOMEM; 2268 2269 level = btrfs_header_level(log->node); 2270 orig_level = level; 2271 path->nodes[level] = log->node; 2272 extent_buffer_get(log->node); 2273 path->slots[level] = 0; 2274 2275 while (1) { 2276 wret = walk_down_log_tree(trans, log, path, &level, wc); 2277 if (wret > 0) 2278 break; 2279 if (wret < 0) { 2280 ret = wret; 2281 goto out; 2282 } 2283 2284 wret = walk_up_log_tree(trans, log, path, &level, wc); 2285 if (wret > 0) 2286 break; 2287 if (wret < 0) { 2288 ret = wret; 2289 goto out; 2290 } 2291 } 2292 2293 /* was the root node processed? if not, catch it here */ 2294 if (path->nodes[orig_level]) { 2295 ret = wc->process_func(log, path->nodes[orig_level], wc, 2296 btrfs_header_generation(path->nodes[orig_level])); 2297 if (ret) 2298 goto out; 2299 if (wc->free) { 2300 struct extent_buffer *next; 2301 2302 next = path->nodes[orig_level]; 2303 2304 btrfs_tree_lock(next); 2305 btrfs_set_lock_blocking(next); 2306 clean_tree_block(trans, log, next); 2307 btrfs_wait_tree_block_writeback(next); 2308 btrfs_tree_unlock(next); 2309 2310 WARN_ON(log->root_key.objectid != 2311 BTRFS_TREE_LOG_OBJECTID); 2312 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2313 next->len); 2314 if (ret) 2315 goto out; 2316 } 2317 } 2318 2319 out: 2320 btrfs_free_path(path); 2321 return ret; 2322 } 2323 2324 /* 2325 * helper function to update the item for a given subvolumes log root 2326 * in the tree of log roots 2327 */ 2328 static int update_log_root(struct btrfs_trans_handle *trans, 2329 struct btrfs_root *log) 2330 { 2331 int ret; 2332 2333 if (log->log_transid == 1) { 2334 /* insert root item on the first sync */ 2335 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2336 &log->root_key, &log->root_item); 2337 } else { 2338 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2339 &log->root_key, &log->root_item); 2340 } 2341 return ret; 2342 } 2343 2344 static int wait_log_commit(struct btrfs_trans_handle *trans, 2345 struct btrfs_root *root, unsigned long transid) 2346 { 2347 DEFINE_WAIT(wait); 2348 int index = transid % 2; 2349 2350 /* 2351 * we only allow two pending log transactions at a time, 2352 * so we know that if ours is more than 2 older than the 2353 * current transaction, we're done 2354 */ 2355 do { 2356 prepare_to_wait(&root->log_commit_wait[index], 2357 &wait, TASK_UNINTERRUPTIBLE); 2358 mutex_unlock(&root->log_mutex); 2359 2360 if (root->fs_info->last_trans_log_full_commit != 2361 trans->transid && root->log_transid < transid + 2 && 2362 atomic_read(&root->log_commit[index])) 2363 schedule(); 2364 2365 finish_wait(&root->log_commit_wait[index], &wait); 2366 mutex_lock(&root->log_mutex); 2367 } while (root->fs_info->last_trans_log_full_commit != 2368 trans->transid && root->log_transid < transid + 2 && 2369 atomic_read(&root->log_commit[index])); 2370 return 0; 2371 } 2372 2373 static void wait_for_writer(struct btrfs_trans_handle *trans, 2374 struct btrfs_root *root) 2375 { 2376 DEFINE_WAIT(wait); 2377 while (root->fs_info->last_trans_log_full_commit != 2378 trans->transid && atomic_read(&root->log_writers)) { 2379 prepare_to_wait(&root->log_writer_wait, 2380 &wait, TASK_UNINTERRUPTIBLE); 2381 mutex_unlock(&root->log_mutex); 2382 if (root->fs_info->last_trans_log_full_commit != 2383 trans->transid && atomic_read(&root->log_writers)) 2384 schedule(); 2385 mutex_lock(&root->log_mutex); 2386 finish_wait(&root->log_writer_wait, &wait); 2387 } 2388 } 2389 2390 /* 2391 * btrfs_sync_log does sends a given tree log down to the disk and 2392 * updates the super blocks to record it. When this call is done, 2393 * you know that any inodes previously logged are safely on disk only 2394 * if it returns 0. 2395 * 2396 * Any other return value means you need to call btrfs_commit_transaction. 2397 * Some of the edge cases for fsyncing directories that have had unlinks 2398 * or renames done in the past mean that sometimes the only safe 2399 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2400 * that has happened. 2401 */ 2402 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2403 struct btrfs_root *root) 2404 { 2405 int index1; 2406 int index2; 2407 int mark; 2408 int ret; 2409 struct btrfs_root *log = root->log_root; 2410 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2411 unsigned long log_transid = 0; 2412 struct blk_plug plug; 2413 2414 mutex_lock(&root->log_mutex); 2415 log_transid = root->log_transid; 2416 index1 = root->log_transid % 2; 2417 if (atomic_read(&root->log_commit[index1])) { 2418 wait_log_commit(trans, root, root->log_transid); 2419 mutex_unlock(&root->log_mutex); 2420 return 0; 2421 } 2422 atomic_set(&root->log_commit[index1], 1); 2423 2424 /* wait for previous tree log sync to complete */ 2425 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2426 wait_log_commit(trans, root, root->log_transid - 1); 2427 while (1) { 2428 int batch = atomic_read(&root->log_batch); 2429 /* when we're on an ssd, just kick the log commit out */ 2430 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2431 mutex_unlock(&root->log_mutex); 2432 schedule_timeout_uninterruptible(1); 2433 mutex_lock(&root->log_mutex); 2434 } 2435 wait_for_writer(trans, root); 2436 if (batch == atomic_read(&root->log_batch)) 2437 break; 2438 } 2439 2440 /* bail out if we need to do a full commit */ 2441 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2442 ret = -EAGAIN; 2443 btrfs_free_logged_extents(log, log_transid); 2444 mutex_unlock(&root->log_mutex); 2445 goto out; 2446 } 2447 2448 if (log_transid % 2 == 0) 2449 mark = EXTENT_DIRTY; 2450 else 2451 mark = EXTENT_NEW; 2452 2453 /* we start IO on all the marked extents here, but we don't actually 2454 * wait for them until later. 2455 */ 2456 blk_start_plug(&plug); 2457 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2458 if (ret) { 2459 blk_finish_plug(&plug); 2460 btrfs_abort_transaction(trans, root, ret); 2461 btrfs_free_logged_extents(log, log_transid); 2462 mutex_unlock(&root->log_mutex); 2463 goto out; 2464 } 2465 2466 btrfs_set_root_node(&log->root_item, log->node); 2467 2468 root->log_transid++; 2469 log->log_transid = root->log_transid; 2470 root->log_start_pid = 0; 2471 smp_mb(); 2472 /* 2473 * IO has been started, blocks of the log tree have WRITTEN flag set 2474 * in their headers. new modifications of the log will be written to 2475 * new positions. so it's safe to allow log writers to go in. 2476 */ 2477 mutex_unlock(&root->log_mutex); 2478 2479 mutex_lock(&log_root_tree->log_mutex); 2480 atomic_inc(&log_root_tree->log_batch); 2481 atomic_inc(&log_root_tree->log_writers); 2482 mutex_unlock(&log_root_tree->log_mutex); 2483 2484 ret = update_log_root(trans, log); 2485 2486 mutex_lock(&log_root_tree->log_mutex); 2487 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2488 smp_mb(); 2489 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2490 wake_up(&log_root_tree->log_writer_wait); 2491 } 2492 2493 if (ret) { 2494 blk_finish_plug(&plug); 2495 if (ret != -ENOSPC) { 2496 btrfs_abort_transaction(trans, root, ret); 2497 mutex_unlock(&log_root_tree->log_mutex); 2498 goto out; 2499 } 2500 root->fs_info->last_trans_log_full_commit = trans->transid; 2501 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2502 btrfs_free_logged_extents(log, log_transid); 2503 mutex_unlock(&log_root_tree->log_mutex); 2504 ret = -EAGAIN; 2505 goto out; 2506 } 2507 2508 index2 = log_root_tree->log_transid % 2; 2509 if (atomic_read(&log_root_tree->log_commit[index2])) { 2510 blk_finish_plug(&plug); 2511 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2512 wait_log_commit(trans, log_root_tree, 2513 log_root_tree->log_transid); 2514 btrfs_free_logged_extents(log, log_transid); 2515 mutex_unlock(&log_root_tree->log_mutex); 2516 ret = 0; 2517 goto out; 2518 } 2519 atomic_set(&log_root_tree->log_commit[index2], 1); 2520 2521 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2522 wait_log_commit(trans, log_root_tree, 2523 log_root_tree->log_transid - 1); 2524 } 2525 2526 wait_for_writer(trans, log_root_tree); 2527 2528 /* 2529 * now that we've moved on to the tree of log tree roots, 2530 * check the full commit flag again 2531 */ 2532 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2533 blk_finish_plug(&plug); 2534 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2535 btrfs_free_logged_extents(log, log_transid); 2536 mutex_unlock(&log_root_tree->log_mutex); 2537 ret = -EAGAIN; 2538 goto out_wake_log_root; 2539 } 2540 2541 ret = btrfs_write_marked_extents(log_root_tree, 2542 &log_root_tree->dirty_log_pages, 2543 EXTENT_DIRTY | EXTENT_NEW); 2544 blk_finish_plug(&plug); 2545 if (ret) { 2546 btrfs_abort_transaction(trans, root, ret); 2547 btrfs_free_logged_extents(log, log_transid); 2548 mutex_unlock(&log_root_tree->log_mutex); 2549 goto out_wake_log_root; 2550 } 2551 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2552 btrfs_wait_marked_extents(log_root_tree, 2553 &log_root_tree->dirty_log_pages, 2554 EXTENT_NEW | EXTENT_DIRTY); 2555 btrfs_wait_logged_extents(log, log_transid); 2556 2557 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2558 log_root_tree->node->start); 2559 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2560 btrfs_header_level(log_root_tree->node)); 2561 2562 log_root_tree->log_transid++; 2563 smp_mb(); 2564 2565 mutex_unlock(&log_root_tree->log_mutex); 2566 2567 /* 2568 * nobody else is going to jump in and write the the ctree 2569 * super here because the log_commit atomic below is protecting 2570 * us. We must be called with a transaction handle pinning 2571 * the running transaction open, so a full commit can't hop 2572 * in and cause problems either. 2573 */ 2574 btrfs_scrub_pause_super(root); 2575 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2576 btrfs_scrub_continue_super(root); 2577 if (ret) { 2578 btrfs_abort_transaction(trans, root, ret); 2579 goto out_wake_log_root; 2580 } 2581 2582 mutex_lock(&root->log_mutex); 2583 if (root->last_log_commit < log_transid) 2584 root->last_log_commit = log_transid; 2585 mutex_unlock(&root->log_mutex); 2586 2587 out_wake_log_root: 2588 atomic_set(&log_root_tree->log_commit[index2], 0); 2589 smp_mb(); 2590 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2591 wake_up(&log_root_tree->log_commit_wait[index2]); 2592 out: 2593 atomic_set(&root->log_commit[index1], 0); 2594 smp_mb(); 2595 if (waitqueue_active(&root->log_commit_wait[index1])) 2596 wake_up(&root->log_commit_wait[index1]); 2597 return ret; 2598 } 2599 2600 static void free_log_tree(struct btrfs_trans_handle *trans, 2601 struct btrfs_root *log) 2602 { 2603 int ret; 2604 u64 start; 2605 u64 end; 2606 struct walk_control wc = { 2607 .free = 1, 2608 .process_func = process_one_buffer 2609 }; 2610 2611 if (trans) { 2612 ret = walk_log_tree(trans, log, &wc); 2613 2614 /* I don't think this can happen but just in case */ 2615 if (ret) 2616 btrfs_abort_transaction(trans, log, ret); 2617 } 2618 2619 while (1) { 2620 ret = find_first_extent_bit(&log->dirty_log_pages, 2621 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2622 NULL); 2623 if (ret) 2624 break; 2625 2626 clear_extent_bits(&log->dirty_log_pages, start, end, 2627 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2628 } 2629 2630 /* 2631 * We may have short-circuited the log tree with the full commit logic 2632 * and left ordered extents on our list, so clear these out to keep us 2633 * from leaking inodes and memory. 2634 */ 2635 btrfs_free_logged_extents(log, 0); 2636 btrfs_free_logged_extents(log, 1); 2637 2638 free_extent_buffer(log->node); 2639 kfree(log); 2640 } 2641 2642 /* 2643 * free all the extents used by the tree log. This should be called 2644 * at commit time of the full transaction 2645 */ 2646 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2647 { 2648 if (root->log_root) { 2649 free_log_tree(trans, root->log_root); 2650 root->log_root = NULL; 2651 } 2652 return 0; 2653 } 2654 2655 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2656 struct btrfs_fs_info *fs_info) 2657 { 2658 if (fs_info->log_root_tree) { 2659 free_log_tree(trans, fs_info->log_root_tree); 2660 fs_info->log_root_tree = NULL; 2661 } 2662 return 0; 2663 } 2664 2665 /* 2666 * If both a file and directory are logged, and unlinks or renames are 2667 * mixed in, we have a few interesting corners: 2668 * 2669 * create file X in dir Y 2670 * link file X to X.link in dir Y 2671 * fsync file X 2672 * unlink file X but leave X.link 2673 * fsync dir Y 2674 * 2675 * After a crash we would expect only X.link to exist. But file X 2676 * didn't get fsync'd again so the log has back refs for X and X.link. 2677 * 2678 * We solve this by removing directory entries and inode backrefs from the 2679 * log when a file that was logged in the current transaction is 2680 * unlinked. Any later fsync will include the updated log entries, and 2681 * we'll be able to reconstruct the proper directory items from backrefs. 2682 * 2683 * This optimizations allows us to avoid relogging the entire inode 2684 * or the entire directory. 2685 */ 2686 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2687 struct btrfs_root *root, 2688 const char *name, int name_len, 2689 struct inode *dir, u64 index) 2690 { 2691 struct btrfs_root *log; 2692 struct btrfs_dir_item *di; 2693 struct btrfs_path *path; 2694 int ret; 2695 int err = 0; 2696 int bytes_del = 0; 2697 u64 dir_ino = btrfs_ino(dir); 2698 2699 if (BTRFS_I(dir)->logged_trans < trans->transid) 2700 return 0; 2701 2702 ret = join_running_log_trans(root); 2703 if (ret) 2704 return 0; 2705 2706 mutex_lock(&BTRFS_I(dir)->log_mutex); 2707 2708 log = root->log_root; 2709 path = btrfs_alloc_path(); 2710 if (!path) { 2711 err = -ENOMEM; 2712 goto out_unlock; 2713 } 2714 2715 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2716 name, name_len, -1); 2717 if (IS_ERR(di)) { 2718 err = PTR_ERR(di); 2719 goto fail; 2720 } 2721 if (di) { 2722 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2723 bytes_del += name_len; 2724 if (ret) { 2725 err = ret; 2726 goto fail; 2727 } 2728 } 2729 btrfs_release_path(path); 2730 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2731 index, name, name_len, -1); 2732 if (IS_ERR(di)) { 2733 err = PTR_ERR(di); 2734 goto fail; 2735 } 2736 if (di) { 2737 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2738 bytes_del += name_len; 2739 if (ret) { 2740 err = ret; 2741 goto fail; 2742 } 2743 } 2744 2745 /* update the directory size in the log to reflect the names 2746 * we have removed 2747 */ 2748 if (bytes_del) { 2749 struct btrfs_key key; 2750 2751 key.objectid = dir_ino; 2752 key.offset = 0; 2753 key.type = BTRFS_INODE_ITEM_KEY; 2754 btrfs_release_path(path); 2755 2756 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2757 if (ret < 0) { 2758 err = ret; 2759 goto fail; 2760 } 2761 if (ret == 0) { 2762 struct btrfs_inode_item *item; 2763 u64 i_size; 2764 2765 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2766 struct btrfs_inode_item); 2767 i_size = btrfs_inode_size(path->nodes[0], item); 2768 if (i_size > bytes_del) 2769 i_size -= bytes_del; 2770 else 2771 i_size = 0; 2772 btrfs_set_inode_size(path->nodes[0], item, i_size); 2773 btrfs_mark_buffer_dirty(path->nodes[0]); 2774 } else 2775 ret = 0; 2776 btrfs_release_path(path); 2777 } 2778 fail: 2779 btrfs_free_path(path); 2780 out_unlock: 2781 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2782 if (ret == -ENOSPC) { 2783 root->fs_info->last_trans_log_full_commit = trans->transid; 2784 ret = 0; 2785 } else if (ret < 0) 2786 btrfs_abort_transaction(trans, root, ret); 2787 2788 btrfs_end_log_trans(root); 2789 2790 return err; 2791 } 2792 2793 /* see comments for btrfs_del_dir_entries_in_log */ 2794 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2795 struct btrfs_root *root, 2796 const char *name, int name_len, 2797 struct inode *inode, u64 dirid) 2798 { 2799 struct btrfs_root *log; 2800 u64 index; 2801 int ret; 2802 2803 if (BTRFS_I(inode)->logged_trans < trans->transid) 2804 return 0; 2805 2806 ret = join_running_log_trans(root); 2807 if (ret) 2808 return 0; 2809 log = root->log_root; 2810 mutex_lock(&BTRFS_I(inode)->log_mutex); 2811 2812 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 2813 dirid, &index); 2814 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2815 if (ret == -ENOSPC) { 2816 root->fs_info->last_trans_log_full_commit = trans->transid; 2817 ret = 0; 2818 } else if (ret < 0 && ret != -ENOENT) 2819 btrfs_abort_transaction(trans, root, ret); 2820 btrfs_end_log_trans(root); 2821 2822 return ret; 2823 } 2824 2825 /* 2826 * creates a range item in the log for 'dirid'. first_offset and 2827 * last_offset tell us which parts of the key space the log should 2828 * be considered authoritative for. 2829 */ 2830 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2831 struct btrfs_root *log, 2832 struct btrfs_path *path, 2833 int key_type, u64 dirid, 2834 u64 first_offset, u64 last_offset) 2835 { 2836 int ret; 2837 struct btrfs_key key; 2838 struct btrfs_dir_log_item *item; 2839 2840 key.objectid = dirid; 2841 key.offset = first_offset; 2842 if (key_type == BTRFS_DIR_ITEM_KEY) 2843 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2844 else 2845 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2846 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2847 if (ret) 2848 return ret; 2849 2850 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2851 struct btrfs_dir_log_item); 2852 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2853 btrfs_mark_buffer_dirty(path->nodes[0]); 2854 btrfs_release_path(path); 2855 return 0; 2856 } 2857 2858 /* 2859 * log all the items included in the current transaction for a given 2860 * directory. This also creates the range items in the log tree required 2861 * to replay anything deleted before the fsync 2862 */ 2863 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2864 struct btrfs_root *root, struct inode *inode, 2865 struct btrfs_path *path, 2866 struct btrfs_path *dst_path, int key_type, 2867 u64 min_offset, u64 *last_offset_ret) 2868 { 2869 struct btrfs_key min_key; 2870 struct btrfs_key max_key; 2871 struct btrfs_root *log = root->log_root; 2872 struct extent_buffer *src; 2873 int err = 0; 2874 int ret; 2875 int i; 2876 int nritems; 2877 u64 first_offset = min_offset; 2878 u64 last_offset = (u64)-1; 2879 u64 ino = btrfs_ino(inode); 2880 2881 log = root->log_root; 2882 max_key.objectid = ino; 2883 max_key.offset = (u64)-1; 2884 max_key.type = key_type; 2885 2886 min_key.objectid = ino; 2887 min_key.type = key_type; 2888 min_key.offset = min_offset; 2889 2890 path->keep_locks = 1; 2891 2892 ret = btrfs_search_forward(root, &min_key, &max_key, 2893 path, trans->transid); 2894 2895 /* 2896 * we didn't find anything from this transaction, see if there 2897 * is anything at all 2898 */ 2899 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 2900 min_key.objectid = ino; 2901 min_key.type = key_type; 2902 min_key.offset = (u64)-1; 2903 btrfs_release_path(path); 2904 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2905 if (ret < 0) { 2906 btrfs_release_path(path); 2907 return ret; 2908 } 2909 ret = btrfs_previous_item(root, path, ino, key_type); 2910 2911 /* if ret == 0 there are items for this type, 2912 * create a range to tell us the last key of this type. 2913 * otherwise, there are no items in this directory after 2914 * *min_offset, and we create a range to indicate that. 2915 */ 2916 if (ret == 0) { 2917 struct btrfs_key tmp; 2918 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2919 path->slots[0]); 2920 if (key_type == tmp.type) 2921 first_offset = max(min_offset, tmp.offset) + 1; 2922 } 2923 goto done; 2924 } 2925 2926 /* go backward to find any previous key */ 2927 ret = btrfs_previous_item(root, path, ino, key_type); 2928 if (ret == 0) { 2929 struct btrfs_key tmp; 2930 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2931 if (key_type == tmp.type) { 2932 first_offset = tmp.offset; 2933 ret = overwrite_item(trans, log, dst_path, 2934 path->nodes[0], path->slots[0], 2935 &tmp); 2936 if (ret) { 2937 err = ret; 2938 goto done; 2939 } 2940 } 2941 } 2942 btrfs_release_path(path); 2943 2944 /* find the first key from this transaction again */ 2945 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2946 if (ret != 0) { 2947 WARN_ON(1); 2948 goto done; 2949 } 2950 2951 /* 2952 * we have a block from this transaction, log every item in it 2953 * from our directory 2954 */ 2955 while (1) { 2956 struct btrfs_key tmp; 2957 src = path->nodes[0]; 2958 nritems = btrfs_header_nritems(src); 2959 for (i = path->slots[0]; i < nritems; i++) { 2960 btrfs_item_key_to_cpu(src, &min_key, i); 2961 2962 if (min_key.objectid != ino || min_key.type != key_type) 2963 goto done; 2964 ret = overwrite_item(trans, log, dst_path, src, i, 2965 &min_key); 2966 if (ret) { 2967 err = ret; 2968 goto done; 2969 } 2970 } 2971 path->slots[0] = nritems; 2972 2973 /* 2974 * look ahead to the next item and see if it is also 2975 * from this directory and from this transaction 2976 */ 2977 ret = btrfs_next_leaf(root, path); 2978 if (ret == 1) { 2979 last_offset = (u64)-1; 2980 goto done; 2981 } 2982 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2983 if (tmp.objectid != ino || tmp.type != key_type) { 2984 last_offset = (u64)-1; 2985 goto done; 2986 } 2987 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2988 ret = overwrite_item(trans, log, dst_path, 2989 path->nodes[0], path->slots[0], 2990 &tmp); 2991 if (ret) 2992 err = ret; 2993 else 2994 last_offset = tmp.offset; 2995 goto done; 2996 } 2997 } 2998 done: 2999 btrfs_release_path(path); 3000 btrfs_release_path(dst_path); 3001 3002 if (err == 0) { 3003 *last_offset_ret = last_offset; 3004 /* 3005 * insert the log range keys to indicate where the log 3006 * is valid 3007 */ 3008 ret = insert_dir_log_key(trans, log, path, key_type, 3009 ino, first_offset, last_offset); 3010 if (ret) 3011 err = ret; 3012 } 3013 return err; 3014 } 3015 3016 /* 3017 * logging directories is very similar to logging inodes, We find all the items 3018 * from the current transaction and write them to the log. 3019 * 3020 * The recovery code scans the directory in the subvolume, and if it finds a 3021 * key in the range logged that is not present in the log tree, then it means 3022 * that dir entry was unlinked during the transaction. 3023 * 3024 * In order for that scan to work, we must include one key smaller than 3025 * the smallest logged by this transaction and one key larger than the largest 3026 * key logged by this transaction. 3027 */ 3028 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3029 struct btrfs_root *root, struct inode *inode, 3030 struct btrfs_path *path, 3031 struct btrfs_path *dst_path) 3032 { 3033 u64 min_key; 3034 u64 max_key; 3035 int ret; 3036 int key_type = BTRFS_DIR_ITEM_KEY; 3037 3038 again: 3039 min_key = 0; 3040 max_key = 0; 3041 while (1) { 3042 ret = log_dir_items(trans, root, inode, path, 3043 dst_path, key_type, min_key, 3044 &max_key); 3045 if (ret) 3046 return ret; 3047 if (max_key == (u64)-1) 3048 break; 3049 min_key = max_key + 1; 3050 } 3051 3052 if (key_type == BTRFS_DIR_ITEM_KEY) { 3053 key_type = BTRFS_DIR_INDEX_KEY; 3054 goto again; 3055 } 3056 return 0; 3057 } 3058 3059 /* 3060 * a helper function to drop items from the log before we relog an 3061 * inode. max_key_type indicates the highest item type to remove. 3062 * This cannot be run for file data extents because it does not 3063 * free the extents they point to. 3064 */ 3065 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3066 struct btrfs_root *log, 3067 struct btrfs_path *path, 3068 u64 objectid, int max_key_type) 3069 { 3070 int ret; 3071 struct btrfs_key key; 3072 struct btrfs_key found_key; 3073 int start_slot; 3074 3075 key.objectid = objectid; 3076 key.type = max_key_type; 3077 key.offset = (u64)-1; 3078 3079 while (1) { 3080 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3081 BUG_ON(ret == 0); /* Logic error */ 3082 if (ret < 0) 3083 break; 3084 3085 if (path->slots[0] == 0) 3086 break; 3087 3088 path->slots[0]--; 3089 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3090 path->slots[0]); 3091 3092 if (found_key.objectid != objectid) 3093 break; 3094 3095 found_key.offset = 0; 3096 found_key.type = 0; 3097 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3098 &start_slot); 3099 3100 ret = btrfs_del_items(trans, log, path, start_slot, 3101 path->slots[0] - start_slot + 1); 3102 /* 3103 * If start slot isn't 0 then we don't need to re-search, we've 3104 * found the last guy with the objectid in this tree. 3105 */ 3106 if (ret || start_slot != 0) 3107 break; 3108 btrfs_release_path(path); 3109 } 3110 btrfs_release_path(path); 3111 if (ret > 0) 3112 ret = 0; 3113 return ret; 3114 } 3115 3116 static void fill_inode_item(struct btrfs_trans_handle *trans, 3117 struct extent_buffer *leaf, 3118 struct btrfs_inode_item *item, 3119 struct inode *inode, int log_inode_only) 3120 { 3121 struct btrfs_map_token token; 3122 3123 btrfs_init_map_token(&token); 3124 3125 if (log_inode_only) { 3126 /* set the generation to zero so the recover code 3127 * can tell the difference between an logging 3128 * just to say 'this inode exists' and a logging 3129 * to say 'update this inode with these values' 3130 */ 3131 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3132 btrfs_set_token_inode_size(leaf, item, 0, &token); 3133 } else { 3134 btrfs_set_token_inode_generation(leaf, item, 3135 BTRFS_I(inode)->generation, 3136 &token); 3137 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3138 } 3139 3140 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3141 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3142 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3143 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3144 3145 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3146 inode->i_atime.tv_sec, &token); 3147 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3148 inode->i_atime.tv_nsec, &token); 3149 3150 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3151 inode->i_mtime.tv_sec, &token); 3152 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3153 inode->i_mtime.tv_nsec, &token); 3154 3155 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3156 inode->i_ctime.tv_sec, &token); 3157 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3158 inode->i_ctime.tv_nsec, &token); 3159 3160 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3161 &token); 3162 3163 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3164 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3165 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3166 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3167 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3168 } 3169 3170 static int log_inode_item(struct btrfs_trans_handle *trans, 3171 struct btrfs_root *log, struct btrfs_path *path, 3172 struct inode *inode) 3173 { 3174 struct btrfs_inode_item *inode_item; 3175 struct btrfs_key key; 3176 int ret; 3177 3178 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key)); 3179 ret = btrfs_insert_empty_item(trans, log, path, &key, 3180 sizeof(*inode_item)); 3181 if (ret && ret != -EEXIST) 3182 return ret; 3183 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3184 struct btrfs_inode_item); 3185 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3186 btrfs_release_path(path); 3187 return 0; 3188 } 3189 3190 static noinline int copy_items(struct btrfs_trans_handle *trans, 3191 struct inode *inode, 3192 struct btrfs_path *dst_path, 3193 struct extent_buffer *src, 3194 int start_slot, int nr, int inode_only) 3195 { 3196 unsigned long src_offset; 3197 unsigned long dst_offset; 3198 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3199 struct btrfs_file_extent_item *extent; 3200 struct btrfs_inode_item *inode_item; 3201 int ret; 3202 struct btrfs_key *ins_keys; 3203 u32 *ins_sizes; 3204 char *ins_data; 3205 int i; 3206 struct list_head ordered_sums; 3207 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3208 3209 INIT_LIST_HEAD(&ordered_sums); 3210 3211 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3212 nr * sizeof(u32), GFP_NOFS); 3213 if (!ins_data) 3214 return -ENOMEM; 3215 3216 ins_sizes = (u32 *)ins_data; 3217 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3218 3219 for (i = 0; i < nr; i++) { 3220 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3221 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3222 } 3223 ret = btrfs_insert_empty_items(trans, log, dst_path, 3224 ins_keys, ins_sizes, nr); 3225 if (ret) { 3226 kfree(ins_data); 3227 return ret; 3228 } 3229 3230 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3231 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3232 dst_path->slots[0]); 3233 3234 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3235 3236 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3237 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3238 dst_path->slots[0], 3239 struct btrfs_inode_item); 3240 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3241 inode, inode_only == LOG_INODE_EXISTS); 3242 } else { 3243 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3244 src_offset, ins_sizes[i]); 3245 } 3246 3247 /* take a reference on file data extents so that truncates 3248 * or deletes of this inode don't have to relog the inode 3249 * again 3250 */ 3251 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3252 !skip_csum) { 3253 int found_type; 3254 extent = btrfs_item_ptr(src, start_slot + i, 3255 struct btrfs_file_extent_item); 3256 3257 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3258 continue; 3259 3260 found_type = btrfs_file_extent_type(src, extent); 3261 if (found_type == BTRFS_FILE_EXTENT_REG) { 3262 u64 ds, dl, cs, cl; 3263 ds = btrfs_file_extent_disk_bytenr(src, 3264 extent); 3265 /* ds == 0 is a hole */ 3266 if (ds == 0) 3267 continue; 3268 3269 dl = btrfs_file_extent_disk_num_bytes(src, 3270 extent); 3271 cs = btrfs_file_extent_offset(src, extent); 3272 cl = btrfs_file_extent_num_bytes(src, 3273 extent); 3274 if (btrfs_file_extent_compression(src, 3275 extent)) { 3276 cs = 0; 3277 cl = dl; 3278 } 3279 3280 ret = btrfs_lookup_csums_range( 3281 log->fs_info->csum_root, 3282 ds + cs, ds + cs + cl - 1, 3283 &ordered_sums, 0); 3284 if (ret) { 3285 btrfs_release_path(dst_path); 3286 kfree(ins_data); 3287 return ret; 3288 } 3289 } 3290 } 3291 } 3292 3293 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3294 btrfs_release_path(dst_path); 3295 kfree(ins_data); 3296 3297 /* 3298 * we have to do this after the loop above to avoid changing the 3299 * log tree while trying to change the log tree. 3300 */ 3301 ret = 0; 3302 while (!list_empty(&ordered_sums)) { 3303 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3304 struct btrfs_ordered_sum, 3305 list); 3306 if (!ret) 3307 ret = btrfs_csum_file_blocks(trans, log, sums); 3308 list_del(&sums->list); 3309 kfree(sums); 3310 } 3311 return ret; 3312 } 3313 3314 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3315 { 3316 struct extent_map *em1, *em2; 3317 3318 em1 = list_entry(a, struct extent_map, list); 3319 em2 = list_entry(b, struct extent_map, list); 3320 3321 if (em1->start < em2->start) 3322 return -1; 3323 else if (em1->start > em2->start) 3324 return 1; 3325 return 0; 3326 } 3327 3328 static int log_one_extent(struct btrfs_trans_handle *trans, 3329 struct inode *inode, struct btrfs_root *root, 3330 struct extent_map *em, struct btrfs_path *path) 3331 { 3332 struct btrfs_root *log = root->log_root; 3333 struct btrfs_file_extent_item *fi; 3334 struct extent_buffer *leaf; 3335 struct btrfs_ordered_extent *ordered; 3336 struct list_head ordered_sums; 3337 struct btrfs_map_token token; 3338 struct btrfs_key key; 3339 u64 mod_start = em->mod_start; 3340 u64 mod_len = em->mod_len; 3341 u64 csum_offset; 3342 u64 csum_len; 3343 u64 extent_offset = em->start - em->orig_start; 3344 u64 block_len; 3345 int ret; 3346 int index = log->log_transid % 2; 3347 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3348 3349 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 3350 em->start + em->len, NULL, 0); 3351 if (ret) 3352 return ret; 3353 3354 INIT_LIST_HEAD(&ordered_sums); 3355 btrfs_init_map_token(&token); 3356 key.objectid = btrfs_ino(inode); 3357 key.type = BTRFS_EXTENT_DATA_KEY; 3358 key.offset = em->start; 3359 3360 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); 3361 if (ret) 3362 return ret; 3363 leaf = path->nodes[0]; 3364 fi = btrfs_item_ptr(leaf, path->slots[0], 3365 struct btrfs_file_extent_item); 3366 3367 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3368 &token); 3369 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3370 skip_csum = true; 3371 btrfs_set_token_file_extent_type(leaf, fi, 3372 BTRFS_FILE_EXTENT_PREALLOC, 3373 &token); 3374 } else { 3375 btrfs_set_token_file_extent_type(leaf, fi, 3376 BTRFS_FILE_EXTENT_REG, 3377 &token); 3378 if (em->block_start == 0) 3379 skip_csum = true; 3380 } 3381 3382 block_len = max(em->block_len, em->orig_block_len); 3383 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3384 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3385 em->block_start, 3386 &token); 3387 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3388 &token); 3389 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3390 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3391 em->block_start - 3392 extent_offset, &token); 3393 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3394 &token); 3395 } else { 3396 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3397 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3398 &token); 3399 } 3400 3401 btrfs_set_token_file_extent_offset(leaf, fi, 3402 em->start - em->orig_start, 3403 &token); 3404 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3405 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 3406 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3407 &token); 3408 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3409 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3410 btrfs_mark_buffer_dirty(leaf); 3411 3412 btrfs_release_path(path); 3413 if (ret) { 3414 return ret; 3415 } 3416 3417 if (skip_csum) 3418 return 0; 3419 3420 if (em->compress_type) { 3421 csum_offset = 0; 3422 csum_len = block_len; 3423 } 3424 3425 /* 3426 * First check and see if our csums are on our outstanding ordered 3427 * extents. 3428 */ 3429 again: 3430 spin_lock_irq(&log->log_extents_lock[index]); 3431 list_for_each_entry(ordered, &log->logged_list[index], log_list) { 3432 struct btrfs_ordered_sum *sum; 3433 3434 if (!mod_len) 3435 break; 3436 3437 if (ordered->inode != inode) 3438 continue; 3439 3440 if (ordered->file_offset + ordered->len <= mod_start || 3441 mod_start + mod_len <= ordered->file_offset) 3442 continue; 3443 3444 /* 3445 * We are going to copy all the csums on this ordered extent, so 3446 * go ahead and adjust mod_start and mod_len in case this 3447 * ordered extent has already been logged. 3448 */ 3449 if (ordered->file_offset > mod_start) { 3450 if (ordered->file_offset + ordered->len >= 3451 mod_start + mod_len) 3452 mod_len = ordered->file_offset - mod_start; 3453 /* 3454 * If we have this case 3455 * 3456 * |--------- logged extent ---------| 3457 * |----- ordered extent ----| 3458 * 3459 * Just don't mess with mod_start and mod_len, we'll 3460 * just end up logging more csums than we need and it 3461 * will be ok. 3462 */ 3463 } else { 3464 if (ordered->file_offset + ordered->len < 3465 mod_start + mod_len) { 3466 mod_len = (mod_start + mod_len) - 3467 (ordered->file_offset + ordered->len); 3468 mod_start = ordered->file_offset + 3469 ordered->len; 3470 } else { 3471 mod_len = 0; 3472 } 3473 } 3474 3475 /* 3476 * To keep us from looping for the above case of an ordered 3477 * extent that falls inside of the logged extent. 3478 */ 3479 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3480 &ordered->flags)) 3481 continue; 3482 atomic_inc(&ordered->refs); 3483 spin_unlock_irq(&log->log_extents_lock[index]); 3484 /* 3485 * we've dropped the lock, we must either break or 3486 * start over after this. 3487 */ 3488 3489 wait_event(ordered->wait, ordered->csum_bytes_left == 0); 3490 3491 list_for_each_entry(sum, &ordered->list, list) { 3492 ret = btrfs_csum_file_blocks(trans, log, sum); 3493 if (ret) { 3494 btrfs_put_ordered_extent(ordered); 3495 goto unlocked; 3496 } 3497 } 3498 btrfs_put_ordered_extent(ordered); 3499 goto again; 3500 3501 } 3502 spin_unlock_irq(&log->log_extents_lock[index]); 3503 unlocked: 3504 3505 if (!mod_len || ret) 3506 return ret; 3507 3508 csum_offset = mod_start - em->start; 3509 csum_len = mod_len; 3510 3511 /* block start is already adjusted for the file extent offset. */ 3512 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3513 em->block_start + csum_offset, 3514 em->block_start + csum_offset + 3515 csum_len - 1, &ordered_sums, 0); 3516 if (ret) 3517 return ret; 3518 3519 while (!list_empty(&ordered_sums)) { 3520 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3521 struct btrfs_ordered_sum, 3522 list); 3523 if (!ret) 3524 ret = btrfs_csum_file_blocks(trans, log, sums); 3525 list_del(&sums->list); 3526 kfree(sums); 3527 } 3528 3529 return ret; 3530 } 3531 3532 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3533 struct btrfs_root *root, 3534 struct inode *inode, 3535 struct btrfs_path *path) 3536 { 3537 struct extent_map *em, *n; 3538 struct list_head extents; 3539 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3540 u64 test_gen; 3541 int ret = 0; 3542 int num = 0; 3543 3544 INIT_LIST_HEAD(&extents); 3545 3546 write_lock(&tree->lock); 3547 test_gen = root->fs_info->last_trans_committed; 3548 3549 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3550 list_del_init(&em->list); 3551 3552 /* 3553 * Just an arbitrary number, this can be really CPU intensive 3554 * once we start getting a lot of extents, and really once we 3555 * have a bunch of extents we just want to commit since it will 3556 * be faster. 3557 */ 3558 if (++num > 32768) { 3559 list_del_init(&tree->modified_extents); 3560 ret = -EFBIG; 3561 goto process; 3562 } 3563 3564 if (em->generation <= test_gen) 3565 continue; 3566 /* Need a ref to keep it from getting evicted from cache */ 3567 atomic_inc(&em->refs); 3568 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3569 list_add_tail(&em->list, &extents); 3570 num++; 3571 } 3572 3573 list_sort(NULL, &extents, extent_cmp); 3574 3575 process: 3576 while (!list_empty(&extents)) { 3577 em = list_entry(extents.next, struct extent_map, list); 3578 3579 list_del_init(&em->list); 3580 3581 /* 3582 * If we had an error we just need to delete everybody from our 3583 * private list. 3584 */ 3585 if (ret) { 3586 clear_em_logging(tree, em); 3587 free_extent_map(em); 3588 continue; 3589 } 3590 3591 write_unlock(&tree->lock); 3592 3593 ret = log_one_extent(trans, inode, root, em, path); 3594 write_lock(&tree->lock); 3595 clear_em_logging(tree, em); 3596 free_extent_map(em); 3597 } 3598 WARN_ON(!list_empty(&extents)); 3599 write_unlock(&tree->lock); 3600 3601 btrfs_release_path(path); 3602 return ret; 3603 } 3604 3605 /* log a single inode in the tree log. 3606 * At least one parent directory for this inode must exist in the tree 3607 * or be logged already. 3608 * 3609 * Any items from this inode changed by the current transaction are copied 3610 * to the log tree. An extra reference is taken on any extents in this 3611 * file, allowing us to avoid a whole pile of corner cases around logging 3612 * blocks that have been removed from the tree. 3613 * 3614 * See LOG_INODE_ALL and related defines for a description of what inode_only 3615 * does. 3616 * 3617 * This handles both files and directories. 3618 */ 3619 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3620 struct btrfs_root *root, struct inode *inode, 3621 int inode_only) 3622 { 3623 struct btrfs_path *path; 3624 struct btrfs_path *dst_path; 3625 struct btrfs_key min_key; 3626 struct btrfs_key max_key; 3627 struct btrfs_root *log = root->log_root; 3628 struct extent_buffer *src = NULL; 3629 int err = 0; 3630 int ret; 3631 int nritems; 3632 int ins_start_slot = 0; 3633 int ins_nr; 3634 bool fast_search = false; 3635 u64 ino = btrfs_ino(inode); 3636 3637 path = btrfs_alloc_path(); 3638 if (!path) 3639 return -ENOMEM; 3640 dst_path = btrfs_alloc_path(); 3641 if (!dst_path) { 3642 btrfs_free_path(path); 3643 return -ENOMEM; 3644 } 3645 3646 min_key.objectid = ino; 3647 min_key.type = BTRFS_INODE_ITEM_KEY; 3648 min_key.offset = 0; 3649 3650 max_key.objectid = ino; 3651 3652 3653 /* today the code can only do partial logging of directories */ 3654 if (S_ISDIR(inode->i_mode) || 3655 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3656 &BTRFS_I(inode)->runtime_flags) && 3657 inode_only == LOG_INODE_EXISTS)) 3658 max_key.type = BTRFS_XATTR_ITEM_KEY; 3659 else 3660 max_key.type = (u8)-1; 3661 max_key.offset = (u64)-1; 3662 3663 /* Only run delayed items if we are a dir or a new file */ 3664 if (S_ISDIR(inode->i_mode) || 3665 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { 3666 ret = btrfs_commit_inode_delayed_items(trans, inode); 3667 if (ret) { 3668 btrfs_free_path(path); 3669 btrfs_free_path(dst_path); 3670 return ret; 3671 } 3672 } 3673 3674 mutex_lock(&BTRFS_I(inode)->log_mutex); 3675 3676 btrfs_get_logged_extents(log, inode); 3677 3678 /* 3679 * a brute force approach to making sure we get the most uptodate 3680 * copies of everything. 3681 */ 3682 if (S_ISDIR(inode->i_mode)) { 3683 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 3684 3685 if (inode_only == LOG_INODE_EXISTS) 3686 max_key_type = BTRFS_XATTR_ITEM_KEY; 3687 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3688 } else { 3689 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3690 &BTRFS_I(inode)->runtime_flags)) { 3691 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3692 &BTRFS_I(inode)->runtime_flags); 3693 ret = btrfs_truncate_inode_items(trans, log, 3694 inode, 0, 0); 3695 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3696 &BTRFS_I(inode)->runtime_flags)) { 3697 if (inode_only == LOG_INODE_ALL) 3698 fast_search = true; 3699 max_key.type = BTRFS_XATTR_ITEM_KEY; 3700 ret = drop_objectid_items(trans, log, path, ino, 3701 max_key.type); 3702 } else { 3703 if (inode_only == LOG_INODE_ALL) 3704 fast_search = true; 3705 ret = log_inode_item(trans, log, dst_path, inode); 3706 if (ret) { 3707 err = ret; 3708 goto out_unlock; 3709 } 3710 goto log_extents; 3711 } 3712 3713 } 3714 if (ret) { 3715 err = ret; 3716 goto out_unlock; 3717 } 3718 path->keep_locks = 1; 3719 3720 while (1) { 3721 ins_nr = 0; 3722 ret = btrfs_search_forward(root, &min_key, &max_key, 3723 path, trans->transid); 3724 if (ret != 0) 3725 break; 3726 again: 3727 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 3728 if (min_key.objectid != ino) 3729 break; 3730 if (min_key.type > max_key.type) 3731 break; 3732 3733 src = path->nodes[0]; 3734 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 3735 ins_nr++; 3736 goto next_slot; 3737 } else if (!ins_nr) { 3738 ins_start_slot = path->slots[0]; 3739 ins_nr = 1; 3740 goto next_slot; 3741 } 3742 3743 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3744 ins_nr, inode_only); 3745 if (ret) { 3746 err = ret; 3747 goto out_unlock; 3748 } 3749 ins_nr = 1; 3750 ins_start_slot = path->slots[0]; 3751 next_slot: 3752 3753 nritems = btrfs_header_nritems(path->nodes[0]); 3754 path->slots[0]++; 3755 if (path->slots[0] < nritems) { 3756 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 3757 path->slots[0]); 3758 goto again; 3759 } 3760 if (ins_nr) { 3761 ret = copy_items(trans, inode, dst_path, src, 3762 ins_start_slot, 3763 ins_nr, inode_only); 3764 if (ret) { 3765 err = ret; 3766 goto out_unlock; 3767 } 3768 ins_nr = 0; 3769 } 3770 btrfs_release_path(path); 3771 3772 if (min_key.offset < (u64)-1) 3773 min_key.offset++; 3774 else if (min_key.type < (u8)-1) 3775 min_key.type++; 3776 else if (min_key.objectid < (u64)-1) 3777 min_key.objectid++; 3778 else 3779 break; 3780 } 3781 if (ins_nr) { 3782 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3783 ins_nr, inode_only); 3784 if (ret) { 3785 err = ret; 3786 goto out_unlock; 3787 } 3788 ins_nr = 0; 3789 } 3790 3791 log_extents: 3792 btrfs_release_path(path); 3793 btrfs_release_path(dst_path); 3794 if (fast_search) { 3795 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 3796 if (ret) { 3797 err = ret; 3798 goto out_unlock; 3799 } 3800 } else { 3801 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3802 struct extent_map *em, *n; 3803 3804 write_lock(&tree->lock); 3805 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3806 list_del_init(&em->list); 3807 write_unlock(&tree->lock); 3808 } 3809 3810 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3811 ret = log_directory_changes(trans, root, inode, path, dst_path); 3812 if (ret) { 3813 err = ret; 3814 goto out_unlock; 3815 } 3816 } 3817 BTRFS_I(inode)->logged_trans = trans->transid; 3818 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 3819 out_unlock: 3820 if (err) 3821 btrfs_free_logged_extents(log, log->log_transid); 3822 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3823 3824 btrfs_free_path(path); 3825 btrfs_free_path(dst_path); 3826 return err; 3827 } 3828 3829 /* 3830 * follow the dentry parent pointers up the chain and see if any 3831 * of the directories in it require a full commit before they can 3832 * be logged. Returns zero if nothing special needs to be done or 1 if 3833 * a full commit is required. 3834 */ 3835 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 3836 struct inode *inode, 3837 struct dentry *parent, 3838 struct super_block *sb, 3839 u64 last_committed) 3840 { 3841 int ret = 0; 3842 struct btrfs_root *root; 3843 struct dentry *old_parent = NULL; 3844 struct inode *orig_inode = inode; 3845 3846 /* 3847 * for regular files, if its inode is already on disk, we don't 3848 * have to worry about the parents at all. This is because 3849 * we can use the last_unlink_trans field to record renames 3850 * and other fun in this file. 3851 */ 3852 if (S_ISREG(inode->i_mode) && 3853 BTRFS_I(inode)->generation <= last_committed && 3854 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3855 goto out; 3856 3857 if (!S_ISDIR(inode->i_mode)) { 3858 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3859 goto out; 3860 inode = parent->d_inode; 3861 } 3862 3863 while (1) { 3864 /* 3865 * If we are logging a directory then we start with our inode, 3866 * not our parents inode, so we need to skipp setting the 3867 * logged_trans so that further down in the log code we don't 3868 * think this inode has already been logged. 3869 */ 3870 if (inode != orig_inode) 3871 BTRFS_I(inode)->logged_trans = trans->transid; 3872 smp_mb(); 3873 3874 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 3875 root = BTRFS_I(inode)->root; 3876 3877 /* 3878 * make sure any commits to the log are forced 3879 * to be full commits 3880 */ 3881 root->fs_info->last_trans_log_full_commit = 3882 trans->transid; 3883 ret = 1; 3884 break; 3885 } 3886 3887 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3888 break; 3889 3890 if (IS_ROOT(parent)) 3891 break; 3892 3893 parent = dget_parent(parent); 3894 dput(old_parent); 3895 old_parent = parent; 3896 inode = parent->d_inode; 3897 3898 } 3899 dput(old_parent); 3900 out: 3901 return ret; 3902 } 3903 3904 /* 3905 * helper function around btrfs_log_inode to make sure newly created 3906 * parent directories also end up in the log. A minimal inode and backref 3907 * only logging is done of any parent directories that are older than 3908 * the last committed transaction 3909 */ 3910 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 3911 struct btrfs_root *root, struct inode *inode, 3912 struct dentry *parent, int exists_only) 3913 { 3914 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 3915 struct super_block *sb; 3916 struct dentry *old_parent = NULL; 3917 int ret = 0; 3918 u64 last_committed = root->fs_info->last_trans_committed; 3919 3920 sb = inode->i_sb; 3921 3922 if (btrfs_test_opt(root, NOTREELOG)) { 3923 ret = 1; 3924 goto end_no_trans; 3925 } 3926 3927 if (root->fs_info->last_trans_log_full_commit > 3928 root->fs_info->last_trans_committed) { 3929 ret = 1; 3930 goto end_no_trans; 3931 } 3932 3933 if (root != BTRFS_I(inode)->root || 3934 btrfs_root_refs(&root->root_item) == 0) { 3935 ret = 1; 3936 goto end_no_trans; 3937 } 3938 3939 ret = check_parent_dirs_for_sync(trans, inode, parent, 3940 sb, last_committed); 3941 if (ret) 3942 goto end_no_trans; 3943 3944 if (btrfs_inode_in_log(inode, trans->transid)) { 3945 ret = BTRFS_NO_LOG_SYNC; 3946 goto end_no_trans; 3947 } 3948 3949 ret = start_log_trans(trans, root); 3950 if (ret) 3951 goto end_trans; 3952 3953 ret = btrfs_log_inode(trans, root, inode, inode_only); 3954 if (ret) 3955 goto end_trans; 3956 3957 /* 3958 * for regular files, if its inode is already on disk, we don't 3959 * have to worry about the parents at all. This is because 3960 * we can use the last_unlink_trans field to record renames 3961 * and other fun in this file. 3962 */ 3963 if (S_ISREG(inode->i_mode) && 3964 BTRFS_I(inode)->generation <= last_committed && 3965 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 3966 ret = 0; 3967 goto end_trans; 3968 } 3969 3970 inode_only = LOG_INODE_EXISTS; 3971 while (1) { 3972 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3973 break; 3974 3975 inode = parent->d_inode; 3976 if (root != BTRFS_I(inode)->root) 3977 break; 3978 3979 if (BTRFS_I(inode)->generation > 3980 root->fs_info->last_trans_committed) { 3981 ret = btrfs_log_inode(trans, root, inode, inode_only); 3982 if (ret) 3983 goto end_trans; 3984 } 3985 if (IS_ROOT(parent)) 3986 break; 3987 3988 parent = dget_parent(parent); 3989 dput(old_parent); 3990 old_parent = parent; 3991 } 3992 ret = 0; 3993 end_trans: 3994 dput(old_parent); 3995 if (ret < 0) { 3996 root->fs_info->last_trans_log_full_commit = trans->transid; 3997 ret = 1; 3998 } 3999 btrfs_end_log_trans(root); 4000 end_no_trans: 4001 return ret; 4002 } 4003 4004 /* 4005 * it is not safe to log dentry if the chunk root has added new 4006 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 4007 * If this returns 1, you must commit the transaction to safely get your 4008 * data on disk. 4009 */ 4010 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4011 struct btrfs_root *root, struct dentry *dentry) 4012 { 4013 struct dentry *parent = dget_parent(dentry); 4014 int ret; 4015 4016 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4017 dput(parent); 4018 4019 return ret; 4020 } 4021 4022 /* 4023 * should be called during mount to recover any replay any log trees 4024 * from the FS 4025 */ 4026 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 4027 { 4028 int ret; 4029 struct btrfs_path *path; 4030 struct btrfs_trans_handle *trans; 4031 struct btrfs_key key; 4032 struct btrfs_key found_key; 4033 struct btrfs_key tmp_key; 4034 struct btrfs_root *log; 4035 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 4036 struct walk_control wc = { 4037 .process_func = process_one_buffer, 4038 .stage = 0, 4039 }; 4040 4041 path = btrfs_alloc_path(); 4042 if (!path) 4043 return -ENOMEM; 4044 4045 fs_info->log_root_recovering = 1; 4046 4047 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4048 if (IS_ERR(trans)) { 4049 ret = PTR_ERR(trans); 4050 goto error; 4051 } 4052 4053 wc.trans = trans; 4054 wc.pin = 1; 4055 4056 ret = walk_log_tree(trans, log_root_tree, &wc); 4057 if (ret) { 4058 btrfs_error(fs_info, ret, "Failed to pin buffers while " 4059 "recovering log root tree."); 4060 goto error; 4061 } 4062 4063 again: 4064 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4065 key.offset = (u64)-1; 4066 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 4067 4068 while (1) { 4069 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4070 4071 if (ret < 0) { 4072 btrfs_error(fs_info, ret, 4073 "Couldn't find tree log root."); 4074 goto error; 4075 } 4076 if (ret > 0) { 4077 if (path->slots[0] == 0) 4078 break; 4079 path->slots[0]--; 4080 } 4081 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4082 path->slots[0]); 4083 btrfs_release_path(path); 4084 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4085 break; 4086 4087 log = btrfs_read_fs_root(log_root_tree, &found_key); 4088 if (IS_ERR(log)) { 4089 ret = PTR_ERR(log); 4090 btrfs_error(fs_info, ret, 4091 "Couldn't read tree log root."); 4092 goto error; 4093 } 4094 4095 tmp_key.objectid = found_key.offset; 4096 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 4097 tmp_key.offset = (u64)-1; 4098 4099 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 4100 if (IS_ERR(wc.replay_dest)) { 4101 ret = PTR_ERR(wc.replay_dest); 4102 free_extent_buffer(log->node); 4103 free_extent_buffer(log->commit_root); 4104 kfree(log); 4105 btrfs_error(fs_info, ret, "Couldn't read target root " 4106 "for tree log recovery."); 4107 goto error; 4108 } 4109 4110 wc.replay_dest->log_root = log; 4111 btrfs_record_root_in_trans(trans, wc.replay_dest); 4112 ret = walk_log_tree(trans, log, &wc); 4113 4114 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 4115 ret = fixup_inode_link_counts(trans, wc.replay_dest, 4116 path); 4117 } 4118 4119 key.offset = found_key.offset - 1; 4120 wc.replay_dest->log_root = NULL; 4121 free_extent_buffer(log->node); 4122 free_extent_buffer(log->commit_root); 4123 kfree(log); 4124 4125 if (ret) 4126 goto error; 4127 4128 if (found_key.offset == 0) 4129 break; 4130 } 4131 btrfs_release_path(path); 4132 4133 /* step one is to pin it all, step two is to replay just inodes */ 4134 if (wc.pin) { 4135 wc.pin = 0; 4136 wc.process_func = replay_one_buffer; 4137 wc.stage = LOG_WALK_REPLAY_INODES; 4138 goto again; 4139 } 4140 /* step three is to replay everything */ 4141 if (wc.stage < LOG_WALK_REPLAY_ALL) { 4142 wc.stage++; 4143 goto again; 4144 } 4145 4146 btrfs_free_path(path); 4147 4148 /* step 4: commit the transaction, which also unpins the blocks */ 4149 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 4150 if (ret) 4151 return ret; 4152 4153 free_extent_buffer(log_root_tree->node); 4154 log_root_tree->log_root = NULL; 4155 fs_info->log_root_recovering = 0; 4156 kfree(log_root_tree); 4157 4158 return 0; 4159 error: 4160 if (wc.trans) 4161 btrfs_end_transaction(wc.trans, fs_info->tree_root); 4162 btrfs_free_path(path); 4163 return ret; 4164 } 4165 4166 /* 4167 * there are some corner cases where we want to force a full 4168 * commit instead of allowing a directory to be logged. 4169 * 4170 * They revolve around files there were unlinked from the directory, and 4171 * this function updates the parent directory so that a full commit is 4172 * properly done if it is fsync'd later after the unlinks are done. 4173 */ 4174 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 4175 struct inode *dir, struct inode *inode, 4176 int for_rename) 4177 { 4178 /* 4179 * when we're logging a file, if it hasn't been renamed 4180 * or unlinked, and its inode is fully committed on disk, 4181 * we don't have to worry about walking up the directory chain 4182 * to log its parents. 4183 * 4184 * So, we use the last_unlink_trans field to put this transid 4185 * into the file. When the file is logged we check it and 4186 * don't log the parents if the file is fully on disk. 4187 */ 4188 if (S_ISREG(inode->i_mode)) 4189 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4190 4191 /* 4192 * if this directory was already logged any new 4193 * names for this file/dir will get recorded 4194 */ 4195 smp_mb(); 4196 if (BTRFS_I(dir)->logged_trans == trans->transid) 4197 return; 4198 4199 /* 4200 * if the inode we're about to unlink was logged, 4201 * the log will be properly updated for any new names 4202 */ 4203 if (BTRFS_I(inode)->logged_trans == trans->transid) 4204 return; 4205 4206 /* 4207 * when renaming files across directories, if the directory 4208 * there we're unlinking from gets fsync'd later on, there's 4209 * no way to find the destination directory later and fsync it 4210 * properly. So, we have to be conservative and force commits 4211 * so the new name gets discovered. 4212 */ 4213 if (for_rename) 4214 goto record; 4215 4216 /* we can safely do the unlink without any special recording */ 4217 return; 4218 4219 record: 4220 BTRFS_I(dir)->last_unlink_trans = trans->transid; 4221 } 4222 4223 /* 4224 * Call this after adding a new name for a file and it will properly 4225 * update the log to reflect the new name. 4226 * 4227 * It will return zero if all goes well, and it will return 1 if a 4228 * full transaction commit is required. 4229 */ 4230 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 4231 struct inode *inode, struct inode *old_dir, 4232 struct dentry *parent) 4233 { 4234 struct btrfs_root * root = BTRFS_I(inode)->root; 4235 4236 /* 4237 * this will force the logging code to walk the dentry chain 4238 * up for the file 4239 */ 4240 if (S_ISREG(inode->i_mode)) 4241 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4242 4243 /* 4244 * if this inode hasn't been logged and directory we're renaming it 4245 * from hasn't been logged, we don't need to log it 4246 */ 4247 if (BTRFS_I(inode)->logged_trans <= 4248 root->fs_info->last_trans_committed && 4249 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 4250 root->fs_info->last_trans_committed)) 4251 return 0; 4252 4253 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4254 } 4255 4256