1 /* 2 * Copyright (C) 2008 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/sched.h> 20 #include <linux/slab.h> 21 #include <linux/blkdev.h> 22 #include <linux/list_sort.h> 23 #include "ctree.h" 24 #include "transaction.h" 25 #include "disk-io.h" 26 #include "locking.h" 27 #include "print-tree.h" 28 #include "backref.h" 29 #include "tree-log.h" 30 #include "hash.h" 31 32 /* magic values for the inode_only field in btrfs_log_inode: 33 * 34 * LOG_INODE_ALL means to log everything 35 * LOG_INODE_EXISTS means to log just enough to recreate the inode 36 * during log replay 37 */ 38 #define LOG_INODE_ALL 0 39 #define LOG_INODE_EXISTS 1 40 41 /* 42 * directory trouble cases 43 * 44 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 45 * log, we must force a full commit before doing an fsync of the directory 46 * where the unlink was done. 47 * ---> record transid of last unlink/rename per directory 48 * 49 * mkdir foo/some_dir 50 * normal commit 51 * rename foo/some_dir foo2/some_dir 52 * mkdir foo/some_dir 53 * fsync foo/some_dir/some_file 54 * 55 * The fsync above will unlink the original some_dir without recording 56 * it in its new location (foo2). After a crash, some_dir will be gone 57 * unless the fsync of some_file forces a full commit 58 * 59 * 2) we must log any new names for any file or dir that is in the fsync 60 * log. ---> check inode while renaming/linking. 61 * 62 * 2a) we must log any new names for any file or dir during rename 63 * when the directory they are being removed from was logged. 64 * ---> check inode and old parent dir during rename 65 * 66 * 2a is actually the more important variant. With the extra logging 67 * a crash might unlink the old name without recreating the new one 68 * 69 * 3) after a crash, we must go through any directories with a link count 70 * of zero and redo the rm -rf 71 * 72 * mkdir f1/foo 73 * normal commit 74 * rm -rf f1/foo 75 * fsync(f1) 76 * 77 * The directory f1 was fully removed from the FS, but fsync was never 78 * called on f1, only its parent dir. After a crash the rm -rf must 79 * be replayed. This must be able to recurse down the entire 80 * directory tree. The inode link count fixup code takes care of the 81 * ugly details. 82 */ 83 84 /* 85 * stages for the tree walking. The first 86 * stage (0) is to only pin down the blocks we find 87 * the second stage (1) is to make sure that all the inodes 88 * we find in the log are created in the subvolume. 89 * 90 * The last stage is to deal with directories and links and extents 91 * and all the other fun semantics 92 */ 93 #define LOG_WALK_PIN_ONLY 0 94 #define LOG_WALK_REPLAY_INODES 1 95 #define LOG_WALK_REPLAY_DIR_INDEX 2 96 #define LOG_WALK_REPLAY_ALL 3 97 98 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 99 struct btrfs_root *root, struct inode *inode, 100 int inode_only); 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, 103 struct btrfs_path *path, u64 objectid); 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 105 struct btrfs_root *root, 106 struct btrfs_root *log, 107 struct btrfs_path *path, 108 u64 dirid, int del_all); 109 110 /* 111 * tree logging is a special write ahead log used to make sure that 112 * fsyncs and O_SYNCs can happen without doing full tree commits. 113 * 114 * Full tree commits are expensive because they require commonly 115 * modified blocks to be recowed, creating many dirty pages in the 116 * extent tree an 4x-6x higher write load than ext3. 117 * 118 * Instead of doing a tree commit on every fsync, we use the 119 * key ranges and transaction ids to find items for a given file or directory 120 * that have changed in this transaction. Those items are copied into 121 * a special tree (one per subvolume root), that tree is written to disk 122 * and then the fsync is considered complete. 123 * 124 * After a crash, items are copied out of the log-tree back into the 125 * subvolume tree. Any file data extents found are recorded in the extent 126 * allocation tree, and the log-tree freed. 127 * 128 * The log tree is read three times, once to pin down all the extents it is 129 * using in ram and once, once to create all the inodes logged in the tree 130 * and once to do all the other items. 131 */ 132 133 /* 134 * start a sub transaction and setup the log tree 135 * this increments the log tree writer count to make the people 136 * syncing the tree wait for us to finish 137 */ 138 static int start_log_trans(struct btrfs_trans_handle *trans, 139 struct btrfs_root *root) 140 { 141 int ret; 142 int err = 0; 143 144 mutex_lock(&root->log_mutex); 145 if (root->log_root) { 146 if (!root->log_start_pid) { 147 root->log_start_pid = current->pid; 148 root->log_multiple_pids = false; 149 } else if (root->log_start_pid != current->pid) { 150 root->log_multiple_pids = true; 151 } 152 153 atomic_inc(&root->log_batch); 154 atomic_inc(&root->log_writers); 155 mutex_unlock(&root->log_mutex); 156 return 0; 157 } 158 root->log_multiple_pids = false; 159 root->log_start_pid = current->pid; 160 mutex_lock(&root->fs_info->tree_log_mutex); 161 if (!root->fs_info->log_root_tree) { 162 ret = btrfs_init_log_root_tree(trans, root->fs_info); 163 if (ret) 164 err = ret; 165 } 166 if (err == 0 && !root->log_root) { 167 ret = btrfs_add_log_tree(trans, root); 168 if (ret) 169 err = ret; 170 } 171 mutex_unlock(&root->fs_info->tree_log_mutex); 172 atomic_inc(&root->log_batch); 173 atomic_inc(&root->log_writers); 174 mutex_unlock(&root->log_mutex); 175 return err; 176 } 177 178 /* 179 * returns 0 if there was a log transaction running and we were able 180 * to join, or returns -ENOENT if there were not transactions 181 * in progress 182 */ 183 static int join_running_log_trans(struct btrfs_root *root) 184 { 185 int ret = -ENOENT; 186 187 smp_mb(); 188 if (!root->log_root) 189 return -ENOENT; 190 191 mutex_lock(&root->log_mutex); 192 if (root->log_root) { 193 ret = 0; 194 atomic_inc(&root->log_writers); 195 } 196 mutex_unlock(&root->log_mutex); 197 return ret; 198 } 199 200 /* 201 * This either makes the current running log transaction wait 202 * until you call btrfs_end_log_trans() or it makes any future 203 * log transactions wait until you call btrfs_end_log_trans() 204 */ 205 int btrfs_pin_log_trans(struct btrfs_root *root) 206 { 207 int ret = -ENOENT; 208 209 mutex_lock(&root->log_mutex); 210 atomic_inc(&root->log_writers); 211 mutex_unlock(&root->log_mutex); 212 return ret; 213 } 214 215 /* 216 * indicate we're done making changes to the log tree 217 * and wake up anyone waiting to do a sync 218 */ 219 void btrfs_end_log_trans(struct btrfs_root *root) 220 { 221 if (atomic_dec_and_test(&root->log_writers)) { 222 smp_mb(); 223 if (waitqueue_active(&root->log_writer_wait)) 224 wake_up(&root->log_writer_wait); 225 } 226 } 227 228 229 /* 230 * the walk control struct is used to pass state down the chain when 231 * processing the log tree. The stage field tells us which part 232 * of the log tree processing we are currently doing. The others 233 * are state fields used for that specific part 234 */ 235 struct walk_control { 236 /* should we free the extent on disk when done? This is used 237 * at transaction commit time while freeing a log tree 238 */ 239 int free; 240 241 /* should we write out the extent buffer? This is used 242 * while flushing the log tree to disk during a sync 243 */ 244 int write; 245 246 /* should we wait for the extent buffer io to finish? Also used 247 * while flushing the log tree to disk for a sync 248 */ 249 int wait; 250 251 /* pin only walk, we record which extents on disk belong to the 252 * log trees 253 */ 254 int pin; 255 256 /* what stage of the replay code we're currently in */ 257 int stage; 258 259 /* the root we are currently replaying */ 260 struct btrfs_root *replay_dest; 261 262 /* the trans handle for the current replay */ 263 struct btrfs_trans_handle *trans; 264 265 /* the function that gets used to process blocks we find in the 266 * tree. Note the extent_buffer might not be up to date when it is 267 * passed in, and it must be checked or read if you need the data 268 * inside it 269 */ 270 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 271 struct walk_control *wc, u64 gen); 272 }; 273 274 /* 275 * process_func used to pin down extents, write them or wait on them 276 */ 277 static int process_one_buffer(struct btrfs_root *log, 278 struct extent_buffer *eb, 279 struct walk_control *wc, u64 gen) 280 { 281 int ret = 0; 282 283 /* 284 * If this fs is mixed then we need to be able to process the leaves to 285 * pin down any logged extents, so we have to read the block. 286 */ 287 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { 288 ret = btrfs_read_buffer(eb, gen); 289 if (ret) 290 return ret; 291 } 292 293 if (wc->pin) 294 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 295 eb->start, eb->len); 296 297 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 298 if (wc->pin && btrfs_header_level(eb) == 0) 299 ret = btrfs_exclude_logged_extents(log, eb); 300 if (wc->write) 301 btrfs_write_tree_block(eb); 302 if (wc->wait) 303 btrfs_wait_tree_block_writeback(eb); 304 } 305 return ret; 306 } 307 308 /* 309 * Item overwrite used by replay and tree logging. eb, slot and key all refer 310 * to the src data we are copying out. 311 * 312 * root is the tree we are copying into, and path is a scratch 313 * path for use in this function (it should be released on entry and 314 * will be released on exit). 315 * 316 * If the key is already in the destination tree the existing item is 317 * overwritten. If the existing item isn't big enough, it is extended. 318 * If it is too large, it is truncated. 319 * 320 * If the key isn't in the destination yet, a new item is inserted. 321 */ 322 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 323 struct btrfs_root *root, 324 struct btrfs_path *path, 325 struct extent_buffer *eb, int slot, 326 struct btrfs_key *key) 327 { 328 int ret; 329 u32 item_size; 330 u64 saved_i_size = 0; 331 int save_old_i_size = 0; 332 unsigned long src_ptr; 333 unsigned long dst_ptr; 334 int overwrite_root = 0; 335 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 336 337 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 338 overwrite_root = 1; 339 340 item_size = btrfs_item_size_nr(eb, slot); 341 src_ptr = btrfs_item_ptr_offset(eb, slot); 342 343 /* look for the key in the destination tree */ 344 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 345 if (ret < 0) 346 return ret; 347 348 if (ret == 0) { 349 char *src_copy; 350 char *dst_copy; 351 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 352 path->slots[0]); 353 if (dst_size != item_size) 354 goto insert; 355 356 if (item_size == 0) { 357 btrfs_release_path(path); 358 return 0; 359 } 360 dst_copy = kmalloc(item_size, GFP_NOFS); 361 src_copy = kmalloc(item_size, GFP_NOFS); 362 if (!dst_copy || !src_copy) { 363 btrfs_release_path(path); 364 kfree(dst_copy); 365 kfree(src_copy); 366 return -ENOMEM; 367 } 368 369 read_extent_buffer(eb, src_copy, src_ptr, item_size); 370 371 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 372 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 373 item_size); 374 ret = memcmp(dst_copy, src_copy, item_size); 375 376 kfree(dst_copy); 377 kfree(src_copy); 378 /* 379 * they have the same contents, just return, this saves 380 * us from cowing blocks in the destination tree and doing 381 * extra writes that may not have been done by a previous 382 * sync 383 */ 384 if (ret == 0) { 385 btrfs_release_path(path); 386 return 0; 387 } 388 389 /* 390 * We need to load the old nbytes into the inode so when we 391 * replay the extents we've logged we get the right nbytes. 392 */ 393 if (inode_item) { 394 struct btrfs_inode_item *item; 395 u64 nbytes; 396 u32 mode; 397 398 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 399 struct btrfs_inode_item); 400 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 401 item = btrfs_item_ptr(eb, slot, 402 struct btrfs_inode_item); 403 btrfs_set_inode_nbytes(eb, item, nbytes); 404 405 /* 406 * If this is a directory we need to reset the i_size to 407 * 0 so that we can set it up properly when replaying 408 * the rest of the items in this log. 409 */ 410 mode = btrfs_inode_mode(eb, item); 411 if (S_ISDIR(mode)) 412 btrfs_set_inode_size(eb, item, 0); 413 } 414 } else if (inode_item) { 415 struct btrfs_inode_item *item; 416 u32 mode; 417 418 /* 419 * New inode, set nbytes to 0 so that the nbytes comes out 420 * properly when we replay the extents. 421 */ 422 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 423 btrfs_set_inode_nbytes(eb, item, 0); 424 425 /* 426 * If this is a directory we need to reset the i_size to 0 so 427 * that we can set it up properly when replaying the rest of 428 * the items in this log. 429 */ 430 mode = btrfs_inode_mode(eb, item); 431 if (S_ISDIR(mode)) 432 btrfs_set_inode_size(eb, item, 0); 433 } 434 insert: 435 btrfs_release_path(path); 436 /* try to insert the key into the destination tree */ 437 ret = btrfs_insert_empty_item(trans, root, path, 438 key, item_size); 439 440 /* make sure any existing item is the correct size */ 441 if (ret == -EEXIST) { 442 u32 found_size; 443 found_size = btrfs_item_size_nr(path->nodes[0], 444 path->slots[0]); 445 if (found_size > item_size) 446 btrfs_truncate_item(root, path, item_size, 1); 447 else if (found_size < item_size) 448 btrfs_extend_item(root, path, 449 item_size - found_size); 450 } else if (ret) { 451 return ret; 452 } 453 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 454 path->slots[0]); 455 456 /* don't overwrite an existing inode if the generation number 457 * was logged as zero. This is done when the tree logging code 458 * is just logging an inode to make sure it exists after recovery. 459 * 460 * Also, don't overwrite i_size on directories during replay. 461 * log replay inserts and removes directory items based on the 462 * state of the tree found in the subvolume, and i_size is modified 463 * as it goes 464 */ 465 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 466 struct btrfs_inode_item *src_item; 467 struct btrfs_inode_item *dst_item; 468 469 src_item = (struct btrfs_inode_item *)src_ptr; 470 dst_item = (struct btrfs_inode_item *)dst_ptr; 471 472 if (btrfs_inode_generation(eb, src_item) == 0) 473 goto no_copy; 474 475 if (overwrite_root && 476 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 477 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 478 save_old_i_size = 1; 479 saved_i_size = btrfs_inode_size(path->nodes[0], 480 dst_item); 481 } 482 } 483 484 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 485 src_ptr, item_size); 486 487 if (save_old_i_size) { 488 struct btrfs_inode_item *dst_item; 489 dst_item = (struct btrfs_inode_item *)dst_ptr; 490 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 491 } 492 493 /* make sure the generation is filled in */ 494 if (key->type == BTRFS_INODE_ITEM_KEY) { 495 struct btrfs_inode_item *dst_item; 496 dst_item = (struct btrfs_inode_item *)dst_ptr; 497 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 498 btrfs_set_inode_generation(path->nodes[0], dst_item, 499 trans->transid); 500 } 501 } 502 no_copy: 503 btrfs_mark_buffer_dirty(path->nodes[0]); 504 btrfs_release_path(path); 505 return 0; 506 } 507 508 /* 509 * simple helper to read an inode off the disk from a given root 510 * This can only be called for subvolume roots and not for the log 511 */ 512 static noinline struct inode *read_one_inode(struct btrfs_root *root, 513 u64 objectid) 514 { 515 struct btrfs_key key; 516 struct inode *inode; 517 518 key.objectid = objectid; 519 key.type = BTRFS_INODE_ITEM_KEY; 520 key.offset = 0; 521 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); 522 if (IS_ERR(inode)) { 523 inode = NULL; 524 } else if (is_bad_inode(inode)) { 525 iput(inode); 526 inode = NULL; 527 } 528 return inode; 529 } 530 531 /* replays a single extent in 'eb' at 'slot' with 'key' into the 532 * subvolume 'root'. path is released on entry and should be released 533 * on exit. 534 * 535 * extents in the log tree have not been allocated out of the extent 536 * tree yet. So, this completes the allocation, taking a reference 537 * as required if the extent already exists or creating a new extent 538 * if it isn't in the extent allocation tree yet. 539 * 540 * The extent is inserted into the file, dropping any existing extents 541 * from the file that overlap the new one. 542 */ 543 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 544 struct btrfs_root *root, 545 struct btrfs_path *path, 546 struct extent_buffer *eb, int slot, 547 struct btrfs_key *key) 548 { 549 int found_type; 550 u64 extent_end; 551 u64 start = key->offset; 552 u64 nbytes = 0; 553 struct btrfs_file_extent_item *item; 554 struct inode *inode = NULL; 555 unsigned long size; 556 int ret = 0; 557 558 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 559 found_type = btrfs_file_extent_type(eb, item); 560 561 if (found_type == BTRFS_FILE_EXTENT_REG || 562 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 563 nbytes = btrfs_file_extent_num_bytes(eb, item); 564 extent_end = start + nbytes; 565 566 /* 567 * We don't add to the inodes nbytes if we are prealloc or a 568 * hole. 569 */ 570 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 571 nbytes = 0; 572 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 573 size = btrfs_file_extent_inline_len(eb, item); 574 nbytes = btrfs_file_extent_ram_bytes(eb, item); 575 extent_end = ALIGN(start + size, root->sectorsize); 576 } else { 577 ret = 0; 578 goto out; 579 } 580 581 inode = read_one_inode(root, key->objectid); 582 if (!inode) { 583 ret = -EIO; 584 goto out; 585 } 586 587 /* 588 * first check to see if we already have this extent in the 589 * file. This must be done before the btrfs_drop_extents run 590 * so we don't try to drop this extent. 591 */ 592 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), 593 start, 0); 594 595 if (ret == 0 && 596 (found_type == BTRFS_FILE_EXTENT_REG || 597 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 598 struct btrfs_file_extent_item cmp1; 599 struct btrfs_file_extent_item cmp2; 600 struct btrfs_file_extent_item *existing; 601 struct extent_buffer *leaf; 602 603 leaf = path->nodes[0]; 604 existing = btrfs_item_ptr(leaf, path->slots[0], 605 struct btrfs_file_extent_item); 606 607 read_extent_buffer(eb, &cmp1, (unsigned long)item, 608 sizeof(cmp1)); 609 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 610 sizeof(cmp2)); 611 612 /* 613 * we already have a pointer to this exact extent, 614 * we don't have to do anything 615 */ 616 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 617 btrfs_release_path(path); 618 goto out; 619 } 620 } 621 btrfs_release_path(path); 622 623 /* drop any overlapping extents */ 624 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); 625 if (ret) 626 goto out; 627 628 if (found_type == BTRFS_FILE_EXTENT_REG || 629 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 630 u64 offset; 631 unsigned long dest_offset; 632 struct btrfs_key ins; 633 634 ret = btrfs_insert_empty_item(trans, root, path, key, 635 sizeof(*item)); 636 if (ret) 637 goto out; 638 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 639 path->slots[0]); 640 copy_extent_buffer(path->nodes[0], eb, dest_offset, 641 (unsigned long)item, sizeof(*item)); 642 643 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 644 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 645 ins.type = BTRFS_EXTENT_ITEM_KEY; 646 offset = key->offset - btrfs_file_extent_offset(eb, item); 647 648 if (ins.objectid > 0) { 649 u64 csum_start; 650 u64 csum_end; 651 LIST_HEAD(ordered_sums); 652 /* 653 * is this extent already allocated in the extent 654 * allocation tree? If so, just add a reference 655 */ 656 ret = btrfs_lookup_extent(root, ins.objectid, 657 ins.offset); 658 if (ret == 0) { 659 ret = btrfs_inc_extent_ref(trans, root, 660 ins.objectid, ins.offset, 661 0, root->root_key.objectid, 662 key->objectid, offset, 0); 663 if (ret) 664 goto out; 665 } else { 666 /* 667 * insert the extent pointer in the extent 668 * allocation tree 669 */ 670 ret = btrfs_alloc_logged_file_extent(trans, 671 root, root->root_key.objectid, 672 key->objectid, offset, &ins); 673 if (ret) 674 goto out; 675 } 676 btrfs_release_path(path); 677 678 if (btrfs_file_extent_compression(eb, item)) { 679 csum_start = ins.objectid; 680 csum_end = csum_start + ins.offset; 681 } else { 682 csum_start = ins.objectid + 683 btrfs_file_extent_offset(eb, item); 684 csum_end = csum_start + 685 btrfs_file_extent_num_bytes(eb, item); 686 } 687 688 ret = btrfs_lookup_csums_range(root->log_root, 689 csum_start, csum_end - 1, 690 &ordered_sums, 0); 691 if (ret) 692 goto out; 693 while (!list_empty(&ordered_sums)) { 694 struct btrfs_ordered_sum *sums; 695 sums = list_entry(ordered_sums.next, 696 struct btrfs_ordered_sum, 697 list); 698 if (!ret) 699 ret = btrfs_csum_file_blocks(trans, 700 root->fs_info->csum_root, 701 sums); 702 list_del(&sums->list); 703 kfree(sums); 704 } 705 if (ret) 706 goto out; 707 } else { 708 btrfs_release_path(path); 709 } 710 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 711 /* inline extents are easy, we just overwrite them */ 712 ret = overwrite_item(trans, root, path, eb, slot, key); 713 if (ret) 714 goto out; 715 } 716 717 inode_add_bytes(inode, nbytes); 718 ret = btrfs_update_inode(trans, root, inode); 719 out: 720 if (inode) 721 iput(inode); 722 return ret; 723 } 724 725 /* 726 * when cleaning up conflicts between the directory names in the 727 * subvolume, directory names in the log and directory names in the 728 * inode back references, we may have to unlink inodes from directories. 729 * 730 * This is a helper function to do the unlink of a specific directory 731 * item 732 */ 733 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 734 struct btrfs_root *root, 735 struct btrfs_path *path, 736 struct inode *dir, 737 struct btrfs_dir_item *di) 738 { 739 struct inode *inode; 740 char *name; 741 int name_len; 742 struct extent_buffer *leaf; 743 struct btrfs_key location; 744 int ret; 745 746 leaf = path->nodes[0]; 747 748 btrfs_dir_item_key_to_cpu(leaf, di, &location); 749 name_len = btrfs_dir_name_len(leaf, di); 750 name = kmalloc(name_len, GFP_NOFS); 751 if (!name) 752 return -ENOMEM; 753 754 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 755 btrfs_release_path(path); 756 757 inode = read_one_inode(root, location.objectid); 758 if (!inode) { 759 ret = -EIO; 760 goto out; 761 } 762 763 ret = link_to_fixup_dir(trans, root, path, location.objectid); 764 if (ret) 765 goto out; 766 767 ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); 768 if (ret) 769 goto out; 770 else 771 ret = btrfs_run_delayed_items(trans, root); 772 out: 773 kfree(name); 774 iput(inode); 775 return ret; 776 } 777 778 /* 779 * helper function to see if a given name and sequence number found 780 * in an inode back reference are already in a directory and correctly 781 * point to this inode 782 */ 783 static noinline int inode_in_dir(struct btrfs_root *root, 784 struct btrfs_path *path, 785 u64 dirid, u64 objectid, u64 index, 786 const char *name, int name_len) 787 { 788 struct btrfs_dir_item *di; 789 struct btrfs_key location; 790 int match = 0; 791 792 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 793 index, name, name_len, 0); 794 if (di && !IS_ERR(di)) { 795 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 796 if (location.objectid != objectid) 797 goto out; 798 } else 799 goto out; 800 btrfs_release_path(path); 801 802 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 803 if (di && !IS_ERR(di)) { 804 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 805 if (location.objectid != objectid) 806 goto out; 807 } else 808 goto out; 809 match = 1; 810 out: 811 btrfs_release_path(path); 812 return match; 813 } 814 815 /* 816 * helper function to check a log tree for a named back reference in 817 * an inode. This is used to decide if a back reference that is 818 * found in the subvolume conflicts with what we find in the log. 819 * 820 * inode backreferences may have multiple refs in a single item, 821 * during replay we process one reference at a time, and we don't 822 * want to delete valid links to a file from the subvolume if that 823 * link is also in the log. 824 */ 825 static noinline int backref_in_log(struct btrfs_root *log, 826 struct btrfs_key *key, 827 u64 ref_objectid, 828 char *name, int namelen) 829 { 830 struct btrfs_path *path; 831 struct btrfs_inode_ref *ref; 832 unsigned long ptr; 833 unsigned long ptr_end; 834 unsigned long name_ptr; 835 int found_name_len; 836 int item_size; 837 int ret; 838 int match = 0; 839 840 path = btrfs_alloc_path(); 841 if (!path) 842 return -ENOMEM; 843 844 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 845 if (ret != 0) 846 goto out; 847 848 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 849 850 if (key->type == BTRFS_INODE_EXTREF_KEY) { 851 if (btrfs_find_name_in_ext_backref(path, ref_objectid, 852 name, namelen, NULL)) 853 match = 1; 854 855 goto out; 856 } 857 858 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 859 ptr_end = ptr + item_size; 860 while (ptr < ptr_end) { 861 ref = (struct btrfs_inode_ref *)ptr; 862 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); 863 if (found_name_len == namelen) { 864 name_ptr = (unsigned long)(ref + 1); 865 ret = memcmp_extent_buffer(path->nodes[0], name, 866 name_ptr, namelen); 867 if (ret == 0) { 868 match = 1; 869 goto out; 870 } 871 } 872 ptr = (unsigned long)(ref + 1) + found_name_len; 873 } 874 out: 875 btrfs_free_path(path); 876 return match; 877 } 878 879 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 880 struct btrfs_root *root, 881 struct btrfs_path *path, 882 struct btrfs_root *log_root, 883 struct inode *dir, struct inode *inode, 884 struct extent_buffer *eb, 885 u64 inode_objectid, u64 parent_objectid, 886 u64 ref_index, char *name, int namelen, 887 int *search_done) 888 { 889 int ret; 890 char *victim_name; 891 int victim_name_len; 892 struct extent_buffer *leaf; 893 struct btrfs_dir_item *di; 894 struct btrfs_key search_key; 895 struct btrfs_inode_extref *extref; 896 897 again: 898 /* Search old style refs */ 899 search_key.objectid = inode_objectid; 900 search_key.type = BTRFS_INODE_REF_KEY; 901 search_key.offset = parent_objectid; 902 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 903 if (ret == 0) { 904 struct btrfs_inode_ref *victim_ref; 905 unsigned long ptr; 906 unsigned long ptr_end; 907 908 leaf = path->nodes[0]; 909 910 /* are we trying to overwrite a back ref for the root directory 911 * if so, just jump out, we're done 912 */ 913 if (search_key.objectid == search_key.offset) 914 return 1; 915 916 /* check all the names in this back reference to see 917 * if they are in the log. if so, we allow them to stay 918 * otherwise they must be unlinked as a conflict 919 */ 920 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 921 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 922 while (ptr < ptr_end) { 923 victim_ref = (struct btrfs_inode_ref *)ptr; 924 victim_name_len = btrfs_inode_ref_name_len(leaf, 925 victim_ref); 926 victim_name = kmalloc(victim_name_len, GFP_NOFS); 927 if (!victim_name) 928 return -ENOMEM; 929 930 read_extent_buffer(leaf, victim_name, 931 (unsigned long)(victim_ref + 1), 932 victim_name_len); 933 934 if (!backref_in_log(log_root, &search_key, 935 parent_objectid, 936 victim_name, 937 victim_name_len)) { 938 inc_nlink(inode); 939 btrfs_release_path(path); 940 941 ret = btrfs_unlink_inode(trans, root, dir, 942 inode, victim_name, 943 victim_name_len); 944 kfree(victim_name); 945 if (ret) 946 return ret; 947 ret = btrfs_run_delayed_items(trans, root); 948 if (ret) 949 return ret; 950 *search_done = 1; 951 goto again; 952 } 953 kfree(victim_name); 954 955 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 956 } 957 958 /* 959 * NOTE: we have searched root tree and checked the 960 * coresponding ref, it does not need to check again. 961 */ 962 *search_done = 1; 963 } 964 btrfs_release_path(path); 965 966 /* Same search but for extended refs */ 967 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 968 inode_objectid, parent_objectid, 0, 969 0); 970 if (!IS_ERR_OR_NULL(extref)) { 971 u32 item_size; 972 u32 cur_offset = 0; 973 unsigned long base; 974 struct inode *victim_parent; 975 976 leaf = path->nodes[0]; 977 978 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 979 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 980 981 while (cur_offset < item_size) { 982 extref = (struct btrfs_inode_extref *)base + cur_offset; 983 984 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 985 986 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 987 goto next; 988 989 victim_name = kmalloc(victim_name_len, GFP_NOFS); 990 if (!victim_name) 991 return -ENOMEM; 992 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 993 victim_name_len); 994 995 search_key.objectid = inode_objectid; 996 search_key.type = BTRFS_INODE_EXTREF_KEY; 997 search_key.offset = btrfs_extref_hash(parent_objectid, 998 victim_name, 999 victim_name_len); 1000 ret = 0; 1001 if (!backref_in_log(log_root, &search_key, 1002 parent_objectid, victim_name, 1003 victim_name_len)) { 1004 ret = -ENOENT; 1005 victim_parent = read_one_inode(root, 1006 parent_objectid); 1007 if (victim_parent) { 1008 inc_nlink(inode); 1009 btrfs_release_path(path); 1010 1011 ret = btrfs_unlink_inode(trans, root, 1012 victim_parent, 1013 inode, 1014 victim_name, 1015 victim_name_len); 1016 if (!ret) 1017 ret = btrfs_run_delayed_items( 1018 trans, root); 1019 } 1020 iput(victim_parent); 1021 kfree(victim_name); 1022 if (ret) 1023 return ret; 1024 *search_done = 1; 1025 goto again; 1026 } 1027 kfree(victim_name); 1028 if (ret) 1029 return ret; 1030 next: 1031 cur_offset += victim_name_len + sizeof(*extref); 1032 } 1033 *search_done = 1; 1034 } 1035 btrfs_release_path(path); 1036 1037 /* look for a conflicting sequence number */ 1038 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1039 ref_index, name, namelen, 0); 1040 if (di && !IS_ERR(di)) { 1041 ret = drop_one_dir_item(trans, root, path, dir, di); 1042 if (ret) 1043 return ret; 1044 } 1045 btrfs_release_path(path); 1046 1047 /* look for a conflicing name */ 1048 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1049 name, namelen, 0); 1050 if (di && !IS_ERR(di)) { 1051 ret = drop_one_dir_item(trans, root, path, dir, di); 1052 if (ret) 1053 return ret; 1054 } 1055 btrfs_release_path(path); 1056 1057 return 0; 1058 } 1059 1060 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1061 u32 *namelen, char **name, u64 *index, 1062 u64 *parent_objectid) 1063 { 1064 struct btrfs_inode_extref *extref; 1065 1066 extref = (struct btrfs_inode_extref *)ref_ptr; 1067 1068 *namelen = btrfs_inode_extref_name_len(eb, extref); 1069 *name = kmalloc(*namelen, GFP_NOFS); 1070 if (*name == NULL) 1071 return -ENOMEM; 1072 1073 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1074 *namelen); 1075 1076 *index = btrfs_inode_extref_index(eb, extref); 1077 if (parent_objectid) 1078 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1079 1080 return 0; 1081 } 1082 1083 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1084 u32 *namelen, char **name, u64 *index) 1085 { 1086 struct btrfs_inode_ref *ref; 1087 1088 ref = (struct btrfs_inode_ref *)ref_ptr; 1089 1090 *namelen = btrfs_inode_ref_name_len(eb, ref); 1091 *name = kmalloc(*namelen, GFP_NOFS); 1092 if (*name == NULL) 1093 return -ENOMEM; 1094 1095 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1096 1097 *index = btrfs_inode_ref_index(eb, ref); 1098 1099 return 0; 1100 } 1101 1102 /* 1103 * replay one inode back reference item found in the log tree. 1104 * eb, slot and key refer to the buffer and key found in the log tree. 1105 * root is the destination we are replaying into, and path is for temp 1106 * use by this function. (it should be released on return). 1107 */ 1108 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1109 struct btrfs_root *root, 1110 struct btrfs_root *log, 1111 struct btrfs_path *path, 1112 struct extent_buffer *eb, int slot, 1113 struct btrfs_key *key) 1114 { 1115 struct inode *dir = NULL; 1116 struct inode *inode = NULL; 1117 unsigned long ref_ptr; 1118 unsigned long ref_end; 1119 char *name = NULL; 1120 int namelen; 1121 int ret; 1122 int search_done = 0; 1123 int log_ref_ver = 0; 1124 u64 parent_objectid; 1125 u64 inode_objectid; 1126 u64 ref_index = 0; 1127 int ref_struct_size; 1128 1129 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1130 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1131 1132 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1133 struct btrfs_inode_extref *r; 1134 1135 ref_struct_size = sizeof(struct btrfs_inode_extref); 1136 log_ref_ver = 1; 1137 r = (struct btrfs_inode_extref *)ref_ptr; 1138 parent_objectid = btrfs_inode_extref_parent(eb, r); 1139 } else { 1140 ref_struct_size = sizeof(struct btrfs_inode_ref); 1141 parent_objectid = key->offset; 1142 } 1143 inode_objectid = key->objectid; 1144 1145 /* 1146 * it is possible that we didn't log all the parent directories 1147 * for a given inode. If we don't find the dir, just don't 1148 * copy the back ref in. The link count fixup code will take 1149 * care of the rest 1150 */ 1151 dir = read_one_inode(root, parent_objectid); 1152 if (!dir) { 1153 ret = -ENOENT; 1154 goto out; 1155 } 1156 1157 inode = read_one_inode(root, inode_objectid); 1158 if (!inode) { 1159 ret = -EIO; 1160 goto out; 1161 } 1162 1163 while (ref_ptr < ref_end) { 1164 if (log_ref_ver) { 1165 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1166 &ref_index, &parent_objectid); 1167 /* 1168 * parent object can change from one array 1169 * item to another. 1170 */ 1171 if (!dir) 1172 dir = read_one_inode(root, parent_objectid); 1173 if (!dir) { 1174 ret = -ENOENT; 1175 goto out; 1176 } 1177 } else { 1178 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1179 &ref_index); 1180 } 1181 if (ret) 1182 goto out; 1183 1184 /* if we already have a perfect match, we're done */ 1185 if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), 1186 ref_index, name, namelen)) { 1187 /* 1188 * look for a conflicting back reference in the 1189 * metadata. if we find one we have to unlink that name 1190 * of the file before we add our new link. Later on, we 1191 * overwrite any existing back reference, and we don't 1192 * want to create dangling pointers in the directory. 1193 */ 1194 1195 if (!search_done) { 1196 ret = __add_inode_ref(trans, root, path, log, 1197 dir, inode, eb, 1198 inode_objectid, 1199 parent_objectid, 1200 ref_index, name, namelen, 1201 &search_done); 1202 if (ret) { 1203 if (ret == 1) 1204 ret = 0; 1205 goto out; 1206 } 1207 } 1208 1209 /* insert our name */ 1210 ret = btrfs_add_link(trans, dir, inode, name, namelen, 1211 0, ref_index); 1212 if (ret) 1213 goto out; 1214 1215 btrfs_update_inode(trans, root, inode); 1216 } 1217 1218 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1219 kfree(name); 1220 name = NULL; 1221 if (log_ref_ver) { 1222 iput(dir); 1223 dir = NULL; 1224 } 1225 } 1226 1227 /* finally write the back reference in the inode */ 1228 ret = overwrite_item(trans, root, path, eb, slot, key); 1229 out: 1230 btrfs_release_path(path); 1231 kfree(name); 1232 iput(dir); 1233 iput(inode); 1234 return ret; 1235 } 1236 1237 static int insert_orphan_item(struct btrfs_trans_handle *trans, 1238 struct btrfs_root *root, u64 offset) 1239 { 1240 int ret; 1241 ret = btrfs_find_orphan_item(root, offset); 1242 if (ret > 0) 1243 ret = btrfs_insert_orphan_item(trans, root, offset); 1244 return ret; 1245 } 1246 1247 static int count_inode_extrefs(struct btrfs_root *root, 1248 struct inode *inode, struct btrfs_path *path) 1249 { 1250 int ret = 0; 1251 int name_len; 1252 unsigned int nlink = 0; 1253 u32 item_size; 1254 u32 cur_offset = 0; 1255 u64 inode_objectid = btrfs_ino(inode); 1256 u64 offset = 0; 1257 unsigned long ptr; 1258 struct btrfs_inode_extref *extref; 1259 struct extent_buffer *leaf; 1260 1261 while (1) { 1262 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1263 &extref, &offset); 1264 if (ret) 1265 break; 1266 1267 leaf = path->nodes[0]; 1268 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1269 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1270 1271 while (cur_offset < item_size) { 1272 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1273 name_len = btrfs_inode_extref_name_len(leaf, extref); 1274 1275 nlink++; 1276 1277 cur_offset += name_len + sizeof(*extref); 1278 } 1279 1280 offset++; 1281 btrfs_release_path(path); 1282 } 1283 btrfs_release_path(path); 1284 1285 if (ret < 0) 1286 return ret; 1287 return nlink; 1288 } 1289 1290 static int count_inode_refs(struct btrfs_root *root, 1291 struct inode *inode, struct btrfs_path *path) 1292 { 1293 int ret; 1294 struct btrfs_key key; 1295 unsigned int nlink = 0; 1296 unsigned long ptr; 1297 unsigned long ptr_end; 1298 int name_len; 1299 u64 ino = btrfs_ino(inode); 1300 1301 key.objectid = ino; 1302 key.type = BTRFS_INODE_REF_KEY; 1303 key.offset = (u64)-1; 1304 1305 while (1) { 1306 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1307 if (ret < 0) 1308 break; 1309 if (ret > 0) { 1310 if (path->slots[0] == 0) 1311 break; 1312 path->slots[0]--; 1313 } 1314 process_slot: 1315 btrfs_item_key_to_cpu(path->nodes[0], &key, 1316 path->slots[0]); 1317 if (key.objectid != ino || 1318 key.type != BTRFS_INODE_REF_KEY) 1319 break; 1320 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1321 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1322 path->slots[0]); 1323 while (ptr < ptr_end) { 1324 struct btrfs_inode_ref *ref; 1325 1326 ref = (struct btrfs_inode_ref *)ptr; 1327 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1328 ref); 1329 ptr = (unsigned long)(ref + 1) + name_len; 1330 nlink++; 1331 } 1332 1333 if (key.offset == 0) 1334 break; 1335 if (path->slots[0] > 0) { 1336 path->slots[0]--; 1337 goto process_slot; 1338 } 1339 key.offset--; 1340 btrfs_release_path(path); 1341 } 1342 btrfs_release_path(path); 1343 1344 return nlink; 1345 } 1346 1347 /* 1348 * There are a few corners where the link count of the file can't 1349 * be properly maintained during replay. So, instead of adding 1350 * lots of complexity to the log code, we just scan the backrefs 1351 * for any file that has been through replay. 1352 * 1353 * The scan will update the link count on the inode to reflect the 1354 * number of back refs found. If it goes down to zero, the iput 1355 * will free the inode. 1356 */ 1357 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1358 struct btrfs_root *root, 1359 struct inode *inode) 1360 { 1361 struct btrfs_path *path; 1362 int ret; 1363 u64 nlink = 0; 1364 u64 ino = btrfs_ino(inode); 1365 1366 path = btrfs_alloc_path(); 1367 if (!path) 1368 return -ENOMEM; 1369 1370 ret = count_inode_refs(root, inode, path); 1371 if (ret < 0) 1372 goto out; 1373 1374 nlink = ret; 1375 1376 ret = count_inode_extrefs(root, inode, path); 1377 if (ret == -ENOENT) 1378 ret = 0; 1379 1380 if (ret < 0) 1381 goto out; 1382 1383 nlink += ret; 1384 1385 ret = 0; 1386 1387 if (nlink != inode->i_nlink) { 1388 set_nlink(inode, nlink); 1389 btrfs_update_inode(trans, root, inode); 1390 } 1391 BTRFS_I(inode)->index_cnt = (u64)-1; 1392 1393 if (inode->i_nlink == 0) { 1394 if (S_ISDIR(inode->i_mode)) { 1395 ret = replay_dir_deletes(trans, root, NULL, path, 1396 ino, 1); 1397 if (ret) 1398 goto out; 1399 } 1400 ret = insert_orphan_item(trans, root, ino); 1401 } 1402 1403 out: 1404 btrfs_free_path(path); 1405 return ret; 1406 } 1407 1408 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1409 struct btrfs_root *root, 1410 struct btrfs_path *path) 1411 { 1412 int ret; 1413 struct btrfs_key key; 1414 struct inode *inode; 1415 1416 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1417 key.type = BTRFS_ORPHAN_ITEM_KEY; 1418 key.offset = (u64)-1; 1419 while (1) { 1420 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1421 if (ret < 0) 1422 break; 1423 1424 if (ret == 1) { 1425 if (path->slots[0] == 0) 1426 break; 1427 path->slots[0]--; 1428 } 1429 1430 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1431 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1432 key.type != BTRFS_ORPHAN_ITEM_KEY) 1433 break; 1434 1435 ret = btrfs_del_item(trans, root, path); 1436 if (ret) 1437 goto out; 1438 1439 btrfs_release_path(path); 1440 inode = read_one_inode(root, key.offset); 1441 if (!inode) 1442 return -EIO; 1443 1444 ret = fixup_inode_link_count(trans, root, inode); 1445 iput(inode); 1446 if (ret) 1447 goto out; 1448 1449 /* 1450 * fixup on a directory may create new entries, 1451 * make sure we always look for the highset possible 1452 * offset 1453 */ 1454 key.offset = (u64)-1; 1455 } 1456 ret = 0; 1457 out: 1458 btrfs_release_path(path); 1459 return ret; 1460 } 1461 1462 1463 /* 1464 * record a given inode in the fixup dir so we can check its link 1465 * count when replay is done. The link count is incremented here 1466 * so the inode won't go away until we check it 1467 */ 1468 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1469 struct btrfs_root *root, 1470 struct btrfs_path *path, 1471 u64 objectid) 1472 { 1473 struct btrfs_key key; 1474 int ret = 0; 1475 struct inode *inode; 1476 1477 inode = read_one_inode(root, objectid); 1478 if (!inode) 1479 return -EIO; 1480 1481 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1482 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1483 key.offset = objectid; 1484 1485 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1486 1487 btrfs_release_path(path); 1488 if (ret == 0) { 1489 if (!inode->i_nlink) 1490 set_nlink(inode, 1); 1491 else 1492 inc_nlink(inode); 1493 ret = btrfs_update_inode(trans, root, inode); 1494 } else if (ret == -EEXIST) { 1495 ret = 0; 1496 } else { 1497 BUG(); /* Logic Error */ 1498 } 1499 iput(inode); 1500 1501 return ret; 1502 } 1503 1504 /* 1505 * when replaying the log for a directory, we only insert names 1506 * for inodes that actually exist. This means an fsync on a directory 1507 * does not implicitly fsync all the new files in it 1508 */ 1509 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1510 struct btrfs_root *root, 1511 struct btrfs_path *path, 1512 u64 dirid, u64 index, 1513 char *name, int name_len, u8 type, 1514 struct btrfs_key *location) 1515 { 1516 struct inode *inode; 1517 struct inode *dir; 1518 int ret; 1519 1520 inode = read_one_inode(root, location->objectid); 1521 if (!inode) 1522 return -ENOENT; 1523 1524 dir = read_one_inode(root, dirid); 1525 if (!dir) { 1526 iput(inode); 1527 return -EIO; 1528 } 1529 1530 ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); 1531 1532 /* FIXME, put inode into FIXUP list */ 1533 1534 iput(inode); 1535 iput(dir); 1536 return ret; 1537 } 1538 1539 /* 1540 * take a single entry in a log directory item and replay it into 1541 * the subvolume. 1542 * 1543 * if a conflicting item exists in the subdirectory already, 1544 * the inode it points to is unlinked and put into the link count 1545 * fix up tree. 1546 * 1547 * If a name from the log points to a file or directory that does 1548 * not exist in the FS, it is skipped. fsyncs on directories 1549 * do not force down inodes inside that directory, just changes to the 1550 * names or unlinks in a directory. 1551 */ 1552 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1553 struct btrfs_root *root, 1554 struct btrfs_path *path, 1555 struct extent_buffer *eb, 1556 struct btrfs_dir_item *di, 1557 struct btrfs_key *key) 1558 { 1559 char *name; 1560 int name_len; 1561 struct btrfs_dir_item *dst_di; 1562 struct btrfs_key found_key; 1563 struct btrfs_key log_key; 1564 struct inode *dir; 1565 u8 log_type; 1566 int exists; 1567 int ret = 0; 1568 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1569 1570 dir = read_one_inode(root, key->objectid); 1571 if (!dir) 1572 return -EIO; 1573 1574 name_len = btrfs_dir_name_len(eb, di); 1575 name = kmalloc(name_len, GFP_NOFS); 1576 if (!name) { 1577 ret = -ENOMEM; 1578 goto out; 1579 } 1580 1581 log_type = btrfs_dir_type(eb, di); 1582 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1583 name_len); 1584 1585 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1586 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1587 if (exists == 0) 1588 exists = 1; 1589 else 1590 exists = 0; 1591 btrfs_release_path(path); 1592 1593 if (key->type == BTRFS_DIR_ITEM_KEY) { 1594 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1595 name, name_len, 1); 1596 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1597 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1598 key->objectid, 1599 key->offset, name, 1600 name_len, 1); 1601 } else { 1602 /* Corruption */ 1603 ret = -EINVAL; 1604 goto out; 1605 } 1606 if (IS_ERR_OR_NULL(dst_di)) { 1607 /* we need a sequence number to insert, so we only 1608 * do inserts for the BTRFS_DIR_INDEX_KEY types 1609 */ 1610 if (key->type != BTRFS_DIR_INDEX_KEY) 1611 goto out; 1612 goto insert; 1613 } 1614 1615 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1616 /* the existing item matches the logged item */ 1617 if (found_key.objectid == log_key.objectid && 1618 found_key.type == log_key.type && 1619 found_key.offset == log_key.offset && 1620 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1621 goto out; 1622 } 1623 1624 /* 1625 * don't drop the conflicting directory entry if the inode 1626 * for the new entry doesn't exist 1627 */ 1628 if (!exists) 1629 goto out; 1630 1631 ret = drop_one_dir_item(trans, root, path, dir, dst_di); 1632 if (ret) 1633 goto out; 1634 1635 if (key->type == BTRFS_DIR_INDEX_KEY) 1636 goto insert; 1637 out: 1638 btrfs_release_path(path); 1639 if (!ret && update_size) { 1640 btrfs_i_size_write(dir, dir->i_size + name_len * 2); 1641 ret = btrfs_update_inode(trans, root, dir); 1642 } 1643 kfree(name); 1644 iput(dir); 1645 return ret; 1646 1647 insert: 1648 btrfs_release_path(path); 1649 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1650 name, name_len, log_type, &log_key); 1651 if (ret && ret != -ENOENT) 1652 goto out; 1653 update_size = false; 1654 ret = 0; 1655 goto out; 1656 } 1657 1658 /* 1659 * find all the names in a directory item and reconcile them into 1660 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 1661 * one name in a directory item, but the same code gets used for 1662 * both directory index types 1663 */ 1664 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1665 struct btrfs_root *root, 1666 struct btrfs_path *path, 1667 struct extent_buffer *eb, int slot, 1668 struct btrfs_key *key) 1669 { 1670 int ret; 1671 u32 item_size = btrfs_item_size_nr(eb, slot); 1672 struct btrfs_dir_item *di; 1673 int name_len; 1674 unsigned long ptr; 1675 unsigned long ptr_end; 1676 1677 ptr = btrfs_item_ptr_offset(eb, slot); 1678 ptr_end = ptr + item_size; 1679 while (ptr < ptr_end) { 1680 di = (struct btrfs_dir_item *)ptr; 1681 if (verify_dir_item(root, eb, di)) 1682 return -EIO; 1683 name_len = btrfs_dir_name_len(eb, di); 1684 ret = replay_one_name(trans, root, path, eb, di, key); 1685 if (ret) 1686 return ret; 1687 ptr = (unsigned long)(di + 1); 1688 ptr += name_len; 1689 } 1690 return 0; 1691 } 1692 1693 /* 1694 * directory replay has two parts. There are the standard directory 1695 * items in the log copied from the subvolume, and range items 1696 * created in the log while the subvolume was logged. 1697 * 1698 * The range items tell us which parts of the key space the log 1699 * is authoritative for. During replay, if a key in the subvolume 1700 * directory is in a logged range item, but not actually in the log 1701 * that means it was deleted from the directory before the fsync 1702 * and should be removed. 1703 */ 1704 static noinline int find_dir_range(struct btrfs_root *root, 1705 struct btrfs_path *path, 1706 u64 dirid, int key_type, 1707 u64 *start_ret, u64 *end_ret) 1708 { 1709 struct btrfs_key key; 1710 u64 found_end; 1711 struct btrfs_dir_log_item *item; 1712 int ret; 1713 int nritems; 1714 1715 if (*start_ret == (u64)-1) 1716 return 1; 1717 1718 key.objectid = dirid; 1719 key.type = key_type; 1720 key.offset = *start_ret; 1721 1722 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1723 if (ret < 0) 1724 goto out; 1725 if (ret > 0) { 1726 if (path->slots[0] == 0) 1727 goto out; 1728 path->slots[0]--; 1729 } 1730 if (ret != 0) 1731 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1732 1733 if (key.type != key_type || key.objectid != dirid) { 1734 ret = 1; 1735 goto next; 1736 } 1737 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1738 struct btrfs_dir_log_item); 1739 found_end = btrfs_dir_log_end(path->nodes[0], item); 1740 1741 if (*start_ret >= key.offset && *start_ret <= found_end) { 1742 ret = 0; 1743 *start_ret = key.offset; 1744 *end_ret = found_end; 1745 goto out; 1746 } 1747 ret = 1; 1748 next: 1749 /* check the next slot in the tree to see if it is a valid item */ 1750 nritems = btrfs_header_nritems(path->nodes[0]); 1751 if (path->slots[0] >= nritems) { 1752 ret = btrfs_next_leaf(root, path); 1753 if (ret) 1754 goto out; 1755 } else { 1756 path->slots[0]++; 1757 } 1758 1759 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1760 1761 if (key.type != key_type || key.objectid != dirid) { 1762 ret = 1; 1763 goto out; 1764 } 1765 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 1766 struct btrfs_dir_log_item); 1767 found_end = btrfs_dir_log_end(path->nodes[0], item); 1768 *start_ret = key.offset; 1769 *end_ret = found_end; 1770 ret = 0; 1771 out: 1772 btrfs_release_path(path); 1773 return ret; 1774 } 1775 1776 /* 1777 * this looks for a given directory item in the log. If the directory 1778 * item is not in the log, the item is removed and the inode it points 1779 * to is unlinked 1780 */ 1781 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 1782 struct btrfs_root *root, 1783 struct btrfs_root *log, 1784 struct btrfs_path *path, 1785 struct btrfs_path *log_path, 1786 struct inode *dir, 1787 struct btrfs_key *dir_key) 1788 { 1789 int ret; 1790 struct extent_buffer *eb; 1791 int slot; 1792 u32 item_size; 1793 struct btrfs_dir_item *di; 1794 struct btrfs_dir_item *log_di; 1795 int name_len; 1796 unsigned long ptr; 1797 unsigned long ptr_end; 1798 char *name; 1799 struct inode *inode; 1800 struct btrfs_key location; 1801 1802 again: 1803 eb = path->nodes[0]; 1804 slot = path->slots[0]; 1805 item_size = btrfs_item_size_nr(eb, slot); 1806 ptr = btrfs_item_ptr_offset(eb, slot); 1807 ptr_end = ptr + item_size; 1808 while (ptr < ptr_end) { 1809 di = (struct btrfs_dir_item *)ptr; 1810 if (verify_dir_item(root, eb, di)) { 1811 ret = -EIO; 1812 goto out; 1813 } 1814 1815 name_len = btrfs_dir_name_len(eb, di); 1816 name = kmalloc(name_len, GFP_NOFS); 1817 if (!name) { 1818 ret = -ENOMEM; 1819 goto out; 1820 } 1821 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1822 name_len); 1823 log_di = NULL; 1824 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 1825 log_di = btrfs_lookup_dir_item(trans, log, log_path, 1826 dir_key->objectid, 1827 name, name_len, 0); 1828 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 1829 log_di = btrfs_lookup_dir_index_item(trans, log, 1830 log_path, 1831 dir_key->objectid, 1832 dir_key->offset, 1833 name, name_len, 0); 1834 } 1835 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) { 1836 btrfs_dir_item_key_to_cpu(eb, di, &location); 1837 btrfs_release_path(path); 1838 btrfs_release_path(log_path); 1839 inode = read_one_inode(root, location.objectid); 1840 if (!inode) { 1841 kfree(name); 1842 return -EIO; 1843 } 1844 1845 ret = link_to_fixup_dir(trans, root, 1846 path, location.objectid); 1847 if (ret) { 1848 kfree(name); 1849 iput(inode); 1850 goto out; 1851 } 1852 1853 inc_nlink(inode); 1854 ret = btrfs_unlink_inode(trans, root, dir, inode, 1855 name, name_len); 1856 if (!ret) 1857 ret = btrfs_run_delayed_items(trans, root); 1858 kfree(name); 1859 iput(inode); 1860 if (ret) 1861 goto out; 1862 1863 /* there might still be more names under this key 1864 * check and repeat if required 1865 */ 1866 ret = btrfs_search_slot(NULL, root, dir_key, path, 1867 0, 0); 1868 if (ret == 0) 1869 goto again; 1870 ret = 0; 1871 goto out; 1872 } else if (IS_ERR(log_di)) { 1873 kfree(name); 1874 return PTR_ERR(log_di); 1875 } 1876 btrfs_release_path(log_path); 1877 kfree(name); 1878 1879 ptr = (unsigned long)(di + 1); 1880 ptr += name_len; 1881 } 1882 ret = 0; 1883 out: 1884 btrfs_release_path(path); 1885 btrfs_release_path(log_path); 1886 return ret; 1887 } 1888 1889 /* 1890 * deletion replay happens before we copy any new directory items 1891 * out of the log or out of backreferences from inodes. It 1892 * scans the log to find ranges of keys that log is authoritative for, 1893 * and then scans the directory to find items in those ranges that are 1894 * not present in the log. 1895 * 1896 * Anything we don't find in the log is unlinked and removed from the 1897 * directory. 1898 */ 1899 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 1900 struct btrfs_root *root, 1901 struct btrfs_root *log, 1902 struct btrfs_path *path, 1903 u64 dirid, int del_all) 1904 { 1905 u64 range_start; 1906 u64 range_end; 1907 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 1908 int ret = 0; 1909 struct btrfs_key dir_key; 1910 struct btrfs_key found_key; 1911 struct btrfs_path *log_path; 1912 struct inode *dir; 1913 1914 dir_key.objectid = dirid; 1915 dir_key.type = BTRFS_DIR_ITEM_KEY; 1916 log_path = btrfs_alloc_path(); 1917 if (!log_path) 1918 return -ENOMEM; 1919 1920 dir = read_one_inode(root, dirid); 1921 /* it isn't an error if the inode isn't there, that can happen 1922 * because we replay the deletes before we copy in the inode item 1923 * from the log 1924 */ 1925 if (!dir) { 1926 btrfs_free_path(log_path); 1927 return 0; 1928 } 1929 again: 1930 range_start = 0; 1931 range_end = 0; 1932 while (1) { 1933 if (del_all) 1934 range_end = (u64)-1; 1935 else { 1936 ret = find_dir_range(log, path, dirid, key_type, 1937 &range_start, &range_end); 1938 if (ret != 0) 1939 break; 1940 } 1941 1942 dir_key.offset = range_start; 1943 while (1) { 1944 int nritems; 1945 ret = btrfs_search_slot(NULL, root, &dir_key, path, 1946 0, 0); 1947 if (ret < 0) 1948 goto out; 1949 1950 nritems = btrfs_header_nritems(path->nodes[0]); 1951 if (path->slots[0] >= nritems) { 1952 ret = btrfs_next_leaf(root, path); 1953 if (ret) 1954 break; 1955 } 1956 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1957 path->slots[0]); 1958 if (found_key.objectid != dirid || 1959 found_key.type != dir_key.type) 1960 goto next_type; 1961 1962 if (found_key.offset > range_end) 1963 break; 1964 1965 ret = check_item_in_log(trans, root, log, path, 1966 log_path, dir, 1967 &found_key); 1968 if (ret) 1969 goto out; 1970 if (found_key.offset == (u64)-1) 1971 break; 1972 dir_key.offset = found_key.offset + 1; 1973 } 1974 btrfs_release_path(path); 1975 if (range_end == (u64)-1) 1976 break; 1977 range_start = range_end + 1; 1978 } 1979 1980 next_type: 1981 ret = 0; 1982 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1983 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1984 dir_key.type = BTRFS_DIR_INDEX_KEY; 1985 btrfs_release_path(path); 1986 goto again; 1987 } 1988 out: 1989 btrfs_release_path(path); 1990 btrfs_free_path(log_path); 1991 iput(dir); 1992 return ret; 1993 } 1994 1995 /* 1996 * the process_func used to replay items from the log tree. This 1997 * gets called in two different stages. The first stage just looks 1998 * for inodes and makes sure they are all copied into the subvolume. 1999 * 2000 * The second stage copies all the other item types from the log into 2001 * the subvolume. The two stage approach is slower, but gets rid of 2002 * lots of complexity around inodes referencing other inodes that exist 2003 * only in the log (references come from either directory items or inode 2004 * back refs). 2005 */ 2006 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2007 struct walk_control *wc, u64 gen) 2008 { 2009 int nritems; 2010 struct btrfs_path *path; 2011 struct btrfs_root *root = wc->replay_dest; 2012 struct btrfs_key key; 2013 int level; 2014 int i; 2015 int ret; 2016 2017 ret = btrfs_read_buffer(eb, gen); 2018 if (ret) 2019 return ret; 2020 2021 level = btrfs_header_level(eb); 2022 2023 if (level != 0) 2024 return 0; 2025 2026 path = btrfs_alloc_path(); 2027 if (!path) 2028 return -ENOMEM; 2029 2030 nritems = btrfs_header_nritems(eb); 2031 for (i = 0; i < nritems; i++) { 2032 btrfs_item_key_to_cpu(eb, &key, i); 2033 2034 /* inode keys are done during the first stage */ 2035 if (key.type == BTRFS_INODE_ITEM_KEY && 2036 wc->stage == LOG_WALK_REPLAY_INODES) { 2037 struct btrfs_inode_item *inode_item; 2038 u32 mode; 2039 2040 inode_item = btrfs_item_ptr(eb, i, 2041 struct btrfs_inode_item); 2042 mode = btrfs_inode_mode(eb, inode_item); 2043 if (S_ISDIR(mode)) { 2044 ret = replay_dir_deletes(wc->trans, 2045 root, log, path, key.objectid, 0); 2046 if (ret) 2047 break; 2048 } 2049 ret = overwrite_item(wc->trans, root, path, 2050 eb, i, &key); 2051 if (ret) 2052 break; 2053 2054 /* for regular files, make sure corresponding 2055 * orhpan item exist. extents past the new EOF 2056 * will be truncated later by orphan cleanup. 2057 */ 2058 if (S_ISREG(mode)) { 2059 ret = insert_orphan_item(wc->trans, root, 2060 key.objectid); 2061 if (ret) 2062 break; 2063 } 2064 2065 ret = link_to_fixup_dir(wc->trans, root, 2066 path, key.objectid); 2067 if (ret) 2068 break; 2069 } 2070 2071 if (key.type == BTRFS_DIR_INDEX_KEY && 2072 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2073 ret = replay_one_dir_item(wc->trans, root, path, 2074 eb, i, &key); 2075 if (ret) 2076 break; 2077 } 2078 2079 if (wc->stage < LOG_WALK_REPLAY_ALL) 2080 continue; 2081 2082 /* these keys are simply copied */ 2083 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2084 ret = overwrite_item(wc->trans, root, path, 2085 eb, i, &key); 2086 if (ret) 2087 break; 2088 } else if (key.type == BTRFS_INODE_REF_KEY || 2089 key.type == BTRFS_INODE_EXTREF_KEY) { 2090 ret = add_inode_ref(wc->trans, root, log, path, 2091 eb, i, &key); 2092 if (ret && ret != -ENOENT) 2093 break; 2094 ret = 0; 2095 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2096 ret = replay_one_extent(wc->trans, root, path, 2097 eb, i, &key); 2098 if (ret) 2099 break; 2100 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2101 ret = replay_one_dir_item(wc->trans, root, path, 2102 eb, i, &key); 2103 if (ret) 2104 break; 2105 } 2106 } 2107 btrfs_free_path(path); 2108 return ret; 2109 } 2110 2111 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2112 struct btrfs_root *root, 2113 struct btrfs_path *path, int *level, 2114 struct walk_control *wc) 2115 { 2116 u64 root_owner; 2117 u64 bytenr; 2118 u64 ptr_gen; 2119 struct extent_buffer *next; 2120 struct extent_buffer *cur; 2121 struct extent_buffer *parent; 2122 u32 blocksize; 2123 int ret = 0; 2124 2125 WARN_ON(*level < 0); 2126 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2127 2128 while (*level > 0) { 2129 WARN_ON(*level < 0); 2130 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2131 cur = path->nodes[*level]; 2132 2133 WARN_ON(btrfs_header_level(cur) != *level); 2134 2135 if (path->slots[*level] >= 2136 btrfs_header_nritems(cur)) 2137 break; 2138 2139 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2140 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2141 blocksize = btrfs_level_size(root, *level - 1); 2142 2143 parent = path->nodes[*level]; 2144 root_owner = btrfs_header_owner(parent); 2145 2146 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 2147 if (!next) 2148 return -ENOMEM; 2149 2150 if (*level == 1) { 2151 ret = wc->process_func(root, next, wc, ptr_gen); 2152 if (ret) { 2153 free_extent_buffer(next); 2154 return ret; 2155 } 2156 2157 path->slots[*level]++; 2158 if (wc->free) { 2159 ret = btrfs_read_buffer(next, ptr_gen); 2160 if (ret) { 2161 free_extent_buffer(next); 2162 return ret; 2163 } 2164 2165 if (trans) { 2166 btrfs_tree_lock(next); 2167 btrfs_set_lock_blocking(next); 2168 clean_tree_block(trans, root, next); 2169 btrfs_wait_tree_block_writeback(next); 2170 btrfs_tree_unlock(next); 2171 } 2172 2173 WARN_ON(root_owner != 2174 BTRFS_TREE_LOG_OBJECTID); 2175 ret = btrfs_free_and_pin_reserved_extent(root, 2176 bytenr, blocksize); 2177 if (ret) { 2178 free_extent_buffer(next); 2179 return ret; 2180 } 2181 } 2182 free_extent_buffer(next); 2183 continue; 2184 } 2185 ret = btrfs_read_buffer(next, ptr_gen); 2186 if (ret) { 2187 free_extent_buffer(next); 2188 return ret; 2189 } 2190 2191 WARN_ON(*level <= 0); 2192 if (path->nodes[*level-1]) 2193 free_extent_buffer(path->nodes[*level-1]); 2194 path->nodes[*level-1] = next; 2195 *level = btrfs_header_level(next); 2196 path->slots[*level] = 0; 2197 cond_resched(); 2198 } 2199 WARN_ON(*level < 0); 2200 WARN_ON(*level >= BTRFS_MAX_LEVEL); 2201 2202 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2203 2204 cond_resched(); 2205 return 0; 2206 } 2207 2208 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2209 struct btrfs_root *root, 2210 struct btrfs_path *path, int *level, 2211 struct walk_control *wc) 2212 { 2213 u64 root_owner; 2214 int i; 2215 int slot; 2216 int ret; 2217 2218 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2219 slot = path->slots[i]; 2220 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2221 path->slots[i]++; 2222 *level = i; 2223 WARN_ON(*level == 0); 2224 return 0; 2225 } else { 2226 struct extent_buffer *parent; 2227 if (path->nodes[*level] == root->node) 2228 parent = path->nodes[*level]; 2229 else 2230 parent = path->nodes[*level + 1]; 2231 2232 root_owner = btrfs_header_owner(parent); 2233 ret = wc->process_func(root, path->nodes[*level], wc, 2234 btrfs_header_generation(path->nodes[*level])); 2235 if (ret) 2236 return ret; 2237 2238 if (wc->free) { 2239 struct extent_buffer *next; 2240 2241 next = path->nodes[*level]; 2242 2243 if (trans) { 2244 btrfs_tree_lock(next); 2245 btrfs_set_lock_blocking(next); 2246 clean_tree_block(trans, root, next); 2247 btrfs_wait_tree_block_writeback(next); 2248 btrfs_tree_unlock(next); 2249 } 2250 2251 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 2252 ret = btrfs_free_and_pin_reserved_extent(root, 2253 path->nodes[*level]->start, 2254 path->nodes[*level]->len); 2255 if (ret) 2256 return ret; 2257 } 2258 free_extent_buffer(path->nodes[*level]); 2259 path->nodes[*level] = NULL; 2260 *level = i + 1; 2261 } 2262 } 2263 return 1; 2264 } 2265 2266 /* 2267 * drop the reference count on the tree rooted at 'snap'. This traverses 2268 * the tree freeing any blocks that have a ref count of zero after being 2269 * decremented. 2270 */ 2271 static int walk_log_tree(struct btrfs_trans_handle *trans, 2272 struct btrfs_root *log, struct walk_control *wc) 2273 { 2274 int ret = 0; 2275 int wret; 2276 int level; 2277 struct btrfs_path *path; 2278 int orig_level; 2279 2280 path = btrfs_alloc_path(); 2281 if (!path) 2282 return -ENOMEM; 2283 2284 level = btrfs_header_level(log->node); 2285 orig_level = level; 2286 path->nodes[level] = log->node; 2287 extent_buffer_get(log->node); 2288 path->slots[level] = 0; 2289 2290 while (1) { 2291 wret = walk_down_log_tree(trans, log, path, &level, wc); 2292 if (wret > 0) 2293 break; 2294 if (wret < 0) { 2295 ret = wret; 2296 goto out; 2297 } 2298 2299 wret = walk_up_log_tree(trans, log, path, &level, wc); 2300 if (wret > 0) 2301 break; 2302 if (wret < 0) { 2303 ret = wret; 2304 goto out; 2305 } 2306 } 2307 2308 /* was the root node processed? if not, catch it here */ 2309 if (path->nodes[orig_level]) { 2310 ret = wc->process_func(log, path->nodes[orig_level], wc, 2311 btrfs_header_generation(path->nodes[orig_level])); 2312 if (ret) 2313 goto out; 2314 if (wc->free) { 2315 struct extent_buffer *next; 2316 2317 next = path->nodes[orig_level]; 2318 2319 if (trans) { 2320 btrfs_tree_lock(next); 2321 btrfs_set_lock_blocking(next); 2322 clean_tree_block(trans, log, next); 2323 btrfs_wait_tree_block_writeback(next); 2324 btrfs_tree_unlock(next); 2325 } 2326 2327 WARN_ON(log->root_key.objectid != 2328 BTRFS_TREE_LOG_OBJECTID); 2329 ret = btrfs_free_and_pin_reserved_extent(log, next->start, 2330 next->len); 2331 if (ret) 2332 goto out; 2333 } 2334 } 2335 2336 out: 2337 btrfs_free_path(path); 2338 return ret; 2339 } 2340 2341 /* 2342 * helper function to update the item for a given subvolumes log root 2343 * in the tree of log roots 2344 */ 2345 static int update_log_root(struct btrfs_trans_handle *trans, 2346 struct btrfs_root *log) 2347 { 2348 int ret; 2349 2350 if (log->log_transid == 1) { 2351 /* insert root item on the first sync */ 2352 ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, 2353 &log->root_key, &log->root_item); 2354 } else { 2355 ret = btrfs_update_root(trans, log->fs_info->log_root_tree, 2356 &log->root_key, &log->root_item); 2357 } 2358 return ret; 2359 } 2360 2361 static int wait_log_commit(struct btrfs_trans_handle *trans, 2362 struct btrfs_root *root, unsigned long transid) 2363 { 2364 DEFINE_WAIT(wait); 2365 int index = transid % 2; 2366 2367 /* 2368 * we only allow two pending log transactions at a time, 2369 * so we know that if ours is more than 2 older than the 2370 * current transaction, we're done 2371 */ 2372 do { 2373 prepare_to_wait(&root->log_commit_wait[index], 2374 &wait, TASK_UNINTERRUPTIBLE); 2375 mutex_unlock(&root->log_mutex); 2376 2377 if (root->fs_info->last_trans_log_full_commit != 2378 trans->transid && root->log_transid < transid + 2 && 2379 atomic_read(&root->log_commit[index])) 2380 schedule(); 2381 2382 finish_wait(&root->log_commit_wait[index], &wait); 2383 mutex_lock(&root->log_mutex); 2384 } while (root->fs_info->last_trans_log_full_commit != 2385 trans->transid && root->log_transid < transid + 2 && 2386 atomic_read(&root->log_commit[index])); 2387 return 0; 2388 } 2389 2390 static void wait_for_writer(struct btrfs_trans_handle *trans, 2391 struct btrfs_root *root) 2392 { 2393 DEFINE_WAIT(wait); 2394 while (root->fs_info->last_trans_log_full_commit != 2395 trans->transid && atomic_read(&root->log_writers)) { 2396 prepare_to_wait(&root->log_writer_wait, 2397 &wait, TASK_UNINTERRUPTIBLE); 2398 mutex_unlock(&root->log_mutex); 2399 if (root->fs_info->last_trans_log_full_commit != 2400 trans->transid && atomic_read(&root->log_writers)) 2401 schedule(); 2402 mutex_lock(&root->log_mutex); 2403 finish_wait(&root->log_writer_wait, &wait); 2404 } 2405 } 2406 2407 /* 2408 * btrfs_sync_log does sends a given tree log down to the disk and 2409 * updates the super blocks to record it. When this call is done, 2410 * you know that any inodes previously logged are safely on disk only 2411 * if it returns 0. 2412 * 2413 * Any other return value means you need to call btrfs_commit_transaction. 2414 * Some of the edge cases for fsyncing directories that have had unlinks 2415 * or renames done in the past mean that sometimes the only safe 2416 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2417 * that has happened. 2418 */ 2419 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2420 struct btrfs_root *root) 2421 { 2422 int index1; 2423 int index2; 2424 int mark; 2425 int ret; 2426 struct btrfs_root *log = root->log_root; 2427 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2428 unsigned long log_transid = 0; 2429 struct blk_plug plug; 2430 2431 mutex_lock(&root->log_mutex); 2432 log_transid = root->log_transid; 2433 index1 = root->log_transid % 2; 2434 if (atomic_read(&root->log_commit[index1])) { 2435 wait_log_commit(trans, root, root->log_transid); 2436 mutex_unlock(&root->log_mutex); 2437 return 0; 2438 } 2439 atomic_set(&root->log_commit[index1], 1); 2440 2441 /* wait for previous tree log sync to complete */ 2442 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2443 wait_log_commit(trans, root, root->log_transid - 1); 2444 while (1) { 2445 int batch = atomic_read(&root->log_batch); 2446 /* when we're on an ssd, just kick the log commit out */ 2447 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { 2448 mutex_unlock(&root->log_mutex); 2449 schedule_timeout_uninterruptible(1); 2450 mutex_lock(&root->log_mutex); 2451 } 2452 wait_for_writer(trans, root); 2453 if (batch == atomic_read(&root->log_batch)) 2454 break; 2455 } 2456 2457 /* bail out if we need to do a full commit */ 2458 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2459 ret = -EAGAIN; 2460 btrfs_free_logged_extents(log, log_transid); 2461 mutex_unlock(&root->log_mutex); 2462 goto out; 2463 } 2464 2465 if (log_transid % 2 == 0) 2466 mark = EXTENT_DIRTY; 2467 else 2468 mark = EXTENT_NEW; 2469 2470 /* we start IO on all the marked extents here, but we don't actually 2471 * wait for them until later. 2472 */ 2473 blk_start_plug(&plug); 2474 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2475 if (ret) { 2476 blk_finish_plug(&plug); 2477 btrfs_abort_transaction(trans, root, ret); 2478 btrfs_free_logged_extents(log, log_transid); 2479 mutex_unlock(&root->log_mutex); 2480 goto out; 2481 } 2482 2483 btrfs_set_root_node(&log->root_item, log->node); 2484 2485 root->log_transid++; 2486 log->log_transid = root->log_transid; 2487 root->log_start_pid = 0; 2488 smp_mb(); 2489 /* 2490 * IO has been started, blocks of the log tree have WRITTEN flag set 2491 * in their headers. new modifications of the log will be written to 2492 * new positions. so it's safe to allow log writers to go in. 2493 */ 2494 mutex_unlock(&root->log_mutex); 2495 2496 mutex_lock(&log_root_tree->log_mutex); 2497 atomic_inc(&log_root_tree->log_batch); 2498 atomic_inc(&log_root_tree->log_writers); 2499 mutex_unlock(&log_root_tree->log_mutex); 2500 2501 ret = update_log_root(trans, log); 2502 2503 mutex_lock(&log_root_tree->log_mutex); 2504 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2505 smp_mb(); 2506 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2507 wake_up(&log_root_tree->log_writer_wait); 2508 } 2509 2510 if (ret) { 2511 blk_finish_plug(&plug); 2512 if (ret != -ENOSPC) { 2513 btrfs_abort_transaction(trans, root, ret); 2514 mutex_unlock(&log_root_tree->log_mutex); 2515 goto out; 2516 } 2517 root->fs_info->last_trans_log_full_commit = trans->transid; 2518 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2519 btrfs_free_logged_extents(log, log_transid); 2520 mutex_unlock(&log_root_tree->log_mutex); 2521 ret = -EAGAIN; 2522 goto out; 2523 } 2524 2525 index2 = log_root_tree->log_transid % 2; 2526 if (atomic_read(&log_root_tree->log_commit[index2])) { 2527 blk_finish_plug(&plug); 2528 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2529 wait_log_commit(trans, log_root_tree, 2530 log_root_tree->log_transid); 2531 btrfs_free_logged_extents(log, log_transid); 2532 mutex_unlock(&log_root_tree->log_mutex); 2533 ret = 0; 2534 goto out; 2535 } 2536 atomic_set(&log_root_tree->log_commit[index2], 1); 2537 2538 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 2539 wait_log_commit(trans, log_root_tree, 2540 log_root_tree->log_transid - 1); 2541 } 2542 2543 wait_for_writer(trans, log_root_tree); 2544 2545 /* 2546 * now that we've moved on to the tree of log tree roots, 2547 * check the full commit flag again 2548 */ 2549 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2550 blk_finish_plug(&plug); 2551 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2552 btrfs_free_logged_extents(log, log_transid); 2553 mutex_unlock(&log_root_tree->log_mutex); 2554 ret = -EAGAIN; 2555 goto out_wake_log_root; 2556 } 2557 2558 ret = btrfs_write_marked_extents(log_root_tree, 2559 &log_root_tree->dirty_log_pages, 2560 EXTENT_DIRTY | EXTENT_NEW); 2561 blk_finish_plug(&plug); 2562 if (ret) { 2563 btrfs_abort_transaction(trans, root, ret); 2564 btrfs_free_logged_extents(log, log_transid); 2565 mutex_unlock(&log_root_tree->log_mutex); 2566 goto out_wake_log_root; 2567 } 2568 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2569 btrfs_wait_marked_extents(log_root_tree, 2570 &log_root_tree->dirty_log_pages, 2571 EXTENT_NEW | EXTENT_DIRTY); 2572 btrfs_wait_logged_extents(log, log_transid); 2573 2574 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2575 log_root_tree->node->start); 2576 btrfs_set_super_log_root_level(root->fs_info->super_for_commit, 2577 btrfs_header_level(log_root_tree->node)); 2578 2579 log_root_tree->log_transid++; 2580 smp_mb(); 2581 2582 mutex_unlock(&log_root_tree->log_mutex); 2583 2584 /* 2585 * nobody else is going to jump in and write the the ctree 2586 * super here because the log_commit atomic below is protecting 2587 * us. We must be called with a transaction handle pinning 2588 * the running transaction open, so a full commit can't hop 2589 * in and cause problems either. 2590 */ 2591 ret = write_ctree_super(trans, root->fs_info->tree_root, 1); 2592 if (ret) { 2593 btrfs_abort_transaction(trans, root, ret); 2594 goto out_wake_log_root; 2595 } 2596 2597 mutex_lock(&root->log_mutex); 2598 if (root->last_log_commit < log_transid) 2599 root->last_log_commit = log_transid; 2600 mutex_unlock(&root->log_mutex); 2601 2602 out_wake_log_root: 2603 atomic_set(&log_root_tree->log_commit[index2], 0); 2604 smp_mb(); 2605 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2606 wake_up(&log_root_tree->log_commit_wait[index2]); 2607 out: 2608 atomic_set(&root->log_commit[index1], 0); 2609 smp_mb(); 2610 if (waitqueue_active(&root->log_commit_wait[index1])) 2611 wake_up(&root->log_commit_wait[index1]); 2612 return ret; 2613 } 2614 2615 static void free_log_tree(struct btrfs_trans_handle *trans, 2616 struct btrfs_root *log) 2617 { 2618 int ret; 2619 u64 start; 2620 u64 end; 2621 struct walk_control wc = { 2622 .free = 1, 2623 .process_func = process_one_buffer 2624 }; 2625 2626 ret = walk_log_tree(trans, log, &wc); 2627 /* I don't think this can happen but just in case */ 2628 if (ret) 2629 btrfs_abort_transaction(trans, log, ret); 2630 2631 while (1) { 2632 ret = find_first_extent_bit(&log->dirty_log_pages, 2633 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, 2634 NULL); 2635 if (ret) 2636 break; 2637 2638 clear_extent_bits(&log->dirty_log_pages, start, end, 2639 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2640 } 2641 2642 /* 2643 * We may have short-circuited the log tree with the full commit logic 2644 * and left ordered extents on our list, so clear these out to keep us 2645 * from leaking inodes and memory. 2646 */ 2647 btrfs_free_logged_extents(log, 0); 2648 btrfs_free_logged_extents(log, 1); 2649 2650 free_extent_buffer(log->node); 2651 kfree(log); 2652 } 2653 2654 /* 2655 * free all the extents used by the tree log. This should be called 2656 * at commit time of the full transaction 2657 */ 2658 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 2659 { 2660 if (root->log_root) { 2661 free_log_tree(trans, root->log_root); 2662 root->log_root = NULL; 2663 } 2664 return 0; 2665 } 2666 2667 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 2668 struct btrfs_fs_info *fs_info) 2669 { 2670 if (fs_info->log_root_tree) { 2671 free_log_tree(trans, fs_info->log_root_tree); 2672 fs_info->log_root_tree = NULL; 2673 } 2674 return 0; 2675 } 2676 2677 /* 2678 * If both a file and directory are logged, and unlinks or renames are 2679 * mixed in, we have a few interesting corners: 2680 * 2681 * create file X in dir Y 2682 * link file X to X.link in dir Y 2683 * fsync file X 2684 * unlink file X but leave X.link 2685 * fsync dir Y 2686 * 2687 * After a crash we would expect only X.link to exist. But file X 2688 * didn't get fsync'd again so the log has back refs for X and X.link. 2689 * 2690 * We solve this by removing directory entries and inode backrefs from the 2691 * log when a file that was logged in the current transaction is 2692 * unlinked. Any later fsync will include the updated log entries, and 2693 * we'll be able to reconstruct the proper directory items from backrefs. 2694 * 2695 * This optimizations allows us to avoid relogging the entire inode 2696 * or the entire directory. 2697 */ 2698 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 2699 struct btrfs_root *root, 2700 const char *name, int name_len, 2701 struct inode *dir, u64 index) 2702 { 2703 struct btrfs_root *log; 2704 struct btrfs_dir_item *di; 2705 struct btrfs_path *path; 2706 int ret; 2707 int err = 0; 2708 int bytes_del = 0; 2709 u64 dir_ino = btrfs_ino(dir); 2710 2711 if (BTRFS_I(dir)->logged_trans < trans->transid) 2712 return 0; 2713 2714 ret = join_running_log_trans(root); 2715 if (ret) 2716 return 0; 2717 2718 mutex_lock(&BTRFS_I(dir)->log_mutex); 2719 2720 log = root->log_root; 2721 path = btrfs_alloc_path(); 2722 if (!path) { 2723 err = -ENOMEM; 2724 goto out_unlock; 2725 } 2726 2727 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 2728 name, name_len, -1); 2729 if (IS_ERR(di)) { 2730 err = PTR_ERR(di); 2731 goto fail; 2732 } 2733 if (di) { 2734 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2735 bytes_del += name_len; 2736 if (ret) { 2737 err = ret; 2738 goto fail; 2739 } 2740 } 2741 btrfs_release_path(path); 2742 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 2743 index, name, name_len, -1); 2744 if (IS_ERR(di)) { 2745 err = PTR_ERR(di); 2746 goto fail; 2747 } 2748 if (di) { 2749 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2750 bytes_del += name_len; 2751 if (ret) { 2752 err = ret; 2753 goto fail; 2754 } 2755 } 2756 2757 /* update the directory size in the log to reflect the names 2758 * we have removed 2759 */ 2760 if (bytes_del) { 2761 struct btrfs_key key; 2762 2763 key.objectid = dir_ino; 2764 key.offset = 0; 2765 key.type = BTRFS_INODE_ITEM_KEY; 2766 btrfs_release_path(path); 2767 2768 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2769 if (ret < 0) { 2770 err = ret; 2771 goto fail; 2772 } 2773 if (ret == 0) { 2774 struct btrfs_inode_item *item; 2775 u64 i_size; 2776 2777 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2778 struct btrfs_inode_item); 2779 i_size = btrfs_inode_size(path->nodes[0], item); 2780 if (i_size > bytes_del) 2781 i_size -= bytes_del; 2782 else 2783 i_size = 0; 2784 btrfs_set_inode_size(path->nodes[0], item, i_size); 2785 btrfs_mark_buffer_dirty(path->nodes[0]); 2786 } else 2787 ret = 0; 2788 btrfs_release_path(path); 2789 } 2790 fail: 2791 btrfs_free_path(path); 2792 out_unlock: 2793 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2794 if (ret == -ENOSPC) { 2795 root->fs_info->last_trans_log_full_commit = trans->transid; 2796 ret = 0; 2797 } else if (ret < 0) 2798 btrfs_abort_transaction(trans, root, ret); 2799 2800 btrfs_end_log_trans(root); 2801 2802 return err; 2803 } 2804 2805 /* see comments for btrfs_del_dir_entries_in_log */ 2806 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 2807 struct btrfs_root *root, 2808 const char *name, int name_len, 2809 struct inode *inode, u64 dirid) 2810 { 2811 struct btrfs_root *log; 2812 u64 index; 2813 int ret; 2814 2815 if (BTRFS_I(inode)->logged_trans < trans->transid) 2816 return 0; 2817 2818 ret = join_running_log_trans(root); 2819 if (ret) 2820 return 0; 2821 log = root->log_root; 2822 mutex_lock(&BTRFS_I(inode)->log_mutex); 2823 2824 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 2825 dirid, &index); 2826 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2827 if (ret == -ENOSPC) { 2828 root->fs_info->last_trans_log_full_commit = trans->transid; 2829 ret = 0; 2830 } else if (ret < 0 && ret != -ENOENT) 2831 btrfs_abort_transaction(trans, root, ret); 2832 btrfs_end_log_trans(root); 2833 2834 return ret; 2835 } 2836 2837 /* 2838 * creates a range item in the log for 'dirid'. first_offset and 2839 * last_offset tell us which parts of the key space the log should 2840 * be considered authoritative for. 2841 */ 2842 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 2843 struct btrfs_root *log, 2844 struct btrfs_path *path, 2845 int key_type, u64 dirid, 2846 u64 first_offset, u64 last_offset) 2847 { 2848 int ret; 2849 struct btrfs_key key; 2850 struct btrfs_dir_log_item *item; 2851 2852 key.objectid = dirid; 2853 key.offset = first_offset; 2854 if (key_type == BTRFS_DIR_ITEM_KEY) 2855 key.type = BTRFS_DIR_LOG_ITEM_KEY; 2856 else 2857 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2858 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2859 if (ret) 2860 return ret; 2861 2862 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2863 struct btrfs_dir_log_item); 2864 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2865 btrfs_mark_buffer_dirty(path->nodes[0]); 2866 btrfs_release_path(path); 2867 return 0; 2868 } 2869 2870 /* 2871 * log all the items included in the current transaction for a given 2872 * directory. This also creates the range items in the log tree required 2873 * to replay anything deleted before the fsync 2874 */ 2875 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 2876 struct btrfs_root *root, struct inode *inode, 2877 struct btrfs_path *path, 2878 struct btrfs_path *dst_path, int key_type, 2879 u64 min_offset, u64 *last_offset_ret) 2880 { 2881 struct btrfs_key min_key; 2882 struct btrfs_root *log = root->log_root; 2883 struct extent_buffer *src; 2884 int err = 0; 2885 int ret; 2886 int i; 2887 int nritems; 2888 u64 first_offset = min_offset; 2889 u64 last_offset = (u64)-1; 2890 u64 ino = btrfs_ino(inode); 2891 2892 log = root->log_root; 2893 2894 min_key.objectid = ino; 2895 min_key.type = key_type; 2896 min_key.offset = min_offset; 2897 2898 path->keep_locks = 1; 2899 2900 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 2901 2902 /* 2903 * we didn't find anything from this transaction, see if there 2904 * is anything at all 2905 */ 2906 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 2907 min_key.objectid = ino; 2908 min_key.type = key_type; 2909 min_key.offset = (u64)-1; 2910 btrfs_release_path(path); 2911 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2912 if (ret < 0) { 2913 btrfs_release_path(path); 2914 return ret; 2915 } 2916 ret = btrfs_previous_item(root, path, ino, key_type); 2917 2918 /* if ret == 0 there are items for this type, 2919 * create a range to tell us the last key of this type. 2920 * otherwise, there are no items in this directory after 2921 * *min_offset, and we create a range to indicate that. 2922 */ 2923 if (ret == 0) { 2924 struct btrfs_key tmp; 2925 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 2926 path->slots[0]); 2927 if (key_type == tmp.type) 2928 first_offset = max(min_offset, tmp.offset) + 1; 2929 } 2930 goto done; 2931 } 2932 2933 /* go backward to find any previous key */ 2934 ret = btrfs_previous_item(root, path, ino, key_type); 2935 if (ret == 0) { 2936 struct btrfs_key tmp; 2937 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2938 if (key_type == tmp.type) { 2939 first_offset = tmp.offset; 2940 ret = overwrite_item(trans, log, dst_path, 2941 path->nodes[0], path->slots[0], 2942 &tmp); 2943 if (ret) { 2944 err = ret; 2945 goto done; 2946 } 2947 } 2948 } 2949 btrfs_release_path(path); 2950 2951 /* find the first key from this transaction again */ 2952 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2953 if (WARN_ON(ret != 0)) 2954 goto done; 2955 2956 /* 2957 * we have a block from this transaction, log every item in it 2958 * from our directory 2959 */ 2960 while (1) { 2961 struct btrfs_key tmp; 2962 src = path->nodes[0]; 2963 nritems = btrfs_header_nritems(src); 2964 for (i = path->slots[0]; i < nritems; i++) { 2965 btrfs_item_key_to_cpu(src, &min_key, i); 2966 2967 if (min_key.objectid != ino || min_key.type != key_type) 2968 goto done; 2969 ret = overwrite_item(trans, log, dst_path, src, i, 2970 &min_key); 2971 if (ret) { 2972 err = ret; 2973 goto done; 2974 } 2975 } 2976 path->slots[0] = nritems; 2977 2978 /* 2979 * look ahead to the next item and see if it is also 2980 * from this directory and from this transaction 2981 */ 2982 ret = btrfs_next_leaf(root, path); 2983 if (ret == 1) { 2984 last_offset = (u64)-1; 2985 goto done; 2986 } 2987 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2988 if (tmp.objectid != ino || tmp.type != key_type) { 2989 last_offset = (u64)-1; 2990 goto done; 2991 } 2992 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 2993 ret = overwrite_item(trans, log, dst_path, 2994 path->nodes[0], path->slots[0], 2995 &tmp); 2996 if (ret) 2997 err = ret; 2998 else 2999 last_offset = tmp.offset; 3000 goto done; 3001 } 3002 } 3003 done: 3004 btrfs_release_path(path); 3005 btrfs_release_path(dst_path); 3006 3007 if (err == 0) { 3008 *last_offset_ret = last_offset; 3009 /* 3010 * insert the log range keys to indicate where the log 3011 * is valid 3012 */ 3013 ret = insert_dir_log_key(trans, log, path, key_type, 3014 ino, first_offset, last_offset); 3015 if (ret) 3016 err = ret; 3017 } 3018 return err; 3019 } 3020 3021 /* 3022 * logging directories is very similar to logging inodes, We find all the items 3023 * from the current transaction and write them to the log. 3024 * 3025 * The recovery code scans the directory in the subvolume, and if it finds a 3026 * key in the range logged that is not present in the log tree, then it means 3027 * that dir entry was unlinked during the transaction. 3028 * 3029 * In order for that scan to work, we must include one key smaller than 3030 * the smallest logged by this transaction and one key larger than the largest 3031 * key logged by this transaction. 3032 */ 3033 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3034 struct btrfs_root *root, struct inode *inode, 3035 struct btrfs_path *path, 3036 struct btrfs_path *dst_path) 3037 { 3038 u64 min_key; 3039 u64 max_key; 3040 int ret; 3041 int key_type = BTRFS_DIR_ITEM_KEY; 3042 3043 again: 3044 min_key = 0; 3045 max_key = 0; 3046 while (1) { 3047 ret = log_dir_items(trans, root, inode, path, 3048 dst_path, key_type, min_key, 3049 &max_key); 3050 if (ret) 3051 return ret; 3052 if (max_key == (u64)-1) 3053 break; 3054 min_key = max_key + 1; 3055 } 3056 3057 if (key_type == BTRFS_DIR_ITEM_KEY) { 3058 key_type = BTRFS_DIR_INDEX_KEY; 3059 goto again; 3060 } 3061 return 0; 3062 } 3063 3064 /* 3065 * a helper function to drop items from the log before we relog an 3066 * inode. max_key_type indicates the highest item type to remove. 3067 * This cannot be run for file data extents because it does not 3068 * free the extents they point to. 3069 */ 3070 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3071 struct btrfs_root *log, 3072 struct btrfs_path *path, 3073 u64 objectid, int max_key_type) 3074 { 3075 int ret; 3076 struct btrfs_key key; 3077 struct btrfs_key found_key; 3078 int start_slot; 3079 3080 key.objectid = objectid; 3081 key.type = max_key_type; 3082 key.offset = (u64)-1; 3083 3084 while (1) { 3085 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3086 BUG_ON(ret == 0); /* Logic error */ 3087 if (ret < 0) 3088 break; 3089 3090 if (path->slots[0] == 0) 3091 break; 3092 3093 path->slots[0]--; 3094 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3095 path->slots[0]); 3096 3097 if (found_key.objectid != objectid) 3098 break; 3099 3100 found_key.offset = 0; 3101 found_key.type = 0; 3102 ret = btrfs_bin_search(path->nodes[0], &found_key, 0, 3103 &start_slot); 3104 3105 ret = btrfs_del_items(trans, log, path, start_slot, 3106 path->slots[0] - start_slot + 1); 3107 /* 3108 * If start slot isn't 0 then we don't need to re-search, we've 3109 * found the last guy with the objectid in this tree. 3110 */ 3111 if (ret || start_slot != 0) 3112 break; 3113 btrfs_release_path(path); 3114 } 3115 btrfs_release_path(path); 3116 if (ret > 0) 3117 ret = 0; 3118 return ret; 3119 } 3120 3121 static void fill_inode_item(struct btrfs_trans_handle *trans, 3122 struct extent_buffer *leaf, 3123 struct btrfs_inode_item *item, 3124 struct inode *inode, int log_inode_only) 3125 { 3126 struct btrfs_map_token token; 3127 3128 btrfs_init_map_token(&token); 3129 3130 if (log_inode_only) { 3131 /* set the generation to zero so the recover code 3132 * can tell the difference between an logging 3133 * just to say 'this inode exists' and a logging 3134 * to say 'update this inode with these values' 3135 */ 3136 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3137 btrfs_set_token_inode_size(leaf, item, 0, &token); 3138 } else { 3139 btrfs_set_token_inode_generation(leaf, item, 3140 BTRFS_I(inode)->generation, 3141 &token); 3142 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); 3143 } 3144 3145 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); 3146 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); 3147 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); 3148 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); 3149 3150 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), 3151 inode->i_atime.tv_sec, &token); 3152 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), 3153 inode->i_atime.tv_nsec, &token); 3154 3155 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), 3156 inode->i_mtime.tv_sec, &token); 3157 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), 3158 inode->i_mtime.tv_nsec, &token); 3159 3160 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), 3161 inode->i_ctime.tv_sec, &token); 3162 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), 3163 inode->i_ctime.tv_nsec, &token); 3164 3165 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), 3166 &token); 3167 3168 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); 3169 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); 3170 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); 3171 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); 3172 btrfs_set_token_inode_block_group(leaf, item, 0, &token); 3173 } 3174 3175 static int log_inode_item(struct btrfs_trans_handle *trans, 3176 struct btrfs_root *log, struct btrfs_path *path, 3177 struct inode *inode) 3178 { 3179 struct btrfs_inode_item *inode_item; 3180 int ret; 3181 3182 ret = btrfs_insert_empty_item(trans, log, path, 3183 &BTRFS_I(inode)->location, 3184 sizeof(*inode_item)); 3185 if (ret && ret != -EEXIST) 3186 return ret; 3187 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3188 struct btrfs_inode_item); 3189 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3190 btrfs_release_path(path); 3191 return 0; 3192 } 3193 3194 static noinline int copy_items(struct btrfs_trans_handle *trans, 3195 struct inode *inode, 3196 struct btrfs_path *dst_path, 3197 struct extent_buffer *src, 3198 int start_slot, int nr, int inode_only) 3199 { 3200 unsigned long src_offset; 3201 unsigned long dst_offset; 3202 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3203 struct btrfs_file_extent_item *extent; 3204 struct btrfs_inode_item *inode_item; 3205 int ret; 3206 struct btrfs_key *ins_keys; 3207 u32 *ins_sizes; 3208 char *ins_data; 3209 int i; 3210 struct list_head ordered_sums; 3211 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3212 3213 INIT_LIST_HEAD(&ordered_sums); 3214 3215 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 3216 nr * sizeof(u32), GFP_NOFS); 3217 if (!ins_data) 3218 return -ENOMEM; 3219 3220 ins_sizes = (u32 *)ins_data; 3221 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3222 3223 for (i = 0; i < nr; i++) { 3224 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 3225 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 3226 } 3227 ret = btrfs_insert_empty_items(trans, log, dst_path, 3228 ins_keys, ins_sizes, nr); 3229 if (ret) { 3230 kfree(ins_data); 3231 return ret; 3232 } 3233 3234 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 3235 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 3236 dst_path->slots[0]); 3237 3238 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3239 3240 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3241 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3242 dst_path->slots[0], 3243 struct btrfs_inode_item); 3244 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3245 inode, inode_only == LOG_INODE_EXISTS); 3246 } else { 3247 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3248 src_offset, ins_sizes[i]); 3249 } 3250 3251 /* take a reference on file data extents so that truncates 3252 * or deletes of this inode don't have to relog the inode 3253 * again 3254 */ 3255 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3256 !skip_csum) { 3257 int found_type; 3258 extent = btrfs_item_ptr(src, start_slot + i, 3259 struct btrfs_file_extent_item); 3260 3261 if (btrfs_file_extent_generation(src, extent) < trans->transid) 3262 continue; 3263 3264 found_type = btrfs_file_extent_type(src, extent); 3265 if (found_type == BTRFS_FILE_EXTENT_REG) { 3266 u64 ds, dl, cs, cl; 3267 ds = btrfs_file_extent_disk_bytenr(src, 3268 extent); 3269 /* ds == 0 is a hole */ 3270 if (ds == 0) 3271 continue; 3272 3273 dl = btrfs_file_extent_disk_num_bytes(src, 3274 extent); 3275 cs = btrfs_file_extent_offset(src, extent); 3276 cl = btrfs_file_extent_num_bytes(src, 3277 extent); 3278 if (btrfs_file_extent_compression(src, 3279 extent)) { 3280 cs = 0; 3281 cl = dl; 3282 } 3283 3284 ret = btrfs_lookup_csums_range( 3285 log->fs_info->csum_root, 3286 ds + cs, ds + cs + cl - 1, 3287 &ordered_sums, 0); 3288 if (ret) { 3289 btrfs_release_path(dst_path); 3290 kfree(ins_data); 3291 return ret; 3292 } 3293 } 3294 } 3295 } 3296 3297 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 3298 btrfs_release_path(dst_path); 3299 kfree(ins_data); 3300 3301 /* 3302 * we have to do this after the loop above to avoid changing the 3303 * log tree while trying to change the log tree. 3304 */ 3305 ret = 0; 3306 while (!list_empty(&ordered_sums)) { 3307 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3308 struct btrfs_ordered_sum, 3309 list); 3310 if (!ret) 3311 ret = btrfs_csum_file_blocks(trans, log, sums); 3312 list_del(&sums->list); 3313 kfree(sums); 3314 } 3315 return ret; 3316 } 3317 3318 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) 3319 { 3320 struct extent_map *em1, *em2; 3321 3322 em1 = list_entry(a, struct extent_map, list); 3323 em2 = list_entry(b, struct extent_map, list); 3324 3325 if (em1->start < em2->start) 3326 return -1; 3327 else if (em1->start > em2->start) 3328 return 1; 3329 return 0; 3330 } 3331 3332 static int log_one_extent(struct btrfs_trans_handle *trans, 3333 struct inode *inode, struct btrfs_root *root, 3334 struct extent_map *em, struct btrfs_path *path) 3335 { 3336 struct btrfs_root *log = root->log_root; 3337 struct btrfs_file_extent_item *fi; 3338 struct extent_buffer *leaf; 3339 struct btrfs_ordered_extent *ordered; 3340 struct list_head ordered_sums; 3341 struct btrfs_map_token token; 3342 struct btrfs_key key; 3343 u64 mod_start = em->mod_start; 3344 u64 mod_len = em->mod_len; 3345 u64 csum_offset; 3346 u64 csum_len; 3347 u64 extent_offset = em->start - em->orig_start; 3348 u64 block_len; 3349 int ret; 3350 int index = log->log_transid % 2; 3351 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3352 3353 ret = __btrfs_drop_extents(trans, log, inode, path, em->start, 3354 em->start + em->len, NULL, 0); 3355 if (ret) 3356 return ret; 3357 3358 INIT_LIST_HEAD(&ordered_sums); 3359 btrfs_init_map_token(&token); 3360 key.objectid = btrfs_ino(inode); 3361 key.type = BTRFS_EXTENT_DATA_KEY; 3362 key.offset = em->start; 3363 3364 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); 3365 if (ret) 3366 return ret; 3367 leaf = path->nodes[0]; 3368 fi = btrfs_item_ptr(leaf, path->slots[0], 3369 struct btrfs_file_extent_item); 3370 3371 btrfs_set_token_file_extent_generation(leaf, fi, em->generation, 3372 &token); 3373 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3374 skip_csum = true; 3375 btrfs_set_token_file_extent_type(leaf, fi, 3376 BTRFS_FILE_EXTENT_PREALLOC, 3377 &token); 3378 } else { 3379 btrfs_set_token_file_extent_type(leaf, fi, 3380 BTRFS_FILE_EXTENT_REG, 3381 &token); 3382 if (em->block_start == EXTENT_MAP_HOLE) 3383 skip_csum = true; 3384 } 3385 3386 block_len = max(em->block_len, em->orig_block_len); 3387 if (em->compress_type != BTRFS_COMPRESS_NONE) { 3388 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3389 em->block_start, 3390 &token); 3391 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3392 &token); 3393 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 3394 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 3395 em->block_start - 3396 extent_offset, &token); 3397 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, 3398 &token); 3399 } else { 3400 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); 3401 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, 3402 &token); 3403 } 3404 3405 btrfs_set_token_file_extent_offset(leaf, fi, 3406 em->start - em->orig_start, 3407 &token); 3408 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); 3409 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); 3410 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, 3411 &token); 3412 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); 3413 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); 3414 btrfs_mark_buffer_dirty(leaf); 3415 3416 btrfs_release_path(path); 3417 if (ret) { 3418 return ret; 3419 } 3420 3421 if (skip_csum) 3422 return 0; 3423 3424 /* 3425 * First check and see if our csums are on our outstanding ordered 3426 * extents. 3427 */ 3428 again: 3429 spin_lock_irq(&log->log_extents_lock[index]); 3430 list_for_each_entry(ordered, &log->logged_list[index], log_list) { 3431 struct btrfs_ordered_sum *sum; 3432 3433 if (!mod_len) 3434 break; 3435 3436 if (ordered->inode != inode) 3437 continue; 3438 3439 if (ordered->file_offset + ordered->len <= mod_start || 3440 mod_start + mod_len <= ordered->file_offset) 3441 continue; 3442 3443 /* 3444 * We are going to copy all the csums on this ordered extent, so 3445 * go ahead and adjust mod_start and mod_len in case this 3446 * ordered extent has already been logged. 3447 */ 3448 if (ordered->file_offset > mod_start) { 3449 if (ordered->file_offset + ordered->len >= 3450 mod_start + mod_len) 3451 mod_len = ordered->file_offset - mod_start; 3452 /* 3453 * If we have this case 3454 * 3455 * |--------- logged extent ---------| 3456 * |----- ordered extent ----| 3457 * 3458 * Just don't mess with mod_start and mod_len, we'll 3459 * just end up logging more csums than we need and it 3460 * will be ok. 3461 */ 3462 } else { 3463 if (ordered->file_offset + ordered->len < 3464 mod_start + mod_len) { 3465 mod_len = (mod_start + mod_len) - 3466 (ordered->file_offset + ordered->len); 3467 mod_start = ordered->file_offset + 3468 ordered->len; 3469 } else { 3470 mod_len = 0; 3471 } 3472 } 3473 3474 /* 3475 * To keep us from looping for the above case of an ordered 3476 * extent that falls inside of the logged extent. 3477 */ 3478 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, 3479 &ordered->flags)) 3480 continue; 3481 atomic_inc(&ordered->refs); 3482 spin_unlock_irq(&log->log_extents_lock[index]); 3483 /* 3484 * we've dropped the lock, we must either break or 3485 * start over after this. 3486 */ 3487 3488 wait_event(ordered->wait, ordered->csum_bytes_left == 0); 3489 3490 list_for_each_entry(sum, &ordered->list, list) { 3491 ret = btrfs_csum_file_blocks(trans, log, sum); 3492 if (ret) { 3493 btrfs_put_ordered_extent(ordered); 3494 goto unlocked; 3495 } 3496 } 3497 btrfs_put_ordered_extent(ordered); 3498 goto again; 3499 3500 } 3501 spin_unlock_irq(&log->log_extents_lock[index]); 3502 unlocked: 3503 3504 if (!mod_len || ret) 3505 return ret; 3506 3507 if (em->compress_type) { 3508 csum_offset = 0; 3509 csum_len = block_len; 3510 } else { 3511 csum_offset = mod_start - em->start; 3512 csum_len = mod_len; 3513 } 3514 3515 /* block start is already adjusted for the file extent offset. */ 3516 ret = btrfs_lookup_csums_range(log->fs_info->csum_root, 3517 em->block_start + csum_offset, 3518 em->block_start + csum_offset + 3519 csum_len - 1, &ordered_sums, 0); 3520 if (ret) 3521 return ret; 3522 3523 while (!list_empty(&ordered_sums)) { 3524 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 3525 struct btrfs_ordered_sum, 3526 list); 3527 if (!ret) 3528 ret = btrfs_csum_file_blocks(trans, log, sums); 3529 list_del(&sums->list); 3530 kfree(sums); 3531 } 3532 3533 return ret; 3534 } 3535 3536 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3537 struct btrfs_root *root, 3538 struct inode *inode, 3539 struct btrfs_path *path) 3540 { 3541 struct extent_map *em, *n; 3542 struct list_head extents; 3543 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3544 u64 test_gen; 3545 int ret = 0; 3546 int num = 0; 3547 3548 INIT_LIST_HEAD(&extents); 3549 3550 write_lock(&tree->lock); 3551 test_gen = root->fs_info->last_trans_committed; 3552 3553 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 3554 list_del_init(&em->list); 3555 3556 /* 3557 * Just an arbitrary number, this can be really CPU intensive 3558 * once we start getting a lot of extents, and really once we 3559 * have a bunch of extents we just want to commit since it will 3560 * be faster. 3561 */ 3562 if (++num > 32768) { 3563 list_del_init(&tree->modified_extents); 3564 ret = -EFBIG; 3565 goto process; 3566 } 3567 3568 if (em->generation <= test_gen) 3569 continue; 3570 /* Need a ref to keep it from getting evicted from cache */ 3571 atomic_inc(&em->refs); 3572 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 3573 list_add_tail(&em->list, &extents); 3574 num++; 3575 } 3576 3577 list_sort(NULL, &extents, extent_cmp); 3578 3579 process: 3580 while (!list_empty(&extents)) { 3581 em = list_entry(extents.next, struct extent_map, list); 3582 3583 list_del_init(&em->list); 3584 3585 /* 3586 * If we had an error we just need to delete everybody from our 3587 * private list. 3588 */ 3589 if (ret) { 3590 clear_em_logging(tree, em); 3591 free_extent_map(em); 3592 continue; 3593 } 3594 3595 write_unlock(&tree->lock); 3596 3597 ret = log_one_extent(trans, inode, root, em, path); 3598 write_lock(&tree->lock); 3599 clear_em_logging(tree, em); 3600 free_extent_map(em); 3601 } 3602 WARN_ON(!list_empty(&extents)); 3603 write_unlock(&tree->lock); 3604 3605 btrfs_release_path(path); 3606 return ret; 3607 } 3608 3609 /* log a single inode in the tree log. 3610 * At least one parent directory for this inode must exist in the tree 3611 * or be logged already. 3612 * 3613 * Any items from this inode changed by the current transaction are copied 3614 * to the log tree. An extra reference is taken on any extents in this 3615 * file, allowing us to avoid a whole pile of corner cases around logging 3616 * blocks that have been removed from the tree. 3617 * 3618 * See LOG_INODE_ALL and related defines for a description of what inode_only 3619 * does. 3620 * 3621 * This handles both files and directories. 3622 */ 3623 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3624 struct btrfs_root *root, struct inode *inode, 3625 int inode_only) 3626 { 3627 struct btrfs_path *path; 3628 struct btrfs_path *dst_path; 3629 struct btrfs_key min_key; 3630 struct btrfs_key max_key; 3631 struct btrfs_root *log = root->log_root; 3632 struct extent_buffer *src = NULL; 3633 int err = 0; 3634 int ret; 3635 int nritems; 3636 int ins_start_slot = 0; 3637 int ins_nr; 3638 bool fast_search = false; 3639 u64 ino = btrfs_ino(inode); 3640 3641 path = btrfs_alloc_path(); 3642 if (!path) 3643 return -ENOMEM; 3644 dst_path = btrfs_alloc_path(); 3645 if (!dst_path) { 3646 btrfs_free_path(path); 3647 return -ENOMEM; 3648 } 3649 3650 min_key.objectid = ino; 3651 min_key.type = BTRFS_INODE_ITEM_KEY; 3652 min_key.offset = 0; 3653 3654 max_key.objectid = ino; 3655 3656 3657 /* today the code can only do partial logging of directories */ 3658 if (S_ISDIR(inode->i_mode) || 3659 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3660 &BTRFS_I(inode)->runtime_flags) && 3661 inode_only == LOG_INODE_EXISTS)) 3662 max_key.type = BTRFS_XATTR_ITEM_KEY; 3663 else 3664 max_key.type = (u8)-1; 3665 max_key.offset = (u64)-1; 3666 3667 /* Only run delayed items if we are a dir or a new file */ 3668 if (S_ISDIR(inode->i_mode) || 3669 BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { 3670 ret = btrfs_commit_inode_delayed_items(trans, inode); 3671 if (ret) { 3672 btrfs_free_path(path); 3673 btrfs_free_path(dst_path); 3674 return ret; 3675 } 3676 } 3677 3678 mutex_lock(&BTRFS_I(inode)->log_mutex); 3679 3680 btrfs_get_logged_extents(log, inode); 3681 3682 /* 3683 * a brute force approach to making sure we get the most uptodate 3684 * copies of everything. 3685 */ 3686 if (S_ISDIR(inode->i_mode)) { 3687 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 3688 3689 if (inode_only == LOG_INODE_EXISTS) 3690 max_key_type = BTRFS_XATTR_ITEM_KEY; 3691 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 3692 } else { 3693 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3694 &BTRFS_I(inode)->runtime_flags)) { 3695 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3696 &BTRFS_I(inode)->runtime_flags); 3697 ret = btrfs_truncate_inode_items(trans, log, 3698 inode, 0, 0); 3699 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3700 &BTRFS_I(inode)->runtime_flags) || 3701 inode_only == LOG_INODE_EXISTS) { 3702 if (inode_only == LOG_INODE_ALL) 3703 fast_search = true; 3704 max_key.type = BTRFS_XATTR_ITEM_KEY; 3705 ret = drop_objectid_items(trans, log, path, ino, 3706 max_key.type); 3707 } else { 3708 if (inode_only == LOG_INODE_ALL) 3709 fast_search = true; 3710 ret = log_inode_item(trans, log, dst_path, inode); 3711 if (ret) { 3712 err = ret; 3713 goto out_unlock; 3714 } 3715 goto log_extents; 3716 } 3717 3718 } 3719 if (ret) { 3720 err = ret; 3721 goto out_unlock; 3722 } 3723 path->keep_locks = 1; 3724 3725 while (1) { 3726 ins_nr = 0; 3727 ret = btrfs_search_forward(root, &min_key, 3728 path, trans->transid); 3729 if (ret != 0) 3730 break; 3731 again: 3732 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 3733 if (min_key.objectid != ino) 3734 break; 3735 if (min_key.type > max_key.type) 3736 break; 3737 3738 src = path->nodes[0]; 3739 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 3740 ins_nr++; 3741 goto next_slot; 3742 } else if (!ins_nr) { 3743 ins_start_slot = path->slots[0]; 3744 ins_nr = 1; 3745 goto next_slot; 3746 } 3747 3748 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3749 ins_nr, inode_only); 3750 if (ret) { 3751 err = ret; 3752 goto out_unlock; 3753 } 3754 ins_nr = 1; 3755 ins_start_slot = path->slots[0]; 3756 next_slot: 3757 3758 nritems = btrfs_header_nritems(path->nodes[0]); 3759 path->slots[0]++; 3760 if (path->slots[0] < nritems) { 3761 btrfs_item_key_to_cpu(path->nodes[0], &min_key, 3762 path->slots[0]); 3763 goto again; 3764 } 3765 if (ins_nr) { 3766 ret = copy_items(trans, inode, dst_path, src, 3767 ins_start_slot, 3768 ins_nr, inode_only); 3769 if (ret) { 3770 err = ret; 3771 goto out_unlock; 3772 } 3773 ins_nr = 0; 3774 } 3775 btrfs_release_path(path); 3776 3777 if (min_key.offset < (u64)-1) { 3778 min_key.offset++; 3779 } else if (min_key.type < max_key.type) { 3780 min_key.type++; 3781 min_key.offset = 0; 3782 } else { 3783 break; 3784 } 3785 } 3786 if (ins_nr) { 3787 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3788 ins_nr, inode_only); 3789 if (ret) { 3790 err = ret; 3791 goto out_unlock; 3792 } 3793 ins_nr = 0; 3794 } 3795 3796 log_extents: 3797 btrfs_release_path(path); 3798 btrfs_release_path(dst_path); 3799 if (fast_search) { 3800 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 3801 if (ret) { 3802 err = ret; 3803 goto out_unlock; 3804 } 3805 } else if (inode_only == LOG_INODE_ALL) { 3806 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3807 struct extent_map *em, *n; 3808 3809 write_lock(&tree->lock); 3810 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3811 list_del_init(&em->list); 3812 write_unlock(&tree->lock); 3813 } 3814 3815 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3816 ret = log_directory_changes(trans, root, inode, path, dst_path); 3817 if (ret) { 3818 err = ret; 3819 goto out_unlock; 3820 } 3821 } 3822 BTRFS_I(inode)->logged_trans = trans->transid; 3823 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 3824 out_unlock: 3825 if (err) 3826 btrfs_free_logged_extents(log, log->log_transid); 3827 mutex_unlock(&BTRFS_I(inode)->log_mutex); 3828 3829 btrfs_free_path(path); 3830 btrfs_free_path(dst_path); 3831 return err; 3832 } 3833 3834 /* 3835 * follow the dentry parent pointers up the chain and see if any 3836 * of the directories in it require a full commit before they can 3837 * be logged. Returns zero if nothing special needs to be done or 1 if 3838 * a full commit is required. 3839 */ 3840 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, 3841 struct inode *inode, 3842 struct dentry *parent, 3843 struct super_block *sb, 3844 u64 last_committed) 3845 { 3846 int ret = 0; 3847 struct btrfs_root *root; 3848 struct dentry *old_parent = NULL; 3849 struct inode *orig_inode = inode; 3850 3851 /* 3852 * for regular files, if its inode is already on disk, we don't 3853 * have to worry about the parents at all. This is because 3854 * we can use the last_unlink_trans field to record renames 3855 * and other fun in this file. 3856 */ 3857 if (S_ISREG(inode->i_mode) && 3858 BTRFS_I(inode)->generation <= last_committed && 3859 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3860 goto out; 3861 3862 if (!S_ISDIR(inode->i_mode)) { 3863 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3864 goto out; 3865 inode = parent->d_inode; 3866 } 3867 3868 while (1) { 3869 /* 3870 * If we are logging a directory then we start with our inode, 3871 * not our parents inode, so we need to skipp setting the 3872 * logged_trans so that further down in the log code we don't 3873 * think this inode has already been logged. 3874 */ 3875 if (inode != orig_inode) 3876 BTRFS_I(inode)->logged_trans = trans->transid; 3877 smp_mb(); 3878 3879 if (BTRFS_I(inode)->last_unlink_trans > last_committed) { 3880 root = BTRFS_I(inode)->root; 3881 3882 /* 3883 * make sure any commits to the log are forced 3884 * to be full commits 3885 */ 3886 root->fs_info->last_trans_log_full_commit = 3887 trans->transid; 3888 ret = 1; 3889 break; 3890 } 3891 3892 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3893 break; 3894 3895 if (IS_ROOT(parent)) 3896 break; 3897 3898 parent = dget_parent(parent); 3899 dput(old_parent); 3900 old_parent = parent; 3901 inode = parent->d_inode; 3902 3903 } 3904 dput(old_parent); 3905 out: 3906 return ret; 3907 } 3908 3909 /* 3910 * helper function around btrfs_log_inode to make sure newly created 3911 * parent directories also end up in the log. A minimal inode and backref 3912 * only logging is done of any parent directories that are older than 3913 * the last committed transaction 3914 */ 3915 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 3916 struct btrfs_root *root, struct inode *inode, 3917 struct dentry *parent, int exists_only) 3918 { 3919 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 3920 struct super_block *sb; 3921 struct dentry *old_parent = NULL; 3922 int ret = 0; 3923 u64 last_committed = root->fs_info->last_trans_committed; 3924 3925 sb = inode->i_sb; 3926 3927 if (btrfs_test_opt(root, NOTREELOG)) { 3928 ret = 1; 3929 goto end_no_trans; 3930 } 3931 3932 if (root->fs_info->last_trans_log_full_commit > 3933 root->fs_info->last_trans_committed) { 3934 ret = 1; 3935 goto end_no_trans; 3936 } 3937 3938 if (root != BTRFS_I(inode)->root || 3939 btrfs_root_refs(&root->root_item) == 0) { 3940 ret = 1; 3941 goto end_no_trans; 3942 } 3943 3944 ret = check_parent_dirs_for_sync(trans, inode, parent, 3945 sb, last_committed); 3946 if (ret) 3947 goto end_no_trans; 3948 3949 if (btrfs_inode_in_log(inode, trans->transid)) { 3950 ret = BTRFS_NO_LOG_SYNC; 3951 goto end_no_trans; 3952 } 3953 3954 ret = start_log_trans(trans, root); 3955 if (ret) 3956 goto end_trans; 3957 3958 ret = btrfs_log_inode(trans, root, inode, inode_only); 3959 if (ret) 3960 goto end_trans; 3961 3962 /* 3963 * for regular files, if its inode is already on disk, we don't 3964 * have to worry about the parents at all. This is because 3965 * we can use the last_unlink_trans field to record renames 3966 * and other fun in this file. 3967 */ 3968 if (S_ISREG(inode->i_mode) && 3969 BTRFS_I(inode)->generation <= last_committed && 3970 BTRFS_I(inode)->last_unlink_trans <= last_committed) { 3971 ret = 0; 3972 goto end_trans; 3973 } 3974 3975 inode_only = LOG_INODE_EXISTS; 3976 while (1) { 3977 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 3978 break; 3979 3980 inode = parent->d_inode; 3981 if (root != BTRFS_I(inode)->root) 3982 break; 3983 3984 if (BTRFS_I(inode)->generation > 3985 root->fs_info->last_trans_committed) { 3986 ret = btrfs_log_inode(trans, root, inode, inode_only); 3987 if (ret) 3988 goto end_trans; 3989 } 3990 if (IS_ROOT(parent)) 3991 break; 3992 3993 parent = dget_parent(parent); 3994 dput(old_parent); 3995 old_parent = parent; 3996 } 3997 ret = 0; 3998 end_trans: 3999 dput(old_parent); 4000 if (ret < 0) { 4001 root->fs_info->last_trans_log_full_commit = trans->transid; 4002 ret = 1; 4003 } 4004 btrfs_end_log_trans(root); 4005 end_no_trans: 4006 return ret; 4007 } 4008 4009 /* 4010 * it is not safe to log dentry if the chunk root has added new 4011 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 4012 * If this returns 1, you must commit the transaction to safely get your 4013 * data on disk. 4014 */ 4015 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4016 struct btrfs_root *root, struct dentry *dentry) 4017 { 4018 struct dentry *parent = dget_parent(dentry); 4019 int ret; 4020 4021 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); 4022 dput(parent); 4023 4024 return ret; 4025 } 4026 4027 /* 4028 * should be called during mount to recover any replay any log trees 4029 * from the FS 4030 */ 4031 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 4032 { 4033 int ret; 4034 struct btrfs_path *path; 4035 struct btrfs_trans_handle *trans; 4036 struct btrfs_key key; 4037 struct btrfs_key found_key; 4038 struct btrfs_key tmp_key; 4039 struct btrfs_root *log; 4040 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 4041 struct walk_control wc = { 4042 .process_func = process_one_buffer, 4043 .stage = 0, 4044 }; 4045 4046 path = btrfs_alloc_path(); 4047 if (!path) 4048 return -ENOMEM; 4049 4050 fs_info->log_root_recovering = 1; 4051 4052 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4053 if (IS_ERR(trans)) { 4054 ret = PTR_ERR(trans); 4055 goto error; 4056 } 4057 4058 wc.trans = trans; 4059 wc.pin = 1; 4060 4061 ret = walk_log_tree(trans, log_root_tree, &wc); 4062 if (ret) { 4063 btrfs_error(fs_info, ret, "Failed to pin buffers while " 4064 "recovering log root tree."); 4065 goto error; 4066 } 4067 4068 again: 4069 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4070 key.offset = (u64)-1; 4071 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 4072 4073 while (1) { 4074 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4075 4076 if (ret < 0) { 4077 btrfs_error(fs_info, ret, 4078 "Couldn't find tree log root."); 4079 goto error; 4080 } 4081 if (ret > 0) { 4082 if (path->slots[0] == 0) 4083 break; 4084 path->slots[0]--; 4085 } 4086 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4087 path->slots[0]); 4088 btrfs_release_path(path); 4089 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4090 break; 4091 4092 log = btrfs_read_fs_root(log_root_tree, &found_key); 4093 if (IS_ERR(log)) { 4094 ret = PTR_ERR(log); 4095 btrfs_error(fs_info, ret, 4096 "Couldn't read tree log root."); 4097 goto error; 4098 } 4099 4100 tmp_key.objectid = found_key.offset; 4101 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 4102 tmp_key.offset = (u64)-1; 4103 4104 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 4105 if (IS_ERR(wc.replay_dest)) { 4106 ret = PTR_ERR(wc.replay_dest); 4107 free_extent_buffer(log->node); 4108 free_extent_buffer(log->commit_root); 4109 kfree(log); 4110 btrfs_error(fs_info, ret, "Couldn't read target root " 4111 "for tree log recovery."); 4112 goto error; 4113 } 4114 4115 wc.replay_dest->log_root = log; 4116 btrfs_record_root_in_trans(trans, wc.replay_dest); 4117 ret = walk_log_tree(trans, log, &wc); 4118 4119 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 4120 ret = fixup_inode_link_counts(trans, wc.replay_dest, 4121 path); 4122 } 4123 4124 key.offset = found_key.offset - 1; 4125 wc.replay_dest->log_root = NULL; 4126 free_extent_buffer(log->node); 4127 free_extent_buffer(log->commit_root); 4128 kfree(log); 4129 4130 if (ret) 4131 goto error; 4132 4133 if (found_key.offset == 0) 4134 break; 4135 } 4136 btrfs_release_path(path); 4137 4138 /* step one is to pin it all, step two is to replay just inodes */ 4139 if (wc.pin) { 4140 wc.pin = 0; 4141 wc.process_func = replay_one_buffer; 4142 wc.stage = LOG_WALK_REPLAY_INODES; 4143 goto again; 4144 } 4145 /* step three is to replay everything */ 4146 if (wc.stage < LOG_WALK_REPLAY_ALL) { 4147 wc.stage++; 4148 goto again; 4149 } 4150 4151 btrfs_free_path(path); 4152 4153 /* step 4: commit the transaction, which also unpins the blocks */ 4154 ret = btrfs_commit_transaction(trans, fs_info->tree_root); 4155 if (ret) 4156 return ret; 4157 4158 free_extent_buffer(log_root_tree->node); 4159 log_root_tree->log_root = NULL; 4160 fs_info->log_root_recovering = 0; 4161 kfree(log_root_tree); 4162 4163 return 0; 4164 error: 4165 if (wc.trans) 4166 btrfs_end_transaction(wc.trans, fs_info->tree_root); 4167 btrfs_free_path(path); 4168 return ret; 4169 } 4170 4171 /* 4172 * there are some corner cases where we want to force a full 4173 * commit instead of allowing a directory to be logged. 4174 * 4175 * They revolve around files there were unlinked from the directory, and 4176 * this function updates the parent directory so that a full commit is 4177 * properly done if it is fsync'd later after the unlinks are done. 4178 */ 4179 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 4180 struct inode *dir, struct inode *inode, 4181 int for_rename) 4182 { 4183 /* 4184 * when we're logging a file, if it hasn't been renamed 4185 * or unlinked, and its inode is fully committed on disk, 4186 * we don't have to worry about walking up the directory chain 4187 * to log its parents. 4188 * 4189 * So, we use the last_unlink_trans field to put this transid 4190 * into the file. When the file is logged we check it and 4191 * don't log the parents if the file is fully on disk. 4192 */ 4193 if (S_ISREG(inode->i_mode)) 4194 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4195 4196 /* 4197 * if this directory was already logged any new 4198 * names for this file/dir will get recorded 4199 */ 4200 smp_mb(); 4201 if (BTRFS_I(dir)->logged_trans == trans->transid) 4202 return; 4203 4204 /* 4205 * if the inode we're about to unlink was logged, 4206 * the log will be properly updated for any new names 4207 */ 4208 if (BTRFS_I(inode)->logged_trans == trans->transid) 4209 return; 4210 4211 /* 4212 * when renaming files across directories, if the directory 4213 * there we're unlinking from gets fsync'd later on, there's 4214 * no way to find the destination directory later and fsync it 4215 * properly. So, we have to be conservative and force commits 4216 * so the new name gets discovered. 4217 */ 4218 if (for_rename) 4219 goto record; 4220 4221 /* we can safely do the unlink without any special recording */ 4222 return; 4223 4224 record: 4225 BTRFS_I(dir)->last_unlink_trans = trans->transid; 4226 } 4227 4228 /* 4229 * Call this after adding a new name for a file and it will properly 4230 * update the log to reflect the new name. 4231 * 4232 * It will return zero if all goes well, and it will return 1 if a 4233 * full transaction commit is required. 4234 */ 4235 int btrfs_log_new_name(struct btrfs_trans_handle *trans, 4236 struct inode *inode, struct inode *old_dir, 4237 struct dentry *parent) 4238 { 4239 struct btrfs_root * root = BTRFS_I(inode)->root; 4240 4241 /* 4242 * this will force the logging code to walk the dentry chain 4243 * up for the file 4244 */ 4245 if (S_ISREG(inode->i_mode)) 4246 BTRFS_I(inode)->last_unlink_trans = trans->transid; 4247 4248 /* 4249 * if this inode hasn't been logged and directory we're renaming it 4250 * from hasn't been logged, we don't need to log it 4251 */ 4252 if (BTRFS_I(inode)->logged_trans <= 4253 root->fs_info->last_trans_committed && 4254 (!old_dir || BTRFS_I(old_dir)->logged_trans <= 4255 root->fs_info->last_trans_committed)) 4256 return 0; 4257 4258 return btrfs_log_inode_parent(trans, root, inode, parent, 1); 4259 } 4260 4261