1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 24 /* magic values for the inode_only field in btrfs_log_inode: 25 * 26 * LOG_INODE_ALL means to log everything 27 * LOG_INODE_EXISTS means to log just enough to recreate the inode 28 * during log replay 29 */ 30 enum { 31 LOG_INODE_ALL, 32 LOG_INODE_EXISTS, 33 LOG_OTHER_INODE, 34 LOG_OTHER_INODE_ALL, 35 }; 36 37 /* 38 * directory trouble cases 39 * 40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 41 * log, we must force a full commit before doing an fsync of the directory 42 * where the unlink was done. 43 * ---> record transid of last unlink/rename per directory 44 * 45 * mkdir foo/some_dir 46 * normal commit 47 * rename foo/some_dir foo2/some_dir 48 * mkdir foo/some_dir 49 * fsync foo/some_dir/some_file 50 * 51 * The fsync above will unlink the original some_dir without recording 52 * it in its new location (foo2). After a crash, some_dir will be gone 53 * unless the fsync of some_file forces a full commit 54 * 55 * 2) we must log any new names for any file or dir that is in the fsync 56 * log. ---> check inode while renaming/linking. 57 * 58 * 2a) we must log any new names for any file or dir during rename 59 * when the directory they are being removed from was logged. 60 * ---> check inode and old parent dir during rename 61 * 62 * 2a is actually the more important variant. With the extra logging 63 * a crash might unlink the old name without recreating the new one 64 * 65 * 3) after a crash, we must go through any directories with a link count 66 * of zero and redo the rm -rf 67 * 68 * mkdir f1/foo 69 * normal commit 70 * rm -rf f1/foo 71 * fsync(f1) 72 * 73 * The directory f1 was fully removed from the FS, but fsync was never 74 * called on f1, only its parent dir. After a crash the rm -rf must 75 * be replayed. This must be able to recurse down the entire 76 * directory tree. The inode link count fixup code takes care of the 77 * ugly details. 78 */ 79 80 /* 81 * stages for the tree walking. The first 82 * stage (0) is to only pin down the blocks we find 83 * the second stage (1) is to make sure that all the inodes 84 * we find in the log are created in the subvolume. 85 * 86 * The last stage is to deal with directories and links and extents 87 * and all the other fun semantics 88 */ 89 enum { 90 LOG_WALK_PIN_ONLY, 91 LOG_WALK_REPLAY_INODES, 92 LOG_WALK_REPLAY_DIR_INDEX, 93 LOG_WALK_REPLAY_ALL, 94 }; 95 96 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 97 struct btrfs_root *root, struct btrfs_inode *inode, 98 int inode_only, 99 struct btrfs_log_ctx *ctx); 100 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 101 struct btrfs_root *root, 102 struct btrfs_path *path, u64 objectid); 103 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 104 struct btrfs_root *root, 105 struct btrfs_root *log, 106 struct btrfs_path *path, 107 u64 dirid, int del_all); 108 static void wait_log_commit(struct btrfs_root *root, int transid); 109 110 /* 111 * tree logging is a special write ahead log used to make sure that 112 * fsyncs and O_SYNCs can happen without doing full tree commits. 113 * 114 * Full tree commits are expensive because they require commonly 115 * modified blocks to be recowed, creating many dirty pages in the 116 * extent tree an 4x-6x higher write load than ext3. 117 * 118 * Instead of doing a tree commit on every fsync, we use the 119 * key ranges and transaction ids to find items for a given file or directory 120 * that have changed in this transaction. Those items are copied into 121 * a special tree (one per subvolume root), that tree is written to disk 122 * and then the fsync is considered complete. 123 * 124 * After a crash, items are copied out of the log-tree back into the 125 * subvolume tree. Any file data extents found are recorded in the extent 126 * allocation tree, and the log-tree freed. 127 * 128 * The log tree is read three times, once to pin down all the extents it is 129 * using in ram and once, once to create all the inodes logged in the tree 130 * and once to do all the other items. 131 */ 132 133 /* 134 * start a sub transaction and setup the log tree 135 * this increments the log tree writer count to make the people 136 * syncing the tree wait for us to finish 137 */ 138 static int start_log_trans(struct btrfs_trans_handle *trans, 139 struct btrfs_root *root, 140 struct btrfs_log_ctx *ctx) 141 { 142 struct btrfs_fs_info *fs_info = root->fs_info; 143 struct btrfs_root *tree_root = fs_info->tree_root; 144 const bool zoned = btrfs_is_zoned(fs_info); 145 int ret = 0; 146 bool created = false; 147 148 /* 149 * First check if the log root tree was already created. If not, create 150 * it before locking the root's log_mutex, just to keep lockdep happy. 151 */ 152 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 153 mutex_lock(&tree_root->log_mutex); 154 if (!fs_info->log_root_tree) { 155 ret = btrfs_init_log_root_tree(trans, fs_info); 156 if (!ret) { 157 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 158 created = true; 159 } 160 } 161 mutex_unlock(&tree_root->log_mutex); 162 if (ret) 163 return ret; 164 } 165 166 mutex_lock(&root->log_mutex); 167 168 again: 169 if (root->log_root) { 170 int index = (root->log_transid + 1) % 2; 171 172 if (btrfs_need_log_full_commit(trans)) { 173 ret = -EAGAIN; 174 goto out; 175 } 176 177 if (zoned && atomic_read(&root->log_commit[index])) { 178 wait_log_commit(root, root->log_transid - 1); 179 goto again; 180 } 181 182 if (!root->log_start_pid) { 183 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 184 root->log_start_pid = current->pid; 185 } else if (root->log_start_pid != current->pid) { 186 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 187 } 188 } else { 189 /* 190 * This means fs_info->log_root_tree was already created 191 * for some other FS trees. Do the full commit not to mix 192 * nodes from multiple log transactions to do sequential 193 * writing. 194 */ 195 if (zoned && !created) { 196 ret = -EAGAIN; 197 goto out; 198 } 199 200 ret = btrfs_add_log_tree(trans, root); 201 if (ret) 202 goto out; 203 204 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 205 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 206 root->log_start_pid = current->pid; 207 } 208 209 atomic_inc(&root->log_writers); 210 if (ctx && !ctx->logging_new_name) { 211 int index = root->log_transid % 2; 212 list_add_tail(&ctx->list, &root->log_ctxs[index]); 213 ctx->log_transid = root->log_transid; 214 } 215 216 out: 217 mutex_unlock(&root->log_mutex); 218 return ret; 219 } 220 221 /* 222 * returns 0 if there was a log transaction running and we were able 223 * to join, or returns -ENOENT if there were not transactions 224 * in progress 225 */ 226 static int join_running_log_trans(struct btrfs_root *root) 227 { 228 const bool zoned = btrfs_is_zoned(root->fs_info); 229 int ret = -ENOENT; 230 231 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 232 return ret; 233 234 mutex_lock(&root->log_mutex); 235 again: 236 if (root->log_root) { 237 int index = (root->log_transid + 1) % 2; 238 239 ret = 0; 240 if (zoned && atomic_read(&root->log_commit[index])) { 241 wait_log_commit(root, root->log_transid - 1); 242 goto again; 243 } 244 atomic_inc(&root->log_writers); 245 } 246 mutex_unlock(&root->log_mutex); 247 return ret; 248 } 249 250 /* 251 * This either makes the current running log transaction wait 252 * until you call btrfs_end_log_trans() or it makes any future 253 * log transactions wait until you call btrfs_end_log_trans() 254 */ 255 void btrfs_pin_log_trans(struct btrfs_root *root) 256 { 257 atomic_inc(&root->log_writers); 258 } 259 260 /* 261 * indicate we're done making changes to the log tree 262 * and wake up anyone waiting to do a sync 263 */ 264 void btrfs_end_log_trans(struct btrfs_root *root) 265 { 266 if (atomic_dec_and_test(&root->log_writers)) { 267 /* atomic_dec_and_test implies a barrier */ 268 cond_wake_up_nomb(&root->log_writer_wait); 269 } 270 } 271 272 static int btrfs_write_tree_block(struct extent_buffer *buf) 273 { 274 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, 275 buf->start + buf->len - 1); 276 } 277 278 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 279 { 280 filemap_fdatawait_range(buf->pages[0]->mapping, 281 buf->start, buf->start + buf->len - 1); 282 } 283 284 /* 285 * the walk control struct is used to pass state down the chain when 286 * processing the log tree. The stage field tells us which part 287 * of the log tree processing we are currently doing. The others 288 * are state fields used for that specific part 289 */ 290 struct walk_control { 291 /* should we free the extent on disk when done? This is used 292 * at transaction commit time while freeing a log tree 293 */ 294 int free; 295 296 /* should we write out the extent buffer? This is used 297 * while flushing the log tree to disk during a sync 298 */ 299 int write; 300 301 /* should we wait for the extent buffer io to finish? Also used 302 * while flushing the log tree to disk for a sync 303 */ 304 int wait; 305 306 /* pin only walk, we record which extents on disk belong to the 307 * log trees 308 */ 309 int pin; 310 311 /* what stage of the replay code we're currently in */ 312 int stage; 313 314 /* 315 * Ignore any items from the inode currently being processed. Needs 316 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 317 * the LOG_WALK_REPLAY_INODES stage. 318 */ 319 bool ignore_cur_inode; 320 321 /* the root we are currently replaying */ 322 struct btrfs_root *replay_dest; 323 324 /* the trans handle for the current replay */ 325 struct btrfs_trans_handle *trans; 326 327 /* the function that gets used to process blocks we find in the 328 * tree. Note the extent_buffer might not be up to date when it is 329 * passed in, and it must be checked or read if you need the data 330 * inside it 331 */ 332 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 333 struct walk_control *wc, u64 gen, int level); 334 }; 335 336 /* 337 * process_func used to pin down extents, write them or wait on them 338 */ 339 static int process_one_buffer(struct btrfs_root *log, 340 struct extent_buffer *eb, 341 struct walk_control *wc, u64 gen, int level) 342 { 343 struct btrfs_fs_info *fs_info = log->fs_info; 344 int ret = 0; 345 346 /* 347 * If this fs is mixed then we need to be able to process the leaves to 348 * pin down any logged extents, so we have to read the block. 349 */ 350 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 351 ret = btrfs_read_buffer(eb, gen, level, NULL); 352 if (ret) 353 return ret; 354 } 355 356 if (wc->pin) 357 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 358 eb->len); 359 360 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 361 if (wc->pin && btrfs_header_level(eb) == 0) 362 ret = btrfs_exclude_logged_extents(eb); 363 if (wc->write) 364 btrfs_write_tree_block(eb); 365 if (wc->wait) 366 btrfs_wait_tree_block_writeback(eb); 367 } 368 return ret; 369 } 370 371 /* 372 * Item overwrite used by replay and tree logging. eb, slot and key all refer 373 * to the src data we are copying out. 374 * 375 * root is the tree we are copying into, and path is a scratch 376 * path for use in this function (it should be released on entry and 377 * will be released on exit). 378 * 379 * If the key is already in the destination tree the existing item is 380 * overwritten. If the existing item isn't big enough, it is extended. 381 * If it is too large, it is truncated. 382 * 383 * If the key isn't in the destination yet, a new item is inserted. 384 */ 385 static noinline int overwrite_item(struct btrfs_trans_handle *trans, 386 struct btrfs_root *root, 387 struct btrfs_path *path, 388 struct extent_buffer *eb, int slot, 389 struct btrfs_key *key) 390 { 391 int ret; 392 u32 item_size; 393 u64 saved_i_size = 0; 394 int save_old_i_size = 0; 395 unsigned long src_ptr; 396 unsigned long dst_ptr; 397 int overwrite_root = 0; 398 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 399 400 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 401 overwrite_root = 1; 402 403 item_size = btrfs_item_size_nr(eb, slot); 404 src_ptr = btrfs_item_ptr_offset(eb, slot); 405 406 /* look for the key in the destination tree */ 407 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 408 if (ret < 0) 409 return ret; 410 411 if (ret == 0) { 412 char *src_copy; 413 char *dst_copy; 414 u32 dst_size = btrfs_item_size_nr(path->nodes[0], 415 path->slots[0]); 416 if (dst_size != item_size) 417 goto insert; 418 419 if (item_size == 0) { 420 btrfs_release_path(path); 421 return 0; 422 } 423 dst_copy = kmalloc(item_size, GFP_NOFS); 424 src_copy = kmalloc(item_size, GFP_NOFS); 425 if (!dst_copy || !src_copy) { 426 btrfs_release_path(path); 427 kfree(dst_copy); 428 kfree(src_copy); 429 return -ENOMEM; 430 } 431 432 read_extent_buffer(eb, src_copy, src_ptr, item_size); 433 434 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 435 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 436 item_size); 437 ret = memcmp(dst_copy, src_copy, item_size); 438 439 kfree(dst_copy); 440 kfree(src_copy); 441 /* 442 * they have the same contents, just return, this saves 443 * us from cowing blocks in the destination tree and doing 444 * extra writes that may not have been done by a previous 445 * sync 446 */ 447 if (ret == 0) { 448 btrfs_release_path(path); 449 return 0; 450 } 451 452 /* 453 * We need to load the old nbytes into the inode so when we 454 * replay the extents we've logged we get the right nbytes. 455 */ 456 if (inode_item) { 457 struct btrfs_inode_item *item; 458 u64 nbytes; 459 u32 mode; 460 461 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 462 struct btrfs_inode_item); 463 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 464 item = btrfs_item_ptr(eb, slot, 465 struct btrfs_inode_item); 466 btrfs_set_inode_nbytes(eb, item, nbytes); 467 468 /* 469 * If this is a directory we need to reset the i_size to 470 * 0 so that we can set it up properly when replaying 471 * the rest of the items in this log. 472 */ 473 mode = btrfs_inode_mode(eb, item); 474 if (S_ISDIR(mode)) 475 btrfs_set_inode_size(eb, item, 0); 476 } 477 } else if (inode_item) { 478 struct btrfs_inode_item *item; 479 u32 mode; 480 481 /* 482 * New inode, set nbytes to 0 so that the nbytes comes out 483 * properly when we replay the extents. 484 */ 485 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 486 btrfs_set_inode_nbytes(eb, item, 0); 487 488 /* 489 * If this is a directory we need to reset the i_size to 0 so 490 * that we can set it up properly when replaying the rest of 491 * the items in this log. 492 */ 493 mode = btrfs_inode_mode(eb, item); 494 if (S_ISDIR(mode)) 495 btrfs_set_inode_size(eb, item, 0); 496 } 497 insert: 498 btrfs_release_path(path); 499 /* try to insert the key into the destination tree */ 500 path->skip_release_on_error = 1; 501 ret = btrfs_insert_empty_item(trans, root, path, 502 key, item_size); 503 path->skip_release_on_error = 0; 504 505 /* make sure any existing item is the correct size */ 506 if (ret == -EEXIST || ret == -EOVERFLOW) { 507 u32 found_size; 508 found_size = btrfs_item_size_nr(path->nodes[0], 509 path->slots[0]); 510 if (found_size > item_size) 511 btrfs_truncate_item(path, item_size, 1); 512 else if (found_size < item_size) 513 btrfs_extend_item(path, item_size - found_size); 514 } else if (ret) { 515 return ret; 516 } 517 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 518 path->slots[0]); 519 520 /* don't overwrite an existing inode if the generation number 521 * was logged as zero. This is done when the tree logging code 522 * is just logging an inode to make sure it exists after recovery. 523 * 524 * Also, don't overwrite i_size on directories during replay. 525 * log replay inserts and removes directory items based on the 526 * state of the tree found in the subvolume, and i_size is modified 527 * as it goes 528 */ 529 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 530 struct btrfs_inode_item *src_item; 531 struct btrfs_inode_item *dst_item; 532 533 src_item = (struct btrfs_inode_item *)src_ptr; 534 dst_item = (struct btrfs_inode_item *)dst_ptr; 535 536 if (btrfs_inode_generation(eb, src_item) == 0) { 537 struct extent_buffer *dst_eb = path->nodes[0]; 538 const u64 ino_size = btrfs_inode_size(eb, src_item); 539 540 /* 541 * For regular files an ino_size == 0 is used only when 542 * logging that an inode exists, as part of a directory 543 * fsync, and the inode wasn't fsynced before. In this 544 * case don't set the size of the inode in the fs/subvol 545 * tree, otherwise we would be throwing valid data away. 546 */ 547 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 548 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 549 ino_size != 0) 550 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 551 goto no_copy; 552 } 553 554 if (overwrite_root && 555 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 556 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 557 save_old_i_size = 1; 558 saved_i_size = btrfs_inode_size(path->nodes[0], 559 dst_item); 560 } 561 } 562 563 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 564 src_ptr, item_size); 565 566 if (save_old_i_size) { 567 struct btrfs_inode_item *dst_item; 568 dst_item = (struct btrfs_inode_item *)dst_ptr; 569 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 570 } 571 572 /* make sure the generation is filled in */ 573 if (key->type == BTRFS_INODE_ITEM_KEY) { 574 struct btrfs_inode_item *dst_item; 575 dst_item = (struct btrfs_inode_item *)dst_ptr; 576 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 577 btrfs_set_inode_generation(path->nodes[0], dst_item, 578 trans->transid); 579 } 580 } 581 no_copy: 582 btrfs_mark_buffer_dirty(path->nodes[0]); 583 btrfs_release_path(path); 584 return 0; 585 } 586 587 /* 588 * simple helper to read an inode off the disk from a given root 589 * This can only be called for subvolume roots and not for the log 590 */ 591 static noinline struct inode *read_one_inode(struct btrfs_root *root, 592 u64 objectid) 593 { 594 struct inode *inode; 595 596 inode = btrfs_iget(root->fs_info->sb, objectid, root); 597 if (IS_ERR(inode)) 598 inode = NULL; 599 return inode; 600 } 601 602 /* replays a single extent in 'eb' at 'slot' with 'key' into the 603 * subvolume 'root'. path is released on entry and should be released 604 * on exit. 605 * 606 * extents in the log tree have not been allocated out of the extent 607 * tree yet. So, this completes the allocation, taking a reference 608 * as required if the extent already exists or creating a new extent 609 * if it isn't in the extent allocation tree yet. 610 * 611 * The extent is inserted into the file, dropping any existing extents 612 * from the file that overlap the new one. 613 */ 614 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 615 struct btrfs_root *root, 616 struct btrfs_path *path, 617 struct extent_buffer *eb, int slot, 618 struct btrfs_key *key) 619 { 620 struct btrfs_drop_extents_args drop_args = { 0 }; 621 struct btrfs_fs_info *fs_info = root->fs_info; 622 int found_type; 623 u64 extent_end; 624 u64 start = key->offset; 625 u64 nbytes = 0; 626 struct btrfs_file_extent_item *item; 627 struct inode *inode = NULL; 628 unsigned long size; 629 int ret = 0; 630 631 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 632 found_type = btrfs_file_extent_type(eb, item); 633 634 if (found_type == BTRFS_FILE_EXTENT_REG || 635 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 636 nbytes = btrfs_file_extent_num_bytes(eb, item); 637 extent_end = start + nbytes; 638 639 /* 640 * We don't add to the inodes nbytes if we are prealloc or a 641 * hole. 642 */ 643 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 644 nbytes = 0; 645 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 646 size = btrfs_file_extent_ram_bytes(eb, item); 647 nbytes = btrfs_file_extent_ram_bytes(eb, item); 648 extent_end = ALIGN(start + size, 649 fs_info->sectorsize); 650 } else { 651 ret = 0; 652 goto out; 653 } 654 655 inode = read_one_inode(root, key->objectid); 656 if (!inode) { 657 ret = -EIO; 658 goto out; 659 } 660 661 /* 662 * first check to see if we already have this extent in the 663 * file. This must be done before the btrfs_drop_extents run 664 * so we don't try to drop this extent. 665 */ 666 ret = btrfs_lookup_file_extent(trans, root, path, 667 btrfs_ino(BTRFS_I(inode)), start, 0); 668 669 if (ret == 0 && 670 (found_type == BTRFS_FILE_EXTENT_REG || 671 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 672 struct btrfs_file_extent_item cmp1; 673 struct btrfs_file_extent_item cmp2; 674 struct btrfs_file_extent_item *existing; 675 struct extent_buffer *leaf; 676 677 leaf = path->nodes[0]; 678 existing = btrfs_item_ptr(leaf, path->slots[0], 679 struct btrfs_file_extent_item); 680 681 read_extent_buffer(eb, &cmp1, (unsigned long)item, 682 sizeof(cmp1)); 683 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 684 sizeof(cmp2)); 685 686 /* 687 * we already have a pointer to this exact extent, 688 * we don't have to do anything 689 */ 690 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 691 btrfs_release_path(path); 692 goto out; 693 } 694 } 695 btrfs_release_path(path); 696 697 /* drop any overlapping extents */ 698 drop_args.start = start; 699 drop_args.end = extent_end; 700 drop_args.drop_cache = true; 701 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 702 if (ret) 703 goto out; 704 705 if (found_type == BTRFS_FILE_EXTENT_REG || 706 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 707 u64 offset; 708 unsigned long dest_offset; 709 struct btrfs_key ins; 710 711 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 712 btrfs_fs_incompat(fs_info, NO_HOLES)) 713 goto update_inode; 714 715 ret = btrfs_insert_empty_item(trans, root, path, key, 716 sizeof(*item)); 717 if (ret) 718 goto out; 719 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 720 path->slots[0]); 721 copy_extent_buffer(path->nodes[0], eb, dest_offset, 722 (unsigned long)item, sizeof(*item)); 723 724 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 725 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 726 ins.type = BTRFS_EXTENT_ITEM_KEY; 727 offset = key->offset - btrfs_file_extent_offset(eb, item); 728 729 /* 730 * Manually record dirty extent, as here we did a shallow 731 * file extent item copy and skip normal backref update, 732 * but modifying extent tree all by ourselves. 733 * So need to manually record dirty extent for qgroup, 734 * as the owner of the file extent changed from log tree 735 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 736 */ 737 ret = btrfs_qgroup_trace_extent(trans, 738 btrfs_file_extent_disk_bytenr(eb, item), 739 btrfs_file_extent_disk_num_bytes(eb, item), 740 GFP_NOFS); 741 if (ret < 0) 742 goto out; 743 744 if (ins.objectid > 0) { 745 struct btrfs_ref ref = { 0 }; 746 u64 csum_start; 747 u64 csum_end; 748 LIST_HEAD(ordered_sums); 749 750 /* 751 * is this extent already allocated in the extent 752 * allocation tree? If so, just add a reference 753 */ 754 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 755 ins.offset); 756 if (ret < 0) { 757 goto out; 758 } else if (ret == 0) { 759 btrfs_init_generic_ref(&ref, 760 BTRFS_ADD_DELAYED_REF, 761 ins.objectid, ins.offset, 0); 762 btrfs_init_data_ref(&ref, 763 root->root_key.objectid, 764 key->objectid, offset); 765 ret = btrfs_inc_extent_ref(trans, &ref); 766 if (ret) 767 goto out; 768 } else { 769 /* 770 * insert the extent pointer in the extent 771 * allocation tree 772 */ 773 ret = btrfs_alloc_logged_file_extent(trans, 774 root->root_key.objectid, 775 key->objectid, offset, &ins); 776 if (ret) 777 goto out; 778 } 779 btrfs_release_path(path); 780 781 if (btrfs_file_extent_compression(eb, item)) { 782 csum_start = ins.objectid; 783 csum_end = csum_start + ins.offset; 784 } else { 785 csum_start = ins.objectid + 786 btrfs_file_extent_offset(eb, item); 787 csum_end = csum_start + 788 btrfs_file_extent_num_bytes(eb, item); 789 } 790 791 ret = btrfs_lookup_csums_range(root->log_root, 792 csum_start, csum_end - 1, 793 &ordered_sums, 0); 794 if (ret) 795 goto out; 796 /* 797 * Now delete all existing cums in the csum root that 798 * cover our range. We do this because we can have an 799 * extent that is completely referenced by one file 800 * extent item and partially referenced by another 801 * file extent item (like after using the clone or 802 * extent_same ioctls). In this case if we end up doing 803 * the replay of the one that partially references the 804 * extent first, and we do not do the csum deletion 805 * below, we can get 2 csum items in the csum tree that 806 * overlap each other. For example, imagine our log has 807 * the two following file extent items: 808 * 809 * key (257 EXTENT_DATA 409600) 810 * extent data disk byte 12845056 nr 102400 811 * extent data offset 20480 nr 20480 ram 102400 812 * 813 * key (257 EXTENT_DATA 819200) 814 * extent data disk byte 12845056 nr 102400 815 * extent data offset 0 nr 102400 ram 102400 816 * 817 * Where the second one fully references the 100K extent 818 * that starts at disk byte 12845056, and the log tree 819 * has a single csum item that covers the entire range 820 * of the extent: 821 * 822 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 823 * 824 * After the first file extent item is replayed, the 825 * csum tree gets the following csum item: 826 * 827 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 828 * 829 * Which covers the 20K sub-range starting at offset 20K 830 * of our extent. Now when we replay the second file 831 * extent item, if we do not delete existing csum items 832 * that cover any of its blocks, we end up getting two 833 * csum items in our csum tree that overlap each other: 834 * 835 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 836 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 837 * 838 * Which is a problem, because after this anyone trying 839 * to lookup up for the checksum of any block of our 840 * extent starting at an offset of 40K or higher, will 841 * end up looking at the second csum item only, which 842 * does not contain the checksum for any block starting 843 * at offset 40K or higher of our extent. 844 */ 845 while (!list_empty(&ordered_sums)) { 846 struct btrfs_ordered_sum *sums; 847 sums = list_entry(ordered_sums.next, 848 struct btrfs_ordered_sum, 849 list); 850 if (!ret) 851 ret = btrfs_del_csums(trans, 852 fs_info->csum_root, 853 sums->bytenr, 854 sums->len); 855 if (!ret) 856 ret = btrfs_csum_file_blocks(trans, 857 fs_info->csum_root, sums); 858 list_del(&sums->list); 859 kfree(sums); 860 } 861 if (ret) 862 goto out; 863 } else { 864 btrfs_release_path(path); 865 } 866 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 867 /* inline extents are easy, we just overwrite them */ 868 ret = overwrite_item(trans, root, path, eb, slot, key); 869 if (ret) 870 goto out; 871 } 872 873 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 874 extent_end - start); 875 if (ret) 876 goto out; 877 878 update_inode: 879 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 880 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 881 out: 882 if (inode) 883 iput(inode); 884 return ret; 885 } 886 887 /* 888 * when cleaning up conflicts between the directory names in the 889 * subvolume, directory names in the log and directory names in the 890 * inode back references, we may have to unlink inodes from directories. 891 * 892 * This is a helper function to do the unlink of a specific directory 893 * item 894 */ 895 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 896 struct btrfs_root *root, 897 struct btrfs_path *path, 898 struct btrfs_inode *dir, 899 struct btrfs_dir_item *di) 900 { 901 struct inode *inode; 902 char *name; 903 int name_len; 904 struct extent_buffer *leaf; 905 struct btrfs_key location; 906 int ret; 907 908 leaf = path->nodes[0]; 909 910 btrfs_dir_item_key_to_cpu(leaf, di, &location); 911 name_len = btrfs_dir_name_len(leaf, di); 912 name = kmalloc(name_len, GFP_NOFS); 913 if (!name) 914 return -ENOMEM; 915 916 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 917 btrfs_release_path(path); 918 919 inode = read_one_inode(root, location.objectid); 920 if (!inode) { 921 ret = -EIO; 922 goto out; 923 } 924 925 ret = link_to_fixup_dir(trans, root, path, location.objectid); 926 if (ret) 927 goto out; 928 929 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, 930 name_len); 931 if (ret) 932 goto out; 933 else 934 ret = btrfs_run_delayed_items(trans); 935 out: 936 kfree(name); 937 iput(inode); 938 return ret; 939 } 940 941 /* 942 * See if a given name and sequence number found in an inode back reference are 943 * already in a directory and correctly point to this inode. 944 * 945 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 946 * exists. 947 */ 948 static noinline int inode_in_dir(struct btrfs_root *root, 949 struct btrfs_path *path, 950 u64 dirid, u64 objectid, u64 index, 951 const char *name, int name_len) 952 { 953 struct btrfs_dir_item *di; 954 struct btrfs_key location; 955 int ret = 0; 956 957 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 958 index, name, name_len, 0); 959 if (IS_ERR(di)) { 960 ret = PTR_ERR(di); 961 goto out; 962 } else if (di) { 963 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 964 if (location.objectid != objectid) 965 goto out; 966 } else { 967 goto out; 968 } 969 970 btrfs_release_path(path); 971 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 972 if (IS_ERR(di)) { 973 ret = PTR_ERR(di); 974 goto out; 975 } else if (di) { 976 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 977 if (location.objectid == objectid) 978 ret = 1; 979 } 980 out: 981 btrfs_release_path(path); 982 return ret; 983 } 984 985 /* 986 * helper function to check a log tree for a named back reference in 987 * an inode. This is used to decide if a back reference that is 988 * found in the subvolume conflicts with what we find in the log. 989 * 990 * inode backreferences may have multiple refs in a single item, 991 * during replay we process one reference at a time, and we don't 992 * want to delete valid links to a file from the subvolume if that 993 * link is also in the log. 994 */ 995 static noinline int backref_in_log(struct btrfs_root *log, 996 struct btrfs_key *key, 997 u64 ref_objectid, 998 const char *name, int namelen) 999 { 1000 struct btrfs_path *path; 1001 int ret; 1002 1003 path = btrfs_alloc_path(); 1004 if (!path) 1005 return -ENOMEM; 1006 1007 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1008 if (ret < 0) { 1009 goto out; 1010 } else if (ret == 1) { 1011 ret = 0; 1012 goto out; 1013 } 1014 1015 if (key->type == BTRFS_INODE_EXTREF_KEY) 1016 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1017 path->slots[0], 1018 ref_objectid, 1019 name, namelen); 1020 else 1021 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1022 path->slots[0], 1023 name, namelen); 1024 out: 1025 btrfs_free_path(path); 1026 return ret; 1027 } 1028 1029 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1030 struct btrfs_root *root, 1031 struct btrfs_path *path, 1032 struct btrfs_root *log_root, 1033 struct btrfs_inode *dir, 1034 struct btrfs_inode *inode, 1035 u64 inode_objectid, u64 parent_objectid, 1036 u64 ref_index, char *name, int namelen, 1037 int *search_done) 1038 { 1039 int ret; 1040 char *victim_name; 1041 int victim_name_len; 1042 struct extent_buffer *leaf; 1043 struct btrfs_dir_item *di; 1044 struct btrfs_key search_key; 1045 struct btrfs_inode_extref *extref; 1046 1047 again: 1048 /* Search old style refs */ 1049 search_key.objectid = inode_objectid; 1050 search_key.type = BTRFS_INODE_REF_KEY; 1051 search_key.offset = parent_objectid; 1052 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1053 if (ret == 0) { 1054 struct btrfs_inode_ref *victim_ref; 1055 unsigned long ptr; 1056 unsigned long ptr_end; 1057 1058 leaf = path->nodes[0]; 1059 1060 /* are we trying to overwrite a back ref for the root directory 1061 * if so, just jump out, we're done 1062 */ 1063 if (search_key.objectid == search_key.offset) 1064 return 1; 1065 1066 /* check all the names in this back reference to see 1067 * if they are in the log. if so, we allow them to stay 1068 * otherwise they must be unlinked as a conflict 1069 */ 1070 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1071 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); 1072 while (ptr < ptr_end) { 1073 victim_ref = (struct btrfs_inode_ref *)ptr; 1074 victim_name_len = btrfs_inode_ref_name_len(leaf, 1075 victim_ref); 1076 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1077 if (!victim_name) 1078 return -ENOMEM; 1079 1080 read_extent_buffer(leaf, victim_name, 1081 (unsigned long)(victim_ref + 1), 1082 victim_name_len); 1083 1084 ret = backref_in_log(log_root, &search_key, 1085 parent_objectid, victim_name, 1086 victim_name_len); 1087 if (ret < 0) { 1088 kfree(victim_name); 1089 return ret; 1090 } else if (!ret) { 1091 inc_nlink(&inode->vfs_inode); 1092 btrfs_release_path(path); 1093 1094 ret = btrfs_unlink_inode(trans, root, dir, inode, 1095 victim_name, victim_name_len); 1096 kfree(victim_name); 1097 if (ret) 1098 return ret; 1099 ret = btrfs_run_delayed_items(trans); 1100 if (ret) 1101 return ret; 1102 *search_done = 1; 1103 goto again; 1104 } 1105 kfree(victim_name); 1106 1107 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1108 } 1109 1110 /* 1111 * NOTE: we have searched root tree and checked the 1112 * corresponding ref, it does not need to check again. 1113 */ 1114 *search_done = 1; 1115 } 1116 btrfs_release_path(path); 1117 1118 /* Same search but for extended refs */ 1119 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1120 inode_objectid, parent_objectid, 0, 1121 0); 1122 if (!IS_ERR_OR_NULL(extref)) { 1123 u32 item_size; 1124 u32 cur_offset = 0; 1125 unsigned long base; 1126 struct inode *victim_parent; 1127 1128 leaf = path->nodes[0]; 1129 1130 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1131 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1132 1133 while (cur_offset < item_size) { 1134 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1135 1136 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1137 1138 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1139 goto next; 1140 1141 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1142 if (!victim_name) 1143 return -ENOMEM; 1144 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1145 victim_name_len); 1146 1147 search_key.objectid = inode_objectid; 1148 search_key.type = BTRFS_INODE_EXTREF_KEY; 1149 search_key.offset = btrfs_extref_hash(parent_objectid, 1150 victim_name, 1151 victim_name_len); 1152 ret = backref_in_log(log_root, &search_key, 1153 parent_objectid, victim_name, 1154 victim_name_len); 1155 if (ret < 0) { 1156 return ret; 1157 } else if (!ret) { 1158 ret = -ENOENT; 1159 victim_parent = read_one_inode(root, 1160 parent_objectid); 1161 if (victim_parent) { 1162 inc_nlink(&inode->vfs_inode); 1163 btrfs_release_path(path); 1164 1165 ret = btrfs_unlink_inode(trans, root, 1166 BTRFS_I(victim_parent), 1167 inode, 1168 victim_name, 1169 victim_name_len); 1170 if (!ret) 1171 ret = btrfs_run_delayed_items( 1172 trans); 1173 } 1174 iput(victim_parent); 1175 kfree(victim_name); 1176 if (ret) 1177 return ret; 1178 *search_done = 1; 1179 goto again; 1180 } 1181 kfree(victim_name); 1182 next: 1183 cur_offset += victim_name_len + sizeof(*extref); 1184 } 1185 *search_done = 1; 1186 } 1187 btrfs_release_path(path); 1188 1189 /* look for a conflicting sequence number */ 1190 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1191 ref_index, name, namelen, 0); 1192 if (IS_ERR(di)) { 1193 return PTR_ERR(di); 1194 } else if (di) { 1195 ret = drop_one_dir_item(trans, root, path, dir, di); 1196 if (ret) 1197 return ret; 1198 } 1199 btrfs_release_path(path); 1200 1201 /* look for a conflicting name */ 1202 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1203 name, namelen, 0); 1204 if (IS_ERR(di)) { 1205 return PTR_ERR(di); 1206 } else if (di) { 1207 ret = drop_one_dir_item(trans, root, path, dir, di); 1208 if (ret) 1209 return ret; 1210 } 1211 btrfs_release_path(path); 1212 1213 return 0; 1214 } 1215 1216 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1217 u32 *namelen, char **name, u64 *index, 1218 u64 *parent_objectid) 1219 { 1220 struct btrfs_inode_extref *extref; 1221 1222 extref = (struct btrfs_inode_extref *)ref_ptr; 1223 1224 *namelen = btrfs_inode_extref_name_len(eb, extref); 1225 *name = kmalloc(*namelen, GFP_NOFS); 1226 if (*name == NULL) 1227 return -ENOMEM; 1228 1229 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1230 *namelen); 1231 1232 if (index) 1233 *index = btrfs_inode_extref_index(eb, extref); 1234 if (parent_objectid) 1235 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1236 1237 return 0; 1238 } 1239 1240 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1241 u32 *namelen, char **name, u64 *index) 1242 { 1243 struct btrfs_inode_ref *ref; 1244 1245 ref = (struct btrfs_inode_ref *)ref_ptr; 1246 1247 *namelen = btrfs_inode_ref_name_len(eb, ref); 1248 *name = kmalloc(*namelen, GFP_NOFS); 1249 if (*name == NULL) 1250 return -ENOMEM; 1251 1252 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1253 1254 if (index) 1255 *index = btrfs_inode_ref_index(eb, ref); 1256 1257 return 0; 1258 } 1259 1260 /* 1261 * Take an inode reference item from the log tree and iterate all names from the 1262 * inode reference item in the subvolume tree with the same key (if it exists). 1263 * For any name that is not in the inode reference item from the log tree, do a 1264 * proper unlink of that name (that is, remove its entry from the inode 1265 * reference item and both dir index keys). 1266 */ 1267 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1268 struct btrfs_root *root, 1269 struct btrfs_path *path, 1270 struct btrfs_inode *inode, 1271 struct extent_buffer *log_eb, 1272 int log_slot, 1273 struct btrfs_key *key) 1274 { 1275 int ret; 1276 unsigned long ref_ptr; 1277 unsigned long ref_end; 1278 struct extent_buffer *eb; 1279 1280 again: 1281 btrfs_release_path(path); 1282 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1283 if (ret > 0) { 1284 ret = 0; 1285 goto out; 1286 } 1287 if (ret < 0) 1288 goto out; 1289 1290 eb = path->nodes[0]; 1291 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1292 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); 1293 while (ref_ptr < ref_end) { 1294 char *name = NULL; 1295 int namelen; 1296 u64 parent_id; 1297 1298 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1299 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1300 NULL, &parent_id); 1301 } else { 1302 parent_id = key->offset; 1303 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1304 NULL); 1305 } 1306 if (ret) 1307 goto out; 1308 1309 if (key->type == BTRFS_INODE_EXTREF_KEY) 1310 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1311 parent_id, name, 1312 namelen); 1313 else 1314 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1315 name, namelen); 1316 1317 if (!ret) { 1318 struct inode *dir; 1319 1320 btrfs_release_path(path); 1321 dir = read_one_inode(root, parent_id); 1322 if (!dir) { 1323 ret = -ENOENT; 1324 kfree(name); 1325 goto out; 1326 } 1327 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 1328 inode, name, namelen); 1329 kfree(name); 1330 iput(dir); 1331 if (ret) 1332 goto out; 1333 goto again; 1334 } 1335 1336 kfree(name); 1337 ref_ptr += namelen; 1338 if (key->type == BTRFS_INODE_EXTREF_KEY) 1339 ref_ptr += sizeof(struct btrfs_inode_extref); 1340 else 1341 ref_ptr += sizeof(struct btrfs_inode_ref); 1342 } 1343 ret = 0; 1344 out: 1345 btrfs_release_path(path); 1346 return ret; 1347 } 1348 1349 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1350 const u8 ref_type, const char *name, 1351 const int namelen) 1352 { 1353 struct btrfs_key key; 1354 struct btrfs_path *path; 1355 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1356 int ret; 1357 1358 path = btrfs_alloc_path(); 1359 if (!path) 1360 return -ENOMEM; 1361 1362 key.objectid = btrfs_ino(BTRFS_I(inode)); 1363 key.type = ref_type; 1364 if (key.type == BTRFS_INODE_REF_KEY) 1365 key.offset = parent_id; 1366 else 1367 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1368 1369 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1370 if (ret < 0) 1371 goto out; 1372 if (ret > 0) { 1373 ret = 0; 1374 goto out; 1375 } 1376 if (key.type == BTRFS_INODE_EXTREF_KEY) 1377 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1378 path->slots[0], parent_id, name, namelen); 1379 else 1380 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1381 name, namelen); 1382 1383 out: 1384 btrfs_free_path(path); 1385 return ret; 1386 } 1387 1388 static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1389 struct inode *dir, struct inode *inode, const char *name, 1390 int namelen, u64 ref_index) 1391 { 1392 struct btrfs_dir_item *dir_item; 1393 struct btrfs_key key; 1394 struct btrfs_path *path; 1395 struct inode *other_inode = NULL; 1396 int ret; 1397 1398 path = btrfs_alloc_path(); 1399 if (!path) 1400 return -ENOMEM; 1401 1402 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1403 btrfs_ino(BTRFS_I(dir)), 1404 name, namelen, 0); 1405 if (!dir_item) { 1406 btrfs_release_path(path); 1407 goto add_link; 1408 } else if (IS_ERR(dir_item)) { 1409 ret = PTR_ERR(dir_item); 1410 goto out; 1411 } 1412 1413 /* 1414 * Our inode's dentry collides with the dentry of another inode which is 1415 * in the log but not yet processed since it has a higher inode number. 1416 * So delete that other dentry. 1417 */ 1418 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1419 btrfs_release_path(path); 1420 other_inode = read_one_inode(root, key.objectid); 1421 if (!other_inode) { 1422 ret = -ENOENT; 1423 goto out; 1424 } 1425 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), 1426 name, namelen); 1427 if (ret) 1428 goto out; 1429 /* 1430 * If we dropped the link count to 0, bump it so that later the iput() 1431 * on the inode will not free it. We will fixup the link count later. 1432 */ 1433 if (other_inode->i_nlink == 0) 1434 inc_nlink(other_inode); 1435 1436 ret = btrfs_run_delayed_items(trans); 1437 if (ret) 1438 goto out; 1439 add_link: 1440 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1441 name, namelen, 0, ref_index); 1442 out: 1443 iput(other_inode); 1444 btrfs_free_path(path); 1445 1446 return ret; 1447 } 1448 1449 /* 1450 * replay one inode back reference item found in the log tree. 1451 * eb, slot and key refer to the buffer and key found in the log tree. 1452 * root is the destination we are replaying into, and path is for temp 1453 * use by this function. (it should be released on return). 1454 */ 1455 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1456 struct btrfs_root *root, 1457 struct btrfs_root *log, 1458 struct btrfs_path *path, 1459 struct extent_buffer *eb, int slot, 1460 struct btrfs_key *key) 1461 { 1462 struct inode *dir = NULL; 1463 struct inode *inode = NULL; 1464 unsigned long ref_ptr; 1465 unsigned long ref_end; 1466 char *name = NULL; 1467 int namelen; 1468 int ret; 1469 int search_done = 0; 1470 int log_ref_ver = 0; 1471 u64 parent_objectid; 1472 u64 inode_objectid; 1473 u64 ref_index = 0; 1474 int ref_struct_size; 1475 1476 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1477 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 1478 1479 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1480 struct btrfs_inode_extref *r; 1481 1482 ref_struct_size = sizeof(struct btrfs_inode_extref); 1483 log_ref_ver = 1; 1484 r = (struct btrfs_inode_extref *)ref_ptr; 1485 parent_objectid = btrfs_inode_extref_parent(eb, r); 1486 } else { 1487 ref_struct_size = sizeof(struct btrfs_inode_ref); 1488 parent_objectid = key->offset; 1489 } 1490 inode_objectid = key->objectid; 1491 1492 /* 1493 * it is possible that we didn't log all the parent directories 1494 * for a given inode. If we don't find the dir, just don't 1495 * copy the back ref in. The link count fixup code will take 1496 * care of the rest 1497 */ 1498 dir = read_one_inode(root, parent_objectid); 1499 if (!dir) { 1500 ret = -ENOENT; 1501 goto out; 1502 } 1503 1504 inode = read_one_inode(root, inode_objectid); 1505 if (!inode) { 1506 ret = -EIO; 1507 goto out; 1508 } 1509 1510 while (ref_ptr < ref_end) { 1511 if (log_ref_ver) { 1512 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1513 &ref_index, &parent_objectid); 1514 /* 1515 * parent object can change from one array 1516 * item to another. 1517 */ 1518 if (!dir) 1519 dir = read_one_inode(root, parent_objectid); 1520 if (!dir) { 1521 ret = -ENOENT; 1522 goto out; 1523 } 1524 } else { 1525 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1526 &ref_index); 1527 } 1528 if (ret) 1529 goto out; 1530 1531 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1532 btrfs_ino(BTRFS_I(inode)), ref_index, 1533 name, namelen); 1534 if (ret < 0) { 1535 goto out; 1536 } else if (ret == 0) { 1537 /* 1538 * look for a conflicting back reference in the 1539 * metadata. if we find one we have to unlink that name 1540 * of the file before we add our new link. Later on, we 1541 * overwrite any existing back reference, and we don't 1542 * want to create dangling pointers in the directory. 1543 */ 1544 1545 if (!search_done) { 1546 ret = __add_inode_ref(trans, root, path, log, 1547 BTRFS_I(dir), 1548 BTRFS_I(inode), 1549 inode_objectid, 1550 parent_objectid, 1551 ref_index, name, namelen, 1552 &search_done); 1553 if (ret) { 1554 if (ret == 1) 1555 ret = 0; 1556 goto out; 1557 } 1558 } 1559 1560 /* 1561 * If a reference item already exists for this inode 1562 * with the same parent and name, but different index, 1563 * drop it and the corresponding directory index entries 1564 * from the parent before adding the new reference item 1565 * and dir index entries, otherwise we would fail with 1566 * -EEXIST returned from btrfs_add_link() below. 1567 */ 1568 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1569 name, namelen); 1570 if (ret > 0) { 1571 ret = btrfs_unlink_inode(trans, root, 1572 BTRFS_I(dir), 1573 BTRFS_I(inode), 1574 name, namelen); 1575 /* 1576 * If we dropped the link count to 0, bump it so 1577 * that later the iput() on the inode will not 1578 * free it. We will fixup the link count later. 1579 */ 1580 if (!ret && inode->i_nlink == 0) 1581 inc_nlink(inode); 1582 } 1583 if (ret < 0) 1584 goto out; 1585 1586 /* insert our name */ 1587 ret = add_link(trans, root, dir, inode, name, namelen, 1588 ref_index); 1589 if (ret) 1590 goto out; 1591 1592 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1593 if (ret) 1594 goto out; 1595 } 1596 /* Else, ret == 1, we already have a perfect match, we're done. */ 1597 1598 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1599 kfree(name); 1600 name = NULL; 1601 if (log_ref_ver) { 1602 iput(dir); 1603 dir = NULL; 1604 } 1605 } 1606 1607 /* 1608 * Before we overwrite the inode reference item in the subvolume tree 1609 * with the item from the log tree, we must unlink all names from the 1610 * parent directory that are in the subvolume's tree inode reference 1611 * item, otherwise we end up with an inconsistent subvolume tree where 1612 * dir index entries exist for a name but there is no inode reference 1613 * item with the same name. 1614 */ 1615 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1616 key); 1617 if (ret) 1618 goto out; 1619 1620 /* finally write the back reference in the inode */ 1621 ret = overwrite_item(trans, root, path, eb, slot, key); 1622 out: 1623 btrfs_release_path(path); 1624 kfree(name); 1625 iput(dir); 1626 iput(inode); 1627 return ret; 1628 } 1629 1630 static int count_inode_extrefs(struct btrfs_root *root, 1631 struct btrfs_inode *inode, struct btrfs_path *path) 1632 { 1633 int ret = 0; 1634 int name_len; 1635 unsigned int nlink = 0; 1636 u32 item_size; 1637 u32 cur_offset = 0; 1638 u64 inode_objectid = btrfs_ino(inode); 1639 u64 offset = 0; 1640 unsigned long ptr; 1641 struct btrfs_inode_extref *extref; 1642 struct extent_buffer *leaf; 1643 1644 while (1) { 1645 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1646 &extref, &offset); 1647 if (ret) 1648 break; 1649 1650 leaf = path->nodes[0]; 1651 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1652 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1653 cur_offset = 0; 1654 1655 while (cur_offset < item_size) { 1656 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1657 name_len = btrfs_inode_extref_name_len(leaf, extref); 1658 1659 nlink++; 1660 1661 cur_offset += name_len + sizeof(*extref); 1662 } 1663 1664 offset++; 1665 btrfs_release_path(path); 1666 } 1667 btrfs_release_path(path); 1668 1669 if (ret < 0 && ret != -ENOENT) 1670 return ret; 1671 return nlink; 1672 } 1673 1674 static int count_inode_refs(struct btrfs_root *root, 1675 struct btrfs_inode *inode, struct btrfs_path *path) 1676 { 1677 int ret; 1678 struct btrfs_key key; 1679 unsigned int nlink = 0; 1680 unsigned long ptr; 1681 unsigned long ptr_end; 1682 int name_len; 1683 u64 ino = btrfs_ino(inode); 1684 1685 key.objectid = ino; 1686 key.type = BTRFS_INODE_REF_KEY; 1687 key.offset = (u64)-1; 1688 1689 while (1) { 1690 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1691 if (ret < 0) 1692 break; 1693 if (ret > 0) { 1694 if (path->slots[0] == 0) 1695 break; 1696 path->slots[0]--; 1697 } 1698 process_slot: 1699 btrfs_item_key_to_cpu(path->nodes[0], &key, 1700 path->slots[0]); 1701 if (key.objectid != ino || 1702 key.type != BTRFS_INODE_REF_KEY) 1703 break; 1704 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1705 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], 1706 path->slots[0]); 1707 while (ptr < ptr_end) { 1708 struct btrfs_inode_ref *ref; 1709 1710 ref = (struct btrfs_inode_ref *)ptr; 1711 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1712 ref); 1713 ptr = (unsigned long)(ref + 1) + name_len; 1714 nlink++; 1715 } 1716 1717 if (key.offset == 0) 1718 break; 1719 if (path->slots[0] > 0) { 1720 path->slots[0]--; 1721 goto process_slot; 1722 } 1723 key.offset--; 1724 btrfs_release_path(path); 1725 } 1726 btrfs_release_path(path); 1727 1728 return nlink; 1729 } 1730 1731 /* 1732 * There are a few corners where the link count of the file can't 1733 * be properly maintained during replay. So, instead of adding 1734 * lots of complexity to the log code, we just scan the backrefs 1735 * for any file that has been through replay. 1736 * 1737 * The scan will update the link count on the inode to reflect the 1738 * number of back refs found. If it goes down to zero, the iput 1739 * will free the inode. 1740 */ 1741 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1742 struct btrfs_root *root, 1743 struct inode *inode) 1744 { 1745 struct btrfs_path *path; 1746 int ret; 1747 u64 nlink = 0; 1748 u64 ino = btrfs_ino(BTRFS_I(inode)); 1749 1750 path = btrfs_alloc_path(); 1751 if (!path) 1752 return -ENOMEM; 1753 1754 ret = count_inode_refs(root, BTRFS_I(inode), path); 1755 if (ret < 0) 1756 goto out; 1757 1758 nlink = ret; 1759 1760 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1761 if (ret < 0) 1762 goto out; 1763 1764 nlink += ret; 1765 1766 ret = 0; 1767 1768 if (nlink != inode->i_nlink) { 1769 set_nlink(inode, nlink); 1770 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1771 if (ret) 1772 goto out; 1773 } 1774 BTRFS_I(inode)->index_cnt = (u64)-1; 1775 1776 if (inode->i_nlink == 0) { 1777 if (S_ISDIR(inode->i_mode)) { 1778 ret = replay_dir_deletes(trans, root, NULL, path, 1779 ino, 1); 1780 if (ret) 1781 goto out; 1782 } 1783 ret = btrfs_insert_orphan_item(trans, root, ino); 1784 if (ret == -EEXIST) 1785 ret = 0; 1786 } 1787 1788 out: 1789 btrfs_free_path(path); 1790 return ret; 1791 } 1792 1793 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1794 struct btrfs_root *root, 1795 struct btrfs_path *path) 1796 { 1797 int ret; 1798 struct btrfs_key key; 1799 struct inode *inode; 1800 1801 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1802 key.type = BTRFS_ORPHAN_ITEM_KEY; 1803 key.offset = (u64)-1; 1804 while (1) { 1805 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1806 if (ret < 0) 1807 break; 1808 1809 if (ret == 1) { 1810 ret = 0; 1811 if (path->slots[0] == 0) 1812 break; 1813 path->slots[0]--; 1814 } 1815 1816 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1817 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1818 key.type != BTRFS_ORPHAN_ITEM_KEY) 1819 break; 1820 1821 ret = btrfs_del_item(trans, root, path); 1822 if (ret) 1823 break; 1824 1825 btrfs_release_path(path); 1826 inode = read_one_inode(root, key.offset); 1827 if (!inode) { 1828 ret = -EIO; 1829 break; 1830 } 1831 1832 ret = fixup_inode_link_count(trans, root, inode); 1833 iput(inode); 1834 if (ret) 1835 break; 1836 1837 /* 1838 * fixup on a directory may create new entries, 1839 * make sure we always look for the highset possible 1840 * offset 1841 */ 1842 key.offset = (u64)-1; 1843 } 1844 btrfs_release_path(path); 1845 return ret; 1846 } 1847 1848 1849 /* 1850 * record a given inode in the fixup dir so we can check its link 1851 * count when replay is done. The link count is incremented here 1852 * so the inode won't go away until we check it 1853 */ 1854 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1855 struct btrfs_root *root, 1856 struct btrfs_path *path, 1857 u64 objectid) 1858 { 1859 struct btrfs_key key; 1860 int ret = 0; 1861 struct inode *inode; 1862 1863 inode = read_one_inode(root, objectid); 1864 if (!inode) 1865 return -EIO; 1866 1867 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1868 key.type = BTRFS_ORPHAN_ITEM_KEY; 1869 key.offset = objectid; 1870 1871 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1872 1873 btrfs_release_path(path); 1874 if (ret == 0) { 1875 if (!inode->i_nlink) 1876 set_nlink(inode, 1); 1877 else 1878 inc_nlink(inode); 1879 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1880 } else if (ret == -EEXIST) { 1881 ret = 0; 1882 } 1883 iput(inode); 1884 1885 return ret; 1886 } 1887 1888 /* 1889 * when replaying the log for a directory, we only insert names 1890 * for inodes that actually exist. This means an fsync on a directory 1891 * does not implicitly fsync all the new files in it 1892 */ 1893 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1894 struct btrfs_root *root, 1895 u64 dirid, u64 index, 1896 char *name, int name_len, 1897 struct btrfs_key *location) 1898 { 1899 struct inode *inode; 1900 struct inode *dir; 1901 int ret; 1902 1903 inode = read_one_inode(root, location->objectid); 1904 if (!inode) 1905 return -ENOENT; 1906 1907 dir = read_one_inode(root, dirid); 1908 if (!dir) { 1909 iput(inode); 1910 return -EIO; 1911 } 1912 1913 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1914 name_len, 1, index); 1915 1916 /* FIXME, put inode into FIXUP list */ 1917 1918 iput(inode); 1919 iput(dir); 1920 return ret; 1921 } 1922 1923 /* 1924 * take a single entry in a log directory item and replay it into 1925 * the subvolume. 1926 * 1927 * if a conflicting item exists in the subdirectory already, 1928 * the inode it points to is unlinked and put into the link count 1929 * fix up tree. 1930 * 1931 * If a name from the log points to a file or directory that does 1932 * not exist in the FS, it is skipped. fsyncs on directories 1933 * do not force down inodes inside that directory, just changes to the 1934 * names or unlinks in a directory. 1935 * 1936 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1937 * non-existing inode) and 1 if the name was replayed. 1938 */ 1939 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1940 struct btrfs_root *root, 1941 struct btrfs_path *path, 1942 struct extent_buffer *eb, 1943 struct btrfs_dir_item *di, 1944 struct btrfs_key *key) 1945 { 1946 char *name; 1947 int name_len; 1948 struct btrfs_dir_item *dst_di; 1949 struct btrfs_key found_key; 1950 struct btrfs_key log_key; 1951 struct inode *dir; 1952 u8 log_type; 1953 bool exists; 1954 int ret; 1955 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); 1956 bool name_added = false; 1957 1958 dir = read_one_inode(root, key->objectid); 1959 if (!dir) 1960 return -EIO; 1961 1962 name_len = btrfs_dir_name_len(eb, di); 1963 name = kmalloc(name_len, GFP_NOFS); 1964 if (!name) { 1965 ret = -ENOMEM; 1966 goto out; 1967 } 1968 1969 log_type = btrfs_dir_type(eb, di); 1970 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1971 name_len); 1972 1973 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1974 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1975 btrfs_release_path(path); 1976 if (ret < 0) 1977 goto out; 1978 exists = (ret == 0); 1979 ret = 0; 1980 1981 if (key->type == BTRFS_DIR_ITEM_KEY) { 1982 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1983 name, name_len, 1); 1984 } else if (key->type == BTRFS_DIR_INDEX_KEY) { 1985 dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1986 key->objectid, 1987 key->offset, name, 1988 name_len, 1); 1989 } else { 1990 /* Corruption */ 1991 ret = -EINVAL; 1992 goto out; 1993 } 1994 1995 if (IS_ERR(dst_di)) { 1996 ret = PTR_ERR(dst_di); 1997 goto out; 1998 } else if (!dst_di) { 1999 /* we need a sequence number to insert, so we only 2000 * do inserts for the BTRFS_DIR_INDEX_KEY types 2001 */ 2002 if (key->type != BTRFS_DIR_INDEX_KEY) 2003 goto out; 2004 goto insert; 2005 } 2006 2007 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 2008 /* the existing item matches the logged item */ 2009 if (found_key.objectid == log_key.objectid && 2010 found_key.type == log_key.type && 2011 found_key.offset == log_key.offset && 2012 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 2013 update_size = false; 2014 goto out; 2015 } 2016 2017 /* 2018 * don't drop the conflicting directory entry if the inode 2019 * for the new entry doesn't exist 2020 */ 2021 if (!exists) 2022 goto out; 2023 2024 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); 2025 if (ret) 2026 goto out; 2027 2028 if (key->type == BTRFS_DIR_INDEX_KEY) 2029 goto insert; 2030 out: 2031 btrfs_release_path(path); 2032 if (!ret && update_size) { 2033 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2034 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 2035 } 2036 kfree(name); 2037 iput(dir); 2038 if (!ret && name_added) 2039 ret = 1; 2040 return ret; 2041 2042 insert: 2043 /* 2044 * Check if the inode reference exists in the log for the given name, 2045 * inode and parent inode 2046 */ 2047 found_key.objectid = log_key.objectid; 2048 found_key.type = BTRFS_INODE_REF_KEY; 2049 found_key.offset = key->objectid; 2050 ret = backref_in_log(root->log_root, &found_key, 0, name, name_len); 2051 if (ret < 0) { 2052 goto out; 2053 } else if (ret) { 2054 /* The dentry will be added later. */ 2055 ret = 0; 2056 update_size = false; 2057 goto out; 2058 } 2059 2060 found_key.objectid = log_key.objectid; 2061 found_key.type = BTRFS_INODE_EXTREF_KEY; 2062 found_key.offset = key->objectid; 2063 ret = backref_in_log(root->log_root, &found_key, key->objectid, name, 2064 name_len); 2065 if (ret < 0) { 2066 goto out; 2067 } else if (ret) { 2068 /* The dentry will be added later. */ 2069 ret = 0; 2070 update_size = false; 2071 goto out; 2072 } 2073 btrfs_release_path(path); 2074 ret = insert_one_name(trans, root, key->objectid, key->offset, 2075 name, name_len, &log_key); 2076 if (ret && ret != -ENOENT && ret != -EEXIST) 2077 goto out; 2078 if (!ret) 2079 name_added = true; 2080 update_size = false; 2081 ret = 0; 2082 goto out; 2083 } 2084 2085 /* 2086 * find all the names in a directory item and reconcile them into 2087 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than 2088 * one name in a directory item, but the same code gets used for 2089 * both directory index types 2090 */ 2091 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2092 struct btrfs_root *root, 2093 struct btrfs_path *path, 2094 struct extent_buffer *eb, int slot, 2095 struct btrfs_key *key) 2096 { 2097 int ret = 0; 2098 u32 item_size = btrfs_item_size_nr(eb, slot); 2099 struct btrfs_dir_item *di; 2100 int name_len; 2101 unsigned long ptr; 2102 unsigned long ptr_end; 2103 struct btrfs_path *fixup_path = NULL; 2104 2105 ptr = btrfs_item_ptr_offset(eb, slot); 2106 ptr_end = ptr + item_size; 2107 while (ptr < ptr_end) { 2108 di = (struct btrfs_dir_item *)ptr; 2109 name_len = btrfs_dir_name_len(eb, di); 2110 ret = replay_one_name(trans, root, path, eb, di, key); 2111 if (ret < 0) 2112 break; 2113 ptr = (unsigned long)(di + 1); 2114 ptr += name_len; 2115 2116 /* 2117 * If this entry refers to a non-directory (directories can not 2118 * have a link count > 1) and it was added in the transaction 2119 * that was not committed, make sure we fixup the link count of 2120 * the inode it the entry points to. Otherwise something like 2121 * the following would result in a directory pointing to an 2122 * inode with a wrong link that does not account for this dir 2123 * entry: 2124 * 2125 * mkdir testdir 2126 * touch testdir/foo 2127 * touch testdir/bar 2128 * sync 2129 * 2130 * ln testdir/bar testdir/bar_link 2131 * ln testdir/foo testdir/foo_link 2132 * xfs_io -c "fsync" testdir/bar 2133 * 2134 * <power failure> 2135 * 2136 * mount fs, log replay happens 2137 * 2138 * File foo would remain with a link count of 1 when it has two 2139 * entries pointing to it in the directory testdir. This would 2140 * make it impossible to ever delete the parent directory has 2141 * it would result in stale dentries that can never be deleted. 2142 */ 2143 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2144 struct btrfs_key di_key; 2145 2146 if (!fixup_path) { 2147 fixup_path = btrfs_alloc_path(); 2148 if (!fixup_path) { 2149 ret = -ENOMEM; 2150 break; 2151 } 2152 } 2153 2154 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2155 ret = link_to_fixup_dir(trans, root, fixup_path, 2156 di_key.objectid); 2157 if (ret) 2158 break; 2159 } 2160 ret = 0; 2161 } 2162 btrfs_free_path(fixup_path); 2163 return ret; 2164 } 2165 2166 /* 2167 * directory replay has two parts. There are the standard directory 2168 * items in the log copied from the subvolume, and range items 2169 * created in the log while the subvolume was logged. 2170 * 2171 * The range items tell us which parts of the key space the log 2172 * is authoritative for. During replay, if a key in the subvolume 2173 * directory is in a logged range item, but not actually in the log 2174 * that means it was deleted from the directory before the fsync 2175 * and should be removed. 2176 */ 2177 static noinline int find_dir_range(struct btrfs_root *root, 2178 struct btrfs_path *path, 2179 u64 dirid, int key_type, 2180 u64 *start_ret, u64 *end_ret) 2181 { 2182 struct btrfs_key key; 2183 u64 found_end; 2184 struct btrfs_dir_log_item *item; 2185 int ret; 2186 int nritems; 2187 2188 if (*start_ret == (u64)-1) 2189 return 1; 2190 2191 key.objectid = dirid; 2192 key.type = key_type; 2193 key.offset = *start_ret; 2194 2195 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2196 if (ret < 0) 2197 goto out; 2198 if (ret > 0) { 2199 if (path->slots[0] == 0) 2200 goto out; 2201 path->slots[0]--; 2202 } 2203 if (ret != 0) 2204 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2205 2206 if (key.type != key_type || key.objectid != dirid) { 2207 ret = 1; 2208 goto next; 2209 } 2210 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2211 struct btrfs_dir_log_item); 2212 found_end = btrfs_dir_log_end(path->nodes[0], item); 2213 2214 if (*start_ret >= key.offset && *start_ret <= found_end) { 2215 ret = 0; 2216 *start_ret = key.offset; 2217 *end_ret = found_end; 2218 goto out; 2219 } 2220 ret = 1; 2221 next: 2222 /* check the next slot in the tree to see if it is a valid item */ 2223 nritems = btrfs_header_nritems(path->nodes[0]); 2224 path->slots[0]++; 2225 if (path->slots[0] >= nritems) { 2226 ret = btrfs_next_leaf(root, path); 2227 if (ret) 2228 goto out; 2229 } 2230 2231 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2232 2233 if (key.type != key_type || key.objectid != dirid) { 2234 ret = 1; 2235 goto out; 2236 } 2237 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2238 struct btrfs_dir_log_item); 2239 found_end = btrfs_dir_log_end(path->nodes[0], item); 2240 *start_ret = key.offset; 2241 *end_ret = found_end; 2242 ret = 0; 2243 out: 2244 btrfs_release_path(path); 2245 return ret; 2246 } 2247 2248 /* 2249 * this looks for a given directory item in the log. If the directory 2250 * item is not in the log, the item is removed and the inode it points 2251 * to is unlinked 2252 */ 2253 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2254 struct btrfs_root *root, 2255 struct btrfs_root *log, 2256 struct btrfs_path *path, 2257 struct btrfs_path *log_path, 2258 struct inode *dir, 2259 struct btrfs_key *dir_key) 2260 { 2261 int ret; 2262 struct extent_buffer *eb; 2263 int slot; 2264 u32 item_size; 2265 struct btrfs_dir_item *di; 2266 struct btrfs_dir_item *log_di; 2267 int name_len; 2268 unsigned long ptr; 2269 unsigned long ptr_end; 2270 char *name; 2271 struct inode *inode; 2272 struct btrfs_key location; 2273 2274 again: 2275 eb = path->nodes[0]; 2276 slot = path->slots[0]; 2277 item_size = btrfs_item_size_nr(eb, slot); 2278 ptr = btrfs_item_ptr_offset(eb, slot); 2279 ptr_end = ptr + item_size; 2280 while (ptr < ptr_end) { 2281 di = (struct btrfs_dir_item *)ptr; 2282 name_len = btrfs_dir_name_len(eb, di); 2283 name = kmalloc(name_len, GFP_NOFS); 2284 if (!name) { 2285 ret = -ENOMEM; 2286 goto out; 2287 } 2288 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2289 name_len); 2290 log_di = NULL; 2291 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) { 2292 log_di = btrfs_lookup_dir_item(trans, log, log_path, 2293 dir_key->objectid, 2294 name, name_len, 0); 2295 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) { 2296 log_di = btrfs_lookup_dir_index_item(trans, log, 2297 log_path, 2298 dir_key->objectid, 2299 dir_key->offset, 2300 name, name_len, 0); 2301 } 2302 if (!log_di) { 2303 btrfs_dir_item_key_to_cpu(eb, di, &location); 2304 btrfs_release_path(path); 2305 btrfs_release_path(log_path); 2306 inode = read_one_inode(root, location.objectid); 2307 if (!inode) { 2308 kfree(name); 2309 return -EIO; 2310 } 2311 2312 ret = link_to_fixup_dir(trans, root, 2313 path, location.objectid); 2314 if (ret) { 2315 kfree(name); 2316 iput(inode); 2317 goto out; 2318 } 2319 2320 inc_nlink(inode); 2321 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), 2322 BTRFS_I(inode), name, name_len); 2323 if (!ret) 2324 ret = btrfs_run_delayed_items(trans); 2325 kfree(name); 2326 iput(inode); 2327 if (ret) 2328 goto out; 2329 2330 /* there might still be more names under this key 2331 * check and repeat if required 2332 */ 2333 ret = btrfs_search_slot(NULL, root, dir_key, path, 2334 0, 0); 2335 if (ret == 0) 2336 goto again; 2337 ret = 0; 2338 goto out; 2339 } else if (IS_ERR(log_di)) { 2340 kfree(name); 2341 return PTR_ERR(log_di); 2342 } 2343 btrfs_release_path(log_path); 2344 kfree(name); 2345 2346 ptr = (unsigned long)(di + 1); 2347 ptr += name_len; 2348 } 2349 ret = 0; 2350 out: 2351 btrfs_release_path(path); 2352 btrfs_release_path(log_path); 2353 return ret; 2354 } 2355 2356 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2357 struct btrfs_root *root, 2358 struct btrfs_root *log, 2359 struct btrfs_path *path, 2360 const u64 ino) 2361 { 2362 struct btrfs_key search_key; 2363 struct btrfs_path *log_path; 2364 int i; 2365 int nritems; 2366 int ret; 2367 2368 log_path = btrfs_alloc_path(); 2369 if (!log_path) 2370 return -ENOMEM; 2371 2372 search_key.objectid = ino; 2373 search_key.type = BTRFS_XATTR_ITEM_KEY; 2374 search_key.offset = 0; 2375 again: 2376 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2377 if (ret < 0) 2378 goto out; 2379 process_leaf: 2380 nritems = btrfs_header_nritems(path->nodes[0]); 2381 for (i = path->slots[0]; i < nritems; i++) { 2382 struct btrfs_key key; 2383 struct btrfs_dir_item *di; 2384 struct btrfs_dir_item *log_di; 2385 u32 total_size; 2386 u32 cur; 2387 2388 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2389 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2390 ret = 0; 2391 goto out; 2392 } 2393 2394 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2395 total_size = btrfs_item_size_nr(path->nodes[0], i); 2396 cur = 0; 2397 while (cur < total_size) { 2398 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2399 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2400 u32 this_len = sizeof(*di) + name_len + data_len; 2401 char *name; 2402 2403 name = kmalloc(name_len, GFP_NOFS); 2404 if (!name) { 2405 ret = -ENOMEM; 2406 goto out; 2407 } 2408 read_extent_buffer(path->nodes[0], name, 2409 (unsigned long)(di + 1), name_len); 2410 2411 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2412 name, name_len, 0); 2413 btrfs_release_path(log_path); 2414 if (!log_di) { 2415 /* Doesn't exist in log tree, so delete it. */ 2416 btrfs_release_path(path); 2417 di = btrfs_lookup_xattr(trans, root, path, ino, 2418 name, name_len, -1); 2419 kfree(name); 2420 if (IS_ERR(di)) { 2421 ret = PTR_ERR(di); 2422 goto out; 2423 } 2424 ASSERT(di); 2425 ret = btrfs_delete_one_dir_name(trans, root, 2426 path, di); 2427 if (ret) 2428 goto out; 2429 btrfs_release_path(path); 2430 search_key = key; 2431 goto again; 2432 } 2433 kfree(name); 2434 if (IS_ERR(log_di)) { 2435 ret = PTR_ERR(log_di); 2436 goto out; 2437 } 2438 cur += this_len; 2439 di = (struct btrfs_dir_item *)((char *)di + this_len); 2440 } 2441 } 2442 ret = btrfs_next_leaf(root, path); 2443 if (ret > 0) 2444 ret = 0; 2445 else if (ret == 0) 2446 goto process_leaf; 2447 out: 2448 btrfs_free_path(log_path); 2449 btrfs_release_path(path); 2450 return ret; 2451 } 2452 2453 2454 /* 2455 * deletion replay happens before we copy any new directory items 2456 * out of the log or out of backreferences from inodes. It 2457 * scans the log to find ranges of keys that log is authoritative for, 2458 * and then scans the directory to find items in those ranges that are 2459 * not present in the log. 2460 * 2461 * Anything we don't find in the log is unlinked and removed from the 2462 * directory. 2463 */ 2464 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2465 struct btrfs_root *root, 2466 struct btrfs_root *log, 2467 struct btrfs_path *path, 2468 u64 dirid, int del_all) 2469 { 2470 u64 range_start; 2471 u64 range_end; 2472 int key_type = BTRFS_DIR_LOG_ITEM_KEY; 2473 int ret = 0; 2474 struct btrfs_key dir_key; 2475 struct btrfs_key found_key; 2476 struct btrfs_path *log_path; 2477 struct inode *dir; 2478 2479 dir_key.objectid = dirid; 2480 dir_key.type = BTRFS_DIR_ITEM_KEY; 2481 log_path = btrfs_alloc_path(); 2482 if (!log_path) 2483 return -ENOMEM; 2484 2485 dir = read_one_inode(root, dirid); 2486 /* it isn't an error if the inode isn't there, that can happen 2487 * because we replay the deletes before we copy in the inode item 2488 * from the log 2489 */ 2490 if (!dir) { 2491 btrfs_free_path(log_path); 2492 return 0; 2493 } 2494 again: 2495 range_start = 0; 2496 range_end = 0; 2497 while (1) { 2498 if (del_all) 2499 range_end = (u64)-1; 2500 else { 2501 ret = find_dir_range(log, path, dirid, key_type, 2502 &range_start, &range_end); 2503 if (ret != 0) 2504 break; 2505 } 2506 2507 dir_key.offset = range_start; 2508 while (1) { 2509 int nritems; 2510 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2511 0, 0); 2512 if (ret < 0) 2513 goto out; 2514 2515 nritems = btrfs_header_nritems(path->nodes[0]); 2516 if (path->slots[0] >= nritems) { 2517 ret = btrfs_next_leaf(root, path); 2518 if (ret == 1) 2519 break; 2520 else if (ret < 0) 2521 goto out; 2522 } 2523 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2524 path->slots[0]); 2525 if (found_key.objectid != dirid || 2526 found_key.type != dir_key.type) 2527 goto next_type; 2528 2529 if (found_key.offset > range_end) 2530 break; 2531 2532 ret = check_item_in_log(trans, root, log, path, 2533 log_path, dir, 2534 &found_key); 2535 if (ret) 2536 goto out; 2537 if (found_key.offset == (u64)-1) 2538 break; 2539 dir_key.offset = found_key.offset + 1; 2540 } 2541 btrfs_release_path(path); 2542 if (range_end == (u64)-1) 2543 break; 2544 range_start = range_end + 1; 2545 } 2546 2547 next_type: 2548 ret = 0; 2549 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 2550 key_type = BTRFS_DIR_LOG_INDEX_KEY; 2551 dir_key.type = BTRFS_DIR_INDEX_KEY; 2552 btrfs_release_path(path); 2553 goto again; 2554 } 2555 out: 2556 btrfs_release_path(path); 2557 btrfs_free_path(log_path); 2558 iput(dir); 2559 return ret; 2560 } 2561 2562 /* 2563 * the process_func used to replay items from the log tree. This 2564 * gets called in two different stages. The first stage just looks 2565 * for inodes and makes sure they are all copied into the subvolume. 2566 * 2567 * The second stage copies all the other item types from the log into 2568 * the subvolume. The two stage approach is slower, but gets rid of 2569 * lots of complexity around inodes referencing other inodes that exist 2570 * only in the log (references come from either directory items or inode 2571 * back refs). 2572 */ 2573 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2574 struct walk_control *wc, u64 gen, int level) 2575 { 2576 int nritems; 2577 struct btrfs_path *path; 2578 struct btrfs_root *root = wc->replay_dest; 2579 struct btrfs_key key; 2580 int i; 2581 int ret; 2582 2583 ret = btrfs_read_buffer(eb, gen, level, NULL); 2584 if (ret) 2585 return ret; 2586 2587 level = btrfs_header_level(eb); 2588 2589 if (level != 0) 2590 return 0; 2591 2592 path = btrfs_alloc_path(); 2593 if (!path) 2594 return -ENOMEM; 2595 2596 nritems = btrfs_header_nritems(eb); 2597 for (i = 0; i < nritems; i++) { 2598 btrfs_item_key_to_cpu(eb, &key, i); 2599 2600 /* inode keys are done during the first stage */ 2601 if (key.type == BTRFS_INODE_ITEM_KEY && 2602 wc->stage == LOG_WALK_REPLAY_INODES) { 2603 struct btrfs_inode_item *inode_item; 2604 u32 mode; 2605 2606 inode_item = btrfs_item_ptr(eb, i, 2607 struct btrfs_inode_item); 2608 /* 2609 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2610 * and never got linked before the fsync, skip it, as 2611 * replaying it is pointless since it would be deleted 2612 * later. We skip logging tmpfiles, but it's always 2613 * possible we are replaying a log created with a kernel 2614 * that used to log tmpfiles. 2615 */ 2616 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2617 wc->ignore_cur_inode = true; 2618 continue; 2619 } else { 2620 wc->ignore_cur_inode = false; 2621 } 2622 ret = replay_xattr_deletes(wc->trans, root, log, 2623 path, key.objectid); 2624 if (ret) 2625 break; 2626 mode = btrfs_inode_mode(eb, inode_item); 2627 if (S_ISDIR(mode)) { 2628 ret = replay_dir_deletes(wc->trans, 2629 root, log, path, key.objectid, 0); 2630 if (ret) 2631 break; 2632 } 2633 ret = overwrite_item(wc->trans, root, path, 2634 eb, i, &key); 2635 if (ret) 2636 break; 2637 2638 /* 2639 * Before replaying extents, truncate the inode to its 2640 * size. We need to do it now and not after log replay 2641 * because before an fsync we can have prealloc extents 2642 * added beyond the inode's i_size. If we did it after, 2643 * through orphan cleanup for example, we would drop 2644 * those prealloc extents just after replaying them. 2645 */ 2646 if (S_ISREG(mode)) { 2647 struct btrfs_drop_extents_args drop_args = { 0 }; 2648 struct inode *inode; 2649 u64 from; 2650 2651 inode = read_one_inode(root, key.objectid); 2652 if (!inode) { 2653 ret = -EIO; 2654 break; 2655 } 2656 from = ALIGN(i_size_read(inode), 2657 root->fs_info->sectorsize); 2658 drop_args.start = from; 2659 drop_args.end = (u64)-1; 2660 drop_args.drop_cache = true; 2661 ret = btrfs_drop_extents(wc->trans, root, 2662 BTRFS_I(inode), 2663 &drop_args); 2664 if (!ret) { 2665 inode_sub_bytes(inode, 2666 drop_args.bytes_found); 2667 /* Update the inode's nbytes. */ 2668 ret = btrfs_update_inode(wc->trans, 2669 root, BTRFS_I(inode)); 2670 } 2671 iput(inode); 2672 if (ret) 2673 break; 2674 } 2675 2676 ret = link_to_fixup_dir(wc->trans, root, 2677 path, key.objectid); 2678 if (ret) 2679 break; 2680 } 2681 2682 if (wc->ignore_cur_inode) 2683 continue; 2684 2685 if (key.type == BTRFS_DIR_INDEX_KEY && 2686 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2687 ret = replay_one_dir_item(wc->trans, root, path, 2688 eb, i, &key); 2689 if (ret) 2690 break; 2691 } 2692 2693 if (wc->stage < LOG_WALK_REPLAY_ALL) 2694 continue; 2695 2696 /* these keys are simply copied */ 2697 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2698 ret = overwrite_item(wc->trans, root, path, 2699 eb, i, &key); 2700 if (ret) 2701 break; 2702 } else if (key.type == BTRFS_INODE_REF_KEY || 2703 key.type == BTRFS_INODE_EXTREF_KEY) { 2704 ret = add_inode_ref(wc->trans, root, log, path, 2705 eb, i, &key); 2706 if (ret && ret != -ENOENT) 2707 break; 2708 ret = 0; 2709 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2710 ret = replay_one_extent(wc->trans, root, path, 2711 eb, i, &key); 2712 if (ret) 2713 break; 2714 } else if (key.type == BTRFS_DIR_ITEM_KEY) { 2715 ret = replay_one_dir_item(wc->trans, root, path, 2716 eb, i, &key); 2717 if (ret) 2718 break; 2719 } 2720 } 2721 btrfs_free_path(path); 2722 return ret; 2723 } 2724 2725 /* 2726 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2727 */ 2728 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2729 { 2730 struct btrfs_block_group *cache; 2731 2732 cache = btrfs_lookup_block_group(fs_info, start); 2733 if (!cache) { 2734 btrfs_err(fs_info, "unable to find block group for %llu", start); 2735 return; 2736 } 2737 2738 spin_lock(&cache->space_info->lock); 2739 spin_lock(&cache->lock); 2740 cache->reserved -= fs_info->nodesize; 2741 cache->space_info->bytes_reserved -= fs_info->nodesize; 2742 spin_unlock(&cache->lock); 2743 spin_unlock(&cache->space_info->lock); 2744 2745 btrfs_put_block_group(cache); 2746 } 2747 2748 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2749 struct btrfs_root *root, 2750 struct btrfs_path *path, int *level, 2751 struct walk_control *wc) 2752 { 2753 struct btrfs_fs_info *fs_info = root->fs_info; 2754 u64 bytenr; 2755 u64 ptr_gen; 2756 struct extent_buffer *next; 2757 struct extent_buffer *cur; 2758 u32 blocksize; 2759 int ret = 0; 2760 2761 while (*level > 0) { 2762 struct btrfs_key first_key; 2763 2764 cur = path->nodes[*level]; 2765 2766 WARN_ON(btrfs_header_level(cur) != *level); 2767 2768 if (path->slots[*level] >= 2769 btrfs_header_nritems(cur)) 2770 break; 2771 2772 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2773 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2774 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2775 blocksize = fs_info->nodesize; 2776 2777 next = btrfs_find_create_tree_block(fs_info, bytenr, 2778 btrfs_header_owner(cur), 2779 *level - 1); 2780 if (IS_ERR(next)) 2781 return PTR_ERR(next); 2782 2783 if (*level == 1) { 2784 ret = wc->process_func(root, next, wc, ptr_gen, 2785 *level - 1); 2786 if (ret) { 2787 free_extent_buffer(next); 2788 return ret; 2789 } 2790 2791 path->slots[*level]++; 2792 if (wc->free) { 2793 ret = btrfs_read_buffer(next, ptr_gen, 2794 *level - 1, &first_key); 2795 if (ret) { 2796 free_extent_buffer(next); 2797 return ret; 2798 } 2799 2800 if (trans) { 2801 btrfs_tree_lock(next); 2802 btrfs_clean_tree_block(next); 2803 btrfs_wait_tree_block_writeback(next); 2804 btrfs_tree_unlock(next); 2805 ret = btrfs_pin_reserved_extent(trans, 2806 bytenr, blocksize); 2807 if (ret) { 2808 free_extent_buffer(next); 2809 return ret; 2810 } 2811 btrfs_redirty_list_add( 2812 trans->transaction, next); 2813 } else { 2814 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2815 clear_extent_buffer_dirty(next); 2816 unaccount_log_buffer(fs_info, bytenr); 2817 } 2818 } 2819 free_extent_buffer(next); 2820 continue; 2821 } 2822 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2823 if (ret) { 2824 free_extent_buffer(next); 2825 return ret; 2826 } 2827 2828 if (path->nodes[*level-1]) 2829 free_extent_buffer(path->nodes[*level-1]); 2830 path->nodes[*level-1] = next; 2831 *level = btrfs_header_level(next); 2832 path->slots[*level] = 0; 2833 cond_resched(); 2834 } 2835 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2836 2837 cond_resched(); 2838 return 0; 2839 } 2840 2841 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2842 struct btrfs_root *root, 2843 struct btrfs_path *path, int *level, 2844 struct walk_control *wc) 2845 { 2846 struct btrfs_fs_info *fs_info = root->fs_info; 2847 int i; 2848 int slot; 2849 int ret; 2850 2851 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2852 slot = path->slots[i]; 2853 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2854 path->slots[i]++; 2855 *level = i; 2856 WARN_ON(*level == 0); 2857 return 0; 2858 } else { 2859 ret = wc->process_func(root, path->nodes[*level], wc, 2860 btrfs_header_generation(path->nodes[*level]), 2861 *level); 2862 if (ret) 2863 return ret; 2864 2865 if (wc->free) { 2866 struct extent_buffer *next; 2867 2868 next = path->nodes[*level]; 2869 2870 if (trans) { 2871 btrfs_tree_lock(next); 2872 btrfs_clean_tree_block(next); 2873 btrfs_wait_tree_block_writeback(next); 2874 btrfs_tree_unlock(next); 2875 ret = btrfs_pin_reserved_extent(trans, 2876 path->nodes[*level]->start, 2877 path->nodes[*level]->len); 2878 if (ret) 2879 return ret; 2880 } else { 2881 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2882 clear_extent_buffer_dirty(next); 2883 2884 unaccount_log_buffer(fs_info, 2885 path->nodes[*level]->start); 2886 } 2887 } 2888 free_extent_buffer(path->nodes[*level]); 2889 path->nodes[*level] = NULL; 2890 *level = i + 1; 2891 } 2892 } 2893 return 1; 2894 } 2895 2896 /* 2897 * drop the reference count on the tree rooted at 'snap'. This traverses 2898 * the tree freeing any blocks that have a ref count of zero after being 2899 * decremented. 2900 */ 2901 static int walk_log_tree(struct btrfs_trans_handle *trans, 2902 struct btrfs_root *log, struct walk_control *wc) 2903 { 2904 struct btrfs_fs_info *fs_info = log->fs_info; 2905 int ret = 0; 2906 int wret; 2907 int level; 2908 struct btrfs_path *path; 2909 int orig_level; 2910 2911 path = btrfs_alloc_path(); 2912 if (!path) 2913 return -ENOMEM; 2914 2915 level = btrfs_header_level(log->node); 2916 orig_level = level; 2917 path->nodes[level] = log->node; 2918 atomic_inc(&log->node->refs); 2919 path->slots[level] = 0; 2920 2921 while (1) { 2922 wret = walk_down_log_tree(trans, log, path, &level, wc); 2923 if (wret > 0) 2924 break; 2925 if (wret < 0) { 2926 ret = wret; 2927 goto out; 2928 } 2929 2930 wret = walk_up_log_tree(trans, log, path, &level, wc); 2931 if (wret > 0) 2932 break; 2933 if (wret < 0) { 2934 ret = wret; 2935 goto out; 2936 } 2937 } 2938 2939 /* was the root node processed? if not, catch it here */ 2940 if (path->nodes[orig_level]) { 2941 ret = wc->process_func(log, path->nodes[orig_level], wc, 2942 btrfs_header_generation(path->nodes[orig_level]), 2943 orig_level); 2944 if (ret) 2945 goto out; 2946 if (wc->free) { 2947 struct extent_buffer *next; 2948 2949 next = path->nodes[orig_level]; 2950 2951 if (trans) { 2952 btrfs_tree_lock(next); 2953 btrfs_clean_tree_block(next); 2954 btrfs_wait_tree_block_writeback(next); 2955 btrfs_tree_unlock(next); 2956 ret = btrfs_pin_reserved_extent(trans, 2957 next->start, next->len); 2958 if (ret) 2959 goto out; 2960 } else { 2961 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2962 clear_extent_buffer_dirty(next); 2963 unaccount_log_buffer(fs_info, next->start); 2964 } 2965 } 2966 } 2967 2968 out: 2969 btrfs_free_path(path); 2970 return ret; 2971 } 2972 2973 /* 2974 * helper function to update the item for a given subvolumes log root 2975 * in the tree of log roots 2976 */ 2977 static int update_log_root(struct btrfs_trans_handle *trans, 2978 struct btrfs_root *log, 2979 struct btrfs_root_item *root_item) 2980 { 2981 struct btrfs_fs_info *fs_info = log->fs_info; 2982 int ret; 2983 2984 if (log->log_transid == 1) { 2985 /* insert root item on the first sync */ 2986 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2987 &log->root_key, root_item); 2988 } else { 2989 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2990 &log->root_key, root_item); 2991 } 2992 return ret; 2993 } 2994 2995 static void wait_log_commit(struct btrfs_root *root, int transid) 2996 { 2997 DEFINE_WAIT(wait); 2998 int index = transid % 2; 2999 3000 /* 3001 * we only allow two pending log transactions at a time, 3002 * so we know that if ours is more than 2 older than the 3003 * current transaction, we're done 3004 */ 3005 for (;;) { 3006 prepare_to_wait(&root->log_commit_wait[index], 3007 &wait, TASK_UNINTERRUPTIBLE); 3008 3009 if (!(root->log_transid_committed < transid && 3010 atomic_read(&root->log_commit[index]))) 3011 break; 3012 3013 mutex_unlock(&root->log_mutex); 3014 schedule(); 3015 mutex_lock(&root->log_mutex); 3016 } 3017 finish_wait(&root->log_commit_wait[index], &wait); 3018 } 3019 3020 static void wait_for_writer(struct btrfs_root *root) 3021 { 3022 DEFINE_WAIT(wait); 3023 3024 for (;;) { 3025 prepare_to_wait(&root->log_writer_wait, &wait, 3026 TASK_UNINTERRUPTIBLE); 3027 if (!atomic_read(&root->log_writers)) 3028 break; 3029 3030 mutex_unlock(&root->log_mutex); 3031 schedule(); 3032 mutex_lock(&root->log_mutex); 3033 } 3034 finish_wait(&root->log_writer_wait, &wait); 3035 } 3036 3037 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3038 struct btrfs_log_ctx *ctx) 3039 { 3040 if (!ctx) 3041 return; 3042 3043 mutex_lock(&root->log_mutex); 3044 list_del_init(&ctx->list); 3045 mutex_unlock(&root->log_mutex); 3046 } 3047 3048 /* 3049 * Invoked in log mutex context, or be sure there is no other task which 3050 * can access the list. 3051 */ 3052 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3053 int index, int error) 3054 { 3055 struct btrfs_log_ctx *ctx; 3056 struct btrfs_log_ctx *safe; 3057 3058 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3059 list_del_init(&ctx->list); 3060 ctx->log_ret = error; 3061 } 3062 } 3063 3064 /* 3065 * btrfs_sync_log does sends a given tree log down to the disk and 3066 * updates the super blocks to record it. When this call is done, 3067 * you know that any inodes previously logged are safely on disk only 3068 * if it returns 0. 3069 * 3070 * Any other return value means you need to call btrfs_commit_transaction. 3071 * Some of the edge cases for fsyncing directories that have had unlinks 3072 * or renames done in the past mean that sometimes the only safe 3073 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3074 * that has happened. 3075 */ 3076 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3077 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3078 { 3079 int index1; 3080 int index2; 3081 int mark; 3082 int ret; 3083 struct btrfs_fs_info *fs_info = root->fs_info; 3084 struct btrfs_root *log = root->log_root; 3085 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3086 struct btrfs_root_item new_root_item; 3087 int log_transid = 0; 3088 struct btrfs_log_ctx root_log_ctx; 3089 struct blk_plug plug; 3090 u64 log_root_start; 3091 u64 log_root_level; 3092 3093 mutex_lock(&root->log_mutex); 3094 log_transid = ctx->log_transid; 3095 if (root->log_transid_committed >= log_transid) { 3096 mutex_unlock(&root->log_mutex); 3097 return ctx->log_ret; 3098 } 3099 3100 index1 = log_transid % 2; 3101 if (atomic_read(&root->log_commit[index1])) { 3102 wait_log_commit(root, log_transid); 3103 mutex_unlock(&root->log_mutex); 3104 return ctx->log_ret; 3105 } 3106 ASSERT(log_transid == root->log_transid); 3107 atomic_set(&root->log_commit[index1], 1); 3108 3109 /* wait for previous tree log sync to complete */ 3110 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3111 wait_log_commit(root, log_transid - 1); 3112 3113 while (1) { 3114 int batch = atomic_read(&root->log_batch); 3115 /* when we're on an ssd, just kick the log commit out */ 3116 if (!btrfs_test_opt(fs_info, SSD) && 3117 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3118 mutex_unlock(&root->log_mutex); 3119 schedule_timeout_uninterruptible(1); 3120 mutex_lock(&root->log_mutex); 3121 } 3122 wait_for_writer(root); 3123 if (batch == atomic_read(&root->log_batch)) 3124 break; 3125 } 3126 3127 /* bail out if we need to do a full commit */ 3128 if (btrfs_need_log_full_commit(trans)) { 3129 ret = -EAGAIN; 3130 mutex_unlock(&root->log_mutex); 3131 goto out; 3132 } 3133 3134 if (log_transid % 2 == 0) 3135 mark = EXTENT_DIRTY; 3136 else 3137 mark = EXTENT_NEW; 3138 3139 /* we start IO on all the marked extents here, but we don't actually 3140 * wait for them until later. 3141 */ 3142 blk_start_plug(&plug); 3143 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3144 /* 3145 * -EAGAIN happens when someone, e.g., a concurrent transaction 3146 * commit, writes a dirty extent in this tree-log commit. This 3147 * concurrent write will create a hole writing out the extents, 3148 * and we cannot proceed on a zoned filesystem, requiring 3149 * sequential writing. While we can bail out to a full commit 3150 * here, but we can continue hoping the concurrent writing fills 3151 * the hole. 3152 */ 3153 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3154 ret = 0; 3155 if (ret) { 3156 blk_finish_plug(&plug); 3157 btrfs_abort_transaction(trans, ret); 3158 btrfs_set_log_full_commit(trans); 3159 mutex_unlock(&root->log_mutex); 3160 goto out; 3161 } 3162 3163 /* 3164 * We _must_ update under the root->log_mutex in order to make sure we 3165 * have a consistent view of the log root we are trying to commit at 3166 * this moment. 3167 * 3168 * We _must_ copy this into a local copy, because we are not holding the 3169 * log_root_tree->log_mutex yet. This is important because when we 3170 * commit the log_root_tree we must have a consistent view of the 3171 * log_root_tree when we update the super block to point at the 3172 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3173 * with the commit and possibly point at the new block which we may not 3174 * have written out. 3175 */ 3176 btrfs_set_root_node(&log->root_item, log->node); 3177 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3178 3179 root->log_transid++; 3180 log->log_transid = root->log_transid; 3181 root->log_start_pid = 0; 3182 /* 3183 * IO has been started, blocks of the log tree have WRITTEN flag set 3184 * in their headers. new modifications of the log will be written to 3185 * new positions. so it's safe to allow log writers to go in. 3186 */ 3187 mutex_unlock(&root->log_mutex); 3188 3189 if (btrfs_is_zoned(fs_info)) { 3190 mutex_lock(&fs_info->tree_root->log_mutex); 3191 if (!log_root_tree->node) { 3192 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3193 if (ret) { 3194 mutex_unlock(&fs_info->tree_root->log_mutex); 3195 goto out; 3196 } 3197 } 3198 mutex_unlock(&fs_info->tree_root->log_mutex); 3199 } 3200 3201 btrfs_init_log_ctx(&root_log_ctx, NULL); 3202 3203 mutex_lock(&log_root_tree->log_mutex); 3204 3205 index2 = log_root_tree->log_transid % 2; 3206 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3207 root_log_ctx.log_transid = log_root_tree->log_transid; 3208 3209 /* 3210 * Now we are safe to update the log_root_tree because we're under the 3211 * log_mutex, and we're a current writer so we're holding the commit 3212 * open until we drop the log_mutex. 3213 */ 3214 ret = update_log_root(trans, log, &new_root_item); 3215 if (ret) { 3216 if (!list_empty(&root_log_ctx.list)) 3217 list_del_init(&root_log_ctx.list); 3218 3219 blk_finish_plug(&plug); 3220 btrfs_set_log_full_commit(trans); 3221 3222 if (ret != -ENOSPC) { 3223 btrfs_abort_transaction(trans, ret); 3224 mutex_unlock(&log_root_tree->log_mutex); 3225 goto out; 3226 } 3227 btrfs_wait_tree_log_extents(log, mark); 3228 mutex_unlock(&log_root_tree->log_mutex); 3229 ret = -EAGAIN; 3230 goto out; 3231 } 3232 3233 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3234 blk_finish_plug(&plug); 3235 list_del_init(&root_log_ctx.list); 3236 mutex_unlock(&log_root_tree->log_mutex); 3237 ret = root_log_ctx.log_ret; 3238 goto out; 3239 } 3240 3241 index2 = root_log_ctx.log_transid % 2; 3242 if (atomic_read(&log_root_tree->log_commit[index2])) { 3243 blk_finish_plug(&plug); 3244 ret = btrfs_wait_tree_log_extents(log, mark); 3245 wait_log_commit(log_root_tree, 3246 root_log_ctx.log_transid); 3247 mutex_unlock(&log_root_tree->log_mutex); 3248 if (!ret) 3249 ret = root_log_ctx.log_ret; 3250 goto out; 3251 } 3252 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3253 atomic_set(&log_root_tree->log_commit[index2], 1); 3254 3255 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3256 wait_log_commit(log_root_tree, 3257 root_log_ctx.log_transid - 1); 3258 } 3259 3260 /* 3261 * now that we've moved on to the tree of log tree roots, 3262 * check the full commit flag again 3263 */ 3264 if (btrfs_need_log_full_commit(trans)) { 3265 blk_finish_plug(&plug); 3266 btrfs_wait_tree_log_extents(log, mark); 3267 mutex_unlock(&log_root_tree->log_mutex); 3268 ret = -EAGAIN; 3269 goto out_wake_log_root; 3270 } 3271 3272 ret = btrfs_write_marked_extents(fs_info, 3273 &log_root_tree->dirty_log_pages, 3274 EXTENT_DIRTY | EXTENT_NEW); 3275 blk_finish_plug(&plug); 3276 /* 3277 * As described above, -EAGAIN indicates a hole in the extents. We 3278 * cannot wait for these write outs since the waiting cause a 3279 * deadlock. Bail out to the full commit instead. 3280 */ 3281 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3282 btrfs_set_log_full_commit(trans); 3283 btrfs_wait_tree_log_extents(log, mark); 3284 mutex_unlock(&log_root_tree->log_mutex); 3285 goto out_wake_log_root; 3286 } else if (ret) { 3287 btrfs_set_log_full_commit(trans); 3288 btrfs_abort_transaction(trans, ret); 3289 mutex_unlock(&log_root_tree->log_mutex); 3290 goto out_wake_log_root; 3291 } 3292 ret = btrfs_wait_tree_log_extents(log, mark); 3293 if (!ret) 3294 ret = btrfs_wait_tree_log_extents(log_root_tree, 3295 EXTENT_NEW | EXTENT_DIRTY); 3296 if (ret) { 3297 btrfs_set_log_full_commit(trans); 3298 mutex_unlock(&log_root_tree->log_mutex); 3299 goto out_wake_log_root; 3300 } 3301 3302 log_root_start = log_root_tree->node->start; 3303 log_root_level = btrfs_header_level(log_root_tree->node); 3304 log_root_tree->log_transid++; 3305 mutex_unlock(&log_root_tree->log_mutex); 3306 3307 /* 3308 * Here we are guaranteed that nobody is going to write the superblock 3309 * for the current transaction before us and that neither we do write 3310 * our superblock before the previous transaction finishes its commit 3311 * and writes its superblock, because: 3312 * 3313 * 1) We are holding a handle on the current transaction, so no body 3314 * can commit it until we release the handle; 3315 * 3316 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3317 * if the previous transaction is still committing, and hasn't yet 3318 * written its superblock, we wait for it to do it, because a 3319 * transaction commit acquires the tree_log_mutex when the commit 3320 * begins and releases it only after writing its superblock. 3321 */ 3322 mutex_lock(&fs_info->tree_log_mutex); 3323 3324 /* 3325 * The previous transaction writeout phase could have failed, and thus 3326 * marked the fs in an error state. We must not commit here, as we 3327 * could have updated our generation in the super_for_commit and 3328 * writing the super here would result in transid mismatches. If there 3329 * is an error here just bail. 3330 */ 3331 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { 3332 ret = -EIO; 3333 btrfs_set_log_full_commit(trans); 3334 btrfs_abort_transaction(trans, ret); 3335 mutex_unlock(&fs_info->tree_log_mutex); 3336 goto out_wake_log_root; 3337 } 3338 3339 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3340 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3341 ret = write_all_supers(fs_info, 1); 3342 mutex_unlock(&fs_info->tree_log_mutex); 3343 if (ret) { 3344 btrfs_set_log_full_commit(trans); 3345 btrfs_abort_transaction(trans, ret); 3346 goto out_wake_log_root; 3347 } 3348 3349 /* 3350 * We know there can only be one task here, since we have not yet set 3351 * root->log_commit[index1] to 0 and any task attempting to sync the 3352 * log must wait for the previous log transaction to commit if it's 3353 * still in progress or wait for the current log transaction commit if 3354 * someone else already started it. We use <= and not < because the 3355 * first log transaction has an ID of 0. 3356 */ 3357 ASSERT(root->last_log_commit <= log_transid); 3358 root->last_log_commit = log_transid; 3359 3360 out_wake_log_root: 3361 mutex_lock(&log_root_tree->log_mutex); 3362 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3363 3364 log_root_tree->log_transid_committed++; 3365 atomic_set(&log_root_tree->log_commit[index2], 0); 3366 mutex_unlock(&log_root_tree->log_mutex); 3367 3368 /* 3369 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3370 * all the updates above are seen by the woken threads. It might not be 3371 * necessary, but proving that seems to be hard. 3372 */ 3373 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3374 out: 3375 mutex_lock(&root->log_mutex); 3376 btrfs_remove_all_log_ctxs(root, index1, ret); 3377 root->log_transid_committed++; 3378 atomic_set(&root->log_commit[index1], 0); 3379 mutex_unlock(&root->log_mutex); 3380 3381 /* 3382 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3383 * all the updates above are seen by the woken threads. It might not be 3384 * necessary, but proving that seems to be hard. 3385 */ 3386 cond_wake_up(&root->log_commit_wait[index1]); 3387 return ret; 3388 } 3389 3390 static void free_log_tree(struct btrfs_trans_handle *trans, 3391 struct btrfs_root *log) 3392 { 3393 int ret; 3394 struct walk_control wc = { 3395 .free = 1, 3396 .process_func = process_one_buffer 3397 }; 3398 3399 if (log->node) { 3400 ret = walk_log_tree(trans, log, &wc); 3401 if (ret) { 3402 if (trans) 3403 btrfs_abort_transaction(trans, ret); 3404 else 3405 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3406 } 3407 } 3408 3409 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3410 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3411 extent_io_tree_release(&log->log_csum_range); 3412 3413 if (trans && log->node) 3414 btrfs_redirty_list_add(trans->transaction, log->node); 3415 btrfs_put_root(log); 3416 } 3417 3418 /* 3419 * free all the extents used by the tree log. This should be called 3420 * at commit time of the full transaction 3421 */ 3422 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3423 { 3424 if (root->log_root) { 3425 free_log_tree(trans, root->log_root); 3426 root->log_root = NULL; 3427 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3428 } 3429 return 0; 3430 } 3431 3432 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3433 struct btrfs_fs_info *fs_info) 3434 { 3435 if (fs_info->log_root_tree) { 3436 free_log_tree(trans, fs_info->log_root_tree); 3437 fs_info->log_root_tree = NULL; 3438 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3439 } 3440 return 0; 3441 } 3442 3443 /* 3444 * Check if an inode was logged in the current transaction. This may often 3445 * return some false positives, because logged_trans is an in memory only field, 3446 * not persisted anywhere. This is meant to be used in contexts where a false 3447 * positive has no functional consequences. 3448 */ 3449 static bool inode_logged(struct btrfs_trans_handle *trans, 3450 struct btrfs_inode *inode) 3451 { 3452 if (inode->logged_trans == trans->transid) 3453 return true; 3454 3455 /* 3456 * The inode's logged_trans is always 0 when we load it (because it is 3457 * not persisted in the inode item or elsewhere). So if it is 0, the 3458 * inode was last modified in the current transaction then the inode may 3459 * have been logged before in the current transaction, then evicted and 3460 * loaded again in the current transaction - or may have never been logged 3461 * in the current transaction, but since we can not be sure, we have to 3462 * assume it was, otherwise our callers can leave an inconsistent log. 3463 */ 3464 if (inode->logged_trans == 0 && 3465 inode->last_trans == trans->transid && 3466 !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) 3467 return true; 3468 3469 return false; 3470 } 3471 3472 /* 3473 * If both a file and directory are logged, and unlinks or renames are 3474 * mixed in, we have a few interesting corners: 3475 * 3476 * create file X in dir Y 3477 * link file X to X.link in dir Y 3478 * fsync file X 3479 * unlink file X but leave X.link 3480 * fsync dir Y 3481 * 3482 * After a crash we would expect only X.link to exist. But file X 3483 * didn't get fsync'd again so the log has back refs for X and X.link. 3484 * 3485 * We solve this by removing directory entries and inode backrefs from the 3486 * log when a file that was logged in the current transaction is 3487 * unlinked. Any later fsync will include the updated log entries, and 3488 * we'll be able to reconstruct the proper directory items from backrefs. 3489 * 3490 * This optimizations allows us to avoid relogging the entire inode 3491 * or the entire directory. 3492 */ 3493 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3494 struct btrfs_root *root, 3495 const char *name, int name_len, 3496 struct btrfs_inode *dir, u64 index) 3497 { 3498 struct btrfs_root *log; 3499 struct btrfs_dir_item *di; 3500 struct btrfs_path *path; 3501 int ret; 3502 int err = 0; 3503 u64 dir_ino = btrfs_ino(dir); 3504 3505 if (!inode_logged(trans, dir)) 3506 return 0; 3507 3508 ret = join_running_log_trans(root); 3509 if (ret) 3510 return 0; 3511 3512 mutex_lock(&dir->log_mutex); 3513 3514 log = root->log_root; 3515 path = btrfs_alloc_path(); 3516 if (!path) { 3517 err = -ENOMEM; 3518 goto out_unlock; 3519 } 3520 3521 di = btrfs_lookup_dir_item(trans, log, path, dir_ino, 3522 name, name_len, -1); 3523 if (IS_ERR(di)) { 3524 err = PTR_ERR(di); 3525 goto fail; 3526 } 3527 if (di) { 3528 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3529 if (ret) { 3530 err = ret; 3531 goto fail; 3532 } 3533 } 3534 btrfs_release_path(path); 3535 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3536 index, name, name_len, -1); 3537 if (IS_ERR(di)) { 3538 err = PTR_ERR(di); 3539 goto fail; 3540 } 3541 if (di) { 3542 ret = btrfs_delete_one_dir_name(trans, log, path, di); 3543 if (ret) { 3544 err = ret; 3545 goto fail; 3546 } 3547 } 3548 3549 /* 3550 * We do not need to update the size field of the directory's inode item 3551 * because on log replay we update the field to reflect all existing 3552 * entries in the directory (see overwrite_item()). 3553 */ 3554 fail: 3555 btrfs_free_path(path); 3556 out_unlock: 3557 mutex_unlock(&dir->log_mutex); 3558 if (err == -ENOSPC) { 3559 btrfs_set_log_full_commit(trans); 3560 err = 0; 3561 } else if (err < 0) { 3562 btrfs_abort_transaction(trans, err); 3563 } 3564 3565 btrfs_end_log_trans(root); 3566 3567 return err; 3568 } 3569 3570 /* see comments for btrfs_del_dir_entries_in_log */ 3571 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3572 struct btrfs_root *root, 3573 const char *name, int name_len, 3574 struct btrfs_inode *inode, u64 dirid) 3575 { 3576 struct btrfs_root *log; 3577 u64 index; 3578 int ret; 3579 3580 if (!inode_logged(trans, inode)) 3581 return 0; 3582 3583 ret = join_running_log_trans(root); 3584 if (ret) 3585 return 0; 3586 log = root->log_root; 3587 mutex_lock(&inode->log_mutex); 3588 3589 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3590 dirid, &index); 3591 mutex_unlock(&inode->log_mutex); 3592 if (ret == -ENOSPC) { 3593 btrfs_set_log_full_commit(trans); 3594 ret = 0; 3595 } else if (ret < 0 && ret != -ENOENT) 3596 btrfs_abort_transaction(trans, ret); 3597 btrfs_end_log_trans(root); 3598 3599 return ret; 3600 } 3601 3602 /* 3603 * creates a range item in the log for 'dirid'. first_offset and 3604 * last_offset tell us which parts of the key space the log should 3605 * be considered authoritative for. 3606 */ 3607 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3608 struct btrfs_root *log, 3609 struct btrfs_path *path, 3610 int key_type, u64 dirid, 3611 u64 first_offset, u64 last_offset) 3612 { 3613 int ret; 3614 struct btrfs_key key; 3615 struct btrfs_dir_log_item *item; 3616 3617 key.objectid = dirid; 3618 key.offset = first_offset; 3619 if (key_type == BTRFS_DIR_ITEM_KEY) 3620 key.type = BTRFS_DIR_LOG_ITEM_KEY; 3621 else 3622 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3623 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3624 if (ret) 3625 return ret; 3626 3627 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3628 struct btrfs_dir_log_item); 3629 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3630 btrfs_mark_buffer_dirty(path->nodes[0]); 3631 btrfs_release_path(path); 3632 return 0; 3633 } 3634 3635 /* 3636 * log all the items included in the current transaction for a given 3637 * directory. This also creates the range items in the log tree required 3638 * to replay anything deleted before the fsync 3639 */ 3640 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3641 struct btrfs_root *root, struct btrfs_inode *inode, 3642 struct btrfs_path *path, 3643 struct btrfs_path *dst_path, int key_type, 3644 struct btrfs_log_ctx *ctx, 3645 u64 min_offset, u64 *last_offset_ret) 3646 { 3647 struct btrfs_key min_key; 3648 struct btrfs_root *log = root->log_root; 3649 struct extent_buffer *src; 3650 int err = 0; 3651 int ret; 3652 int i; 3653 int nritems; 3654 u64 first_offset = min_offset; 3655 u64 last_offset = (u64)-1; 3656 u64 ino = btrfs_ino(inode); 3657 3658 log = root->log_root; 3659 3660 min_key.objectid = ino; 3661 min_key.type = key_type; 3662 min_key.offset = min_offset; 3663 3664 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3665 3666 /* 3667 * we didn't find anything from this transaction, see if there 3668 * is anything at all 3669 */ 3670 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) { 3671 min_key.objectid = ino; 3672 min_key.type = key_type; 3673 min_key.offset = (u64)-1; 3674 btrfs_release_path(path); 3675 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3676 if (ret < 0) { 3677 btrfs_release_path(path); 3678 return ret; 3679 } 3680 ret = btrfs_previous_item(root, path, ino, key_type); 3681 3682 /* if ret == 0 there are items for this type, 3683 * create a range to tell us the last key of this type. 3684 * otherwise, there are no items in this directory after 3685 * *min_offset, and we create a range to indicate that. 3686 */ 3687 if (ret == 0) { 3688 struct btrfs_key tmp; 3689 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3690 path->slots[0]); 3691 if (key_type == tmp.type) 3692 first_offset = max(min_offset, tmp.offset) + 1; 3693 } 3694 goto done; 3695 } 3696 3697 /* go backward to find any previous key */ 3698 ret = btrfs_previous_item(root, path, ino, key_type); 3699 if (ret == 0) { 3700 struct btrfs_key tmp; 3701 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3702 if (key_type == tmp.type) { 3703 first_offset = tmp.offset; 3704 ret = overwrite_item(trans, log, dst_path, 3705 path->nodes[0], path->slots[0], 3706 &tmp); 3707 if (ret) { 3708 err = ret; 3709 goto done; 3710 } 3711 } 3712 } 3713 btrfs_release_path(path); 3714 3715 /* 3716 * Find the first key from this transaction again. See the note for 3717 * log_new_dir_dentries, if we're logging a directory recursively we 3718 * won't be holding its i_mutex, which means we can modify the directory 3719 * while we're logging it. If we remove an entry between our first 3720 * search and this search we'll not find the key again and can just 3721 * bail. 3722 */ 3723 search: 3724 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3725 if (ret != 0) 3726 goto done; 3727 3728 /* 3729 * we have a block from this transaction, log every item in it 3730 * from our directory 3731 */ 3732 while (1) { 3733 struct btrfs_key tmp; 3734 src = path->nodes[0]; 3735 nritems = btrfs_header_nritems(src); 3736 for (i = path->slots[0]; i < nritems; i++) { 3737 struct btrfs_dir_item *di; 3738 3739 btrfs_item_key_to_cpu(src, &min_key, i); 3740 3741 if (min_key.objectid != ino || min_key.type != key_type) 3742 goto done; 3743 3744 if (need_resched()) { 3745 btrfs_release_path(path); 3746 cond_resched(); 3747 goto search; 3748 } 3749 3750 ret = overwrite_item(trans, log, dst_path, src, i, 3751 &min_key); 3752 if (ret) { 3753 err = ret; 3754 goto done; 3755 } 3756 3757 /* 3758 * We must make sure that when we log a directory entry, 3759 * the corresponding inode, after log replay, has a 3760 * matching link count. For example: 3761 * 3762 * touch foo 3763 * mkdir mydir 3764 * sync 3765 * ln foo mydir/bar 3766 * xfs_io -c "fsync" mydir 3767 * <crash> 3768 * <mount fs and log replay> 3769 * 3770 * Would result in a fsync log that when replayed, our 3771 * file inode would have a link count of 1, but we get 3772 * two directory entries pointing to the same inode. 3773 * After removing one of the names, it would not be 3774 * possible to remove the other name, which resulted 3775 * always in stale file handle errors, and would not 3776 * be possible to rmdir the parent directory, since 3777 * its i_size could never decrement to the value 3778 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. 3779 */ 3780 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3781 btrfs_dir_item_key_to_cpu(src, di, &tmp); 3782 if (ctx && 3783 (btrfs_dir_transid(src, di) == trans->transid || 3784 btrfs_dir_type(src, di) == BTRFS_FT_DIR) && 3785 tmp.type != BTRFS_ROOT_ITEM_KEY) 3786 ctx->log_new_dentries = true; 3787 } 3788 path->slots[0] = nritems; 3789 3790 /* 3791 * look ahead to the next item and see if it is also 3792 * from this directory and from this transaction 3793 */ 3794 ret = btrfs_next_leaf(root, path); 3795 if (ret) { 3796 if (ret == 1) 3797 last_offset = (u64)-1; 3798 else 3799 err = ret; 3800 goto done; 3801 } 3802 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3803 if (tmp.objectid != ino || tmp.type != key_type) { 3804 last_offset = (u64)-1; 3805 goto done; 3806 } 3807 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3808 ret = overwrite_item(trans, log, dst_path, 3809 path->nodes[0], path->slots[0], 3810 &tmp); 3811 if (ret) 3812 err = ret; 3813 else 3814 last_offset = tmp.offset; 3815 goto done; 3816 } 3817 } 3818 done: 3819 btrfs_release_path(path); 3820 btrfs_release_path(dst_path); 3821 3822 if (err == 0) { 3823 *last_offset_ret = last_offset; 3824 /* 3825 * insert the log range keys to indicate where the log 3826 * is valid 3827 */ 3828 ret = insert_dir_log_key(trans, log, path, key_type, 3829 ino, first_offset, last_offset); 3830 if (ret) 3831 err = ret; 3832 } 3833 return err; 3834 } 3835 3836 /* 3837 * logging directories is very similar to logging inodes, We find all the items 3838 * from the current transaction and write them to the log. 3839 * 3840 * The recovery code scans the directory in the subvolume, and if it finds a 3841 * key in the range logged that is not present in the log tree, then it means 3842 * that dir entry was unlinked during the transaction. 3843 * 3844 * In order for that scan to work, we must include one key smaller than 3845 * the smallest logged by this transaction and one key larger than the largest 3846 * key logged by this transaction. 3847 */ 3848 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3849 struct btrfs_root *root, struct btrfs_inode *inode, 3850 struct btrfs_path *path, 3851 struct btrfs_path *dst_path, 3852 struct btrfs_log_ctx *ctx) 3853 { 3854 u64 min_key; 3855 u64 max_key; 3856 int ret; 3857 int key_type = BTRFS_DIR_ITEM_KEY; 3858 3859 again: 3860 min_key = 0; 3861 max_key = 0; 3862 while (1) { 3863 ret = log_dir_items(trans, root, inode, path, dst_path, key_type, 3864 ctx, min_key, &max_key); 3865 if (ret) 3866 return ret; 3867 if (max_key == (u64)-1) 3868 break; 3869 min_key = max_key + 1; 3870 } 3871 3872 if (key_type == BTRFS_DIR_ITEM_KEY) { 3873 key_type = BTRFS_DIR_INDEX_KEY; 3874 goto again; 3875 } 3876 return 0; 3877 } 3878 3879 /* 3880 * a helper function to drop items from the log before we relog an 3881 * inode. max_key_type indicates the highest item type to remove. 3882 * This cannot be run for file data extents because it does not 3883 * free the extents they point to. 3884 */ 3885 static int drop_objectid_items(struct btrfs_trans_handle *trans, 3886 struct btrfs_root *log, 3887 struct btrfs_path *path, 3888 u64 objectid, int max_key_type) 3889 { 3890 int ret; 3891 struct btrfs_key key; 3892 struct btrfs_key found_key; 3893 int start_slot; 3894 3895 key.objectid = objectid; 3896 key.type = max_key_type; 3897 key.offset = (u64)-1; 3898 3899 while (1) { 3900 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 3901 BUG_ON(ret == 0); /* Logic error */ 3902 if (ret < 0) 3903 break; 3904 3905 if (path->slots[0] == 0) 3906 break; 3907 3908 path->slots[0]--; 3909 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3910 path->slots[0]); 3911 3912 if (found_key.objectid != objectid) 3913 break; 3914 3915 found_key.offset = 0; 3916 found_key.type = 0; 3917 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 3918 if (ret < 0) 3919 break; 3920 3921 ret = btrfs_del_items(trans, log, path, start_slot, 3922 path->slots[0] - start_slot + 1); 3923 /* 3924 * If start slot isn't 0 then we don't need to re-search, we've 3925 * found the last guy with the objectid in this tree. 3926 */ 3927 if (ret || start_slot != 0) 3928 break; 3929 btrfs_release_path(path); 3930 } 3931 btrfs_release_path(path); 3932 if (ret > 0) 3933 ret = 0; 3934 return ret; 3935 } 3936 3937 static void fill_inode_item(struct btrfs_trans_handle *trans, 3938 struct extent_buffer *leaf, 3939 struct btrfs_inode_item *item, 3940 struct inode *inode, int log_inode_only, 3941 u64 logged_isize) 3942 { 3943 struct btrfs_map_token token; 3944 u64 flags; 3945 3946 btrfs_init_map_token(&token, leaf); 3947 3948 if (log_inode_only) { 3949 /* set the generation to zero so the recover code 3950 * can tell the difference between an logging 3951 * just to say 'this inode exists' and a logging 3952 * to say 'update this inode with these values' 3953 */ 3954 btrfs_set_token_inode_generation(&token, item, 0); 3955 btrfs_set_token_inode_size(&token, item, logged_isize); 3956 } else { 3957 btrfs_set_token_inode_generation(&token, item, 3958 BTRFS_I(inode)->generation); 3959 btrfs_set_token_inode_size(&token, item, inode->i_size); 3960 } 3961 3962 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 3963 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 3964 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 3965 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 3966 3967 btrfs_set_token_timespec_sec(&token, &item->atime, 3968 inode->i_atime.tv_sec); 3969 btrfs_set_token_timespec_nsec(&token, &item->atime, 3970 inode->i_atime.tv_nsec); 3971 3972 btrfs_set_token_timespec_sec(&token, &item->mtime, 3973 inode->i_mtime.tv_sec); 3974 btrfs_set_token_timespec_nsec(&token, &item->mtime, 3975 inode->i_mtime.tv_nsec); 3976 3977 btrfs_set_token_timespec_sec(&token, &item->ctime, 3978 inode->i_ctime.tv_sec); 3979 btrfs_set_token_timespec_nsec(&token, &item->ctime, 3980 inode->i_ctime.tv_nsec); 3981 3982 /* 3983 * We do not need to set the nbytes field, in fact during a fast fsync 3984 * its value may not even be correct, since a fast fsync does not wait 3985 * for ordered extent completion, which is where we update nbytes, it 3986 * only waits for writeback to complete. During log replay as we find 3987 * file extent items and replay them, we adjust the nbytes field of the 3988 * inode item in subvolume tree as needed (see overwrite_item()). 3989 */ 3990 3991 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 3992 btrfs_set_token_inode_transid(&token, item, trans->transid); 3993 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 3994 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 3995 BTRFS_I(inode)->ro_flags); 3996 btrfs_set_token_inode_flags(&token, item, flags); 3997 btrfs_set_token_inode_block_group(&token, item, 0); 3998 } 3999 4000 static int log_inode_item(struct btrfs_trans_handle *trans, 4001 struct btrfs_root *log, struct btrfs_path *path, 4002 struct btrfs_inode *inode, bool inode_item_dropped) 4003 { 4004 struct btrfs_inode_item *inode_item; 4005 int ret; 4006 4007 /* 4008 * If we are doing a fast fsync and the inode was logged before in the 4009 * current transaction, then we know the inode was previously logged and 4010 * it exists in the log tree. For performance reasons, in this case use 4011 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4012 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4013 * contention in case there are concurrent fsyncs for other inodes of the 4014 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4015 * already exists can also result in unnecessarily splitting a leaf. 4016 */ 4017 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4018 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4019 ASSERT(ret <= 0); 4020 if (ret > 0) 4021 ret = -ENOENT; 4022 } else { 4023 /* 4024 * This means it is the first fsync in the current transaction, 4025 * so the inode item is not in the log and we need to insert it. 4026 * We can never get -EEXIST because we are only called for a fast 4027 * fsync and in case an inode eviction happens after the inode was 4028 * logged before in the current transaction, when we load again 4029 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4030 * flags and set ->logged_trans to 0. 4031 */ 4032 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4033 sizeof(*inode_item)); 4034 ASSERT(ret != -EEXIST); 4035 } 4036 if (ret) 4037 return ret; 4038 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4039 struct btrfs_inode_item); 4040 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4041 0, 0); 4042 btrfs_release_path(path); 4043 return 0; 4044 } 4045 4046 static int log_csums(struct btrfs_trans_handle *trans, 4047 struct btrfs_inode *inode, 4048 struct btrfs_root *log_root, 4049 struct btrfs_ordered_sum *sums) 4050 { 4051 const u64 lock_end = sums->bytenr + sums->len - 1; 4052 struct extent_state *cached_state = NULL; 4053 int ret; 4054 4055 /* 4056 * If this inode was not used for reflink operations in the current 4057 * transaction with new extents, then do the fast path, no need to 4058 * worry about logging checksum items with overlapping ranges. 4059 */ 4060 if (inode->last_reflink_trans < trans->transid) 4061 return btrfs_csum_file_blocks(trans, log_root, sums); 4062 4063 /* 4064 * Serialize logging for checksums. This is to avoid racing with the 4065 * same checksum being logged by another task that is logging another 4066 * file which happens to refer to the same extent as well. Such races 4067 * can leave checksum items in the log with overlapping ranges. 4068 */ 4069 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 4070 lock_end, &cached_state); 4071 if (ret) 4072 return ret; 4073 /* 4074 * Due to extent cloning, we might have logged a csum item that covers a 4075 * subrange of a cloned extent, and later we can end up logging a csum 4076 * item for a larger subrange of the same extent or the entire range. 4077 * This would leave csum items in the log tree that cover the same range 4078 * and break the searches for checksums in the log tree, resulting in 4079 * some checksums missing in the fs/subvolume tree. So just delete (or 4080 * trim and adjust) any existing csum items in the log for this range. 4081 */ 4082 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4083 if (!ret) 4084 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4085 4086 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 4087 &cached_state); 4088 4089 return ret; 4090 } 4091 4092 static noinline int copy_items(struct btrfs_trans_handle *trans, 4093 struct btrfs_inode *inode, 4094 struct btrfs_path *dst_path, 4095 struct btrfs_path *src_path, 4096 int start_slot, int nr, int inode_only, 4097 u64 logged_isize) 4098 { 4099 struct btrfs_fs_info *fs_info = trans->fs_info; 4100 unsigned long src_offset; 4101 unsigned long dst_offset; 4102 struct btrfs_root *log = inode->root->log_root; 4103 struct btrfs_file_extent_item *extent; 4104 struct btrfs_inode_item *inode_item; 4105 struct extent_buffer *src = src_path->nodes[0]; 4106 int ret; 4107 struct btrfs_key *ins_keys; 4108 u32 *ins_sizes; 4109 char *ins_data; 4110 int i; 4111 struct list_head ordered_sums; 4112 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; 4113 4114 INIT_LIST_HEAD(&ordered_sums); 4115 4116 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4117 nr * sizeof(u32), GFP_NOFS); 4118 if (!ins_data) 4119 return -ENOMEM; 4120 4121 ins_sizes = (u32 *)ins_data; 4122 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4123 4124 for (i = 0; i < nr; i++) { 4125 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); 4126 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); 4127 } 4128 ret = btrfs_insert_empty_items(trans, log, dst_path, 4129 ins_keys, ins_sizes, nr); 4130 if (ret) { 4131 kfree(ins_data); 4132 return ret; 4133 } 4134 4135 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 4136 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 4137 dst_path->slots[0]); 4138 4139 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 4140 4141 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 4142 inode_item = btrfs_item_ptr(dst_path->nodes[0], 4143 dst_path->slots[0], 4144 struct btrfs_inode_item); 4145 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4146 &inode->vfs_inode, 4147 inode_only == LOG_INODE_EXISTS, 4148 logged_isize); 4149 } else { 4150 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4151 src_offset, ins_sizes[i]); 4152 } 4153 4154 /* take a reference on file data extents so that truncates 4155 * or deletes of this inode don't have to relog the inode 4156 * again 4157 */ 4158 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && 4159 !skip_csum) { 4160 int found_type; 4161 extent = btrfs_item_ptr(src, start_slot + i, 4162 struct btrfs_file_extent_item); 4163 4164 if (btrfs_file_extent_generation(src, extent) < trans->transid) 4165 continue; 4166 4167 found_type = btrfs_file_extent_type(src, extent); 4168 if (found_type == BTRFS_FILE_EXTENT_REG) { 4169 u64 ds, dl, cs, cl; 4170 ds = btrfs_file_extent_disk_bytenr(src, 4171 extent); 4172 /* ds == 0 is a hole */ 4173 if (ds == 0) 4174 continue; 4175 4176 dl = btrfs_file_extent_disk_num_bytes(src, 4177 extent); 4178 cs = btrfs_file_extent_offset(src, extent); 4179 cl = btrfs_file_extent_num_bytes(src, 4180 extent); 4181 if (btrfs_file_extent_compression(src, 4182 extent)) { 4183 cs = 0; 4184 cl = dl; 4185 } 4186 4187 ret = btrfs_lookup_csums_range( 4188 fs_info->csum_root, 4189 ds + cs, ds + cs + cl - 1, 4190 &ordered_sums, 0); 4191 if (ret) 4192 break; 4193 } 4194 } 4195 } 4196 4197 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4198 btrfs_release_path(dst_path); 4199 kfree(ins_data); 4200 4201 /* 4202 * we have to do this after the loop above to avoid changing the 4203 * log tree while trying to change the log tree. 4204 */ 4205 while (!list_empty(&ordered_sums)) { 4206 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4207 struct btrfs_ordered_sum, 4208 list); 4209 if (!ret) 4210 ret = log_csums(trans, inode, log, sums); 4211 list_del(&sums->list); 4212 kfree(sums); 4213 } 4214 4215 return ret; 4216 } 4217 4218 static int extent_cmp(void *priv, const struct list_head *a, 4219 const struct list_head *b) 4220 { 4221 const struct extent_map *em1, *em2; 4222 4223 em1 = list_entry(a, struct extent_map, list); 4224 em2 = list_entry(b, struct extent_map, list); 4225 4226 if (em1->start < em2->start) 4227 return -1; 4228 else if (em1->start > em2->start) 4229 return 1; 4230 return 0; 4231 } 4232 4233 static int log_extent_csums(struct btrfs_trans_handle *trans, 4234 struct btrfs_inode *inode, 4235 struct btrfs_root *log_root, 4236 const struct extent_map *em, 4237 struct btrfs_log_ctx *ctx) 4238 { 4239 struct btrfs_ordered_extent *ordered; 4240 u64 csum_offset; 4241 u64 csum_len; 4242 u64 mod_start = em->mod_start; 4243 u64 mod_len = em->mod_len; 4244 LIST_HEAD(ordered_sums); 4245 int ret = 0; 4246 4247 if (inode->flags & BTRFS_INODE_NODATASUM || 4248 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4249 em->block_start == EXTENT_MAP_HOLE) 4250 return 0; 4251 4252 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4253 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4254 const u64 mod_end = mod_start + mod_len; 4255 struct btrfs_ordered_sum *sums; 4256 4257 if (mod_len == 0) 4258 break; 4259 4260 if (ordered_end <= mod_start) 4261 continue; 4262 if (mod_end <= ordered->file_offset) 4263 break; 4264 4265 /* 4266 * We are going to copy all the csums on this ordered extent, so 4267 * go ahead and adjust mod_start and mod_len in case this ordered 4268 * extent has already been logged. 4269 */ 4270 if (ordered->file_offset > mod_start) { 4271 if (ordered_end >= mod_end) 4272 mod_len = ordered->file_offset - mod_start; 4273 /* 4274 * If we have this case 4275 * 4276 * |--------- logged extent ---------| 4277 * |----- ordered extent ----| 4278 * 4279 * Just don't mess with mod_start and mod_len, we'll 4280 * just end up logging more csums than we need and it 4281 * will be ok. 4282 */ 4283 } else { 4284 if (ordered_end < mod_end) { 4285 mod_len = mod_end - ordered_end; 4286 mod_start = ordered_end; 4287 } else { 4288 mod_len = 0; 4289 } 4290 } 4291 4292 /* 4293 * To keep us from looping for the above case of an ordered 4294 * extent that falls inside of the logged extent. 4295 */ 4296 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4297 continue; 4298 4299 list_for_each_entry(sums, &ordered->list, list) { 4300 ret = log_csums(trans, inode, log_root, sums); 4301 if (ret) 4302 return ret; 4303 } 4304 } 4305 4306 /* We're done, found all csums in the ordered extents. */ 4307 if (mod_len == 0) 4308 return 0; 4309 4310 /* If we're compressed we have to save the entire range of csums. */ 4311 if (em->compress_type) { 4312 csum_offset = 0; 4313 csum_len = max(em->block_len, em->orig_block_len); 4314 } else { 4315 csum_offset = mod_start - em->start; 4316 csum_len = mod_len; 4317 } 4318 4319 /* block start is already adjusted for the file extent offset. */ 4320 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root, 4321 em->block_start + csum_offset, 4322 em->block_start + csum_offset + 4323 csum_len - 1, &ordered_sums, 0); 4324 if (ret) 4325 return ret; 4326 4327 while (!list_empty(&ordered_sums)) { 4328 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4329 struct btrfs_ordered_sum, 4330 list); 4331 if (!ret) 4332 ret = log_csums(trans, inode, log_root, sums); 4333 list_del(&sums->list); 4334 kfree(sums); 4335 } 4336 4337 return ret; 4338 } 4339 4340 static int log_one_extent(struct btrfs_trans_handle *trans, 4341 struct btrfs_inode *inode, struct btrfs_root *root, 4342 const struct extent_map *em, 4343 struct btrfs_path *path, 4344 struct btrfs_log_ctx *ctx) 4345 { 4346 struct btrfs_drop_extents_args drop_args = { 0 }; 4347 struct btrfs_root *log = root->log_root; 4348 struct btrfs_file_extent_item *fi; 4349 struct extent_buffer *leaf; 4350 struct btrfs_map_token token; 4351 struct btrfs_key key; 4352 u64 extent_offset = em->start - em->orig_start; 4353 u64 block_len; 4354 int ret; 4355 4356 ret = log_extent_csums(trans, inode, log, em, ctx); 4357 if (ret) 4358 return ret; 4359 4360 drop_args.path = path; 4361 drop_args.start = em->start; 4362 drop_args.end = em->start + em->len; 4363 drop_args.replace_extent = true; 4364 drop_args.extent_item_size = sizeof(*fi); 4365 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4366 if (ret) 4367 return ret; 4368 4369 if (!drop_args.extent_inserted) { 4370 key.objectid = btrfs_ino(inode); 4371 key.type = BTRFS_EXTENT_DATA_KEY; 4372 key.offset = em->start; 4373 4374 ret = btrfs_insert_empty_item(trans, log, path, &key, 4375 sizeof(*fi)); 4376 if (ret) 4377 return ret; 4378 } 4379 leaf = path->nodes[0]; 4380 btrfs_init_map_token(&token, leaf); 4381 fi = btrfs_item_ptr(leaf, path->slots[0], 4382 struct btrfs_file_extent_item); 4383 4384 btrfs_set_token_file_extent_generation(&token, fi, trans->transid); 4385 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4386 btrfs_set_token_file_extent_type(&token, fi, 4387 BTRFS_FILE_EXTENT_PREALLOC); 4388 else 4389 btrfs_set_token_file_extent_type(&token, fi, 4390 BTRFS_FILE_EXTENT_REG); 4391 4392 block_len = max(em->block_len, em->orig_block_len); 4393 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4394 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4395 em->block_start); 4396 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4397 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4398 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 4399 em->block_start - 4400 extent_offset); 4401 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); 4402 } else { 4403 btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); 4404 btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); 4405 } 4406 4407 btrfs_set_token_file_extent_offset(&token, fi, extent_offset); 4408 btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); 4409 btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); 4410 btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); 4411 btrfs_set_token_file_extent_encryption(&token, fi, 0); 4412 btrfs_set_token_file_extent_other_encoding(&token, fi, 0); 4413 btrfs_mark_buffer_dirty(leaf); 4414 4415 btrfs_release_path(path); 4416 4417 return ret; 4418 } 4419 4420 /* 4421 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4422 * lose them after doing a fast fsync and replaying the log. We scan the 4423 * subvolume's root instead of iterating the inode's extent map tree because 4424 * otherwise we can log incorrect extent items based on extent map conversion. 4425 * That can happen due to the fact that extent maps are merged when they 4426 * are not in the extent map tree's list of modified extents. 4427 */ 4428 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4429 struct btrfs_inode *inode, 4430 struct btrfs_path *path) 4431 { 4432 struct btrfs_root *root = inode->root; 4433 struct btrfs_key key; 4434 const u64 i_size = i_size_read(&inode->vfs_inode); 4435 const u64 ino = btrfs_ino(inode); 4436 struct btrfs_path *dst_path = NULL; 4437 bool dropped_extents = false; 4438 u64 truncate_offset = i_size; 4439 struct extent_buffer *leaf; 4440 int slot; 4441 int ins_nr = 0; 4442 int start_slot; 4443 int ret; 4444 4445 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4446 return 0; 4447 4448 key.objectid = ino; 4449 key.type = BTRFS_EXTENT_DATA_KEY; 4450 key.offset = i_size; 4451 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4452 if (ret < 0) 4453 goto out; 4454 4455 /* 4456 * We must check if there is a prealloc extent that starts before the 4457 * i_size and crosses the i_size boundary. This is to ensure later we 4458 * truncate down to the end of that extent and not to the i_size, as 4459 * otherwise we end up losing part of the prealloc extent after a log 4460 * replay and with an implicit hole if there is another prealloc extent 4461 * that starts at an offset beyond i_size. 4462 */ 4463 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4464 if (ret < 0) 4465 goto out; 4466 4467 if (ret == 0) { 4468 struct btrfs_file_extent_item *ei; 4469 4470 leaf = path->nodes[0]; 4471 slot = path->slots[0]; 4472 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4473 4474 if (btrfs_file_extent_type(leaf, ei) == 4475 BTRFS_FILE_EXTENT_PREALLOC) { 4476 u64 extent_end; 4477 4478 btrfs_item_key_to_cpu(leaf, &key, slot); 4479 extent_end = key.offset + 4480 btrfs_file_extent_num_bytes(leaf, ei); 4481 4482 if (extent_end > i_size) 4483 truncate_offset = extent_end; 4484 } 4485 } else { 4486 ret = 0; 4487 } 4488 4489 while (true) { 4490 leaf = path->nodes[0]; 4491 slot = path->slots[0]; 4492 4493 if (slot >= btrfs_header_nritems(leaf)) { 4494 if (ins_nr > 0) { 4495 ret = copy_items(trans, inode, dst_path, path, 4496 start_slot, ins_nr, 1, 0); 4497 if (ret < 0) 4498 goto out; 4499 ins_nr = 0; 4500 } 4501 ret = btrfs_next_leaf(root, path); 4502 if (ret < 0) 4503 goto out; 4504 if (ret > 0) { 4505 ret = 0; 4506 break; 4507 } 4508 continue; 4509 } 4510 4511 btrfs_item_key_to_cpu(leaf, &key, slot); 4512 if (key.objectid > ino) 4513 break; 4514 if (WARN_ON_ONCE(key.objectid < ino) || 4515 key.type < BTRFS_EXTENT_DATA_KEY || 4516 key.offset < i_size) { 4517 path->slots[0]++; 4518 continue; 4519 } 4520 if (!dropped_extents) { 4521 /* 4522 * Avoid logging extent items logged in past fsync calls 4523 * and leading to duplicate keys in the log tree. 4524 */ 4525 do { 4526 ret = btrfs_truncate_inode_items(trans, 4527 root->log_root, 4528 inode, truncate_offset, 4529 BTRFS_EXTENT_DATA_KEY, 4530 NULL); 4531 } while (ret == -EAGAIN); 4532 if (ret) 4533 goto out; 4534 dropped_extents = true; 4535 } 4536 if (ins_nr == 0) 4537 start_slot = slot; 4538 ins_nr++; 4539 path->slots[0]++; 4540 if (!dst_path) { 4541 dst_path = btrfs_alloc_path(); 4542 if (!dst_path) { 4543 ret = -ENOMEM; 4544 goto out; 4545 } 4546 } 4547 } 4548 if (ins_nr > 0) 4549 ret = copy_items(trans, inode, dst_path, path, 4550 start_slot, ins_nr, 1, 0); 4551 out: 4552 btrfs_release_path(path); 4553 btrfs_free_path(dst_path); 4554 return ret; 4555 } 4556 4557 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4558 struct btrfs_root *root, 4559 struct btrfs_inode *inode, 4560 struct btrfs_path *path, 4561 struct btrfs_log_ctx *ctx) 4562 { 4563 struct btrfs_ordered_extent *ordered; 4564 struct btrfs_ordered_extent *tmp; 4565 struct extent_map *em, *n; 4566 struct list_head extents; 4567 struct extent_map_tree *tree = &inode->extent_tree; 4568 int ret = 0; 4569 int num = 0; 4570 4571 INIT_LIST_HEAD(&extents); 4572 4573 write_lock(&tree->lock); 4574 4575 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4576 list_del_init(&em->list); 4577 /* 4578 * Just an arbitrary number, this can be really CPU intensive 4579 * once we start getting a lot of extents, and really once we 4580 * have a bunch of extents we just want to commit since it will 4581 * be faster. 4582 */ 4583 if (++num > 32768) { 4584 list_del_init(&tree->modified_extents); 4585 ret = -EFBIG; 4586 goto process; 4587 } 4588 4589 if (em->generation < trans->transid) 4590 continue; 4591 4592 /* We log prealloc extents beyond eof later. */ 4593 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4594 em->start >= i_size_read(&inode->vfs_inode)) 4595 continue; 4596 4597 /* Need a ref to keep it from getting evicted from cache */ 4598 refcount_inc(&em->refs); 4599 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4600 list_add_tail(&em->list, &extents); 4601 num++; 4602 } 4603 4604 list_sort(NULL, &extents, extent_cmp); 4605 process: 4606 while (!list_empty(&extents)) { 4607 em = list_entry(extents.next, struct extent_map, list); 4608 4609 list_del_init(&em->list); 4610 4611 /* 4612 * If we had an error we just need to delete everybody from our 4613 * private list. 4614 */ 4615 if (ret) { 4616 clear_em_logging(tree, em); 4617 free_extent_map(em); 4618 continue; 4619 } 4620 4621 write_unlock(&tree->lock); 4622 4623 ret = log_one_extent(trans, inode, root, em, path, ctx); 4624 write_lock(&tree->lock); 4625 clear_em_logging(tree, em); 4626 free_extent_map(em); 4627 } 4628 WARN_ON(!list_empty(&extents)); 4629 write_unlock(&tree->lock); 4630 4631 btrfs_release_path(path); 4632 if (!ret) 4633 ret = btrfs_log_prealloc_extents(trans, inode, path); 4634 if (ret) 4635 return ret; 4636 4637 /* 4638 * We have logged all extents successfully, now make sure the commit of 4639 * the current transaction waits for the ordered extents to complete 4640 * before it commits and wipes out the log trees, otherwise we would 4641 * lose data if an ordered extents completes after the transaction 4642 * commits and a power failure happens after the transaction commit. 4643 */ 4644 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4645 list_del_init(&ordered->log_list); 4646 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4647 4648 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4649 spin_lock_irq(&inode->ordered_tree.lock); 4650 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4651 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4652 atomic_inc(&trans->transaction->pending_ordered); 4653 } 4654 spin_unlock_irq(&inode->ordered_tree.lock); 4655 } 4656 btrfs_put_ordered_extent(ordered); 4657 } 4658 4659 return 0; 4660 } 4661 4662 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4663 struct btrfs_path *path, u64 *size_ret) 4664 { 4665 struct btrfs_key key; 4666 int ret; 4667 4668 key.objectid = btrfs_ino(inode); 4669 key.type = BTRFS_INODE_ITEM_KEY; 4670 key.offset = 0; 4671 4672 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4673 if (ret < 0) { 4674 return ret; 4675 } else if (ret > 0) { 4676 *size_ret = 0; 4677 } else { 4678 struct btrfs_inode_item *item; 4679 4680 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4681 struct btrfs_inode_item); 4682 *size_ret = btrfs_inode_size(path->nodes[0], item); 4683 /* 4684 * If the in-memory inode's i_size is smaller then the inode 4685 * size stored in the btree, return the inode's i_size, so 4686 * that we get a correct inode size after replaying the log 4687 * when before a power failure we had a shrinking truncate 4688 * followed by addition of a new name (rename / new hard link). 4689 * Otherwise return the inode size from the btree, to avoid 4690 * data loss when replaying a log due to previously doing a 4691 * write that expands the inode's size and logging a new name 4692 * immediately after. 4693 */ 4694 if (*size_ret > inode->vfs_inode.i_size) 4695 *size_ret = inode->vfs_inode.i_size; 4696 } 4697 4698 btrfs_release_path(path); 4699 return 0; 4700 } 4701 4702 /* 4703 * At the moment we always log all xattrs. This is to figure out at log replay 4704 * time which xattrs must have their deletion replayed. If a xattr is missing 4705 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4706 * because if a xattr is deleted, the inode is fsynced and a power failure 4707 * happens, causing the log to be replayed the next time the fs is mounted, 4708 * we want the xattr to not exist anymore (same behaviour as other filesystems 4709 * with a journal, ext3/4, xfs, f2fs, etc). 4710 */ 4711 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4712 struct btrfs_root *root, 4713 struct btrfs_inode *inode, 4714 struct btrfs_path *path, 4715 struct btrfs_path *dst_path) 4716 { 4717 int ret; 4718 struct btrfs_key key; 4719 const u64 ino = btrfs_ino(inode); 4720 int ins_nr = 0; 4721 int start_slot = 0; 4722 bool found_xattrs = false; 4723 4724 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 4725 return 0; 4726 4727 key.objectid = ino; 4728 key.type = BTRFS_XATTR_ITEM_KEY; 4729 key.offset = 0; 4730 4731 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4732 if (ret < 0) 4733 return ret; 4734 4735 while (true) { 4736 int slot = path->slots[0]; 4737 struct extent_buffer *leaf = path->nodes[0]; 4738 int nritems = btrfs_header_nritems(leaf); 4739 4740 if (slot >= nritems) { 4741 if (ins_nr > 0) { 4742 ret = copy_items(trans, inode, dst_path, path, 4743 start_slot, ins_nr, 1, 0); 4744 if (ret < 0) 4745 return ret; 4746 ins_nr = 0; 4747 } 4748 ret = btrfs_next_leaf(root, path); 4749 if (ret < 0) 4750 return ret; 4751 else if (ret > 0) 4752 break; 4753 continue; 4754 } 4755 4756 btrfs_item_key_to_cpu(leaf, &key, slot); 4757 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 4758 break; 4759 4760 if (ins_nr == 0) 4761 start_slot = slot; 4762 ins_nr++; 4763 path->slots[0]++; 4764 found_xattrs = true; 4765 cond_resched(); 4766 } 4767 if (ins_nr > 0) { 4768 ret = copy_items(trans, inode, dst_path, path, 4769 start_slot, ins_nr, 1, 0); 4770 if (ret < 0) 4771 return ret; 4772 } 4773 4774 if (!found_xattrs) 4775 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 4776 4777 return 0; 4778 } 4779 4780 /* 4781 * When using the NO_HOLES feature if we punched a hole that causes the 4782 * deletion of entire leafs or all the extent items of the first leaf (the one 4783 * that contains the inode item and references) we may end up not processing 4784 * any extents, because there are no leafs with a generation matching the 4785 * current transaction that have extent items for our inode. So we need to find 4786 * if any holes exist and then log them. We also need to log holes after any 4787 * truncate operation that changes the inode's size. 4788 */ 4789 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 4790 struct btrfs_root *root, 4791 struct btrfs_inode *inode, 4792 struct btrfs_path *path) 4793 { 4794 struct btrfs_fs_info *fs_info = root->fs_info; 4795 struct btrfs_key key; 4796 const u64 ino = btrfs_ino(inode); 4797 const u64 i_size = i_size_read(&inode->vfs_inode); 4798 u64 prev_extent_end = 0; 4799 int ret; 4800 4801 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 4802 return 0; 4803 4804 key.objectid = ino; 4805 key.type = BTRFS_EXTENT_DATA_KEY; 4806 key.offset = 0; 4807 4808 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4809 if (ret < 0) 4810 return ret; 4811 4812 while (true) { 4813 struct extent_buffer *leaf = path->nodes[0]; 4814 4815 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 4816 ret = btrfs_next_leaf(root, path); 4817 if (ret < 0) 4818 return ret; 4819 if (ret > 0) { 4820 ret = 0; 4821 break; 4822 } 4823 leaf = path->nodes[0]; 4824 } 4825 4826 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4827 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 4828 break; 4829 4830 /* We have a hole, log it. */ 4831 if (prev_extent_end < key.offset) { 4832 const u64 hole_len = key.offset - prev_extent_end; 4833 4834 /* 4835 * Release the path to avoid deadlocks with other code 4836 * paths that search the root while holding locks on 4837 * leafs from the log root. 4838 */ 4839 btrfs_release_path(path); 4840 ret = btrfs_insert_file_extent(trans, root->log_root, 4841 ino, prev_extent_end, 0, 4842 0, hole_len, 0, hole_len, 4843 0, 0, 0); 4844 if (ret < 0) 4845 return ret; 4846 4847 /* 4848 * Search for the same key again in the root. Since it's 4849 * an extent item and we are holding the inode lock, the 4850 * key must still exist. If it doesn't just emit warning 4851 * and return an error to fall back to a transaction 4852 * commit. 4853 */ 4854 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4855 if (ret < 0) 4856 return ret; 4857 if (WARN_ON(ret > 0)) 4858 return -ENOENT; 4859 leaf = path->nodes[0]; 4860 } 4861 4862 prev_extent_end = btrfs_file_extent_end(path); 4863 path->slots[0]++; 4864 cond_resched(); 4865 } 4866 4867 if (prev_extent_end < i_size) { 4868 u64 hole_len; 4869 4870 btrfs_release_path(path); 4871 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 4872 ret = btrfs_insert_file_extent(trans, root->log_root, 4873 ino, prev_extent_end, 0, 0, 4874 hole_len, 0, hole_len, 4875 0, 0, 0); 4876 if (ret < 0) 4877 return ret; 4878 } 4879 4880 return 0; 4881 } 4882 4883 /* 4884 * When we are logging a new inode X, check if it doesn't have a reference that 4885 * matches the reference from some other inode Y created in a past transaction 4886 * and that was renamed in the current transaction. If we don't do this, then at 4887 * log replay time we can lose inode Y (and all its files if it's a directory): 4888 * 4889 * mkdir /mnt/x 4890 * echo "hello world" > /mnt/x/foobar 4891 * sync 4892 * mv /mnt/x /mnt/y 4893 * mkdir /mnt/x # or touch /mnt/x 4894 * xfs_io -c fsync /mnt/x 4895 * <power fail> 4896 * mount fs, trigger log replay 4897 * 4898 * After the log replay procedure, we would lose the first directory and all its 4899 * files (file foobar). 4900 * For the case where inode Y is not a directory we simply end up losing it: 4901 * 4902 * echo "123" > /mnt/foo 4903 * sync 4904 * mv /mnt/foo /mnt/bar 4905 * echo "abc" > /mnt/foo 4906 * xfs_io -c fsync /mnt/foo 4907 * <power fail> 4908 * 4909 * We also need this for cases where a snapshot entry is replaced by some other 4910 * entry (file or directory) otherwise we end up with an unreplayable log due to 4911 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 4912 * if it were a regular entry: 4913 * 4914 * mkdir /mnt/x 4915 * btrfs subvolume snapshot /mnt /mnt/x/snap 4916 * btrfs subvolume delete /mnt/x/snap 4917 * rmdir /mnt/x 4918 * mkdir /mnt/x 4919 * fsync /mnt/x or fsync some new file inside it 4920 * <power fail> 4921 * 4922 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 4923 * the same transaction. 4924 */ 4925 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4926 const int slot, 4927 const struct btrfs_key *key, 4928 struct btrfs_inode *inode, 4929 u64 *other_ino, u64 *other_parent) 4930 { 4931 int ret; 4932 struct btrfs_path *search_path; 4933 char *name = NULL; 4934 u32 name_len = 0; 4935 u32 item_size = btrfs_item_size_nr(eb, slot); 4936 u32 cur_offset = 0; 4937 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 4938 4939 search_path = btrfs_alloc_path(); 4940 if (!search_path) 4941 return -ENOMEM; 4942 search_path->search_commit_root = 1; 4943 search_path->skip_locking = 1; 4944 4945 while (cur_offset < item_size) { 4946 u64 parent; 4947 u32 this_name_len; 4948 u32 this_len; 4949 unsigned long name_ptr; 4950 struct btrfs_dir_item *di; 4951 4952 if (key->type == BTRFS_INODE_REF_KEY) { 4953 struct btrfs_inode_ref *iref; 4954 4955 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 4956 parent = key->offset; 4957 this_name_len = btrfs_inode_ref_name_len(eb, iref); 4958 name_ptr = (unsigned long)(iref + 1); 4959 this_len = sizeof(*iref) + this_name_len; 4960 } else { 4961 struct btrfs_inode_extref *extref; 4962 4963 extref = (struct btrfs_inode_extref *)(ptr + 4964 cur_offset); 4965 parent = btrfs_inode_extref_parent(eb, extref); 4966 this_name_len = btrfs_inode_extref_name_len(eb, extref); 4967 name_ptr = (unsigned long)&extref->name; 4968 this_len = sizeof(*extref) + this_name_len; 4969 } 4970 4971 if (this_name_len > name_len) { 4972 char *new_name; 4973 4974 new_name = krealloc(name, this_name_len, GFP_NOFS); 4975 if (!new_name) { 4976 ret = -ENOMEM; 4977 goto out; 4978 } 4979 name_len = this_name_len; 4980 name = new_name; 4981 } 4982 4983 read_extent_buffer(eb, name, name_ptr, this_name_len); 4984 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 4985 parent, name, this_name_len, 0); 4986 if (di && !IS_ERR(di)) { 4987 struct btrfs_key di_key; 4988 4989 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 4990 di, &di_key); 4991 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 4992 if (di_key.objectid != key->objectid) { 4993 ret = 1; 4994 *other_ino = di_key.objectid; 4995 *other_parent = parent; 4996 } else { 4997 ret = 0; 4998 } 4999 } else { 5000 ret = -EAGAIN; 5001 } 5002 goto out; 5003 } else if (IS_ERR(di)) { 5004 ret = PTR_ERR(di); 5005 goto out; 5006 } 5007 btrfs_release_path(search_path); 5008 5009 cur_offset += this_len; 5010 } 5011 ret = 0; 5012 out: 5013 btrfs_free_path(search_path); 5014 kfree(name); 5015 return ret; 5016 } 5017 5018 struct btrfs_ino_list { 5019 u64 ino; 5020 u64 parent; 5021 struct list_head list; 5022 }; 5023 5024 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5025 struct btrfs_root *root, 5026 struct btrfs_path *path, 5027 struct btrfs_log_ctx *ctx, 5028 u64 ino, u64 parent) 5029 { 5030 struct btrfs_ino_list *ino_elem; 5031 LIST_HEAD(inode_list); 5032 int ret = 0; 5033 5034 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5035 if (!ino_elem) 5036 return -ENOMEM; 5037 ino_elem->ino = ino; 5038 ino_elem->parent = parent; 5039 list_add_tail(&ino_elem->list, &inode_list); 5040 5041 while (!list_empty(&inode_list)) { 5042 struct btrfs_fs_info *fs_info = root->fs_info; 5043 struct btrfs_key key; 5044 struct inode *inode; 5045 5046 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 5047 list); 5048 ino = ino_elem->ino; 5049 parent = ino_elem->parent; 5050 list_del(&ino_elem->list); 5051 kfree(ino_elem); 5052 if (ret) 5053 continue; 5054 5055 btrfs_release_path(path); 5056 5057 inode = btrfs_iget(fs_info->sb, ino, root); 5058 /* 5059 * If the other inode that had a conflicting dir entry was 5060 * deleted in the current transaction, we need to log its parent 5061 * directory. 5062 */ 5063 if (IS_ERR(inode)) { 5064 ret = PTR_ERR(inode); 5065 if (ret == -ENOENT) { 5066 inode = btrfs_iget(fs_info->sb, parent, root); 5067 if (IS_ERR(inode)) { 5068 ret = PTR_ERR(inode); 5069 } else { 5070 ret = btrfs_log_inode(trans, root, 5071 BTRFS_I(inode), 5072 LOG_OTHER_INODE_ALL, 5073 ctx); 5074 btrfs_add_delayed_iput(inode); 5075 } 5076 } 5077 continue; 5078 } 5079 /* 5080 * If the inode was already logged skip it - otherwise we can 5081 * hit an infinite loop. Example: 5082 * 5083 * From the commit root (previous transaction) we have the 5084 * following inodes: 5085 * 5086 * inode 257 a directory 5087 * inode 258 with references "zz" and "zz_link" on inode 257 5088 * inode 259 with reference "a" on inode 257 5089 * 5090 * And in the current (uncommitted) transaction we have: 5091 * 5092 * inode 257 a directory, unchanged 5093 * inode 258 with references "a" and "a2" on inode 257 5094 * inode 259 with reference "zz_link" on inode 257 5095 * inode 261 with reference "zz" on inode 257 5096 * 5097 * When logging inode 261 the following infinite loop could 5098 * happen if we don't skip already logged inodes: 5099 * 5100 * - we detect inode 258 as a conflicting inode, with inode 261 5101 * on reference "zz", and log it; 5102 * 5103 * - we detect inode 259 as a conflicting inode, with inode 258 5104 * on reference "a", and log it; 5105 * 5106 * - we detect inode 258 as a conflicting inode, with inode 259 5107 * on reference "zz_link", and log it - again! After this we 5108 * repeat the above steps forever. 5109 */ 5110 spin_lock(&BTRFS_I(inode)->lock); 5111 /* 5112 * Check the inode's logged_trans only instead of 5113 * btrfs_inode_in_log(). This is because the last_log_commit of 5114 * the inode is not updated when we only log that it exists (see 5115 * btrfs_log_inode()). 5116 */ 5117 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5118 spin_unlock(&BTRFS_I(inode)->lock); 5119 btrfs_add_delayed_iput(inode); 5120 continue; 5121 } 5122 spin_unlock(&BTRFS_I(inode)->lock); 5123 /* 5124 * We are safe logging the other inode without acquiring its 5125 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5126 * are safe against concurrent renames of the other inode as 5127 * well because during a rename we pin the log and update the 5128 * log with the new name before we unpin it. 5129 */ 5130 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5131 LOG_OTHER_INODE, ctx); 5132 if (ret) { 5133 btrfs_add_delayed_iput(inode); 5134 continue; 5135 } 5136 5137 key.objectid = ino; 5138 key.type = BTRFS_INODE_REF_KEY; 5139 key.offset = 0; 5140 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5141 if (ret < 0) { 5142 btrfs_add_delayed_iput(inode); 5143 continue; 5144 } 5145 5146 while (true) { 5147 struct extent_buffer *leaf = path->nodes[0]; 5148 int slot = path->slots[0]; 5149 u64 other_ino = 0; 5150 u64 other_parent = 0; 5151 5152 if (slot >= btrfs_header_nritems(leaf)) { 5153 ret = btrfs_next_leaf(root, path); 5154 if (ret < 0) { 5155 break; 5156 } else if (ret > 0) { 5157 ret = 0; 5158 break; 5159 } 5160 continue; 5161 } 5162 5163 btrfs_item_key_to_cpu(leaf, &key, slot); 5164 if (key.objectid != ino || 5165 (key.type != BTRFS_INODE_REF_KEY && 5166 key.type != BTRFS_INODE_EXTREF_KEY)) { 5167 ret = 0; 5168 break; 5169 } 5170 5171 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5172 BTRFS_I(inode), &other_ino, 5173 &other_parent); 5174 if (ret < 0) 5175 break; 5176 if (ret > 0) { 5177 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5178 if (!ino_elem) { 5179 ret = -ENOMEM; 5180 break; 5181 } 5182 ino_elem->ino = other_ino; 5183 ino_elem->parent = other_parent; 5184 list_add_tail(&ino_elem->list, &inode_list); 5185 ret = 0; 5186 } 5187 path->slots[0]++; 5188 } 5189 btrfs_add_delayed_iput(inode); 5190 } 5191 5192 return ret; 5193 } 5194 5195 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5196 struct btrfs_inode *inode, 5197 struct btrfs_key *min_key, 5198 const struct btrfs_key *max_key, 5199 struct btrfs_path *path, 5200 struct btrfs_path *dst_path, 5201 const u64 logged_isize, 5202 const bool recursive_logging, 5203 const int inode_only, 5204 struct btrfs_log_ctx *ctx, 5205 bool *need_log_inode_item) 5206 { 5207 struct btrfs_root *root = inode->root; 5208 int ins_start_slot = 0; 5209 int ins_nr = 0; 5210 int ret; 5211 5212 while (1) { 5213 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5214 if (ret < 0) 5215 return ret; 5216 if (ret > 0) { 5217 ret = 0; 5218 break; 5219 } 5220 again: 5221 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5222 if (min_key->objectid != max_key->objectid) 5223 break; 5224 if (min_key->type > max_key->type) 5225 break; 5226 5227 if (min_key->type == BTRFS_INODE_ITEM_KEY) 5228 *need_log_inode_item = false; 5229 5230 if ((min_key->type == BTRFS_INODE_REF_KEY || 5231 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5232 inode->generation == trans->transid && 5233 !recursive_logging) { 5234 u64 other_ino = 0; 5235 u64 other_parent = 0; 5236 5237 ret = btrfs_check_ref_name_override(path->nodes[0], 5238 path->slots[0], min_key, inode, 5239 &other_ino, &other_parent); 5240 if (ret < 0) { 5241 return ret; 5242 } else if (ret > 0 && ctx && 5243 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5244 if (ins_nr > 0) { 5245 ins_nr++; 5246 } else { 5247 ins_nr = 1; 5248 ins_start_slot = path->slots[0]; 5249 } 5250 ret = copy_items(trans, inode, dst_path, path, 5251 ins_start_slot, ins_nr, 5252 inode_only, logged_isize); 5253 if (ret < 0) 5254 return ret; 5255 ins_nr = 0; 5256 5257 ret = log_conflicting_inodes(trans, root, path, 5258 ctx, other_ino, other_parent); 5259 if (ret) 5260 return ret; 5261 btrfs_release_path(path); 5262 goto next_key; 5263 } 5264 } 5265 5266 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ 5267 if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5268 if (ins_nr == 0) 5269 goto next_slot; 5270 ret = copy_items(trans, inode, dst_path, path, 5271 ins_start_slot, 5272 ins_nr, inode_only, logged_isize); 5273 if (ret < 0) 5274 return ret; 5275 ins_nr = 0; 5276 goto next_slot; 5277 } 5278 5279 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5280 ins_nr++; 5281 goto next_slot; 5282 } else if (!ins_nr) { 5283 ins_start_slot = path->slots[0]; 5284 ins_nr = 1; 5285 goto next_slot; 5286 } 5287 5288 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5289 ins_nr, inode_only, logged_isize); 5290 if (ret < 0) 5291 return ret; 5292 ins_nr = 1; 5293 ins_start_slot = path->slots[0]; 5294 next_slot: 5295 path->slots[0]++; 5296 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5297 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5298 path->slots[0]); 5299 goto again; 5300 } 5301 if (ins_nr) { 5302 ret = copy_items(trans, inode, dst_path, path, 5303 ins_start_slot, ins_nr, inode_only, 5304 logged_isize); 5305 if (ret < 0) 5306 return ret; 5307 ins_nr = 0; 5308 } 5309 btrfs_release_path(path); 5310 next_key: 5311 if (min_key->offset < (u64)-1) { 5312 min_key->offset++; 5313 } else if (min_key->type < max_key->type) { 5314 min_key->type++; 5315 min_key->offset = 0; 5316 } else { 5317 break; 5318 } 5319 } 5320 if (ins_nr) 5321 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5322 ins_nr, inode_only, logged_isize); 5323 5324 return ret; 5325 } 5326 5327 /* log a single inode in the tree log. 5328 * At least one parent directory for this inode must exist in the tree 5329 * or be logged already. 5330 * 5331 * Any items from this inode changed by the current transaction are copied 5332 * to the log tree. An extra reference is taken on any extents in this 5333 * file, allowing us to avoid a whole pile of corner cases around logging 5334 * blocks that have been removed from the tree. 5335 * 5336 * See LOG_INODE_ALL and related defines for a description of what inode_only 5337 * does. 5338 * 5339 * This handles both files and directories. 5340 */ 5341 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5342 struct btrfs_root *root, struct btrfs_inode *inode, 5343 int inode_only, 5344 struct btrfs_log_ctx *ctx) 5345 { 5346 struct btrfs_path *path; 5347 struct btrfs_path *dst_path; 5348 struct btrfs_key min_key; 5349 struct btrfs_key max_key; 5350 struct btrfs_root *log = root->log_root; 5351 int err = 0; 5352 int ret = 0; 5353 bool fast_search = false; 5354 u64 ino = btrfs_ino(inode); 5355 struct extent_map_tree *em_tree = &inode->extent_tree; 5356 u64 logged_isize = 0; 5357 bool need_log_inode_item = true; 5358 bool xattrs_logged = false; 5359 bool recursive_logging = false; 5360 bool inode_item_dropped = true; 5361 5362 path = btrfs_alloc_path(); 5363 if (!path) 5364 return -ENOMEM; 5365 dst_path = btrfs_alloc_path(); 5366 if (!dst_path) { 5367 btrfs_free_path(path); 5368 return -ENOMEM; 5369 } 5370 5371 min_key.objectid = ino; 5372 min_key.type = BTRFS_INODE_ITEM_KEY; 5373 min_key.offset = 0; 5374 5375 max_key.objectid = ino; 5376 5377 5378 /* today the code can only do partial logging of directories */ 5379 if (S_ISDIR(inode->vfs_inode.i_mode) || 5380 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5381 &inode->runtime_flags) && 5382 inode_only >= LOG_INODE_EXISTS)) 5383 max_key.type = BTRFS_XATTR_ITEM_KEY; 5384 else 5385 max_key.type = (u8)-1; 5386 max_key.offset = (u64)-1; 5387 5388 /* 5389 * Only run delayed items if we are a directory. We want to make sure 5390 * all directory indexes hit the fs/subvolume tree so we can find them 5391 * and figure out which index ranges have to be logged. 5392 * 5393 * Otherwise commit the delayed inode only if the full sync flag is set, 5394 * as we want to make sure an up to date version is in the subvolume 5395 * tree so copy_inode_items_to_log() / copy_items() can find it and copy 5396 * it to the log tree. For a non full sync, we always log the inode item 5397 * based on the in-memory struct btrfs_inode which is always up to date. 5398 */ 5399 if (S_ISDIR(inode->vfs_inode.i_mode)) 5400 ret = btrfs_commit_inode_delayed_items(trans, inode); 5401 else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) 5402 ret = btrfs_commit_inode_delayed_inode(inode); 5403 5404 if (ret) { 5405 btrfs_free_path(path); 5406 btrfs_free_path(dst_path); 5407 return ret; 5408 } 5409 5410 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5411 recursive_logging = true; 5412 if (inode_only == LOG_OTHER_INODE) 5413 inode_only = LOG_INODE_EXISTS; 5414 else 5415 inode_only = LOG_INODE_ALL; 5416 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5417 } else { 5418 mutex_lock(&inode->log_mutex); 5419 } 5420 5421 /* 5422 * This is for cases where logging a directory could result in losing a 5423 * a file after replaying the log. For example, if we move a file from a 5424 * directory A to a directory B, then fsync directory A, we have no way 5425 * to known the file was moved from A to B, so logging just A would 5426 * result in losing the file after a log replay. 5427 */ 5428 if (S_ISDIR(inode->vfs_inode.i_mode) && 5429 inode_only == LOG_INODE_ALL && 5430 inode->last_unlink_trans >= trans->transid) { 5431 btrfs_set_log_full_commit(trans); 5432 err = 1; 5433 goto out_unlock; 5434 } 5435 5436 /* 5437 * a brute force approach to making sure we get the most uptodate 5438 * copies of everything. 5439 */ 5440 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5441 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5442 5443 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5444 if (inode_only == LOG_INODE_EXISTS) 5445 max_key_type = BTRFS_XATTR_ITEM_KEY; 5446 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 5447 } else { 5448 if (inode_only == LOG_INODE_EXISTS) { 5449 /* 5450 * Make sure the new inode item we write to the log has 5451 * the same isize as the current one (if it exists). 5452 * This is necessary to prevent data loss after log 5453 * replay, and also to prevent doing a wrong expanding 5454 * truncate - for e.g. create file, write 4K into offset 5455 * 0, fsync, write 4K into offset 4096, add hard link, 5456 * fsync some other file (to sync log), power fail - if 5457 * we use the inode's current i_size, after log replay 5458 * we get a 8Kb file, with the last 4Kb extent as a hole 5459 * (zeroes), as if an expanding truncate happened, 5460 * instead of getting a file of 4Kb only. 5461 */ 5462 err = logged_inode_size(log, inode, path, &logged_isize); 5463 if (err) 5464 goto out_unlock; 5465 } 5466 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5467 &inode->runtime_flags)) { 5468 if (inode_only == LOG_INODE_EXISTS) { 5469 max_key.type = BTRFS_XATTR_ITEM_KEY; 5470 ret = drop_objectid_items(trans, log, path, ino, 5471 max_key.type); 5472 } else { 5473 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5474 &inode->runtime_flags); 5475 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5476 &inode->runtime_flags); 5477 while(1) { 5478 ret = btrfs_truncate_inode_items(trans, 5479 log, inode, 0, 0, NULL); 5480 if (ret != -EAGAIN) 5481 break; 5482 } 5483 } 5484 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5485 &inode->runtime_flags) || 5486 inode_only == LOG_INODE_EXISTS) { 5487 if (inode_only == LOG_INODE_ALL) 5488 fast_search = true; 5489 max_key.type = BTRFS_XATTR_ITEM_KEY; 5490 ret = drop_objectid_items(trans, log, path, ino, 5491 max_key.type); 5492 } else { 5493 if (inode_only == LOG_INODE_ALL) 5494 fast_search = true; 5495 inode_item_dropped = false; 5496 goto log_extents; 5497 } 5498 5499 } 5500 if (ret) { 5501 err = ret; 5502 goto out_unlock; 5503 } 5504 5505 err = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5506 path, dst_path, logged_isize, 5507 recursive_logging, inode_only, ctx, 5508 &need_log_inode_item); 5509 if (err) 5510 goto out_unlock; 5511 5512 btrfs_release_path(path); 5513 btrfs_release_path(dst_path); 5514 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); 5515 if (err) 5516 goto out_unlock; 5517 xattrs_logged = true; 5518 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5519 btrfs_release_path(path); 5520 btrfs_release_path(dst_path); 5521 err = btrfs_log_holes(trans, root, inode, path); 5522 if (err) 5523 goto out_unlock; 5524 } 5525 log_extents: 5526 btrfs_release_path(path); 5527 btrfs_release_path(dst_path); 5528 if (need_log_inode_item) { 5529 err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5530 if (err) 5531 goto out_unlock; 5532 /* 5533 * If we are doing a fast fsync and the inode was logged before 5534 * in this transaction, we don't need to log the xattrs because 5535 * they were logged before. If xattrs were added, changed or 5536 * deleted since the last time we logged the inode, then we have 5537 * already logged them because the inode had the runtime flag 5538 * BTRFS_INODE_COPY_EVERYTHING set. 5539 */ 5540 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5541 err = btrfs_log_all_xattrs(trans, root, inode, path, 5542 dst_path); 5543 if (err) 5544 goto out_unlock; 5545 btrfs_release_path(path); 5546 } 5547 } 5548 if (fast_search) { 5549 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 5550 ctx); 5551 if (ret) { 5552 err = ret; 5553 goto out_unlock; 5554 } 5555 } else if (inode_only == LOG_INODE_ALL) { 5556 struct extent_map *em, *n; 5557 5558 write_lock(&em_tree->lock); 5559 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5560 list_del_init(&em->list); 5561 write_unlock(&em_tree->lock); 5562 } 5563 5564 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5565 ret = log_directory_changes(trans, root, inode, path, dst_path, 5566 ctx); 5567 if (ret) { 5568 err = ret; 5569 goto out_unlock; 5570 } 5571 } 5572 5573 /* 5574 * If we are logging that an ancestor inode exists as part of logging a 5575 * new name from a link or rename operation, don't mark the inode as 5576 * logged - otherwise if an explicit fsync is made against an ancestor, 5577 * the fsync considers the inode in the log and doesn't sync the log, 5578 * resulting in the ancestor missing after a power failure unless the 5579 * log was synced as part of an fsync against any other unrelated inode. 5580 * So keep it simple for this case and just don't flag the ancestors as 5581 * logged. 5582 */ 5583 if (!ctx || 5584 !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && 5585 &inode->vfs_inode != ctx->inode)) { 5586 spin_lock(&inode->lock); 5587 inode->logged_trans = trans->transid; 5588 /* 5589 * Don't update last_log_commit if we logged that an inode exists. 5590 * We do this for two reasons: 5591 * 5592 * 1) We might have had buffered writes to this inode that were 5593 * flushed and had their ordered extents completed in this 5594 * transaction, but we did not previously log the inode with 5595 * LOG_INODE_ALL. Later the inode was evicted and after that 5596 * it was loaded again and this LOG_INODE_EXISTS log operation 5597 * happened. We must make sure that if an explicit fsync against 5598 * the inode is performed later, it logs the new extents, an 5599 * updated inode item, etc, and syncs the log. The same logic 5600 * applies to direct IO writes instead of buffered writes. 5601 * 5602 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 5603 * is logged with an i_size of 0 or whatever value was logged 5604 * before. If later the i_size of the inode is increased by a 5605 * truncate operation, the log is synced through an fsync of 5606 * some other inode and then finally an explicit fsync against 5607 * this inode is made, we must make sure this fsync logs the 5608 * inode with the new i_size, the hole between old i_size and 5609 * the new i_size, and syncs the log. 5610 */ 5611 if (inode_only != LOG_INODE_EXISTS) 5612 inode->last_log_commit = inode->last_sub_trans; 5613 spin_unlock(&inode->lock); 5614 } 5615 out_unlock: 5616 mutex_unlock(&inode->log_mutex); 5617 5618 btrfs_free_path(path); 5619 btrfs_free_path(dst_path); 5620 return err; 5621 } 5622 5623 /* 5624 * Check if we need to log an inode. This is used in contexts where while 5625 * logging an inode we need to log another inode (either that it exists or in 5626 * full mode). This is used instead of btrfs_inode_in_log() because the later 5627 * requires the inode to be in the log and have the log transaction committed, 5628 * while here we do not care if the log transaction was already committed - our 5629 * caller will commit the log later - and we want to avoid logging an inode 5630 * multiple times when multiple tasks have joined the same log transaction. 5631 */ 5632 static bool need_log_inode(struct btrfs_trans_handle *trans, 5633 struct btrfs_inode *inode) 5634 { 5635 /* 5636 * If a directory was not modified, no dentries added or removed, we can 5637 * and should avoid logging it. 5638 */ 5639 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5640 return false; 5641 5642 /* 5643 * If this inode does not have new/updated/deleted xattrs since the last 5644 * time it was logged and is flagged as logged in the current transaction, 5645 * we can skip logging it. As for new/deleted names, those are updated in 5646 * the log by link/unlink/rename operations. 5647 * In case the inode was logged and then evicted and reloaded, its 5648 * logged_trans will be 0, in which case we have to fully log it since 5649 * logged_trans is a transient field, not persisted. 5650 */ 5651 if (inode->logged_trans == trans->transid && 5652 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5653 return false; 5654 5655 return true; 5656 } 5657 5658 struct btrfs_dir_list { 5659 u64 ino; 5660 struct list_head list; 5661 }; 5662 5663 /* 5664 * Log the inodes of the new dentries of a directory. See log_dir_items() for 5665 * details about the why it is needed. 5666 * This is a recursive operation - if an existing dentry corresponds to a 5667 * directory, that directory's new entries are logged too (same behaviour as 5668 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5669 * the dentries point to we do not lock their i_mutex, otherwise lockdep 5670 * complains about the following circular lock dependency / possible deadlock: 5671 * 5672 * CPU0 CPU1 5673 * ---- ---- 5674 * lock(&type->i_mutex_dir_key#3/2); 5675 * lock(sb_internal#2); 5676 * lock(&type->i_mutex_dir_key#3/2); 5677 * lock(&sb->s_type->i_mutex_key#14); 5678 * 5679 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5680 * sb_start_intwrite() in btrfs_start_transaction(). 5681 * Not locking i_mutex of the inodes is still safe because: 5682 * 5683 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5684 * that while logging the inode new references (names) are added or removed 5685 * from the inode, leaving the logged inode item with a link count that does 5686 * not match the number of logged inode reference items. This is fine because 5687 * at log replay time we compute the real number of links and correct the 5688 * link count in the inode item (see replay_one_buffer() and 5689 * link_to_fixup_dir()); 5690 * 5691 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5692 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and 5693 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item 5694 * has a size that doesn't match the sum of the lengths of all the logged 5695 * names. This does not result in a problem because if a dir_item key is 5696 * logged but its matching dir_index key is not logged, at log replay time we 5697 * don't use it to replay the respective name (see replay_one_name()). On the 5698 * other hand if only the dir_index key ends up being logged, the respective 5699 * name is added to the fs/subvol tree with both the dir_item and dir_index 5700 * keys created (see replay_one_name()). 5701 * The directory's inode item with a wrong i_size is not a problem as well, 5702 * since we don't use it at log replay time to set the i_size in the inode 5703 * item of the fs/subvol tree (see overwrite_item()). 5704 */ 5705 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5706 struct btrfs_root *root, 5707 struct btrfs_inode *start_inode, 5708 struct btrfs_log_ctx *ctx) 5709 { 5710 struct btrfs_fs_info *fs_info = root->fs_info; 5711 struct btrfs_root *log = root->log_root; 5712 struct btrfs_path *path; 5713 LIST_HEAD(dir_list); 5714 struct btrfs_dir_list *dir_elem; 5715 int ret = 0; 5716 5717 path = btrfs_alloc_path(); 5718 if (!path) 5719 return -ENOMEM; 5720 5721 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5722 if (!dir_elem) { 5723 btrfs_free_path(path); 5724 return -ENOMEM; 5725 } 5726 dir_elem->ino = btrfs_ino(start_inode); 5727 list_add_tail(&dir_elem->list, &dir_list); 5728 5729 while (!list_empty(&dir_list)) { 5730 struct extent_buffer *leaf; 5731 struct btrfs_key min_key; 5732 int nritems; 5733 int i; 5734 5735 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 5736 list); 5737 if (ret) 5738 goto next_dir_inode; 5739 5740 min_key.objectid = dir_elem->ino; 5741 min_key.type = BTRFS_DIR_ITEM_KEY; 5742 min_key.offset = 0; 5743 again: 5744 btrfs_release_path(path); 5745 ret = btrfs_search_forward(log, &min_key, path, trans->transid); 5746 if (ret < 0) { 5747 goto next_dir_inode; 5748 } else if (ret > 0) { 5749 ret = 0; 5750 goto next_dir_inode; 5751 } 5752 5753 process_leaf: 5754 leaf = path->nodes[0]; 5755 nritems = btrfs_header_nritems(leaf); 5756 for (i = path->slots[0]; i < nritems; i++) { 5757 struct btrfs_dir_item *di; 5758 struct btrfs_key di_key; 5759 struct inode *di_inode; 5760 struct btrfs_dir_list *new_dir_elem; 5761 int log_mode = LOG_INODE_EXISTS; 5762 int type; 5763 5764 btrfs_item_key_to_cpu(leaf, &min_key, i); 5765 if (min_key.objectid != dir_elem->ino || 5766 min_key.type != BTRFS_DIR_ITEM_KEY) 5767 goto next_dir_inode; 5768 5769 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5770 type = btrfs_dir_type(leaf, di); 5771 if (btrfs_dir_transid(leaf, di) < trans->transid && 5772 type != BTRFS_FT_DIR) 5773 continue; 5774 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5775 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5776 continue; 5777 5778 btrfs_release_path(path); 5779 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5780 if (IS_ERR(di_inode)) { 5781 ret = PTR_ERR(di_inode); 5782 goto next_dir_inode; 5783 } 5784 5785 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 5786 btrfs_add_delayed_iput(di_inode); 5787 break; 5788 } 5789 5790 ctx->log_new_dentries = false; 5791 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 5792 log_mode = LOG_INODE_ALL; 5793 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), 5794 log_mode, ctx); 5795 btrfs_add_delayed_iput(di_inode); 5796 if (ret) 5797 goto next_dir_inode; 5798 if (ctx->log_new_dentries) { 5799 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 5800 GFP_NOFS); 5801 if (!new_dir_elem) { 5802 ret = -ENOMEM; 5803 goto next_dir_inode; 5804 } 5805 new_dir_elem->ino = di_key.objectid; 5806 list_add_tail(&new_dir_elem->list, &dir_list); 5807 } 5808 break; 5809 } 5810 if (i == nritems) { 5811 ret = btrfs_next_leaf(log, path); 5812 if (ret < 0) { 5813 goto next_dir_inode; 5814 } else if (ret > 0) { 5815 ret = 0; 5816 goto next_dir_inode; 5817 } 5818 goto process_leaf; 5819 } 5820 if (min_key.offset < (u64)-1) { 5821 min_key.offset++; 5822 goto again; 5823 } 5824 next_dir_inode: 5825 list_del(&dir_elem->list); 5826 kfree(dir_elem); 5827 } 5828 5829 btrfs_free_path(path); 5830 return ret; 5831 } 5832 5833 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 5834 struct btrfs_inode *inode, 5835 struct btrfs_log_ctx *ctx) 5836 { 5837 struct btrfs_fs_info *fs_info = trans->fs_info; 5838 int ret; 5839 struct btrfs_path *path; 5840 struct btrfs_key key; 5841 struct btrfs_root *root = inode->root; 5842 const u64 ino = btrfs_ino(inode); 5843 5844 path = btrfs_alloc_path(); 5845 if (!path) 5846 return -ENOMEM; 5847 path->skip_locking = 1; 5848 path->search_commit_root = 1; 5849 5850 key.objectid = ino; 5851 key.type = BTRFS_INODE_REF_KEY; 5852 key.offset = 0; 5853 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5854 if (ret < 0) 5855 goto out; 5856 5857 while (true) { 5858 struct extent_buffer *leaf = path->nodes[0]; 5859 int slot = path->slots[0]; 5860 u32 cur_offset = 0; 5861 u32 item_size; 5862 unsigned long ptr; 5863 5864 if (slot >= btrfs_header_nritems(leaf)) { 5865 ret = btrfs_next_leaf(root, path); 5866 if (ret < 0) 5867 goto out; 5868 else if (ret > 0) 5869 break; 5870 continue; 5871 } 5872 5873 btrfs_item_key_to_cpu(leaf, &key, slot); 5874 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 5875 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 5876 break; 5877 5878 item_size = btrfs_item_size_nr(leaf, slot); 5879 ptr = btrfs_item_ptr_offset(leaf, slot); 5880 while (cur_offset < item_size) { 5881 struct btrfs_key inode_key; 5882 struct inode *dir_inode; 5883 5884 inode_key.type = BTRFS_INODE_ITEM_KEY; 5885 inode_key.offset = 0; 5886 5887 if (key.type == BTRFS_INODE_EXTREF_KEY) { 5888 struct btrfs_inode_extref *extref; 5889 5890 extref = (struct btrfs_inode_extref *) 5891 (ptr + cur_offset); 5892 inode_key.objectid = btrfs_inode_extref_parent( 5893 leaf, extref); 5894 cur_offset += sizeof(*extref); 5895 cur_offset += btrfs_inode_extref_name_len(leaf, 5896 extref); 5897 } else { 5898 inode_key.objectid = key.offset; 5899 cur_offset = item_size; 5900 } 5901 5902 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 5903 root); 5904 /* 5905 * If the parent inode was deleted, return an error to 5906 * fallback to a transaction commit. This is to prevent 5907 * getting an inode that was moved from one parent A to 5908 * a parent B, got its former parent A deleted and then 5909 * it got fsync'ed, from existing at both parents after 5910 * a log replay (and the old parent still existing). 5911 * Example: 5912 * 5913 * mkdir /mnt/A 5914 * mkdir /mnt/B 5915 * touch /mnt/B/bar 5916 * sync 5917 * mv /mnt/B/bar /mnt/A/bar 5918 * mv -T /mnt/A /mnt/B 5919 * fsync /mnt/B/bar 5920 * <power fail> 5921 * 5922 * If we ignore the old parent B which got deleted, 5923 * after a log replay we would have file bar linked 5924 * at both parents and the old parent B would still 5925 * exist. 5926 */ 5927 if (IS_ERR(dir_inode)) { 5928 ret = PTR_ERR(dir_inode); 5929 goto out; 5930 } 5931 5932 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 5933 btrfs_add_delayed_iput(dir_inode); 5934 continue; 5935 } 5936 5937 if (ctx) 5938 ctx->log_new_dentries = false; 5939 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), 5940 LOG_INODE_ALL, ctx); 5941 if (!ret && ctx && ctx->log_new_dentries) 5942 ret = log_new_dir_dentries(trans, root, 5943 BTRFS_I(dir_inode), ctx); 5944 btrfs_add_delayed_iput(dir_inode); 5945 if (ret) 5946 goto out; 5947 } 5948 path->slots[0]++; 5949 } 5950 ret = 0; 5951 out: 5952 btrfs_free_path(path); 5953 return ret; 5954 } 5955 5956 static int log_new_ancestors(struct btrfs_trans_handle *trans, 5957 struct btrfs_root *root, 5958 struct btrfs_path *path, 5959 struct btrfs_log_ctx *ctx) 5960 { 5961 struct btrfs_key found_key; 5962 5963 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5964 5965 while (true) { 5966 struct btrfs_fs_info *fs_info = root->fs_info; 5967 struct extent_buffer *leaf = path->nodes[0]; 5968 int slot = path->slots[0]; 5969 struct btrfs_key search_key; 5970 struct inode *inode; 5971 u64 ino; 5972 int ret = 0; 5973 5974 btrfs_release_path(path); 5975 5976 ino = found_key.offset; 5977 5978 search_key.objectid = found_key.offset; 5979 search_key.type = BTRFS_INODE_ITEM_KEY; 5980 search_key.offset = 0; 5981 inode = btrfs_iget(fs_info->sb, ino, root); 5982 if (IS_ERR(inode)) 5983 return PTR_ERR(inode); 5984 5985 if (BTRFS_I(inode)->generation >= trans->transid && 5986 need_log_inode(trans, BTRFS_I(inode))) 5987 ret = btrfs_log_inode(trans, root, BTRFS_I(inode), 5988 LOG_INODE_EXISTS, ctx); 5989 btrfs_add_delayed_iput(inode); 5990 if (ret) 5991 return ret; 5992 5993 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 5994 break; 5995 5996 search_key.type = BTRFS_INODE_REF_KEY; 5997 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5998 if (ret < 0) 5999 return ret; 6000 6001 leaf = path->nodes[0]; 6002 slot = path->slots[0]; 6003 if (slot >= btrfs_header_nritems(leaf)) { 6004 ret = btrfs_next_leaf(root, path); 6005 if (ret < 0) 6006 return ret; 6007 else if (ret > 0) 6008 return -ENOENT; 6009 leaf = path->nodes[0]; 6010 slot = path->slots[0]; 6011 } 6012 6013 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6014 if (found_key.objectid != search_key.objectid || 6015 found_key.type != BTRFS_INODE_REF_KEY) 6016 return -ENOENT; 6017 } 6018 return 0; 6019 } 6020 6021 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6022 struct btrfs_inode *inode, 6023 struct dentry *parent, 6024 struct btrfs_log_ctx *ctx) 6025 { 6026 struct btrfs_root *root = inode->root; 6027 struct dentry *old_parent = NULL; 6028 struct super_block *sb = inode->vfs_inode.i_sb; 6029 int ret = 0; 6030 6031 while (true) { 6032 if (!parent || d_really_is_negative(parent) || 6033 sb != parent->d_sb) 6034 break; 6035 6036 inode = BTRFS_I(d_inode(parent)); 6037 if (root != inode->root) 6038 break; 6039 6040 if (inode->generation >= trans->transid && 6041 need_log_inode(trans, inode)) { 6042 ret = btrfs_log_inode(trans, root, inode, 6043 LOG_INODE_EXISTS, ctx); 6044 if (ret) 6045 break; 6046 } 6047 if (IS_ROOT(parent)) 6048 break; 6049 6050 parent = dget_parent(parent); 6051 dput(old_parent); 6052 old_parent = parent; 6053 } 6054 dput(old_parent); 6055 6056 return ret; 6057 } 6058 6059 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6060 struct btrfs_inode *inode, 6061 struct dentry *parent, 6062 struct btrfs_log_ctx *ctx) 6063 { 6064 struct btrfs_root *root = inode->root; 6065 const u64 ino = btrfs_ino(inode); 6066 struct btrfs_path *path; 6067 struct btrfs_key search_key; 6068 int ret; 6069 6070 /* 6071 * For a single hard link case, go through a fast path that does not 6072 * need to iterate the fs/subvolume tree. 6073 */ 6074 if (inode->vfs_inode.i_nlink < 2) 6075 return log_new_ancestors_fast(trans, inode, parent, ctx); 6076 6077 path = btrfs_alloc_path(); 6078 if (!path) 6079 return -ENOMEM; 6080 6081 search_key.objectid = ino; 6082 search_key.type = BTRFS_INODE_REF_KEY; 6083 search_key.offset = 0; 6084 again: 6085 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6086 if (ret < 0) 6087 goto out; 6088 if (ret == 0) 6089 path->slots[0]++; 6090 6091 while (true) { 6092 struct extent_buffer *leaf = path->nodes[0]; 6093 int slot = path->slots[0]; 6094 struct btrfs_key found_key; 6095 6096 if (slot >= btrfs_header_nritems(leaf)) { 6097 ret = btrfs_next_leaf(root, path); 6098 if (ret < 0) 6099 goto out; 6100 else if (ret > 0) 6101 break; 6102 continue; 6103 } 6104 6105 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6106 if (found_key.objectid != ino || 6107 found_key.type > BTRFS_INODE_EXTREF_KEY) 6108 break; 6109 6110 /* 6111 * Don't deal with extended references because they are rare 6112 * cases and too complex to deal with (we would need to keep 6113 * track of which subitem we are processing for each item in 6114 * this loop, etc). So just return some error to fallback to 6115 * a transaction commit. 6116 */ 6117 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6118 ret = -EMLINK; 6119 goto out; 6120 } 6121 6122 /* 6123 * Logging ancestors needs to do more searches on the fs/subvol 6124 * tree, so it releases the path as needed to avoid deadlocks. 6125 * Keep track of the last inode ref key and resume from that key 6126 * after logging all new ancestors for the current hard link. 6127 */ 6128 memcpy(&search_key, &found_key, sizeof(search_key)); 6129 6130 ret = log_new_ancestors(trans, root, path, ctx); 6131 if (ret) 6132 goto out; 6133 btrfs_release_path(path); 6134 goto again; 6135 } 6136 ret = 0; 6137 out: 6138 btrfs_free_path(path); 6139 return ret; 6140 } 6141 6142 /* 6143 * helper function around btrfs_log_inode to make sure newly created 6144 * parent directories also end up in the log. A minimal inode and backref 6145 * only logging is done of any parent directories that are older than 6146 * the last committed transaction 6147 */ 6148 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6149 struct btrfs_inode *inode, 6150 struct dentry *parent, 6151 int inode_only, 6152 struct btrfs_log_ctx *ctx) 6153 { 6154 struct btrfs_root *root = inode->root; 6155 struct btrfs_fs_info *fs_info = root->fs_info; 6156 int ret = 0; 6157 bool log_dentries = false; 6158 6159 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6160 ret = 1; 6161 goto end_no_trans; 6162 } 6163 6164 if (btrfs_root_refs(&root->root_item) == 0) { 6165 ret = 1; 6166 goto end_no_trans; 6167 } 6168 6169 /* 6170 * Skip already logged inodes or inodes corresponding to tmpfiles 6171 * (since logging them is pointless, a link count of 0 means they 6172 * will never be accessible). 6173 */ 6174 if ((btrfs_inode_in_log(inode, trans->transid) && 6175 list_empty(&ctx->ordered_extents)) || 6176 inode->vfs_inode.i_nlink == 0) { 6177 ret = BTRFS_NO_LOG_SYNC; 6178 goto end_no_trans; 6179 } 6180 6181 ret = start_log_trans(trans, root, ctx); 6182 if (ret) 6183 goto end_no_trans; 6184 6185 ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); 6186 if (ret) 6187 goto end_trans; 6188 6189 /* 6190 * for regular files, if its inode is already on disk, we don't 6191 * have to worry about the parents at all. This is because 6192 * we can use the last_unlink_trans field to record renames 6193 * and other fun in this file. 6194 */ 6195 if (S_ISREG(inode->vfs_inode.i_mode) && 6196 inode->generation < trans->transid && 6197 inode->last_unlink_trans < trans->transid) { 6198 ret = 0; 6199 goto end_trans; 6200 } 6201 6202 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) 6203 log_dentries = true; 6204 6205 /* 6206 * On unlink we must make sure all our current and old parent directory 6207 * inodes are fully logged. This is to prevent leaving dangling 6208 * directory index entries in directories that were our parents but are 6209 * not anymore. Not doing this results in old parent directory being 6210 * impossible to delete after log replay (rmdir will always fail with 6211 * error -ENOTEMPTY). 6212 * 6213 * Example 1: 6214 * 6215 * mkdir testdir 6216 * touch testdir/foo 6217 * ln testdir/foo testdir/bar 6218 * sync 6219 * unlink testdir/bar 6220 * xfs_io -c fsync testdir/foo 6221 * <power failure> 6222 * mount fs, triggers log replay 6223 * 6224 * If we don't log the parent directory (testdir), after log replay the 6225 * directory still has an entry pointing to the file inode using the bar 6226 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6227 * the file inode has a link count of 1. 6228 * 6229 * Example 2: 6230 * 6231 * mkdir testdir 6232 * touch foo 6233 * ln foo testdir/foo2 6234 * ln foo testdir/foo3 6235 * sync 6236 * unlink testdir/foo3 6237 * xfs_io -c fsync foo 6238 * <power failure> 6239 * mount fs, triggers log replay 6240 * 6241 * Similar as the first example, after log replay the parent directory 6242 * testdir still has an entry pointing to the inode file with name foo3 6243 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6244 * and has a link count of 2. 6245 */ 6246 if (inode->last_unlink_trans >= trans->transid) { 6247 ret = btrfs_log_all_parents(trans, inode, ctx); 6248 if (ret) 6249 goto end_trans; 6250 } 6251 6252 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6253 if (ret) 6254 goto end_trans; 6255 6256 if (log_dentries) 6257 ret = log_new_dir_dentries(trans, root, inode, ctx); 6258 else 6259 ret = 0; 6260 end_trans: 6261 if (ret < 0) { 6262 btrfs_set_log_full_commit(trans); 6263 ret = 1; 6264 } 6265 6266 if (ret) 6267 btrfs_remove_log_ctx(root, ctx); 6268 btrfs_end_log_trans(root); 6269 end_no_trans: 6270 return ret; 6271 } 6272 6273 /* 6274 * it is not safe to log dentry if the chunk root has added new 6275 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6276 * If this returns 1, you must commit the transaction to safely get your 6277 * data on disk. 6278 */ 6279 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6280 struct dentry *dentry, 6281 struct btrfs_log_ctx *ctx) 6282 { 6283 struct dentry *parent = dget_parent(dentry); 6284 int ret; 6285 6286 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6287 LOG_INODE_ALL, ctx); 6288 dput(parent); 6289 6290 return ret; 6291 } 6292 6293 /* 6294 * should be called during mount to recover any replay any log trees 6295 * from the FS 6296 */ 6297 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6298 { 6299 int ret; 6300 struct btrfs_path *path; 6301 struct btrfs_trans_handle *trans; 6302 struct btrfs_key key; 6303 struct btrfs_key found_key; 6304 struct btrfs_root *log; 6305 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6306 struct walk_control wc = { 6307 .process_func = process_one_buffer, 6308 .stage = LOG_WALK_PIN_ONLY, 6309 }; 6310 6311 path = btrfs_alloc_path(); 6312 if (!path) 6313 return -ENOMEM; 6314 6315 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6316 6317 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6318 if (IS_ERR(trans)) { 6319 ret = PTR_ERR(trans); 6320 goto error; 6321 } 6322 6323 wc.trans = trans; 6324 wc.pin = 1; 6325 6326 ret = walk_log_tree(trans, log_root_tree, &wc); 6327 if (ret) { 6328 btrfs_handle_fs_error(fs_info, ret, 6329 "Failed to pin buffers while recovering log root tree."); 6330 goto error; 6331 } 6332 6333 again: 6334 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6335 key.offset = (u64)-1; 6336 key.type = BTRFS_ROOT_ITEM_KEY; 6337 6338 while (1) { 6339 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6340 6341 if (ret < 0) { 6342 btrfs_handle_fs_error(fs_info, ret, 6343 "Couldn't find tree log root."); 6344 goto error; 6345 } 6346 if (ret > 0) { 6347 if (path->slots[0] == 0) 6348 break; 6349 path->slots[0]--; 6350 } 6351 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6352 path->slots[0]); 6353 btrfs_release_path(path); 6354 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6355 break; 6356 6357 log = btrfs_read_tree_root(log_root_tree, &found_key); 6358 if (IS_ERR(log)) { 6359 ret = PTR_ERR(log); 6360 btrfs_handle_fs_error(fs_info, ret, 6361 "Couldn't read tree log root."); 6362 goto error; 6363 } 6364 6365 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6366 true); 6367 if (IS_ERR(wc.replay_dest)) { 6368 ret = PTR_ERR(wc.replay_dest); 6369 6370 /* 6371 * We didn't find the subvol, likely because it was 6372 * deleted. This is ok, simply skip this log and go to 6373 * the next one. 6374 * 6375 * We need to exclude the root because we can't have 6376 * other log replays overwriting this log as we'll read 6377 * it back in a few more times. This will keep our 6378 * block from being modified, and we'll just bail for 6379 * each subsequent pass. 6380 */ 6381 if (ret == -ENOENT) 6382 ret = btrfs_pin_extent_for_log_replay(trans, 6383 log->node->start, 6384 log->node->len); 6385 btrfs_put_root(log); 6386 6387 if (!ret) 6388 goto next; 6389 btrfs_handle_fs_error(fs_info, ret, 6390 "Couldn't read target root for tree log recovery."); 6391 goto error; 6392 } 6393 6394 wc.replay_dest->log_root = log; 6395 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 6396 if (ret) 6397 /* The loop needs to continue due to the root refs */ 6398 btrfs_handle_fs_error(fs_info, ret, 6399 "failed to record the log root in transaction"); 6400 else 6401 ret = walk_log_tree(trans, log, &wc); 6402 6403 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6404 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6405 path); 6406 } 6407 6408 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6409 struct btrfs_root *root = wc.replay_dest; 6410 6411 btrfs_release_path(path); 6412 6413 /* 6414 * We have just replayed everything, and the highest 6415 * objectid of fs roots probably has changed in case 6416 * some inode_item's got replayed. 6417 * 6418 * root->objectid_mutex is not acquired as log replay 6419 * could only happen during mount. 6420 */ 6421 ret = btrfs_init_root_free_objectid(root); 6422 } 6423 6424 wc.replay_dest->log_root = NULL; 6425 btrfs_put_root(wc.replay_dest); 6426 btrfs_put_root(log); 6427 6428 if (ret) 6429 goto error; 6430 next: 6431 if (found_key.offset == 0) 6432 break; 6433 key.offset = found_key.offset - 1; 6434 } 6435 btrfs_release_path(path); 6436 6437 /* step one is to pin it all, step two is to replay just inodes */ 6438 if (wc.pin) { 6439 wc.pin = 0; 6440 wc.process_func = replay_one_buffer; 6441 wc.stage = LOG_WALK_REPLAY_INODES; 6442 goto again; 6443 } 6444 /* step three is to replay everything */ 6445 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6446 wc.stage++; 6447 goto again; 6448 } 6449 6450 btrfs_free_path(path); 6451 6452 /* step 4: commit the transaction, which also unpins the blocks */ 6453 ret = btrfs_commit_transaction(trans); 6454 if (ret) 6455 return ret; 6456 6457 log_root_tree->log_root = NULL; 6458 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6459 btrfs_put_root(log_root_tree); 6460 6461 return 0; 6462 error: 6463 if (wc.trans) 6464 btrfs_end_transaction(wc.trans); 6465 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6466 btrfs_free_path(path); 6467 return ret; 6468 } 6469 6470 /* 6471 * there are some corner cases where we want to force a full 6472 * commit instead of allowing a directory to be logged. 6473 * 6474 * They revolve around files there were unlinked from the directory, and 6475 * this function updates the parent directory so that a full commit is 6476 * properly done if it is fsync'd later after the unlinks are done. 6477 * 6478 * Must be called before the unlink operations (updates to the subvolume tree, 6479 * inodes, etc) are done. 6480 */ 6481 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6482 struct btrfs_inode *dir, struct btrfs_inode *inode, 6483 int for_rename) 6484 { 6485 /* 6486 * when we're logging a file, if it hasn't been renamed 6487 * or unlinked, and its inode is fully committed on disk, 6488 * we don't have to worry about walking up the directory chain 6489 * to log its parents. 6490 * 6491 * So, we use the last_unlink_trans field to put this transid 6492 * into the file. When the file is logged we check it and 6493 * don't log the parents if the file is fully on disk. 6494 */ 6495 mutex_lock(&inode->log_mutex); 6496 inode->last_unlink_trans = trans->transid; 6497 mutex_unlock(&inode->log_mutex); 6498 6499 /* 6500 * if this directory was already logged any new 6501 * names for this file/dir will get recorded 6502 */ 6503 if (dir->logged_trans == trans->transid) 6504 return; 6505 6506 /* 6507 * if the inode we're about to unlink was logged, 6508 * the log will be properly updated for any new names 6509 */ 6510 if (inode->logged_trans == trans->transid) 6511 return; 6512 6513 /* 6514 * when renaming files across directories, if the directory 6515 * there we're unlinking from gets fsync'd later on, there's 6516 * no way to find the destination directory later and fsync it 6517 * properly. So, we have to be conservative and force commits 6518 * so the new name gets discovered. 6519 */ 6520 if (for_rename) 6521 goto record; 6522 6523 /* we can safely do the unlink without any special recording */ 6524 return; 6525 6526 record: 6527 mutex_lock(&dir->log_mutex); 6528 dir->last_unlink_trans = trans->transid; 6529 mutex_unlock(&dir->log_mutex); 6530 } 6531 6532 /* 6533 * Make sure that if someone attempts to fsync the parent directory of a deleted 6534 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6535 * that after replaying the log tree of the parent directory's root we will not 6536 * see the snapshot anymore and at log replay time we will not see any log tree 6537 * corresponding to the deleted snapshot's root, which could lead to replaying 6538 * it after replaying the log tree of the parent directory (which would replay 6539 * the snapshot delete operation). 6540 * 6541 * Must be called before the actual snapshot destroy operation (updates to the 6542 * parent root and tree of tree roots trees, etc) are done. 6543 */ 6544 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6545 struct btrfs_inode *dir) 6546 { 6547 mutex_lock(&dir->log_mutex); 6548 dir->last_unlink_trans = trans->transid; 6549 mutex_unlock(&dir->log_mutex); 6550 } 6551 6552 /* 6553 * Call this after adding a new name for a file and it will properly 6554 * update the log to reflect the new name. 6555 */ 6556 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6557 struct btrfs_inode *inode, struct btrfs_inode *old_dir, 6558 struct dentry *parent) 6559 { 6560 struct btrfs_log_ctx ctx; 6561 6562 /* 6563 * this will force the logging code to walk the dentry chain 6564 * up for the file 6565 */ 6566 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6567 inode->last_unlink_trans = trans->transid; 6568 6569 /* 6570 * if this inode hasn't been logged and directory we're renaming it 6571 * from hasn't been logged, we don't need to log it 6572 */ 6573 if (!inode_logged(trans, inode) && 6574 (!old_dir || !inode_logged(trans, old_dir))) 6575 return; 6576 6577 /* 6578 * If we are doing a rename (old_dir is not NULL) from a directory that 6579 * was previously logged, make sure the next log attempt on the directory 6580 * is not skipped and logs the inode again. This is because the log may 6581 * not currently be authoritative for a range including the old 6582 * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make 6583 * sure after a log replay we do not end up with both the new and old 6584 * dentries around (in case the inode is a directory we would have a 6585 * directory with two hard links and 2 inode references for different 6586 * parents). The next log attempt of old_dir will happen at 6587 * btrfs_log_all_parents(), called through btrfs_log_inode_parent() 6588 * below, because we have previously set inode->last_unlink_trans to the 6589 * current transaction ID, either here or at btrfs_record_unlink_dir() in 6590 * case inode is a directory. 6591 */ 6592 if (old_dir) 6593 old_dir->logged_trans = 0; 6594 6595 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 6596 ctx.logging_new_name = true; 6597 /* 6598 * We don't care about the return value. If we fail to log the new name 6599 * then we know the next attempt to sync the log will fallback to a full 6600 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 6601 * we don't need to worry about getting a log committed that has an 6602 * inconsistent state after a rename operation. 6603 */ 6604 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 6605 } 6606 6607