1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 #include "fs.h" 25 #include "accessors.h" 26 #include "extent-tree.h" 27 #include "root-tree.h" 28 #include "dir-item.h" 29 #include "file-item.h" 30 #include "file.h" 31 #include "orphan.h" 32 #include "tree-checker.h" 33 34 #define MAX_CONFLICT_INODES 10 35 36 /* magic values for the inode_only field in btrfs_log_inode: 37 * 38 * LOG_INODE_ALL means to log everything 39 * LOG_INODE_EXISTS means to log just enough to recreate the inode 40 * during log replay 41 */ 42 enum { 43 LOG_INODE_ALL, 44 LOG_INODE_EXISTS, 45 }; 46 47 /* 48 * directory trouble cases 49 * 50 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 51 * log, we must force a full commit before doing an fsync of the directory 52 * where the unlink was done. 53 * ---> record transid of last unlink/rename per directory 54 * 55 * mkdir foo/some_dir 56 * normal commit 57 * rename foo/some_dir foo2/some_dir 58 * mkdir foo/some_dir 59 * fsync foo/some_dir/some_file 60 * 61 * The fsync above will unlink the original some_dir without recording 62 * it in its new location (foo2). After a crash, some_dir will be gone 63 * unless the fsync of some_file forces a full commit 64 * 65 * 2) we must log any new names for any file or dir that is in the fsync 66 * log. ---> check inode while renaming/linking. 67 * 68 * 2a) we must log any new names for any file or dir during rename 69 * when the directory they are being removed from was logged. 70 * ---> check inode and old parent dir during rename 71 * 72 * 2a is actually the more important variant. With the extra logging 73 * a crash might unlink the old name without recreating the new one 74 * 75 * 3) after a crash, we must go through any directories with a link count 76 * of zero and redo the rm -rf 77 * 78 * mkdir f1/foo 79 * normal commit 80 * rm -rf f1/foo 81 * fsync(f1) 82 * 83 * The directory f1 was fully removed from the FS, but fsync was never 84 * called on f1, only its parent dir. After a crash the rm -rf must 85 * be replayed. This must be able to recurse down the entire 86 * directory tree. The inode link count fixup code takes care of the 87 * ugly details. 88 */ 89 90 /* 91 * stages for the tree walking. The first 92 * stage (0) is to only pin down the blocks we find 93 * the second stage (1) is to make sure that all the inodes 94 * we find in the log are created in the subvolume. 95 * 96 * The last stage is to deal with directories and links and extents 97 * and all the other fun semantics 98 */ 99 enum { 100 LOG_WALK_PIN_ONLY, 101 LOG_WALK_REPLAY_INODES, 102 LOG_WALK_REPLAY_DIR_INDEX, 103 LOG_WALK_REPLAY_ALL, 104 }; 105 106 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 107 struct btrfs_inode *inode, 108 int inode_only, 109 struct btrfs_log_ctx *ctx); 110 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 111 struct btrfs_root *root, 112 struct btrfs_path *path, u64 objectid); 113 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 114 struct btrfs_root *root, 115 struct btrfs_root *log, 116 struct btrfs_path *path, 117 u64 dirid, int del_all); 118 static void wait_log_commit(struct btrfs_root *root, int transid); 119 120 /* 121 * tree logging is a special write ahead log used to make sure that 122 * fsyncs and O_SYNCs can happen without doing full tree commits. 123 * 124 * Full tree commits are expensive because they require commonly 125 * modified blocks to be recowed, creating many dirty pages in the 126 * extent tree an 4x-6x higher write load than ext3. 127 * 128 * Instead of doing a tree commit on every fsync, we use the 129 * key ranges and transaction ids to find items for a given file or directory 130 * that have changed in this transaction. Those items are copied into 131 * a special tree (one per subvolume root), that tree is written to disk 132 * and then the fsync is considered complete. 133 * 134 * After a crash, items are copied out of the log-tree back into the 135 * subvolume tree. Any file data extents found are recorded in the extent 136 * allocation tree, and the log-tree freed. 137 * 138 * The log tree is read three times, once to pin down all the extents it is 139 * using in ram and once, once to create all the inodes logged in the tree 140 * and once to do all the other items. 141 */ 142 143 /* 144 * start a sub transaction and setup the log tree 145 * this increments the log tree writer count to make the people 146 * syncing the tree wait for us to finish 147 */ 148 static int start_log_trans(struct btrfs_trans_handle *trans, 149 struct btrfs_root *root, 150 struct btrfs_log_ctx *ctx) 151 { 152 struct btrfs_fs_info *fs_info = root->fs_info; 153 struct btrfs_root *tree_root = fs_info->tree_root; 154 const bool zoned = btrfs_is_zoned(fs_info); 155 int ret = 0; 156 bool created = false; 157 158 /* 159 * First check if the log root tree was already created. If not, create 160 * it before locking the root's log_mutex, just to keep lockdep happy. 161 */ 162 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 163 mutex_lock(&tree_root->log_mutex); 164 if (!fs_info->log_root_tree) { 165 ret = btrfs_init_log_root_tree(trans, fs_info); 166 if (!ret) { 167 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 168 created = true; 169 } 170 } 171 mutex_unlock(&tree_root->log_mutex); 172 if (ret) 173 return ret; 174 } 175 176 mutex_lock(&root->log_mutex); 177 178 again: 179 if (root->log_root) { 180 int index = (root->log_transid + 1) % 2; 181 182 if (btrfs_need_log_full_commit(trans)) { 183 ret = BTRFS_LOG_FORCE_COMMIT; 184 goto out; 185 } 186 187 if (zoned && atomic_read(&root->log_commit[index])) { 188 wait_log_commit(root, root->log_transid - 1); 189 goto again; 190 } 191 192 if (!root->log_start_pid) { 193 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 194 root->log_start_pid = current->pid; 195 } else if (root->log_start_pid != current->pid) { 196 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 197 } 198 } else { 199 /* 200 * This means fs_info->log_root_tree was already created 201 * for some other FS trees. Do the full commit not to mix 202 * nodes from multiple log transactions to do sequential 203 * writing. 204 */ 205 if (zoned && !created) { 206 ret = BTRFS_LOG_FORCE_COMMIT; 207 goto out; 208 } 209 210 ret = btrfs_add_log_tree(trans, root); 211 if (ret) 212 goto out; 213 214 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 215 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 216 root->log_start_pid = current->pid; 217 } 218 219 atomic_inc(&root->log_writers); 220 if (!ctx->logging_new_name) { 221 int index = root->log_transid % 2; 222 list_add_tail(&ctx->list, &root->log_ctxs[index]); 223 ctx->log_transid = root->log_transid; 224 } 225 226 out: 227 mutex_unlock(&root->log_mutex); 228 return ret; 229 } 230 231 /* 232 * returns 0 if there was a log transaction running and we were able 233 * to join, or returns -ENOENT if there were not transactions 234 * in progress 235 */ 236 static int join_running_log_trans(struct btrfs_root *root) 237 { 238 const bool zoned = btrfs_is_zoned(root->fs_info); 239 int ret = -ENOENT; 240 241 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 242 return ret; 243 244 mutex_lock(&root->log_mutex); 245 again: 246 if (root->log_root) { 247 int index = (root->log_transid + 1) % 2; 248 249 ret = 0; 250 if (zoned && atomic_read(&root->log_commit[index])) { 251 wait_log_commit(root, root->log_transid - 1); 252 goto again; 253 } 254 atomic_inc(&root->log_writers); 255 } 256 mutex_unlock(&root->log_mutex); 257 return ret; 258 } 259 260 /* 261 * This either makes the current running log transaction wait 262 * until you call btrfs_end_log_trans() or it makes any future 263 * log transactions wait until you call btrfs_end_log_trans() 264 */ 265 void btrfs_pin_log_trans(struct btrfs_root *root) 266 { 267 atomic_inc(&root->log_writers); 268 } 269 270 /* 271 * indicate we're done making changes to the log tree 272 * and wake up anyone waiting to do a sync 273 */ 274 void btrfs_end_log_trans(struct btrfs_root *root) 275 { 276 if (atomic_dec_and_test(&root->log_writers)) { 277 /* atomic_dec_and_test implies a barrier */ 278 cond_wake_up_nomb(&root->log_writer_wait); 279 } 280 } 281 282 /* 283 * the walk control struct is used to pass state down the chain when 284 * processing the log tree. The stage field tells us which part 285 * of the log tree processing we are currently doing. The others 286 * are state fields used for that specific part 287 */ 288 struct walk_control { 289 /* should we free the extent on disk when done? This is used 290 * at transaction commit time while freeing a log tree 291 */ 292 int free; 293 294 /* pin only walk, we record which extents on disk belong to the 295 * log trees 296 */ 297 int pin; 298 299 /* what stage of the replay code we're currently in */ 300 int stage; 301 302 /* 303 * Ignore any items from the inode currently being processed. Needs 304 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 305 * the LOG_WALK_REPLAY_INODES stage. 306 */ 307 bool ignore_cur_inode; 308 309 /* the root we are currently replaying */ 310 struct btrfs_root *replay_dest; 311 312 /* the trans handle for the current replay */ 313 struct btrfs_trans_handle *trans; 314 315 /* the function that gets used to process blocks we find in the 316 * tree. Note the extent_buffer might not be up to date when it is 317 * passed in, and it must be checked or read if you need the data 318 * inside it 319 */ 320 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 321 struct walk_control *wc, u64 gen, int level); 322 }; 323 324 /* 325 * process_func used to pin down extents, write them or wait on them 326 */ 327 static int process_one_buffer(struct btrfs_root *log, 328 struct extent_buffer *eb, 329 struct walk_control *wc, u64 gen, int level) 330 { 331 struct btrfs_fs_info *fs_info = log->fs_info; 332 int ret = 0; 333 334 /* 335 * If this fs is mixed then we need to be able to process the leaves to 336 * pin down any logged extents, so we have to read the block. 337 */ 338 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 339 struct btrfs_tree_parent_check check = { 340 .level = level, 341 .transid = gen 342 }; 343 344 ret = btrfs_read_extent_buffer(eb, &check); 345 if (ret) 346 return ret; 347 } 348 349 if (wc->pin) { 350 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 351 eb->len); 352 if (ret) 353 return ret; 354 355 if (btrfs_buffer_uptodate(eb, gen, 0) && 356 btrfs_header_level(eb) == 0) 357 ret = btrfs_exclude_logged_extents(eb); 358 } 359 return ret; 360 } 361 362 /* 363 * Item overwrite used by replay and tree logging. eb, slot and key all refer 364 * to the src data we are copying out. 365 * 366 * root is the tree we are copying into, and path is a scratch 367 * path for use in this function (it should be released on entry and 368 * will be released on exit). 369 * 370 * If the key is already in the destination tree the existing item is 371 * overwritten. If the existing item isn't big enough, it is extended. 372 * If it is too large, it is truncated. 373 * 374 * If the key isn't in the destination yet, a new item is inserted. 375 */ 376 static int overwrite_item(struct btrfs_trans_handle *trans, 377 struct btrfs_root *root, 378 struct btrfs_path *path, 379 struct extent_buffer *eb, int slot, 380 struct btrfs_key *key) 381 { 382 int ret; 383 u32 item_size; 384 u64 saved_i_size = 0; 385 int save_old_i_size = 0; 386 unsigned long src_ptr; 387 unsigned long dst_ptr; 388 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 389 390 /* 391 * This is only used during log replay, so the root is always from a 392 * fs/subvolume tree. In case we ever need to support a log root, then 393 * we'll have to clone the leaf in the path, release the path and use 394 * the leaf before writing into the log tree. See the comments at 395 * copy_items() for more details. 396 */ 397 ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); 398 399 item_size = btrfs_item_size(eb, slot); 400 src_ptr = btrfs_item_ptr_offset(eb, slot); 401 402 /* Look for the key in the destination tree. */ 403 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 404 if (ret < 0) 405 return ret; 406 407 if (ret == 0) { 408 char *src_copy; 409 char *dst_copy; 410 u32 dst_size = btrfs_item_size(path->nodes[0], 411 path->slots[0]); 412 if (dst_size != item_size) 413 goto insert; 414 415 if (item_size == 0) { 416 btrfs_release_path(path); 417 return 0; 418 } 419 dst_copy = kmalloc(item_size, GFP_NOFS); 420 src_copy = kmalloc(item_size, GFP_NOFS); 421 if (!dst_copy || !src_copy) { 422 btrfs_release_path(path); 423 kfree(dst_copy); 424 kfree(src_copy); 425 return -ENOMEM; 426 } 427 428 read_extent_buffer(eb, src_copy, src_ptr, item_size); 429 430 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 431 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 432 item_size); 433 ret = memcmp(dst_copy, src_copy, item_size); 434 435 kfree(dst_copy); 436 kfree(src_copy); 437 /* 438 * they have the same contents, just return, this saves 439 * us from cowing blocks in the destination tree and doing 440 * extra writes that may not have been done by a previous 441 * sync 442 */ 443 if (ret == 0) { 444 btrfs_release_path(path); 445 return 0; 446 } 447 448 /* 449 * We need to load the old nbytes into the inode so when we 450 * replay the extents we've logged we get the right nbytes. 451 */ 452 if (inode_item) { 453 struct btrfs_inode_item *item; 454 u64 nbytes; 455 u32 mode; 456 457 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 458 struct btrfs_inode_item); 459 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 460 item = btrfs_item_ptr(eb, slot, 461 struct btrfs_inode_item); 462 btrfs_set_inode_nbytes(eb, item, nbytes); 463 464 /* 465 * If this is a directory we need to reset the i_size to 466 * 0 so that we can set it up properly when replaying 467 * the rest of the items in this log. 468 */ 469 mode = btrfs_inode_mode(eb, item); 470 if (S_ISDIR(mode)) 471 btrfs_set_inode_size(eb, item, 0); 472 } 473 } else if (inode_item) { 474 struct btrfs_inode_item *item; 475 u32 mode; 476 477 /* 478 * New inode, set nbytes to 0 so that the nbytes comes out 479 * properly when we replay the extents. 480 */ 481 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 482 btrfs_set_inode_nbytes(eb, item, 0); 483 484 /* 485 * If this is a directory we need to reset the i_size to 0 so 486 * that we can set it up properly when replaying the rest of 487 * the items in this log. 488 */ 489 mode = btrfs_inode_mode(eb, item); 490 if (S_ISDIR(mode)) 491 btrfs_set_inode_size(eb, item, 0); 492 } 493 insert: 494 btrfs_release_path(path); 495 /* try to insert the key into the destination tree */ 496 path->skip_release_on_error = 1; 497 ret = btrfs_insert_empty_item(trans, root, path, 498 key, item_size); 499 path->skip_release_on_error = 0; 500 501 /* make sure any existing item is the correct size */ 502 if (ret == -EEXIST || ret == -EOVERFLOW) { 503 u32 found_size; 504 found_size = btrfs_item_size(path->nodes[0], 505 path->slots[0]); 506 if (found_size > item_size) 507 btrfs_truncate_item(trans, path, item_size, 1); 508 else if (found_size < item_size) 509 btrfs_extend_item(trans, path, item_size - found_size); 510 } else if (ret) { 511 return ret; 512 } 513 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 514 path->slots[0]); 515 516 /* don't overwrite an existing inode if the generation number 517 * was logged as zero. This is done when the tree logging code 518 * is just logging an inode to make sure it exists after recovery. 519 * 520 * Also, don't overwrite i_size on directories during replay. 521 * log replay inserts and removes directory items based on the 522 * state of the tree found in the subvolume, and i_size is modified 523 * as it goes 524 */ 525 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 526 struct btrfs_inode_item *src_item; 527 struct btrfs_inode_item *dst_item; 528 529 src_item = (struct btrfs_inode_item *)src_ptr; 530 dst_item = (struct btrfs_inode_item *)dst_ptr; 531 532 if (btrfs_inode_generation(eb, src_item) == 0) { 533 struct extent_buffer *dst_eb = path->nodes[0]; 534 const u64 ino_size = btrfs_inode_size(eb, src_item); 535 536 /* 537 * For regular files an ino_size == 0 is used only when 538 * logging that an inode exists, as part of a directory 539 * fsync, and the inode wasn't fsynced before. In this 540 * case don't set the size of the inode in the fs/subvol 541 * tree, otherwise we would be throwing valid data away. 542 */ 543 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 544 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 545 ino_size != 0) 546 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 547 goto no_copy; 548 } 549 550 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && 551 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 552 save_old_i_size = 1; 553 saved_i_size = btrfs_inode_size(path->nodes[0], 554 dst_item); 555 } 556 } 557 558 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 559 src_ptr, item_size); 560 561 if (save_old_i_size) { 562 struct btrfs_inode_item *dst_item; 563 dst_item = (struct btrfs_inode_item *)dst_ptr; 564 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 565 } 566 567 /* make sure the generation is filled in */ 568 if (key->type == BTRFS_INODE_ITEM_KEY) { 569 struct btrfs_inode_item *dst_item; 570 dst_item = (struct btrfs_inode_item *)dst_ptr; 571 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 572 btrfs_set_inode_generation(path->nodes[0], dst_item, 573 trans->transid); 574 } 575 } 576 no_copy: 577 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 578 btrfs_release_path(path); 579 return 0; 580 } 581 582 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, 583 struct fscrypt_str *name) 584 { 585 char *buf; 586 587 buf = kmalloc(len, GFP_NOFS); 588 if (!buf) 589 return -ENOMEM; 590 591 read_extent_buffer(eb, buf, (unsigned long)start, len); 592 name->name = buf; 593 name->len = len; 594 return 0; 595 } 596 597 /* 598 * simple helper to read an inode off the disk from a given root 599 * This can only be called for subvolume roots and not for the log 600 */ 601 static noinline struct inode *read_one_inode(struct btrfs_root *root, 602 u64 objectid) 603 { 604 struct inode *inode; 605 606 inode = btrfs_iget(root->fs_info->sb, objectid, root); 607 if (IS_ERR(inode)) 608 inode = NULL; 609 return inode; 610 } 611 612 /* replays a single extent in 'eb' at 'slot' with 'key' into the 613 * subvolume 'root'. path is released on entry and should be released 614 * on exit. 615 * 616 * extents in the log tree have not been allocated out of the extent 617 * tree yet. So, this completes the allocation, taking a reference 618 * as required if the extent already exists or creating a new extent 619 * if it isn't in the extent allocation tree yet. 620 * 621 * The extent is inserted into the file, dropping any existing extents 622 * from the file that overlap the new one. 623 */ 624 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 625 struct btrfs_root *root, 626 struct btrfs_path *path, 627 struct extent_buffer *eb, int slot, 628 struct btrfs_key *key) 629 { 630 struct btrfs_drop_extents_args drop_args = { 0 }; 631 struct btrfs_fs_info *fs_info = root->fs_info; 632 int found_type; 633 u64 extent_end; 634 u64 start = key->offset; 635 u64 nbytes = 0; 636 struct btrfs_file_extent_item *item; 637 struct inode *inode = NULL; 638 unsigned long size; 639 int ret = 0; 640 641 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 642 found_type = btrfs_file_extent_type(eb, item); 643 644 if (found_type == BTRFS_FILE_EXTENT_REG || 645 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 646 nbytes = btrfs_file_extent_num_bytes(eb, item); 647 extent_end = start + nbytes; 648 649 /* 650 * We don't add to the inodes nbytes if we are prealloc or a 651 * hole. 652 */ 653 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 654 nbytes = 0; 655 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 656 size = btrfs_file_extent_ram_bytes(eb, item); 657 nbytes = btrfs_file_extent_ram_bytes(eb, item); 658 extent_end = ALIGN(start + size, 659 fs_info->sectorsize); 660 } else { 661 ret = 0; 662 goto out; 663 } 664 665 inode = read_one_inode(root, key->objectid); 666 if (!inode) { 667 ret = -EIO; 668 goto out; 669 } 670 671 /* 672 * first check to see if we already have this extent in the 673 * file. This must be done before the btrfs_drop_extents run 674 * so we don't try to drop this extent. 675 */ 676 ret = btrfs_lookup_file_extent(trans, root, path, 677 btrfs_ino(BTRFS_I(inode)), start, 0); 678 679 if (ret == 0 && 680 (found_type == BTRFS_FILE_EXTENT_REG || 681 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 682 struct btrfs_file_extent_item cmp1; 683 struct btrfs_file_extent_item cmp2; 684 struct btrfs_file_extent_item *existing; 685 struct extent_buffer *leaf; 686 687 leaf = path->nodes[0]; 688 existing = btrfs_item_ptr(leaf, path->slots[0], 689 struct btrfs_file_extent_item); 690 691 read_extent_buffer(eb, &cmp1, (unsigned long)item, 692 sizeof(cmp1)); 693 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 694 sizeof(cmp2)); 695 696 /* 697 * we already have a pointer to this exact extent, 698 * we don't have to do anything 699 */ 700 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 701 btrfs_release_path(path); 702 goto out; 703 } 704 } 705 btrfs_release_path(path); 706 707 /* drop any overlapping extents */ 708 drop_args.start = start; 709 drop_args.end = extent_end; 710 drop_args.drop_cache = true; 711 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 712 if (ret) 713 goto out; 714 715 if (found_type == BTRFS_FILE_EXTENT_REG || 716 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 717 u64 offset; 718 unsigned long dest_offset; 719 struct btrfs_key ins; 720 721 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 722 btrfs_fs_incompat(fs_info, NO_HOLES)) 723 goto update_inode; 724 725 ret = btrfs_insert_empty_item(trans, root, path, key, 726 sizeof(*item)); 727 if (ret) 728 goto out; 729 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 730 path->slots[0]); 731 copy_extent_buffer(path->nodes[0], eb, dest_offset, 732 (unsigned long)item, sizeof(*item)); 733 734 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 735 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 736 ins.type = BTRFS_EXTENT_ITEM_KEY; 737 offset = key->offset - btrfs_file_extent_offset(eb, item); 738 739 /* 740 * Manually record dirty extent, as here we did a shallow 741 * file extent item copy and skip normal backref update, 742 * but modifying extent tree all by ourselves. 743 * So need to manually record dirty extent for qgroup, 744 * as the owner of the file extent changed from log tree 745 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 746 */ 747 ret = btrfs_qgroup_trace_extent(trans, 748 btrfs_file_extent_disk_bytenr(eb, item), 749 btrfs_file_extent_disk_num_bytes(eb, item)); 750 if (ret < 0) 751 goto out; 752 753 if (ins.objectid > 0) { 754 struct btrfs_ref ref = { 0 }; 755 u64 csum_start; 756 u64 csum_end; 757 LIST_HEAD(ordered_sums); 758 759 /* 760 * is this extent already allocated in the extent 761 * allocation tree? If so, just add a reference 762 */ 763 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 764 ins.offset); 765 if (ret < 0) { 766 goto out; 767 } else if (ret == 0) { 768 btrfs_init_generic_ref(&ref, 769 BTRFS_ADD_DELAYED_REF, 770 ins.objectid, ins.offset, 0); 771 btrfs_init_data_ref(&ref, 772 root->root_key.objectid, 773 key->objectid, offset, 0, false); 774 ret = btrfs_inc_extent_ref(trans, &ref); 775 if (ret) 776 goto out; 777 } else { 778 /* 779 * insert the extent pointer in the extent 780 * allocation tree 781 */ 782 ret = btrfs_alloc_logged_file_extent(trans, 783 root->root_key.objectid, 784 key->objectid, offset, &ins); 785 if (ret) 786 goto out; 787 } 788 btrfs_release_path(path); 789 790 if (btrfs_file_extent_compression(eb, item)) { 791 csum_start = ins.objectid; 792 csum_end = csum_start + ins.offset; 793 } else { 794 csum_start = ins.objectid + 795 btrfs_file_extent_offset(eb, item); 796 csum_end = csum_start + 797 btrfs_file_extent_num_bytes(eb, item); 798 } 799 800 ret = btrfs_lookup_csums_list(root->log_root, 801 csum_start, csum_end - 1, 802 &ordered_sums, 0, false); 803 if (ret) 804 goto out; 805 /* 806 * Now delete all existing cums in the csum root that 807 * cover our range. We do this because we can have an 808 * extent that is completely referenced by one file 809 * extent item and partially referenced by another 810 * file extent item (like after using the clone or 811 * extent_same ioctls). In this case if we end up doing 812 * the replay of the one that partially references the 813 * extent first, and we do not do the csum deletion 814 * below, we can get 2 csum items in the csum tree that 815 * overlap each other. For example, imagine our log has 816 * the two following file extent items: 817 * 818 * key (257 EXTENT_DATA 409600) 819 * extent data disk byte 12845056 nr 102400 820 * extent data offset 20480 nr 20480 ram 102400 821 * 822 * key (257 EXTENT_DATA 819200) 823 * extent data disk byte 12845056 nr 102400 824 * extent data offset 0 nr 102400 ram 102400 825 * 826 * Where the second one fully references the 100K extent 827 * that starts at disk byte 12845056, and the log tree 828 * has a single csum item that covers the entire range 829 * of the extent: 830 * 831 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 832 * 833 * After the first file extent item is replayed, the 834 * csum tree gets the following csum item: 835 * 836 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 837 * 838 * Which covers the 20K sub-range starting at offset 20K 839 * of our extent. Now when we replay the second file 840 * extent item, if we do not delete existing csum items 841 * that cover any of its blocks, we end up getting two 842 * csum items in our csum tree that overlap each other: 843 * 844 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 845 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 846 * 847 * Which is a problem, because after this anyone trying 848 * to lookup up for the checksum of any block of our 849 * extent starting at an offset of 40K or higher, will 850 * end up looking at the second csum item only, which 851 * does not contain the checksum for any block starting 852 * at offset 40K or higher of our extent. 853 */ 854 while (!list_empty(&ordered_sums)) { 855 struct btrfs_ordered_sum *sums; 856 struct btrfs_root *csum_root; 857 858 sums = list_entry(ordered_sums.next, 859 struct btrfs_ordered_sum, 860 list); 861 csum_root = btrfs_csum_root(fs_info, 862 sums->logical); 863 if (!ret) 864 ret = btrfs_del_csums(trans, csum_root, 865 sums->logical, 866 sums->len); 867 if (!ret) 868 ret = btrfs_csum_file_blocks(trans, 869 csum_root, 870 sums); 871 list_del(&sums->list); 872 kfree(sums); 873 } 874 if (ret) 875 goto out; 876 } else { 877 btrfs_release_path(path); 878 } 879 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 880 /* inline extents are easy, we just overwrite them */ 881 ret = overwrite_item(trans, root, path, eb, slot, key); 882 if (ret) 883 goto out; 884 } 885 886 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 887 extent_end - start); 888 if (ret) 889 goto out; 890 891 update_inode: 892 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 893 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 894 out: 895 iput(inode); 896 return ret; 897 } 898 899 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 900 struct btrfs_inode *dir, 901 struct btrfs_inode *inode, 902 const struct fscrypt_str *name) 903 { 904 int ret; 905 906 ret = btrfs_unlink_inode(trans, dir, inode, name); 907 if (ret) 908 return ret; 909 /* 910 * Whenever we need to check if a name exists or not, we check the 911 * fs/subvolume tree. So after an unlink we must run delayed items, so 912 * that future checks for a name during log replay see that the name 913 * does not exists anymore. 914 */ 915 return btrfs_run_delayed_items(trans); 916 } 917 918 /* 919 * when cleaning up conflicts between the directory names in the 920 * subvolume, directory names in the log and directory names in the 921 * inode back references, we may have to unlink inodes from directories. 922 * 923 * This is a helper function to do the unlink of a specific directory 924 * item 925 */ 926 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 927 struct btrfs_path *path, 928 struct btrfs_inode *dir, 929 struct btrfs_dir_item *di) 930 { 931 struct btrfs_root *root = dir->root; 932 struct inode *inode; 933 struct fscrypt_str name; 934 struct extent_buffer *leaf; 935 struct btrfs_key location; 936 int ret; 937 938 leaf = path->nodes[0]; 939 940 btrfs_dir_item_key_to_cpu(leaf, di, &location); 941 ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); 942 if (ret) 943 return -ENOMEM; 944 945 btrfs_release_path(path); 946 947 inode = read_one_inode(root, location.objectid); 948 if (!inode) { 949 ret = -EIO; 950 goto out; 951 } 952 953 ret = link_to_fixup_dir(trans, root, path, location.objectid); 954 if (ret) 955 goto out; 956 957 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); 958 out: 959 kfree(name.name); 960 iput(inode); 961 return ret; 962 } 963 964 /* 965 * See if a given name and sequence number found in an inode back reference are 966 * already in a directory and correctly point to this inode. 967 * 968 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 969 * exists. 970 */ 971 static noinline int inode_in_dir(struct btrfs_root *root, 972 struct btrfs_path *path, 973 u64 dirid, u64 objectid, u64 index, 974 struct fscrypt_str *name) 975 { 976 struct btrfs_dir_item *di; 977 struct btrfs_key location; 978 int ret = 0; 979 980 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 981 index, name, 0); 982 if (IS_ERR(di)) { 983 ret = PTR_ERR(di); 984 goto out; 985 } else if (di) { 986 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 987 if (location.objectid != objectid) 988 goto out; 989 } else { 990 goto out; 991 } 992 993 btrfs_release_path(path); 994 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); 995 if (IS_ERR(di)) { 996 ret = PTR_ERR(di); 997 goto out; 998 } else if (di) { 999 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1000 if (location.objectid == objectid) 1001 ret = 1; 1002 } 1003 out: 1004 btrfs_release_path(path); 1005 return ret; 1006 } 1007 1008 /* 1009 * helper function to check a log tree for a named back reference in 1010 * an inode. This is used to decide if a back reference that is 1011 * found in the subvolume conflicts with what we find in the log. 1012 * 1013 * inode backreferences may have multiple refs in a single item, 1014 * during replay we process one reference at a time, and we don't 1015 * want to delete valid links to a file from the subvolume if that 1016 * link is also in the log. 1017 */ 1018 static noinline int backref_in_log(struct btrfs_root *log, 1019 struct btrfs_key *key, 1020 u64 ref_objectid, 1021 const struct fscrypt_str *name) 1022 { 1023 struct btrfs_path *path; 1024 int ret; 1025 1026 path = btrfs_alloc_path(); 1027 if (!path) 1028 return -ENOMEM; 1029 1030 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1031 if (ret < 0) { 1032 goto out; 1033 } else if (ret == 1) { 1034 ret = 0; 1035 goto out; 1036 } 1037 1038 if (key->type == BTRFS_INODE_EXTREF_KEY) 1039 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1040 path->slots[0], 1041 ref_objectid, name); 1042 else 1043 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1044 path->slots[0], name); 1045 out: 1046 btrfs_free_path(path); 1047 return ret; 1048 } 1049 1050 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1051 struct btrfs_root *root, 1052 struct btrfs_path *path, 1053 struct btrfs_root *log_root, 1054 struct btrfs_inode *dir, 1055 struct btrfs_inode *inode, 1056 u64 inode_objectid, u64 parent_objectid, 1057 u64 ref_index, struct fscrypt_str *name) 1058 { 1059 int ret; 1060 struct extent_buffer *leaf; 1061 struct btrfs_dir_item *di; 1062 struct btrfs_key search_key; 1063 struct btrfs_inode_extref *extref; 1064 1065 again: 1066 /* Search old style refs */ 1067 search_key.objectid = inode_objectid; 1068 search_key.type = BTRFS_INODE_REF_KEY; 1069 search_key.offset = parent_objectid; 1070 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1071 if (ret == 0) { 1072 struct btrfs_inode_ref *victim_ref; 1073 unsigned long ptr; 1074 unsigned long ptr_end; 1075 1076 leaf = path->nodes[0]; 1077 1078 /* are we trying to overwrite a back ref for the root directory 1079 * if so, just jump out, we're done 1080 */ 1081 if (search_key.objectid == search_key.offset) 1082 return 1; 1083 1084 /* check all the names in this back reference to see 1085 * if they are in the log. if so, we allow them to stay 1086 * otherwise they must be unlinked as a conflict 1087 */ 1088 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1089 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1090 while (ptr < ptr_end) { 1091 struct fscrypt_str victim_name; 1092 1093 victim_ref = (struct btrfs_inode_ref *)ptr; 1094 ret = read_alloc_one_name(leaf, (victim_ref + 1), 1095 btrfs_inode_ref_name_len(leaf, victim_ref), 1096 &victim_name); 1097 if (ret) 1098 return ret; 1099 1100 ret = backref_in_log(log_root, &search_key, 1101 parent_objectid, &victim_name); 1102 if (ret < 0) { 1103 kfree(victim_name.name); 1104 return ret; 1105 } else if (!ret) { 1106 inc_nlink(&inode->vfs_inode); 1107 btrfs_release_path(path); 1108 1109 ret = unlink_inode_for_log_replay(trans, dir, inode, 1110 &victim_name); 1111 kfree(victim_name.name); 1112 if (ret) 1113 return ret; 1114 goto again; 1115 } 1116 kfree(victim_name.name); 1117 1118 ptr = (unsigned long)(victim_ref + 1) + victim_name.len; 1119 } 1120 } 1121 btrfs_release_path(path); 1122 1123 /* Same search but for extended refs */ 1124 extref = btrfs_lookup_inode_extref(NULL, root, path, name, 1125 inode_objectid, parent_objectid, 0, 1126 0); 1127 if (IS_ERR(extref)) { 1128 return PTR_ERR(extref); 1129 } else if (extref) { 1130 u32 item_size; 1131 u32 cur_offset = 0; 1132 unsigned long base; 1133 struct inode *victim_parent; 1134 1135 leaf = path->nodes[0]; 1136 1137 item_size = btrfs_item_size(leaf, path->slots[0]); 1138 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1139 1140 while (cur_offset < item_size) { 1141 struct fscrypt_str victim_name; 1142 1143 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1144 1145 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1146 goto next; 1147 1148 ret = read_alloc_one_name(leaf, &extref->name, 1149 btrfs_inode_extref_name_len(leaf, extref), 1150 &victim_name); 1151 if (ret) 1152 return ret; 1153 1154 search_key.objectid = inode_objectid; 1155 search_key.type = BTRFS_INODE_EXTREF_KEY; 1156 search_key.offset = btrfs_extref_hash(parent_objectid, 1157 victim_name.name, 1158 victim_name.len); 1159 ret = backref_in_log(log_root, &search_key, 1160 parent_objectid, &victim_name); 1161 if (ret < 0) { 1162 kfree(victim_name.name); 1163 return ret; 1164 } else if (!ret) { 1165 ret = -ENOENT; 1166 victim_parent = read_one_inode(root, 1167 parent_objectid); 1168 if (victim_parent) { 1169 inc_nlink(&inode->vfs_inode); 1170 btrfs_release_path(path); 1171 1172 ret = unlink_inode_for_log_replay(trans, 1173 BTRFS_I(victim_parent), 1174 inode, &victim_name); 1175 } 1176 iput(victim_parent); 1177 kfree(victim_name.name); 1178 if (ret) 1179 return ret; 1180 goto again; 1181 } 1182 kfree(victim_name.name); 1183 next: 1184 cur_offset += victim_name.len + sizeof(*extref); 1185 } 1186 } 1187 btrfs_release_path(path); 1188 1189 /* look for a conflicting sequence number */ 1190 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1191 ref_index, name, 0); 1192 if (IS_ERR(di)) { 1193 return PTR_ERR(di); 1194 } else if (di) { 1195 ret = drop_one_dir_item(trans, path, dir, di); 1196 if (ret) 1197 return ret; 1198 } 1199 btrfs_release_path(path); 1200 1201 /* look for a conflicting name */ 1202 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); 1203 if (IS_ERR(di)) { 1204 return PTR_ERR(di); 1205 } else if (di) { 1206 ret = drop_one_dir_item(trans, path, dir, di); 1207 if (ret) 1208 return ret; 1209 } 1210 btrfs_release_path(path); 1211 1212 return 0; 1213 } 1214 1215 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1216 struct fscrypt_str *name, u64 *index, 1217 u64 *parent_objectid) 1218 { 1219 struct btrfs_inode_extref *extref; 1220 int ret; 1221 1222 extref = (struct btrfs_inode_extref *)ref_ptr; 1223 1224 ret = read_alloc_one_name(eb, &extref->name, 1225 btrfs_inode_extref_name_len(eb, extref), name); 1226 if (ret) 1227 return ret; 1228 1229 if (index) 1230 *index = btrfs_inode_extref_index(eb, extref); 1231 if (parent_objectid) 1232 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1233 1234 return 0; 1235 } 1236 1237 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1238 struct fscrypt_str *name, u64 *index) 1239 { 1240 struct btrfs_inode_ref *ref; 1241 int ret; 1242 1243 ref = (struct btrfs_inode_ref *)ref_ptr; 1244 1245 ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), 1246 name); 1247 if (ret) 1248 return ret; 1249 1250 if (index) 1251 *index = btrfs_inode_ref_index(eb, ref); 1252 1253 return 0; 1254 } 1255 1256 /* 1257 * Take an inode reference item from the log tree and iterate all names from the 1258 * inode reference item in the subvolume tree with the same key (if it exists). 1259 * For any name that is not in the inode reference item from the log tree, do a 1260 * proper unlink of that name (that is, remove its entry from the inode 1261 * reference item and both dir index keys). 1262 */ 1263 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1264 struct btrfs_root *root, 1265 struct btrfs_path *path, 1266 struct btrfs_inode *inode, 1267 struct extent_buffer *log_eb, 1268 int log_slot, 1269 struct btrfs_key *key) 1270 { 1271 int ret; 1272 unsigned long ref_ptr; 1273 unsigned long ref_end; 1274 struct extent_buffer *eb; 1275 1276 again: 1277 btrfs_release_path(path); 1278 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1279 if (ret > 0) { 1280 ret = 0; 1281 goto out; 1282 } 1283 if (ret < 0) 1284 goto out; 1285 1286 eb = path->nodes[0]; 1287 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1288 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1289 while (ref_ptr < ref_end) { 1290 struct fscrypt_str name; 1291 u64 parent_id; 1292 1293 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1294 ret = extref_get_fields(eb, ref_ptr, &name, 1295 NULL, &parent_id); 1296 } else { 1297 parent_id = key->offset; 1298 ret = ref_get_fields(eb, ref_ptr, &name, NULL); 1299 } 1300 if (ret) 1301 goto out; 1302 1303 if (key->type == BTRFS_INODE_EXTREF_KEY) 1304 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1305 parent_id, &name); 1306 else 1307 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); 1308 1309 if (!ret) { 1310 struct inode *dir; 1311 1312 btrfs_release_path(path); 1313 dir = read_one_inode(root, parent_id); 1314 if (!dir) { 1315 ret = -ENOENT; 1316 kfree(name.name); 1317 goto out; 1318 } 1319 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1320 inode, &name); 1321 kfree(name.name); 1322 iput(dir); 1323 if (ret) 1324 goto out; 1325 goto again; 1326 } 1327 1328 kfree(name.name); 1329 ref_ptr += name.len; 1330 if (key->type == BTRFS_INODE_EXTREF_KEY) 1331 ref_ptr += sizeof(struct btrfs_inode_extref); 1332 else 1333 ref_ptr += sizeof(struct btrfs_inode_ref); 1334 } 1335 ret = 0; 1336 out: 1337 btrfs_release_path(path); 1338 return ret; 1339 } 1340 1341 /* 1342 * replay one inode back reference item found in the log tree. 1343 * eb, slot and key refer to the buffer and key found in the log tree. 1344 * root is the destination we are replaying into, and path is for temp 1345 * use by this function. (it should be released on return). 1346 */ 1347 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1348 struct btrfs_root *root, 1349 struct btrfs_root *log, 1350 struct btrfs_path *path, 1351 struct extent_buffer *eb, int slot, 1352 struct btrfs_key *key) 1353 { 1354 struct inode *dir = NULL; 1355 struct inode *inode = NULL; 1356 unsigned long ref_ptr; 1357 unsigned long ref_end; 1358 struct fscrypt_str name; 1359 int ret; 1360 int log_ref_ver = 0; 1361 u64 parent_objectid; 1362 u64 inode_objectid; 1363 u64 ref_index = 0; 1364 int ref_struct_size; 1365 1366 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1367 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1368 1369 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1370 struct btrfs_inode_extref *r; 1371 1372 ref_struct_size = sizeof(struct btrfs_inode_extref); 1373 log_ref_ver = 1; 1374 r = (struct btrfs_inode_extref *)ref_ptr; 1375 parent_objectid = btrfs_inode_extref_parent(eb, r); 1376 } else { 1377 ref_struct_size = sizeof(struct btrfs_inode_ref); 1378 parent_objectid = key->offset; 1379 } 1380 inode_objectid = key->objectid; 1381 1382 /* 1383 * it is possible that we didn't log all the parent directories 1384 * for a given inode. If we don't find the dir, just don't 1385 * copy the back ref in. The link count fixup code will take 1386 * care of the rest 1387 */ 1388 dir = read_one_inode(root, parent_objectid); 1389 if (!dir) { 1390 ret = -ENOENT; 1391 goto out; 1392 } 1393 1394 inode = read_one_inode(root, inode_objectid); 1395 if (!inode) { 1396 ret = -EIO; 1397 goto out; 1398 } 1399 1400 while (ref_ptr < ref_end) { 1401 if (log_ref_ver) { 1402 ret = extref_get_fields(eb, ref_ptr, &name, 1403 &ref_index, &parent_objectid); 1404 /* 1405 * parent object can change from one array 1406 * item to another. 1407 */ 1408 if (!dir) 1409 dir = read_one_inode(root, parent_objectid); 1410 if (!dir) { 1411 ret = -ENOENT; 1412 goto out; 1413 } 1414 } else { 1415 ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); 1416 } 1417 if (ret) 1418 goto out; 1419 1420 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1421 btrfs_ino(BTRFS_I(inode)), ref_index, &name); 1422 if (ret < 0) { 1423 goto out; 1424 } else if (ret == 0) { 1425 /* 1426 * look for a conflicting back reference in the 1427 * metadata. if we find one we have to unlink that name 1428 * of the file before we add our new link. Later on, we 1429 * overwrite any existing back reference, and we don't 1430 * want to create dangling pointers in the directory. 1431 */ 1432 ret = __add_inode_ref(trans, root, path, log, 1433 BTRFS_I(dir), BTRFS_I(inode), 1434 inode_objectid, parent_objectid, 1435 ref_index, &name); 1436 if (ret) { 1437 if (ret == 1) 1438 ret = 0; 1439 goto out; 1440 } 1441 1442 /* insert our name */ 1443 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1444 &name, 0, ref_index); 1445 if (ret) 1446 goto out; 1447 1448 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1449 if (ret) 1450 goto out; 1451 } 1452 /* Else, ret == 1, we already have a perfect match, we're done. */ 1453 1454 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; 1455 kfree(name.name); 1456 name.name = NULL; 1457 if (log_ref_ver) { 1458 iput(dir); 1459 dir = NULL; 1460 } 1461 } 1462 1463 /* 1464 * Before we overwrite the inode reference item in the subvolume tree 1465 * with the item from the log tree, we must unlink all names from the 1466 * parent directory that are in the subvolume's tree inode reference 1467 * item, otherwise we end up with an inconsistent subvolume tree where 1468 * dir index entries exist for a name but there is no inode reference 1469 * item with the same name. 1470 */ 1471 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1472 key); 1473 if (ret) 1474 goto out; 1475 1476 /* finally write the back reference in the inode */ 1477 ret = overwrite_item(trans, root, path, eb, slot, key); 1478 out: 1479 btrfs_release_path(path); 1480 kfree(name.name); 1481 iput(dir); 1482 iput(inode); 1483 return ret; 1484 } 1485 1486 static int count_inode_extrefs(struct btrfs_root *root, 1487 struct btrfs_inode *inode, struct btrfs_path *path) 1488 { 1489 int ret = 0; 1490 int name_len; 1491 unsigned int nlink = 0; 1492 u32 item_size; 1493 u32 cur_offset = 0; 1494 u64 inode_objectid = btrfs_ino(inode); 1495 u64 offset = 0; 1496 unsigned long ptr; 1497 struct btrfs_inode_extref *extref; 1498 struct extent_buffer *leaf; 1499 1500 while (1) { 1501 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1502 &extref, &offset); 1503 if (ret) 1504 break; 1505 1506 leaf = path->nodes[0]; 1507 item_size = btrfs_item_size(leaf, path->slots[0]); 1508 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1509 cur_offset = 0; 1510 1511 while (cur_offset < item_size) { 1512 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1513 name_len = btrfs_inode_extref_name_len(leaf, extref); 1514 1515 nlink++; 1516 1517 cur_offset += name_len + sizeof(*extref); 1518 } 1519 1520 offset++; 1521 btrfs_release_path(path); 1522 } 1523 btrfs_release_path(path); 1524 1525 if (ret < 0 && ret != -ENOENT) 1526 return ret; 1527 return nlink; 1528 } 1529 1530 static int count_inode_refs(struct btrfs_root *root, 1531 struct btrfs_inode *inode, struct btrfs_path *path) 1532 { 1533 int ret; 1534 struct btrfs_key key; 1535 unsigned int nlink = 0; 1536 unsigned long ptr; 1537 unsigned long ptr_end; 1538 int name_len; 1539 u64 ino = btrfs_ino(inode); 1540 1541 key.objectid = ino; 1542 key.type = BTRFS_INODE_REF_KEY; 1543 key.offset = (u64)-1; 1544 1545 while (1) { 1546 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1547 if (ret < 0) 1548 break; 1549 if (ret > 0) { 1550 if (path->slots[0] == 0) 1551 break; 1552 path->slots[0]--; 1553 } 1554 process_slot: 1555 btrfs_item_key_to_cpu(path->nodes[0], &key, 1556 path->slots[0]); 1557 if (key.objectid != ino || 1558 key.type != BTRFS_INODE_REF_KEY) 1559 break; 1560 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1561 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1562 path->slots[0]); 1563 while (ptr < ptr_end) { 1564 struct btrfs_inode_ref *ref; 1565 1566 ref = (struct btrfs_inode_ref *)ptr; 1567 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1568 ref); 1569 ptr = (unsigned long)(ref + 1) + name_len; 1570 nlink++; 1571 } 1572 1573 if (key.offset == 0) 1574 break; 1575 if (path->slots[0] > 0) { 1576 path->slots[0]--; 1577 goto process_slot; 1578 } 1579 key.offset--; 1580 btrfs_release_path(path); 1581 } 1582 btrfs_release_path(path); 1583 1584 return nlink; 1585 } 1586 1587 /* 1588 * There are a few corners where the link count of the file can't 1589 * be properly maintained during replay. So, instead of adding 1590 * lots of complexity to the log code, we just scan the backrefs 1591 * for any file that has been through replay. 1592 * 1593 * The scan will update the link count on the inode to reflect the 1594 * number of back refs found. If it goes down to zero, the iput 1595 * will free the inode. 1596 */ 1597 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1598 struct btrfs_root *root, 1599 struct inode *inode) 1600 { 1601 struct btrfs_path *path; 1602 int ret; 1603 u64 nlink = 0; 1604 u64 ino = btrfs_ino(BTRFS_I(inode)); 1605 1606 path = btrfs_alloc_path(); 1607 if (!path) 1608 return -ENOMEM; 1609 1610 ret = count_inode_refs(root, BTRFS_I(inode), path); 1611 if (ret < 0) 1612 goto out; 1613 1614 nlink = ret; 1615 1616 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1617 if (ret < 0) 1618 goto out; 1619 1620 nlink += ret; 1621 1622 ret = 0; 1623 1624 if (nlink != inode->i_nlink) { 1625 set_nlink(inode, nlink); 1626 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1627 if (ret) 1628 goto out; 1629 } 1630 BTRFS_I(inode)->index_cnt = (u64)-1; 1631 1632 if (inode->i_nlink == 0) { 1633 if (S_ISDIR(inode->i_mode)) { 1634 ret = replay_dir_deletes(trans, root, NULL, path, 1635 ino, 1); 1636 if (ret) 1637 goto out; 1638 } 1639 ret = btrfs_insert_orphan_item(trans, root, ino); 1640 if (ret == -EEXIST) 1641 ret = 0; 1642 } 1643 1644 out: 1645 btrfs_free_path(path); 1646 return ret; 1647 } 1648 1649 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1650 struct btrfs_root *root, 1651 struct btrfs_path *path) 1652 { 1653 int ret; 1654 struct btrfs_key key; 1655 struct inode *inode; 1656 1657 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1658 key.type = BTRFS_ORPHAN_ITEM_KEY; 1659 key.offset = (u64)-1; 1660 while (1) { 1661 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1662 if (ret < 0) 1663 break; 1664 1665 if (ret == 1) { 1666 ret = 0; 1667 if (path->slots[0] == 0) 1668 break; 1669 path->slots[0]--; 1670 } 1671 1672 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1673 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1674 key.type != BTRFS_ORPHAN_ITEM_KEY) 1675 break; 1676 1677 ret = btrfs_del_item(trans, root, path); 1678 if (ret) 1679 break; 1680 1681 btrfs_release_path(path); 1682 inode = read_one_inode(root, key.offset); 1683 if (!inode) { 1684 ret = -EIO; 1685 break; 1686 } 1687 1688 ret = fixup_inode_link_count(trans, root, inode); 1689 iput(inode); 1690 if (ret) 1691 break; 1692 1693 /* 1694 * fixup on a directory may create new entries, 1695 * make sure we always look for the highset possible 1696 * offset 1697 */ 1698 key.offset = (u64)-1; 1699 } 1700 btrfs_release_path(path); 1701 return ret; 1702 } 1703 1704 1705 /* 1706 * record a given inode in the fixup dir so we can check its link 1707 * count when replay is done. The link count is incremented here 1708 * so the inode won't go away until we check it 1709 */ 1710 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1711 struct btrfs_root *root, 1712 struct btrfs_path *path, 1713 u64 objectid) 1714 { 1715 struct btrfs_key key; 1716 int ret = 0; 1717 struct inode *inode; 1718 1719 inode = read_one_inode(root, objectid); 1720 if (!inode) 1721 return -EIO; 1722 1723 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1724 key.type = BTRFS_ORPHAN_ITEM_KEY; 1725 key.offset = objectid; 1726 1727 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1728 1729 btrfs_release_path(path); 1730 if (ret == 0) { 1731 if (!inode->i_nlink) 1732 set_nlink(inode, 1); 1733 else 1734 inc_nlink(inode); 1735 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1736 } else if (ret == -EEXIST) { 1737 ret = 0; 1738 } 1739 iput(inode); 1740 1741 return ret; 1742 } 1743 1744 /* 1745 * when replaying the log for a directory, we only insert names 1746 * for inodes that actually exist. This means an fsync on a directory 1747 * does not implicitly fsync all the new files in it 1748 */ 1749 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1750 struct btrfs_root *root, 1751 u64 dirid, u64 index, 1752 const struct fscrypt_str *name, 1753 struct btrfs_key *location) 1754 { 1755 struct inode *inode; 1756 struct inode *dir; 1757 int ret; 1758 1759 inode = read_one_inode(root, location->objectid); 1760 if (!inode) 1761 return -ENOENT; 1762 1763 dir = read_one_inode(root, dirid); 1764 if (!dir) { 1765 iput(inode); 1766 return -EIO; 1767 } 1768 1769 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1770 1, index); 1771 1772 /* FIXME, put inode into FIXUP list */ 1773 1774 iput(inode); 1775 iput(dir); 1776 return ret; 1777 } 1778 1779 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1780 struct btrfs_inode *dir, 1781 struct btrfs_path *path, 1782 struct btrfs_dir_item *dst_di, 1783 const struct btrfs_key *log_key, 1784 u8 log_flags, 1785 bool exists) 1786 { 1787 struct btrfs_key found_key; 1788 1789 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1790 /* The existing dentry points to the same inode, don't delete it. */ 1791 if (found_key.objectid == log_key->objectid && 1792 found_key.type == log_key->type && 1793 found_key.offset == log_key->offset && 1794 btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) 1795 return 1; 1796 1797 /* 1798 * Don't drop the conflicting directory entry if the inode for the new 1799 * entry doesn't exist. 1800 */ 1801 if (!exists) 1802 return 0; 1803 1804 return drop_one_dir_item(trans, path, dir, dst_di); 1805 } 1806 1807 /* 1808 * take a single entry in a log directory item and replay it into 1809 * the subvolume. 1810 * 1811 * if a conflicting item exists in the subdirectory already, 1812 * the inode it points to is unlinked and put into the link count 1813 * fix up tree. 1814 * 1815 * If a name from the log points to a file or directory that does 1816 * not exist in the FS, it is skipped. fsyncs on directories 1817 * do not force down inodes inside that directory, just changes to the 1818 * names or unlinks in a directory. 1819 * 1820 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1821 * non-existing inode) and 1 if the name was replayed. 1822 */ 1823 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1824 struct btrfs_root *root, 1825 struct btrfs_path *path, 1826 struct extent_buffer *eb, 1827 struct btrfs_dir_item *di, 1828 struct btrfs_key *key) 1829 { 1830 struct fscrypt_str name; 1831 struct btrfs_dir_item *dir_dst_di; 1832 struct btrfs_dir_item *index_dst_di; 1833 bool dir_dst_matches = false; 1834 bool index_dst_matches = false; 1835 struct btrfs_key log_key; 1836 struct btrfs_key search_key; 1837 struct inode *dir; 1838 u8 log_flags; 1839 bool exists; 1840 int ret; 1841 bool update_size = true; 1842 bool name_added = false; 1843 1844 dir = read_one_inode(root, key->objectid); 1845 if (!dir) 1846 return -EIO; 1847 1848 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 1849 if (ret) 1850 goto out; 1851 1852 log_flags = btrfs_dir_flags(eb, di); 1853 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1854 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1855 btrfs_release_path(path); 1856 if (ret < 0) 1857 goto out; 1858 exists = (ret == 0); 1859 ret = 0; 1860 1861 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1862 &name, 1); 1863 if (IS_ERR(dir_dst_di)) { 1864 ret = PTR_ERR(dir_dst_di); 1865 goto out; 1866 } else if (dir_dst_di) { 1867 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1868 dir_dst_di, &log_key, 1869 log_flags, exists); 1870 if (ret < 0) 1871 goto out; 1872 dir_dst_matches = (ret == 1); 1873 } 1874 1875 btrfs_release_path(path); 1876 1877 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1878 key->objectid, key->offset, 1879 &name, 1); 1880 if (IS_ERR(index_dst_di)) { 1881 ret = PTR_ERR(index_dst_di); 1882 goto out; 1883 } else if (index_dst_di) { 1884 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1885 index_dst_di, &log_key, 1886 log_flags, exists); 1887 if (ret < 0) 1888 goto out; 1889 index_dst_matches = (ret == 1); 1890 } 1891 1892 btrfs_release_path(path); 1893 1894 if (dir_dst_matches && index_dst_matches) { 1895 ret = 0; 1896 update_size = false; 1897 goto out; 1898 } 1899 1900 /* 1901 * Check if the inode reference exists in the log for the given name, 1902 * inode and parent inode 1903 */ 1904 search_key.objectid = log_key.objectid; 1905 search_key.type = BTRFS_INODE_REF_KEY; 1906 search_key.offset = key->objectid; 1907 ret = backref_in_log(root->log_root, &search_key, 0, &name); 1908 if (ret < 0) { 1909 goto out; 1910 } else if (ret) { 1911 /* The dentry will be added later. */ 1912 ret = 0; 1913 update_size = false; 1914 goto out; 1915 } 1916 1917 search_key.objectid = log_key.objectid; 1918 search_key.type = BTRFS_INODE_EXTREF_KEY; 1919 search_key.offset = key->objectid; 1920 ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); 1921 if (ret < 0) { 1922 goto out; 1923 } else if (ret) { 1924 /* The dentry will be added later. */ 1925 ret = 0; 1926 update_size = false; 1927 goto out; 1928 } 1929 btrfs_release_path(path); 1930 ret = insert_one_name(trans, root, key->objectid, key->offset, 1931 &name, &log_key); 1932 if (ret && ret != -ENOENT && ret != -EEXIST) 1933 goto out; 1934 if (!ret) 1935 name_added = true; 1936 update_size = false; 1937 ret = 0; 1938 1939 out: 1940 if (!ret && update_size) { 1941 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); 1942 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 1943 } 1944 kfree(name.name); 1945 iput(dir); 1946 if (!ret && name_added) 1947 ret = 1; 1948 return ret; 1949 } 1950 1951 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 1952 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1953 struct btrfs_root *root, 1954 struct btrfs_path *path, 1955 struct extent_buffer *eb, int slot, 1956 struct btrfs_key *key) 1957 { 1958 int ret; 1959 struct btrfs_dir_item *di; 1960 1961 /* We only log dir index keys, which only contain a single dir item. */ 1962 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 1963 1964 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 1965 ret = replay_one_name(trans, root, path, eb, di, key); 1966 if (ret < 0) 1967 return ret; 1968 1969 /* 1970 * If this entry refers to a non-directory (directories can not have a 1971 * link count > 1) and it was added in the transaction that was not 1972 * committed, make sure we fixup the link count of the inode the entry 1973 * points to. Otherwise something like the following would result in a 1974 * directory pointing to an inode with a wrong link that does not account 1975 * for this dir entry: 1976 * 1977 * mkdir testdir 1978 * touch testdir/foo 1979 * touch testdir/bar 1980 * sync 1981 * 1982 * ln testdir/bar testdir/bar_link 1983 * ln testdir/foo testdir/foo_link 1984 * xfs_io -c "fsync" testdir/bar 1985 * 1986 * <power failure> 1987 * 1988 * mount fs, log replay happens 1989 * 1990 * File foo would remain with a link count of 1 when it has two entries 1991 * pointing to it in the directory testdir. This would make it impossible 1992 * to ever delete the parent directory has it would result in stale 1993 * dentries that can never be deleted. 1994 */ 1995 if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { 1996 struct btrfs_path *fixup_path; 1997 struct btrfs_key di_key; 1998 1999 fixup_path = btrfs_alloc_path(); 2000 if (!fixup_path) 2001 return -ENOMEM; 2002 2003 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2004 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2005 btrfs_free_path(fixup_path); 2006 } 2007 2008 return ret; 2009 } 2010 2011 /* 2012 * directory replay has two parts. There are the standard directory 2013 * items in the log copied from the subvolume, and range items 2014 * created in the log while the subvolume was logged. 2015 * 2016 * The range items tell us which parts of the key space the log 2017 * is authoritative for. During replay, if a key in the subvolume 2018 * directory is in a logged range item, but not actually in the log 2019 * that means it was deleted from the directory before the fsync 2020 * and should be removed. 2021 */ 2022 static noinline int find_dir_range(struct btrfs_root *root, 2023 struct btrfs_path *path, 2024 u64 dirid, 2025 u64 *start_ret, u64 *end_ret) 2026 { 2027 struct btrfs_key key; 2028 u64 found_end; 2029 struct btrfs_dir_log_item *item; 2030 int ret; 2031 int nritems; 2032 2033 if (*start_ret == (u64)-1) 2034 return 1; 2035 2036 key.objectid = dirid; 2037 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2038 key.offset = *start_ret; 2039 2040 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2041 if (ret < 0) 2042 goto out; 2043 if (ret > 0) { 2044 if (path->slots[0] == 0) 2045 goto out; 2046 path->slots[0]--; 2047 } 2048 if (ret != 0) 2049 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2050 2051 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2052 ret = 1; 2053 goto next; 2054 } 2055 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2056 struct btrfs_dir_log_item); 2057 found_end = btrfs_dir_log_end(path->nodes[0], item); 2058 2059 if (*start_ret >= key.offset && *start_ret <= found_end) { 2060 ret = 0; 2061 *start_ret = key.offset; 2062 *end_ret = found_end; 2063 goto out; 2064 } 2065 ret = 1; 2066 next: 2067 /* check the next slot in the tree to see if it is a valid item */ 2068 nritems = btrfs_header_nritems(path->nodes[0]); 2069 path->slots[0]++; 2070 if (path->slots[0] >= nritems) { 2071 ret = btrfs_next_leaf(root, path); 2072 if (ret) 2073 goto out; 2074 } 2075 2076 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2077 2078 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2079 ret = 1; 2080 goto out; 2081 } 2082 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2083 struct btrfs_dir_log_item); 2084 found_end = btrfs_dir_log_end(path->nodes[0], item); 2085 *start_ret = key.offset; 2086 *end_ret = found_end; 2087 ret = 0; 2088 out: 2089 btrfs_release_path(path); 2090 return ret; 2091 } 2092 2093 /* 2094 * this looks for a given directory item in the log. If the directory 2095 * item is not in the log, the item is removed and the inode it points 2096 * to is unlinked 2097 */ 2098 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2099 struct btrfs_root *log, 2100 struct btrfs_path *path, 2101 struct btrfs_path *log_path, 2102 struct inode *dir, 2103 struct btrfs_key *dir_key) 2104 { 2105 struct btrfs_root *root = BTRFS_I(dir)->root; 2106 int ret; 2107 struct extent_buffer *eb; 2108 int slot; 2109 struct btrfs_dir_item *di; 2110 struct fscrypt_str name; 2111 struct inode *inode = NULL; 2112 struct btrfs_key location; 2113 2114 /* 2115 * Currently we only log dir index keys. Even if we replay a log created 2116 * by an older kernel that logged both dir index and dir item keys, all 2117 * we need to do is process the dir index keys, we (and our caller) can 2118 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2119 */ 2120 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2121 2122 eb = path->nodes[0]; 2123 slot = path->slots[0]; 2124 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2125 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 2126 if (ret) 2127 goto out; 2128 2129 if (log) { 2130 struct btrfs_dir_item *log_di; 2131 2132 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2133 dir_key->objectid, 2134 dir_key->offset, &name, 0); 2135 if (IS_ERR(log_di)) { 2136 ret = PTR_ERR(log_di); 2137 goto out; 2138 } else if (log_di) { 2139 /* The dentry exists in the log, we have nothing to do. */ 2140 ret = 0; 2141 goto out; 2142 } 2143 } 2144 2145 btrfs_dir_item_key_to_cpu(eb, di, &location); 2146 btrfs_release_path(path); 2147 btrfs_release_path(log_path); 2148 inode = read_one_inode(root, location.objectid); 2149 if (!inode) { 2150 ret = -EIO; 2151 goto out; 2152 } 2153 2154 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2155 if (ret) 2156 goto out; 2157 2158 inc_nlink(inode); 2159 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2160 &name); 2161 /* 2162 * Unlike dir item keys, dir index keys can only have one name (entry) in 2163 * them, as there are no key collisions since each key has a unique offset 2164 * (an index number), so we're done. 2165 */ 2166 out: 2167 btrfs_release_path(path); 2168 btrfs_release_path(log_path); 2169 kfree(name.name); 2170 iput(inode); 2171 return ret; 2172 } 2173 2174 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2175 struct btrfs_root *root, 2176 struct btrfs_root *log, 2177 struct btrfs_path *path, 2178 const u64 ino) 2179 { 2180 struct btrfs_key search_key; 2181 struct btrfs_path *log_path; 2182 int i; 2183 int nritems; 2184 int ret; 2185 2186 log_path = btrfs_alloc_path(); 2187 if (!log_path) 2188 return -ENOMEM; 2189 2190 search_key.objectid = ino; 2191 search_key.type = BTRFS_XATTR_ITEM_KEY; 2192 search_key.offset = 0; 2193 again: 2194 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2195 if (ret < 0) 2196 goto out; 2197 process_leaf: 2198 nritems = btrfs_header_nritems(path->nodes[0]); 2199 for (i = path->slots[0]; i < nritems; i++) { 2200 struct btrfs_key key; 2201 struct btrfs_dir_item *di; 2202 struct btrfs_dir_item *log_di; 2203 u32 total_size; 2204 u32 cur; 2205 2206 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2207 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2208 ret = 0; 2209 goto out; 2210 } 2211 2212 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2213 total_size = btrfs_item_size(path->nodes[0], i); 2214 cur = 0; 2215 while (cur < total_size) { 2216 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2217 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2218 u32 this_len = sizeof(*di) + name_len + data_len; 2219 char *name; 2220 2221 name = kmalloc(name_len, GFP_NOFS); 2222 if (!name) { 2223 ret = -ENOMEM; 2224 goto out; 2225 } 2226 read_extent_buffer(path->nodes[0], name, 2227 (unsigned long)(di + 1), name_len); 2228 2229 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2230 name, name_len, 0); 2231 btrfs_release_path(log_path); 2232 if (!log_di) { 2233 /* Doesn't exist in log tree, so delete it. */ 2234 btrfs_release_path(path); 2235 di = btrfs_lookup_xattr(trans, root, path, ino, 2236 name, name_len, -1); 2237 kfree(name); 2238 if (IS_ERR(di)) { 2239 ret = PTR_ERR(di); 2240 goto out; 2241 } 2242 ASSERT(di); 2243 ret = btrfs_delete_one_dir_name(trans, root, 2244 path, di); 2245 if (ret) 2246 goto out; 2247 btrfs_release_path(path); 2248 search_key = key; 2249 goto again; 2250 } 2251 kfree(name); 2252 if (IS_ERR(log_di)) { 2253 ret = PTR_ERR(log_di); 2254 goto out; 2255 } 2256 cur += this_len; 2257 di = (struct btrfs_dir_item *)((char *)di + this_len); 2258 } 2259 } 2260 ret = btrfs_next_leaf(root, path); 2261 if (ret > 0) 2262 ret = 0; 2263 else if (ret == 0) 2264 goto process_leaf; 2265 out: 2266 btrfs_free_path(log_path); 2267 btrfs_release_path(path); 2268 return ret; 2269 } 2270 2271 2272 /* 2273 * deletion replay happens before we copy any new directory items 2274 * out of the log or out of backreferences from inodes. It 2275 * scans the log to find ranges of keys that log is authoritative for, 2276 * and then scans the directory to find items in those ranges that are 2277 * not present in the log. 2278 * 2279 * Anything we don't find in the log is unlinked and removed from the 2280 * directory. 2281 */ 2282 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2283 struct btrfs_root *root, 2284 struct btrfs_root *log, 2285 struct btrfs_path *path, 2286 u64 dirid, int del_all) 2287 { 2288 u64 range_start; 2289 u64 range_end; 2290 int ret = 0; 2291 struct btrfs_key dir_key; 2292 struct btrfs_key found_key; 2293 struct btrfs_path *log_path; 2294 struct inode *dir; 2295 2296 dir_key.objectid = dirid; 2297 dir_key.type = BTRFS_DIR_INDEX_KEY; 2298 log_path = btrfs_alloc_path(); 2299 if (!log_path) 2300 return -ENOMEM; 2301 2302 dir = read_one_inode(root, dirid); 2303 /* it isn't an error if the inode isn't there, that can happen 2304 * because we replay the deletes before we copy in the inode item 2305 * from the log 2306 */ 2307 if (!dir) { 2308 btrfs_free_path(log_path); 2309 return 0; 2310 } 2311 2312 range_start = 0; 2313 range_end = 0; 2314 while (1) { 2315 if (del_all) 2316 range_end = (u64)-1; 2317 else { 2318 ret = find_dir_range(log, path, dirid, 2319 &range_start, &range_end); 2320 if (ret < 0) 2321 goto out; 2322 else if (ret > 0) 2323 break; 2324 } 2325 2326 dir_key.offset = range_start; 2327 while (1) { 2328 int nritems; 2329 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2330 0, 0); 2331 if (ret < 0) 2332 goto out; 2333 2334 nritems = btrfs_header_nritems(path->nodes[0]); 2335 if (path->slots[0] >= nritems) { 2336 ret = btrfs_next_leaf(root, path); 2337 if (ret == 1) 2338 break; 2339 else if (ret < 0) 2340 goto out; 2341 } 2342 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2343 path->slots[0]); 2344 if (found_key.objectid != dirid || 2345 found_key.type != dir_key.type) { 2346 ret = 0; 2347 goto out; 2348 } 2349 2350 if (found_key.offset > range_end) 2351 break; 2352 2353 ret = check_item_in_log(trans, log, path, 2354 log_path, dir, 2355 &found_key); 2356 if (ret) 2357 goto out; 2358 if (found_key.offset == (u64)-1) 2359 break; 2360 dir_key.offset = found_key.offset + 1; 2361 } 2362 btrfs_release_path(path); 2363 if (range_end == (u64)-1) 2364 break; 2365 range_start = range_end + 1; 2366 } 2367 ret = 0; 2368 out: 2369 btrfs_release_path(path); 2370 btrfs_free_path(log_path); 2371 iput(dir); 2372 return ret; 2373 } 2374 2375 /* 2376 * the process_func used to replay items from the log tree. This 2377 * gets called in two different stages. The first stage just looks 2378 * for inodes and makes sure they are all copied into the subvolume. 2379 * 2380 * The second stage copies all the other item types from the log into 2381 * the subvolume. The two stage approach is slower, but gets rid of 2382 * lots of complexity around inodes referencing other inodes that exist 2383 * only in the log (references come from either directory items or inode 2384 * back refs). 2385 */ 2386 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2387 struct walk_control *wc, u64 gen, int level) 2388 { 2389 int nritems; 2390 struct btrfs_tree_parent_check check = { 2391 .transid = gen, 2392 .level = level 2393 }; 2394 struct btrfs_path *path; 2395 struct btrfs_root *root = wc->replay_dest; 2396 struct btrfs_key key; 2397 int i; 2398 int ret; 2399 2400 ret = btrfs_read_extent_buffer(eb, &check); 2401 if (ret) 2402 return ret; 2403 2404 level = btrfs_header_level(eb); 2405 2406 if (level != 0) 2407 return 0; 2408 2409 path = btrfs_alloc_path(); 2410 if (!path) 2411 return -ENOMEM; 2412 2413 nritems = btrfs_header_nritems(eb); 2414 for (i = 0; i < nritems; i++) { 2415 btrfs_item_key_to_cpu(eb, &key, i); 2416 2417 /* inode keys are done during the first stage */ 2418 if (key.type == BTRFS_INODE_ITEM_KEY && 2419 wc->stage == LOG_WALK_REPLAY_INODES) { 2420 struct btrfs_inode_item *inode_item; 2421 u32 mode; 2422 2423 inode_item = btrfs_item_ptr(eb, i, 2424 struct btrfs_inode_item); 2425 /* 2426 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2427 * and never got linked before the fsync, skip it, as 2428 * replaying it is pointless since it would be deleted 2429 * later. We skip logging tmpfiles, but it's always 2430 * possible we are replaying a log created with a kernel 2431 * that used to log tmpfiles. 2432 */ 2433 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2434 wc->ignore_cur_inode = true; 2435 continue; 2436 } else { 2437 wc->ignore_cur_inode = false; 2438 } 2439 ret = replay_xattr_deletes(wc->trans, root, log, 2440 path, key.objectid); 2441 if (ret) 2442 break; 2443 mode = btrfs_inode_mode(eb, inode_item); 2444 if (S_ISDIR(mode)) { 2445 ret = replay_dir_deletes(wc->trans, 2446 root, log, path, key.objectid, 0); 2447 if (ret) 2448 break; 2449 } 2450 ret = overwrite_item(wc->trans, root, path, 2451 eb, i, &key); 2452 if (ret) 2453 break; 2454 2455 /* 2456 * Before replaying extents, truncate the inode to its 2457 * size. We need to do it now and not after log replay 2458 * because before an fsync we can have prealloc extents 2459 * added beyond the inode's i_size. If we did it after, 2460 * through orphan cleanup for example, we would drop 2461 * those prealloc extents just after replaying them. 2462 */ 2463 if (S_ISREG(mode)) { 2464 struct btrfs_drop_extents_args drop_args = { 0 }; 2465 struct inode *inode; 2466 u64 from; 2467 2468 inode = read_one_inode(root, key.objectid); 2469 if (!inode) { 2470 ret = -EIO; 2471 break; 2472 } 2473 from = ALIGN(i_size_read(inode), 2474 root->fs_info->sectorsize); 2475 drop_args.start = from; 2476 drop_args.end = (u64)-1; 2477 drop_args.drop_cache = true; 2478 ret = btrfs_drop_extents(wc->trans, root, 2479 BTRFS_I(inode), 2480 &drop_args); 2481 if (!ret) { 2482 inode_sub_bytes(inode, 2483 drop_args.bytes_found); 2484 /* Update the inode's nbytes. */ 2485 ret = btrfs_update_inode(wc->trans, 2486 root, BTRFS_I(inode)); 2487 } 2488 iput(inode); 2489 if (ret) 2490 break; 2491 } 2492 2493 ret = link_to_fixup_dir(wc->trans, root, 2494 path, key.objectid); 2495 if (ret) 2496 break; 2497 } 2498 2499 if (wc->ignore_cur_inode) 2500 continue; 2501 2502 if (key.type == BTRFS_DIR_INDEX_KEY && 2503 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2504 ret = replay_one_dir_item(wc->trans, root, path, 2505 eb, i, &key); 2506 if (ret) 2507 break; 2508 } 2509 2510 if (wc->stage < LOG_WALK_REPLAY_ALL) 2511 continue; 2512 2513 /* these keys are simply copied */ 2514 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2515 ret = overwrite_item(wc->trans, root, path, 2516 eb, i, &key); 2517 if (ret) 2518 break; 2519 } else if (key.type == BTRFS_INODE_REF_KEY || 2520 key.type == BTRFS_INODE_EXTREF_KEY) { 2521 ret = add_inode_ref(wc->trans, root, log, path, 2522 eb, i, &key); 2523 if (ret && ret != -ENOENT) 2524 break; 2525 ret = 0; 2526 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2527 ret = replay_one_extent(wc->trans, root, path, 2528 eb, i, &key); 2529 if (ret) 2530 break; 2531 } 2532 /* 2533 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2534 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2535 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2536 * older kernel with such keys, ignore them. 2537 */ 2538 } 2539 btrfs_free_path(path); 2540 return ret; 2541 } 2542 2543 /* 2544 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2545 */ 2546 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2547 { 2548 struct btrfs_block_group *cache; 2549 2550 cache = btrfs_lookup_block_group(fs_info, start); 2551 if (!cache) { 2552 btrfs_err(fs_info, "unable to find block group for %llu", start); 2553 return; 2554 } 2555 2556 spin_lock(&cache->space_info->lock); 2557 spin_lock(&cache->lock); 2558 cache->reserved -= fs_info->nodesize; 2559 cache->space_info->bytes_reserved -= fs_info->nodesize; 2560 spin_unlock(&cache->lock); 2561 spin_unlock(&cache->space_info->lock); 2562 2563 btrfs_put_block_group(cache); 2564 } 2565 2566 static int clean_log_buffer(struct btrfs_trans_handle *trans, 2567 struct extent_buffer *eb) 2568 { 2569 int ret; 2570 2571 btrfs_tree_lock(eb); 2572 btrfs_clear_buffer_dirty(trans, eb); 2573 wait_on_extent_buffer_writeback(eb); 2574 btrfs_tree_unlock(eb); 2575 2576 if (trans) { 2577 ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); 2578 if (ret) 2579 return ret; 2580 btrfs_redirty_list_add(trans->transaction, eb); 2581 } else { 2582 unaccount_log_buffer(eb->fs_info, eb->start); 2583 } 2584 2585 return 0; 2586 } 2587 2588 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2589 struct btrfs_root *root, 2590 struct btrfs_path *path, int *level, 2591 struct walk_control *wc) 2592 { 2593 struct btrfs_fs_info *fs_info = root->fs_info; 2594 u64 bytenr; 2595 u64 ptr_gen; 2596 struct extent_buffer *next; 2597 struct extent_buffer *cur; 2598 int ret = 0; 2599 2600 while (*level > 0) { 2601 struct btrfs_tree_parent_check check = { 0 }; 2602 2603 cur = path->nodes[*level]; 2604 2605 WARN_ON(btrfs_header_level(cur) != *level); 2606 2607 if (path->slots[*level] >= 2608 btrfs_header_nritems(cur)) 2609 break; 2610 2611 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2612 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2613 check.transid = ptr_gen; 2614 check.level = *level - 1; 2615 check.has_first_key = true; 2616 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); 2617 2618 next = btrfs_find_create_tree_block(fs_info, bytenr, 2619 btrfs_header_owner(cur), 2620 *level - 1); 2621 if (IS_ERR(next)) 2622 return PTR_ERR(next); 2623 2624 if (*level == 1) { 2625 ret = wc->process_func(root, next, wc, ptr_gen, 2626 *level - 1); 2627 if (ret) { 2628 free_extent_buffer(next); 2629 return ret; 2630 } 2631 2632 path->slots[*level]++; 2633 if (wc->free) { 2634 ret = btrfs_read_extent_buffer(next, &check); 2635 if (ret) { 2636 free_extent_buffer(next); 2637 return ret; 2638 } 2639 2640 ret = clean_log_buffer(trans, next); 2641 if (ret) { 2642 free_extent_buffer(next); 2643 return ret; 2644 } 2645 } 2646 free_extent_buffer(next); 2647 continue; 2648 } 2649 ret = btrfs_read_extent_buffer(next, &check); 2650 if (ret) { 2651 free_extent_buffer(next); 2652 return ret; 2653 } 2654 2655 if (path->nodes[*level-1]) 2656 free_extent_buffer(path->nodes[*level-1]); 2657 path->nodes[*level-1] = next; 2658 *level = btrfs_header_level(next); 2659 path->slots[*level] = 0; 2660 cond_resched(); 2661 } 2662 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2663 2664 cond_resched(); 2665 return 0; 2666 } 2667 2668 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2669 struct btrfs_root *root, 2670 struct btrfs_path *path, int *level, 2671 struct walk_control *wc) 2672 { 2673 int i; 2674 int slot; 2675 int ret; 2676 2677 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2678 slot = path->slots[i]; 2679 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2680 path->slots[i]++; 2681 *level = i; 2682 WARN_ON(*level == 0); 2683 return 0; 2684 } else { 2685 ret = wc->process_func(root, path->nodes[*level], wc, 2686 btrfs_header_generation(path->nodes[*level]), 2687 *level); 2688 if (ret) 2689 return ret; 2690 2691 if (wc->free) { 2692 ret = clean_log_buffer(trans, path->nodes[*level]); 2693 if (ret) 2694 return ret; 2695 } 2696 free_extent_buffer(path->nodes[*level]); 2697 path->nodes[*level] = NULL; 2698 *level = i + 1; 2699 } 2700 } 2701 return 1; 2702 } 2703 2704 /* 2705 * drop the reference count on the tree rooted at 'snap'. This traverses 2706 * the tree freeing any blocks that have a ref count of zero after being 2707 * decremented. 2708 */ 2709 static int walk_log_tree(struct btrfs_trans_handle *trans, 2710 struct btrfs_root *log, struct walk_control *wc) 2711 { 2712 int ret = 0; 2713 int wret; 2714 int level; 2715 struct btrfs_path *path; 2716 int orig_level; 2717 2718 path = btrfs_alloc_path(); 2719 if (!path) 2720 return -ENOMEM; 2721 2722 level = btrfs_header_level(log->node); 2723 orig_level = level; 2724 path->nodes[level] = log->node; 2725 atomic_inc(&log->node->refs); 2726 path->slots[level] = 0; 2727 2728 while (1) { 2729 wret = walk_down_log_tree(trans, log, path, &level, wc); 2730 if (wret > 0) 2731 break; 2732 if (wret < 0) { 2733 ret = wret; 2734 goto out; 2735 } 2736 2737 wret = walk_up_log_tree(trans, log, path, &level, wc); 2738 if (wret > 0) 2739 break; 2740 if (wret < 0) { 2741 ret = wret; 2742 goto out; 2743 } 2744 } 2745 2746 /* was the root node processed? if not, catch it here */ 2747 if (path->nodes[orig_level]) { 2748 ret = wc->process_func(log, path->nodes[orig_level], wc, 2749 btrfs_header_generation(path->nodes[orig_level]), 2750 orig_level); 2751 if (ret) 2752 goto out; 2753 if (wc->free) 2754 ret = clean_log_buffer(trans, path->nodes[orig_level]); 2755 } 2756 2757 out: 2758 btrfs_free_path(path); 2759 return ret; 2760 } 2761 2762 /* 2763 * helper function to update the item for a given subvolumes log root 2764 * in the tree of log roots 2765 */ 2766 static int update_log_root(struct btrfs_trans_handle *trans, 2767 struct btrfs_root *log, 2768 struct btrfs_root_item *root_item) 2769 { 2770 struct btrfs_fs_info *fs_info = log->fs_info; 2771 int ret; 2772 2773 if (log->log_transid == 1) { 2774 /* insert root item on the first sync */ 2775 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2776 &log->root_key, root_item); 2777 } else { 2778 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2779 &log->root_key, root_item); 2780 } 2781 return ret; 2782 } 2783 2784 static void wait_log_commit(struct btrfs_root *root, int transid) 2785 { 2786 DEFINE_WAIT(wait); 2787 int index = transid % 2; 2788 2789 /* 2790 * we only allow two pending log transactions at a time, 2791 * so we know that if ours is more than 2 older than the 2792 * current transaction, we're done 2793 */ 2794 for (;;) { 2795 prepare_to_wait(&root->log_commit_wait[index], 2796 &wait, TASK_UNINTERRUPTIBLE); 2797 2798 if (!(root->log_transid_committed < transid && 2799 atomic_read(&root->log_commit[index]))) 2800 break; 2801 2802 mutex_unlock(&root->log_mutex); 2803 schedule(); 2804 mutex_lock(&root->log_mutex); 2805 } 2806 finish_wait(&root->log_commit_wait[index], &wait); 2807 } 2808 2809 static void wait_for_writer(struct btrfs_root *root) 2810 { 2811 DEFINE_WAIT(wait); 2812 2813 for (;;) { 2814 prepare_to_wait(&root->log_writer_wait, &wait, 2815 TASK_UNINTERRUPTIBLE); 2816 if (!atomic_read(&root->log_writers)) 2817 break; 2818 2819 mutex_unlock(&root->log_mutex); 2820 schedule(); 2821 mutex_lock(&root->log_mutex); 2822 } 2823 finish_wait(&root->log_writer_wait, &wait); 2824 } 2825 2826 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2827 struct btrfs_log_ctx *ctx) 2828 { 2829 mutex_lock(&root->log_mutex); 2830 list_del_init(&ctx->list); 2831 mutex_unlock(&root->log_mutex); 2832 } 2833 2834 /* 2835 * Invoked in log mutex context, or be sure there is no other task which 2836 * can access the list. 2837 */ 2838 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2839 int index, int error) 2840 { 2841 struct btrfs_log_ctx *ctx; 2842 struct btrfs_log_ctx *safe; 2843 2844 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2845 list_del_init(&ctx->list); 2846 ctx->log_ret = error; 2847 } 2848 } 2849 2850 /* 2851 * btrfs_sync_log does sends a given tree log down to the disk and 2852 * updates the super blocks to record it. When this call is done, 2853 * you know that any inodes previously logged are safely on disk only 2854 * if it returns 0. 2855 * 2856 * Any other return value means you need to call btrfs_commit_transaction. 2857 * Some of the edge cases for fsyncing directories that have had unlinks 2858 * or renames done in the past mean that sometimes the only safe 2859 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2860 * that has happened. 2861 */ 2862 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2863 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2864 { 2865 int index1; 2866 int index2; 2867 int mark; 2868 int ret; 2869 struct btrfs_fs_info *fs_info = root->fs_info; 2870 struct btrfs_root *log = root->log_root; 2871 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2872 struct btrfs_root_item new_root_item; 2873 int log_transid = 0; 2874 struct btrfs_log_ctx root_log_ctx; 2875 struct blk_plug plug; 2876 u64 log_root_start; 2877 u64 log_root_level; 2878 2879 mutex_lock(&root->log_mutex); 2880 log_transid = ctx->log_transid; 2881 if (root->log_transid_committed >= log_transid) { 2882 mutex_unlock(&root->log_mutex); 2883 return ctx->log_ret; 2884 } 2885 2886 index1 = log_transid % 2; 2887 if (atomic_read(&root->log_commit[index1])) { 2888 wait_log_commit(root, log_transid); 2889 mutex_unlock(&root->log_mutex); 2890 return ctx->log_ret; 2891 } 2892 ASSERT(log_transid == root->log_transid); 2893 atomic_set(&root->log_commit[index1], 1); 2894 2895 /* wait for previous tree log sync to complete */ 2896 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2897 wait_log_commit(root, log_transid - 1); 2898 2899 while (1) { 2900 int batch = atomic_read(&root->log_batch); 2901 /* when we're on an ssd, just kick the log commit out */ 2902 if (!btrfs_test_opt(fs_info, SSD) && 2903 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2904 mutex_unlock(&root->log_mutex); 2905 schedule_timeout_uninterruptible(1); 2906 mutex_lock(&root->log_mutex); 2907 } 2908 wait_for_writer(root); 2909 if (batch == atomic_read(&root->log_batch)) 2910 break; 2911 } 2912 2913 /* bail out if we need to do a full commit */ 2914 if (btrfs_need_log_full_commit(trans)) { 2915 ret = BTRFS_LOG_FORCE_COMMIT; 2916 mutex_unlock(&root->log_mutex); 2917 goto out; 2918 } 2919 2920 if (log_transid % 2 == 0) 2921 mark = EXTENT_DIRTY; 2922 else 2923 mark = EXTENT_NEW; 2924 2925 /* we start IO on all the marked extents here, but we don't actually 2926 * wait for them until later. 2927 */ 2928 blk_start_plug(&plug); 2929 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 2930 /* 2931 * -EAGAIN happens when someone, e.g., a concurrent transaction 2932 * commit, writes a dirty extent in this tree-log commit. This 2933 * concurrent write will create a hole writing out the extents, 2934 * and we cannot proceed on a zoned filesystem, requiring 2935 * sequential writing. While we can bail out to a full commit 2936 * here, but we can continue hoping the concurrent writing fills 2937 * the hole. 2938 */ 2939 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 2940 ret = 0; 2941 if (ret) { 2942 blk_finish_plug(&plug); 2943 btrfs_set_log_full_commit(trans); 2944 mutex_unlock(&root->log_mutex); 2945 goto out; 2946 } 2947 2948 /* 2949 * We _must_ update under the root->log_mutex in order to make sure we 2950 * have a consistent view of the log root we are trying to commit at 2951 * this moment. 2952 * 2953 * We _must_ copy this into a local copy, because we are not holding the 2954 * log_root_tree->log_mutex yet. This is important because when we 2955 * commit the log_root_tree we must have a consistent view of the 2956 * log_root_tree when we update the super block to point at the 2957 * log_root_tree bytenr. If we update the log_root_tree here we'll race 2958 * with the commit and possibly point at the new block which we may not 2959 * have written out. 2960 */ 2961 btrfs_set_root_node(&log->root_item, log->node); 2962 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 2963 2964 root->log_transid++; 2965 log->log_transid = root->log_transid; 2966 root->log_start_pid = 0; 2967 /* 2968 * IO has been started, blocks of the log tree have WRITTEN flag set 2969 * in their headers. new modifications of the log will be written to 2970 * new positions. so it's safe to allow log writers to go in. 2971 */ 2972 mutex_unlock(&root->log_mutex); 2973 2974 if (btrfs_is_zoned(fs_info)) { 2975 mutex_lock(&fs_info->tree_root->log_mutex); 2976 if (!log_root_tree->node) { 2977 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 2978 if (ret) { 2979 mutex_unlock(&fs_info->tree_root->log_mutex); 2980 blk_finish_plug(&plug); 2981 goto out; 2982 } 2983 } 2984 mutex_unlock(&fs_info->tree_root->log_mutex); 2985 } 2986 2987 btrfs_init_log_ctx(&root_log_ctx, NULL); 2988 2989 mutex_lock(&log_root_tree->log_mutex); 2990 2991 index2 = log_root_tree->log_transid % 2; 2992 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 2993 root_log_ctx.log_transid = log_root_tree->log_transid; 2994 2995 /* 2996 * Now we are safe to update the log_root_tree because we're under the 2997 * log_mutex, and we're a current writer so we're holding the commit 2998 * open until we drop the log_mutex. 2999 */ 3000 ret = update_log_root(trans, log, &new_root_item); 3001 if (ret) { 3002 if (!list_empty(&root_log_ctx.list)) 3003 list_del_init(&root_log_ctx.list); 3004 3005 blk_finish_plug(&plug); 3006 btrfs_set_log_full_commit(trans); 3007 if (ret != -ENOSPC) 3008 btrfs_err(fs_info, 3009 "failed to update log for root %llu ret %d", 3010 root->root_key.objectid, ret); 3011 btrfs_wait_tree_log_extents(log, mark); 3012 mutex_unlock(&log_root_tree->log_mutex); 3013 goto out; 3014 } 3015 3016 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3017 blk_finish_plug(&plug); 3018 list_del_init(&root_log_ctx.list); 3019 mutex_unlock(&log_root_tree->log_mutex); 3020 ret = root_log_ctx.log_ret; 3021 goto out; 3022 } 3023 3024 index2 = root_log_ctx.log_transid % 2; 3025 if (atomic_read(&log_root_tree->log_commit[index2])) { 3026 blk_finish_plug(&plug); 3027 ret = btrfs_wait_tree_log_extents(log, mark); 3028 wait_log_commit(log_root_tree, 3029 root_log_ctx.log_transid); 3030 mutex_unlock(&log_root_tree->log_mutex); 3031 if (!ret) 3032 ret = root_log_ctx.log_ret; 3033 goto out; 3034 } 3035 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3036 atomic_set(&log_root_tree->log_commit[index2], 1); 3037 3038 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3039 wait_log_commit(log_root_tree, 3040 root_log_ctx.log_transid - 1); 3041 } 3042 3043 /* 3044 * now that we've moved on to the tree of log tree roots, 3045 * check the full commit flag again 3046 */ 3047 if (btrfs_need_log_full_commit(trans)) { 3048 blk_finish_plug(&plug); 3049 btrfs_wait_tree_log_extents(log, mark); 3050 mutex_unlock(&log_root_tree->log_mutex); 3051 ret = BTRFS_LOG_FORCE_COMMIT; 3052 goto out_wake_log_root; 3053 } 3054 3055 ret = btrfs_write_marked_extents(fs_info, 3056 &log_root_tree->dirty_log_pages, 3057 EXTENT_DIRTY | EXTENT_NEW); 3058 blk_finish_plug(&plug); 3059 /* 3060 * As described above, -EAGAIN indicates a hole in the extents. We 3061 * cannot wait for these write outs since the waiting cause a 3062 * deadlock. Bail out to the full commit instead. 3063 */ 3064 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3065 btrfs_set_log_full_commit(trans); 3066 btrfs_wait_tree_log_extents(log, mark); 3067 mutex_unlock(&log_root_tree->log_mutex); 3068 goto out_wake_log_root; 3069 } else if (ret) { 3070 btrfs_set_log_full_commit(trans); 3071 mutex_unlock(&log_root_tree->log_mutex); 3072 goto out_wake_log_root; 3073 } 3074 ret = btrfs_wait_tree_log_extents(log, mark); 3075 if (!ret) 3076 ret = btrfs_wait_tree_log_extents(log_root_tree, 3077 EXTENT_NEW | EXTENT_DIRTY); 3078 if (ret) { 3079 btrfs_set_log_full_commit(trans); 3080 mutex_unlock(&log_root_tree->log_mutex); 3081 goto out_wake_log_root; 3082 } 3083 3084 log_root_start = log_root_tree->node->start; 3085 log_root_level = btrfs_header_level(log_root_tree->node); 3086 log_root_tree->log_transid++; 3087 mutex_unlock(&log_root_tree->log_mutex); 3088 3089 /* 3090 * Here we are guaranteed that nobody is going to write the superblock 3091 * for the current transaction before us and that neither we do write 3092 * our superblock before the previous transaction finishes its commit 3093 * and writes its superblock, because: 3094 * 3095 * 1) We are holding a handle on the current transaction, so no body 3096 * can commit it until we release the handle; 3097 * 3098 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3099 * if the previous transaction is still committing, and hasn't yet 3100 * written its superblock, we wait for it to do it, because a 3101 * transaction commit acquires the tree_log_mutex when the commit 3102 * begins and releases it only after writing its superblock. 3103 */ 3104 mutex_lock(&fs_info->tree_log_mutex); 3105 3106 /* 3107 * The previous transaction writeout phase could have failed, and thus 3108 * marked the fs in an error state. We must not commit here, as we 3109 * could have updated our generation in the super_for_commit and 3110 * writing the super here would result in transid mismatches. If there 3111 * is an error here just bail. 3112 */ 3113 if (BTRFS_FS_ERROR(fs_info)) { 3114 ret = -EIO; 3115 btrfs_set_log_full_commit(trans); 3116 btrfs_abort_transaction(trans, ret); 3117 mutex_unlock(&fs_info->tree_log_mutex); 3118 goto out_wake_log_root; 3119 } 3120 3121 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3122 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3123 ret = write_all_supers(fs_info, 1); 3124 mutex_unlock(&fs_info->tree_log_mutex); 3125 if (ret) { 3126 btrfs_set_log_full_commit(trans); 3127 btrfs_abort_transaction(trans, ret); 3128 goto out_wake_log_root; 3129 } 3130 3131 /* 3132 * We know there can only be one task here, since we have not yet set 3133 * root->log_commit[index1] to 0 and any task attempting to sync the 3134 * log must wait for the previous log transaction to commit if it's 3135 * still in progress or wait for the current log transaction commit if 3136 * someone else already started it. We use <= and not < because the 3137 * first log transaction has an ID of 0. 3138 */ 3139 ASSERT(root->last_log_commit <= log_transid); 3140 root->last_log_commit = log_transid; 3141 3142 out_wake_log_root: 3143 mutex_lock(&log_root_tree->log_mutex); 3144 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3145 3146 log_root_tree->log_transid_committed++; 3147 atomic_set(&log_root_tree->log_commit[index2], 0); 3148 mutex_unlock(&log_root_tree->log_mutex); 3149 3150 /* 3151 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3152 * all the updates above are seen by the woken threads. It might not be 3153 * necessary, but proving that seems to be hard. 3154 */ 3155 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3156 out: 3157 mutex_lock(&root->log_mutex); 3158 btrfs_remove_all_log_ctxs(root, index1, ret); 3159 root->log_transid_committed++; 3160 atomic_set(&root->log_commit[index1], 0); 3161 mutex_unlock(&root->log_mutex); 3162 3163 /* 3164 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3165 * all the updates above are seen by the woken threads. It might not be 3166 * necessary, but proving that seems to be hard. 3167 */ 3168 cond_wake_up(&root->log_commit_wait[index1]); 3169 return ret; 3170 } 3171 3172 static void free_log_tree(struct btrfs_trans_handle *trans, 3173 struct btrfs_root *log) 3174 { 3175 int ret; 3176 struct walk_control wc = { 3177 .free = 1, 3178 .process_func = process_one_buffer 3179 }; 3180 3181 if (log->node) { 3182 ret = walk_log_tree(trans, log, &wc); 3183 if (ret) { 3184 /* 3185 * We weren't able to traverse the entire log tree, the 3186 * typical scenario is getting an -EIO when reading an 3187 * extent buffer of the tree, due to a previous writeback 3188 * failure of it. 3189 */ 3190 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3191 &log->fs_info->fs_state); 3192 3193 /* 3194 * Some extent buffers of the log tree may still be dirty 3195 * and not yet written back to storage, because we may 3196 * have updates to a log tree without syncing a log tree, 3197 * such as during rename and link operations. So flush 3198 * them out and wait for their writeback to complete, so 3199 * that we properly cleanup their state and pages. 3200 */ 3201 btrfs_write_marked_extents(log->fs_info, 3202 &log->dirty_log_pages, 3203 EXTENT_DIRTY | EXTENT_NEW); 3204 btrfs_wait_tree_log_extents(log, 3205 EXTENT_DIRTY | EXTENT_NEW); 3206 3207 if (trans) 3208 btrfs_abort_transaction(trans, ret); 3209 else 3210 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3211 } 3212 } 3213 3214 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3215 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3216 extent_io_tree_release(&log->log_csum_range); 3217 3218 btrfs_put_root(log); 3219 } 3220 3221 /* 3222 * free all the extents used by the tree log. This should be called 3223 * at commit time of the full transaction 3224 */ 3225 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3226 { 3227 if (root->log_root) { 3228 free_log_tree(trans, root->log_root); 3229 root->log_root = NULL; 3230 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3231 } 3232 return 0; 3233 } 3234 3235 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3236 struct btrfs_fs_info *fs_info) 3237 { 3238 if (fs_info->log_root_tree) { 3239 free_log_tree(trans, fs_info->log_root_tree); 3240 fs_info->log_root_tree = NULL; 3241 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3242 } 3243 return 0; 3244 } 3245 3246 /* 3247 * Check if an inode was logged in the current transaction. This correctly deals 3248 * with the case where the inode was logged but has a logged_trans of 0, which 3249 * happens if the inode is evicted and loaded again, as logged_trans is an in 3250 * memory only field (not persisted). 3251 * 3252 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3253 * and < 0 on error. 3254 */ 3255 static int inode_logged(const struct btrfs_trans_handle *trans, 3256 struct btrfs_inode *inode, 3257 struct btrfs_path *path_in) 3258 { 3259 struct btrfs_path *path = path_in; 3260 struct btrfs_key key; 3261 int ret; 3262 3263 if (inode->logged_trans == trans->transid) 3264 return 1; 3265 3266 /* 3267 * If logged_trans is not 0, then we know the inode logged was not logged 3268 * in this transaction, so we can return false right away. 3269 */ 3270 if (inode->logged_trans > 0) 3271 return 0; 3272 3273 /* 3274 * If no log tree was created for this root in this transaction, then 3275 * the inode can not have been logged in this transaction. In that case 3276 * set logged_trans to anything greater than 0 and less than the current 3277 * transaction's ID, to avoid the search below in a future call in case 3278 * a log tree gets created after this. 3279 */ 3280 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3281 inode->logged_trans = trans->transid - 1; 3282 return 0; 3283 } 3284 3285 /* 3286 * We have a log tree and the inode's logged_trans is 0. We can't tell 3287 * for sure if the inode was logged before in this transaction by looking 3288 * only at logged_trans. We could be pessimistic and assume it was, but 3289 * that can lead to unnecessarily logging an inode during rename and link 3290 * operations, and then further updating the log in followup rename and 3291 * link operations, specially if it's a directory, which adds latency 3292 * visible to applications doing a series of rename or link operations. 3293 * 3294 * A logged_trans of 0 here can mean several things: 3295 * 3296 * 1) The inode was never logged since the filesystem was mounted, and may 3297 * or may have not been evicted and loaded again; 3298 * 3299 * 2) The inode was logged in a previous transaction, then evicted and 3300 * then loaded again; 3301 * 3302 * 3) The inode was logged in the current transaction, then evicted and 3303 * then loaded again. 3304 * 3305 * For cases 1) and 2) we don't want to return true, but we need to detect 3306 * case 3) and return true. So we do a search in the log root for the inode 3307 * item. 3308 */ 3309 key.objectid = btrfs_ino(inode); 3310 key.type = BTRFS_INODE_ITEM_KEY; 3311 key.offset = 0; 3312 3313 if (!path) { 3314 path = btrfs_alloc_path(); 3315 if (!path) 3316 return -ENOMEM; 3317 } 3318 3319 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3320 3321 if (path_in) 3322 btrfs_release_path(path); 3323 else 3324 btrfs_free_path(path); 3325 3326 /* 3327 * Logging an inode always results in logging its inode item. So if we 3328 * did not find the item we know the inode was not logged for sure. 3329 */ 3330 if (ret < 0) { 3331 return ret; 3332 } else if (ret > 0) { 3333 /* 3334 * Set logged_trans to a value greater than 0 and less then the 3335 * current transaction to avoid doing the search in future calls. 3336 */ 3337 inode->logged_trans = trans->transid - 1; 3338 return 0; 3339 } 3340 3341 /* 3342 * The inode was previously logged and then evicted, set logged_trans to 3343 * the current transacion's ID, to avoid future tree searches as long as 3344 * the inode is not evicted again. 3345 */ 3346 inode->logged_trans = trans->transid; 3347 3348 /* 3349 * If it's a directory, then we must set last_dir_index_offset to the 3350 * maximum possible value, so that the next attempt to log the inode does 3351 * not skip checking if dir index keys found in modified subvolume tree 3352 * leaves have been logged before, otherwise it would result in attempts 3353 * to insert duplicate dir index keys in the log tree. This must be done 3354 * because last_dir_index_offset is an in-memory only field, not persisted 3355 * in the inode item or any other on-disk structure, so its value is lost 3356 * once the inode is evicted. 3357 */ 3358 if (S_ISDIR(inode->vfs_inode.i_mode)) 3359 inode->last_dir_index_offset = (u64)-1; 3360 3361 return 1; 3362 } 3363 3364 /* 3365 * Delete a directory entry from the log if it exists. 3366 * 3367 * Returns < 0 on error 3368 * 1 if the entry does not exists 3369 * 0 if the entry existed and was successfully deleted 3370 */ 3371 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3372 struct btrfs_root *log, 3373 struct btrfs_path *path, 3374 u64 dir_ino, 3375 const struct fscrypt_str *name, 3376 u64 index) 3377 { 3378 struct btrfs_dir_item *di; 3379 3380 /* 3381 * We only log dir index items of a directory, so we don't need to look 3382 * for dir item keys. 3383 */ 3384 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3385 index, name, -1); 3386 if (IS_ERR(di)) 3387 return PTR_ERR(di); 3388 else if (!di) 3389 return 1; 3390 3391 /* 3392 * We do not need to update the size field of the directory's 3393 * inode item because on log replay we update the field to reflect 3394 * all existing entries in the directory (see overwrite_item()). 3395 */ 3396 return btrfs_delete_one_dir_name(trans, log, path, di); 3397 } 3398 3399 /* 3400 * If both a file and directory are logged, and unlinks or renames are 3401 * mixed in, we have a few interesting corners: 3402 * 3403 * create file X in dir Y 3404 * link file X to X.link in dir Y 3405 * fsync file X 3406 * unlink file X but leave X.link 3407 * fsync dir Y 3408 * 3409 * After a crash we would expect only X.link to exist. But file X 3410 * didn't get fsync'd again so the log has back refs for X and X.link. 3411 * 3412 * We solve this by removing directory entries and inode backrefs from the 3413 * log when a file that was logged in the current transaction is 3414 * unlinked. Any later fsync will include the updated log entries, and 3415 * we'll be able to reconstruct the proper directory items from backrefs. 3416 * 3417 * This optimizations allows us to avoid relogging the entire inode 3418 * or the entire directory. 3419 */ 3420 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3421 struct btrfs_root *root, 3422 const struct fscrypt_str *name, 3423 struct btrfs_inode *dir, u64 index) 3424 { 3425 struct btrfs_path *path; 3426 int ret; 3427 3428 ret = inode_logged(trans, dir, NULL); 3429 if (ret == 0) 3430 return; 3431 else if (ret < 0) { 3432 btrfs_set_log_full_commit(trans); 3433 return; 3434 } 3435 3436 ret = join_running_log_trans(root); 3437 if (ret) 3438 return; 3439 3440 mutex_lock(&dir->log_mutex); 3441 3442 path = btrfs_alloc_path(); 3443 if (!path) { 3444 ret = -ENOMEM; 3445 goto out_unlock; 3446 } 3447 3448 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3449 name, index); 3450 btrfs_free_path(path); 3451 out_unlock: 3452 mutex_unlock(&dir->log_mutex); 3453 if (ret < 0) 3454 btrfs_set_log_full_commit(trans); 3455 btrfs_end_log_trans(root); 3456 } 3457 3458 /* see comments for btrfs_del_dir_entries_in_log */ 3459 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3460 struct btrfs_root *root, 3461 const struct fscrypt_str *name, 3462 struct btrfs_inode *inode, u64 dirid) 3463 { 3464 struct btrfs_root *log; 3465 u64 index; 3466 int ret; 3467 3468 ret = inode_logged(trans, inode, NULL); 3469 if (ret == 0) 3470 return; 3471 else if (ret < 0) { 3472 btrfs_set_log_full_commit(trans); 3473 return; 3474 } 3475 3476 ret = join_running_log_trans(root); 3477 if (ret) 3478 return; 3479 log = root->log_root; 3480 mutex_lock(&inode->log_mutex); 3481 3482 ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), 3483 dirid, &index); 3484 mutex_unlock(&inode->log_mutex); 3485 if (ret < 0 && ret != -ENOENT) 3486 btrfs_set_log_full_commit(trans); 3487 btrfs_end_log_trans(root); 3488 } 3489 3490 /* 3491 * creates a range item in the log for 'dirid'. first_offset and 3492 * last_offset tell us which parts of the key space the log should 3493 * be considered authoritative for. 3494 */ 3495 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3496 struct btrfs_root *log, 3497 struct btrfs_path *path, 3498 u64 dirid, 3499 u64 first_offset, u64 last_offset) 3500 { 3501 int ret; 3502 struct btrfs_key key; 3503 struct btrfs_dir_log_item *item; 3504 3505 key.objectid = dirid; 3506 key.offset = first_offset; 3507 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3508 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3509 /* 3510 * -EEXIST is fine and can happen sporadically when we are logging a 3511 * directory and have concurrent insertions in the subvolume's tree for 3512 * items from other inodes and that result in pushing off some dir items 3513 * from one leaf to another in order to accommodate for the new items. 3514 * This results in logging the same dir index range key. 3515 */ 3516 if (ret && ret != -EEXIST) 3517 return ret; 3518 3519 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3520 struct btrfs_dir_log_item); 3521 if (ret == -EEXIST) { 3522 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); 3523 3524 /* 3525 * btrfs_del_dir_entries_in_log() might have been called during 3526 * an unlink between the initial insertion of this key and the 3527 * current update, or we might be logging a single entry deletion 3528 * during a rename, so set the new last_offset to the max value. 3529 */ 3530 last_offset = max(last_offset, curr_end); 3531 } 3532 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3533 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 3534 btrfs_release_path(path); 3535 return 0; 3536 } 3537 3538 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3539 struct btrfs_inode *inode, 3540 struct extent_buffer *src, 3541 struct btrfs_path *dst_path, 3542 int start_slot, 3543 int count) 3544 { 3545 struct btrfs_root *log = inode->root->log_root; 3546 char *ins_data = NULL; 3547 struct btrfs_item_batch batch; 3548 struct extent_buffer *dst; 3549 unsigned long src_offset; 3550 unsigned long dst_offset; 3551 u64 last_index; 3552 struct btrfs_key key; 3553 u32 item_size; 3554 int ret; 3555 int i; 3556 3557 ASSERT(count > 0); 3558 batch.nr = count; 3559 3560 if (count == 1) { 3561 btrfs_item_key_to_cpu(src, &key, start_slot); 3562 item_size = btrfs_item_size(src, start_slot); 3563 batch.keys = &key; 3564 batch.data_sizes = &item_size; 3565 batch.total_data_size = item_size; 3566 } else { 3567 struct btrfs_key *ins_keys; 3568 u32 *ins_sizes; 3569 3570 ins_data = kmalloc(count * sizeof(u32) + 3571 count * sizeof(struct btrfs_key), GFP_NOFS); 3572 if (!ins_data) 3573 return -ENOMEM; 3574 3575 ins_sizes = (u32 *)ins_data; 3576 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3577 batch.keys = ins_keys; 3578 batch.data_sizes = ins_sizes; 3579 batch.total_data_size = 0; 3580 3581 for (i = 0; i < count; i++) { 3582 const int slot = start_slot + i; 3583 3584 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3585 ins_sizes[i] = btrfs_item_size(src, slot); 3586 batch.total_data_size += ins_sizes[i]; 3587 } 3588 } 3589 3590 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3591 if (ret) 3592 goto out; 3593 3594 dst = dst_path->nodes[0]; 3595 /* 3596 * Copy all the items in bulk, in a single copy operation. Item data is 3597 * organized such that it's placed at the end of a leaf and from right 3598 * to left. For example, the data for the second item ends at an offset 3599 * that matches the offset where the data for the first item starts, the 3600 * data for the third item ends at an offset that matches the offset 3601 * where the data of the second items starts, and so on. 3602 * Therefore our source and destination start offsets for copy match the 3603 * offsets of the last items (highest slots). 3604 */ 3605 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3606 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3607 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3608 btrfs_release_path(dst_path); 3609 3610 last_index = batch.keys[count - 1].offset; 3611 ASSERT(last_index > inode->last_dir_index_offset); 3612 3613 /* 3614 * If for some unexpected reason the last item's index is not greater 3615 * than the last index we logged, warn and force a transaction commit. 3616 */ 3617 if (WARN_ON(last_index <= inode->last_dir_index_offset)) 3618 ret = BTRFS_LOG_FORCE_COMMIT; 3619 else 3620 inode->last_dir_index_offset = last_index; 3621 3622 if (btrfs_get_first_dir_index_to_log(inode) == 0) 3623 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); 3624 out: 3625 kfree(ins_data); 3626 3627 return ret; 3628 } 3629 3630 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3631 struct btrfs_inode *inode, 3632 struct btrfs_path *path, 3633 struct btrfs_path *dst_path, 3634 struct btrfs_log_ctx *ctx, 3635 u64 *last_old_dentry_offset) 3636 { 3637 struct btrfs_root *log = inode->root->log_root; 3638 struct extent_buffer *src; 3639 const int nritems = btrfs_header_nritems(path->nodes[0]); 3640 const u64 ino = btrfs_ino(inode); 3641 bool last_found = false; 3642 int batch_start = 0; 3643 int batch_size = 0; 3644 int i; 3645 3646 /* 3647 * We need to clone the leaf, release the read lock on it, and use the 3648 * clone before modifying the log tree. See the comment at copy_items() 3649 * about why we need to do this. 3650 */ 3651 src = btrfs_clone_extent_buffer(path->nodes[0]); 3652 if (!src) 3653 return -ENOMEM; 3654 3655 i = path->slots[0]; 3656 btrfs_release_path(path); 3657 path->nodes[0] = src; 3658 path->slots[0] = i; 3659 3660 for (; i < nritems; i++) { 3661 struct btrfs_dir_item *di; 3662 struct btrfs_key key; 3663 int ret; 3664 3665 btrfs_item_key_to_cpu(src, &key, i); 3666 3667 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3668 last_found = true; 3669 break; 3670 } 3671 3672 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3673 3674 /* 3675 * Skip ranges of items that consist only of dir item keys created 3676 * in past transactions. However if we find a gap, we must log a 3677 * dir index range item for that gap, so that index keys in that 3678 * gap are deleted during log replay. 3679 */ 3680 if (btrfs_dir_transid(src, di) < trans->transid) { 3681 if (key.offset > *last_old_dentry_offset + 1) { 3682 ret = insert_dir_log_key(trans, log, dst_path, 3683 ino, *last_old_dentry_offset + 1, 3684 key.offset - 1); 3685 if (ret < 0) 3686 return ret; 3687 } 3688 3689 *last_old_dentry_offset = key.offset; 3690 continue; 3691 } 3692 3693 /* If we logged this dir index item before, we can skip it. */ 3694 if (key.offset <= inode->last_dir_index_offset) 3695 continue; 3696 3697 /* 3698 * We must make sure that when we log a directory entry, the 3699 * corresponding inode, after log replay, has a matching link 3700 * count. For example: 3701 * 3702 * touch foo 3703 * mkdir mydir 3704 * sync 3705 * ln foo mydir/bar 3706 * xfs_io -c "fsync" mydir 3707 * <crash> 3708 * <mount fs and log replay> 3709 * 3710 * Would result in a fsync log that when replayed, our file inode 3711 * would have a link count of 1, but we get two directory entries 3712 * pointing to the same inode. After removing one of the names, 3713 * it would not be possible to remove the other name, which 3714 * resulted always in stale file handle errors, and would not be 3715 * possible to rmdir the parent directory, since its i_size could 3716 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3717 * resulting in -ENOTEMPTY errors. 3718 */ 3719 if (!ctx->log_new_dentries) { 3720 struct btrfs_key di_key; 3721 3722 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3723 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3724 ctx->log_new_dentries = true; 3725 } 3726 3727 if (batch_size == 0) 3728 batch_start = i; 3729 batch_size++; 3730 } 3731 3732 if (batch_size > 0) { 3733 int ret; 3734 3735 ret = flush_dir_items_batch(trans, inode, src, dst_path, 3736 batch_start, batch_size); 3737 if (ret < 0) 3738 return ret; 3739 } 3740 3741 return last_found ? 1 : 0; 3742 } 3743 3744 /* 3745 * log all the items included in the current transaction for a given 3746 * directory. This also creates the range items in the log tree required 3747 * to replay anything deleted before the fsync 3748 */ 3749 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3750 struct btrfs_inode *inode, 3751 struct btrfs_path *path, 3752 struct btrfs_path *dst_path, 3753 struct btrfs_log_ctx *ctx, 3754 u64 min_offset, u64 *last_offset_ret) 3755 { 3756 struct btrfs_key min_key; 3757 struct btrfs_root *root = inode->root; 3758 struct btrfs_root *log = root->log_root; 3759 int ret; 3760 u64 last_old_dentry_offset = min_offset - 1; 3761 u64 last_offset = (u64)-1; 3762 u64 ino = btrfs_ino(inode); 3763 3764 min_key.objectid = ino; 3765 min_key.type = BTRFS_DIR_INDEX_KEY; 3766 min_key.offset = min_offset; 3767 3768 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3769 3770 /* 3771 * we didn't find anything from this transaction, see if there 3772 * is anything at all 3773 */ 3774 if (ret != 0 || min_key.objectid != ino || 3775 min_key.type != BTRFS_DIR_INDEX_KEY) { 3776 min_key.objectid = ino; 3777 min_key.type = BTRFS_DIR_INDEX_KEY; 3778 min_key.offset = (u64)-1; 3779 btrfs_release_path(path); 3780 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3781 if (ret < 0) { 3782 btrfs_release_path(path); 3783 return ret; 3784 } 3785 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3786 3787 /* if ret == 0 there are items for this type, 3788 * create a range to tell us the last key of this type. 3789 * otherwise, there are no items in this directory after 3790 * *min_offset, and we create a range to indicate that. 3791 */ 3792 if (ret == 0) { 3793 struct btrfs_key tmp; 3794 3795 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3796 path->slots[0]); 3797 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3798 last_old_dentry_offset = tmp.offset; 3799 } else if (ret > 0) { 3800 ret = 0; 3801 } 3802 3803 goto done; 3804 } 3805 3806 /* go backward to find any previous key */ 3807 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3808 if (ret == 0) { 3809 struct btrfs_key tmp; 3810 3811 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3812 /* 3813 * The dir index key before the first one we found that needs to 3814 * be logged might be in a previous leaf, and there might be a 3815 * gap between these keys, meaning that we had deletions that 3816 * happened. So the key range item we log (key type 3817 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 3818 * previous key's offset plus 1, so that those deletes are replayed. 3819 */ 3820 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3821 last_old_dentry_offset = tmp.offset; 3822 } else if (ret < 0) { 3823 goto done; 3824 } 3825 3826 btrfs_release_path(path); 3827 3828 /* 3829 * Find the first key from this transaction again or the one we were at 3830 * in the loop below in case we had to reschedule. We may be logging the 3831 * directory without holding its VFS lock, which happen when logging new 3832 * dentries (through log_new_dir_dentries()) or in some cases when we 3833 * need to log the parent directory of an inode. This means a dir index 3834 * key might be deleted from the inode's root, and therefore we may not 3835 * find it anymore. If we can't find it, just move to the next key. We 3836 * can not bail out and ignore, because if we do that we will simply 3837 * not log dir index keys that come after the one that was just deleted 3838 * and we can end up logging a dir index range that ends at (u64)-1 3839 * (@last_offset is initialized to that), resulting in removing dir 3840 * entries we should not remove at log replay time. 3841 */ 3842 search: 3843 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3844 if (ret > 0) { 3845 ret = btrfs_next_item(root, path); 3846 if (ret > 0) { 3847 /* There are no more keys in the inode's root. */ 3848 ret = 0; 3849 goto done; 3850 } 3851 } 3852 if (ret < 0) 3853 goto done; 3854 3855 /* 3856 * we have a block from this transaction, log every item in it 3857 * from our directory 3858 */ 3859 while (1) { 3860 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 3861 &last_old_dentry_offset); 3862 if (ret != 0) { 3863 if (ret > 0) 3864 ret = 0; 3865 goto done; 3866 } 3867 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 3868 3869 /* 3870 * look ahead to the next item and see if it is also 3871 * from this directory and from this transaction 3872 */ 3873 ret = btrfs_next_leaf(root, path); 3874 if (ret) { 3875 if (ret == 1) { 3876 last_offset = (u64)-1; 3877 ret = 0; 3878 } 3879 goto done; 3880 } 3881 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 3882 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 3883 last_offset = (u64)-1; 3884 goto done; 3885 } 3886 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3887 /* 3888 * The next leaf was not changed in the current transaction 3889 * and has at least one dir index key. 3890 * We check for the next key because there might have been 3891 * one or more deletions between the last key we logged and 3892 * that next key. So the key range item we log (key type 3893 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 3894 * offset minus 1, so that those deletes are replayed. 3895 */ 3896 last_offset = min_key.offset - 1; 3897 goto done; 3898 } 3899 if (need_resched()) { 3900 btrfs_release_path(path); 3901 cond_resched(); 3902 goto search; 3903 } 3904 } 3905 done: 3906 btrfs_release_path(path); 3907 btrfs_release_path(dst_path); 3908 3909 if (ret == 0) { 3910 *last_offset_ret = last_offset; 3911 /* 3912 * In case the leaf was changed in the current transaction but 3913 * all its dir items are from a past transaction, the last item 3914 * in the leaf is a dir item and there's no gap between that last 3915 * dir item and the first one on the next leaf (which did not 3916 * change in the current transaction), then we don't need to log 3917 * a range, last_old_dentry_offset is == to last_offset. 3918 */ 3919 ASSERT(last_old_dentry_offset <= last_offset); 3920 if (last_old_dentry_offset < last_offset) 3921 ret = insert_dir_log_key(trans, log, path, ino, 3922 last_old_dentry_offset + 1, 3923 last_offset); 3924 } 3925 3926 return ret; 3927 } 3928 3929 /* 3930 * If the inode was logged before and it was evicted, then its 3931 * last_dir_index_offset is (u64)-1, so we don't the value of the last index 3932 * key offset. If that's the case, search for it and update the inode. This 3933 * is to avoid lookups in the log tree every time we try to insert a dir index 3934 * key from a leaf changed in the current transaction, and to allow us to always 3935 * do batch insertions of dir index keys. 3936 */ 3937 static int update_last_dir_index_offset(struct btrfs_inode *inode, 3938 struct btrfs_path *path, 3939 const struct btrfs_log_ctx *ctx) 3940 { 3941 const u64 ino = btrfs_ino(inode); 3942 struct btrfs_key key; 3943 int ret; 3944 3945 lockdep_assert_held(&inode->log_mutex); 3946 3947 if (inode->last_dir_index_offset != (u64)-1) 3948 return 0; 3949 3950 if (!ctx->logged_before) { 3951 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 3952 return 0; 3953 } 3954 3955 key.objectid = ino; 3956 key.type = BTRFS_DIR_INDEX_KEY; 3957 key.offset = (u64)-1; 3958 3959 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3960 /* 3961 * An error happened or we actually have an index key with an offset 3962 * value of (u64)-1. Bail out, we're done. 3963 */ 3964 if (ret <= 0) 3965 goto out; 3966 3967 ret = 0; 3968 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 3969 3970 /* 3971 * No dir index items, bail out and leave last_dir_index_offset with 3972 * the value right before the first valid index value. 3973 */ 3974 if (path->slots[0] == 0) 3975 goto out; 3976 3977 /* 3978 * btrfs_search_slot() left us at one slot beyond the slot with the last 3979 * index key, or beyond the last key of the directory that is not an 3980 * index key. If we have an index key before, set last_dir_index_offset 3981 * to its offset value, otherwise leave it with a value right before the 3982 * first valid index value, as it means we have an empty directory. 3983 */ 3984 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 3985 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) 3986 inode->last_dir_index_offset = key.offset; 3987 3988 out: 3989 btrfs_release_path(path); 3990 3991 return ret; 3992 } 3993 3994 /* 3995 * logging directories is very similar to logging inodes, We find all the items 3996 * from the current transaction and write them to the log. 3997 * 3998 * The recovery code scans the directory in the subvolume, and if it finds a 3999 * key in the range logged that is not present in the log tree, then it means 4000 * that dir entry was unlinked during the transaction. 4001 * 4002 * In order for that scan to work, we must include one key smaller than 4003 * the smallest logged by this transaction and one key larger than the largest 4004 * key logged by this transaction. 4005 */ 4006 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4007 struct btrfs_inode *inode, 4008 struct btrfs_path *path, 4009 struct btrfs_path *dst_path, 4010 struct btrfs_log_ctx *ctx) 4011 { 4012 u64 min_key; 4013 u64 max_key; 4014 int ret; 4015 4016 ret = update_last_dir_index_offset(inode, path, ctx); 4017 if (ret) 4018 return ret; 4019 4020 min_key = BTRFS_DIR_START_INDEX; 4021 max_key = 0; 4022 4023 while (1) { 4024 ret = log_dir_items(trans, inode, path, dst_path, 4025 ctx, min_key, &max_key); 4026 if (ret) 4027 return ret; 4028 if (max_key == (u64)-1) 4029 break; 4030 min_key = max_key + 1; 4031 } 4032 4033 return 0; 4034 } 4035 4036 /* 4037 * a helper function to drop items from the log before we relog an 4038 * inode. max_key_type indicates the highest item type to remove. 4039 * This cannot be run for file data extents because it does not 4040 * free the extents they point to. 4041 */ 4042 static int drop_inode_items(struct btrfs_trans_handle *trans, 4043 struct btrfs_root *log, 4044 struct btrfs_path *path, 4045 struct btrfs_inode *inode, 4046 int max_key_type) 4047 { 4048 int ret; 4049 struct btrfs_key key; 4050 struct btrfs_key found_key; 4051 int start_slot; 4052 4053 key.objectid = btrfs_ino(inode); 4054 key.type = max_key_type; 4055 key.offset = (u64)-1; 4056 4057 while (1) { 4058 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4059 if (ret < 0) { 4060 break; 4061 } else if (ret > 0) { 4062 if (path->slots[0] == 0) 4063 break; 4064 path->slots[0]--; 4065 } 4066 4067 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4068 path->slots[0]); 4069 4070 if (found_key.objectid != key.objectid) 4071 break; 4072 4073 found_key.offset = 0; 4074 found_key.type = 0; 4075 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); 4076 if (ret < 0) 4077 break; 4078 4079 ret = btrfs_del_items(trans, log, path, start_slot, 4080 path->slots[0] - start_slot + 1); 4081 /* 4082 * If start slot isn't 0 then we don't need to re-search, we've 4083 * found the last guy with the objectid in this tree. 4084 */ 4085 if (ret || start_slot != 0) 4086 break; 4087 btrfs_release_path(path); 4088 } 4089 btrfs_release_path(path); 4090 if (ret > 0) 4091 ret = 0; 4092 return ret; 4093 } 4094 4095 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4096 struct btrfs_root *log_root, 4097 struct btrfs_inode *inode, 4098 u64 new_size, u32 min_type) 4099 { 4100 struct btrfs_truncate_control control = { 4101 .new_size = new_size, 4102 .ino = btrfs_ino(inode), 4103 .min_type = min_type, 4104 .skip_ref_updates = true, 4105 }; 4106 4107 return btrfs_truncate_inode_items(trans, log_root, &control); 4108 } 4109 4110 static void fill_inode_item(struct btrfs_trans_handle *trans, 4111 struct extent_buffer *leaf, 4112 struct btrfs_inode_item *item, 4113 struct inode *inode, int log_inode_only, 4114 u64 logged_isize) 4115 { 4116 struct btrfs_map_token token; 4117 u64 flags; 4118 4119 btrfs_init_map_token(&token, leaf); 4120 4121 if (log_inode_only) { 4122 /* set the generation to zero so the recover code 4123 * can tell the difference between an logging 4124 * just to say 'this inode exists' and a logging 4125 * to say 'update this inode with these values' 4126 */ 4127 btrfs_set_token_inode_generation(&token, item, 0); 4128 btrfs_set_token_inode_size(&token, item, logged_isize); 4129 } else { 4130 btrfs_set_token_inode_generation(&token, item, 4131 BTRFS_I(inode)->generation); 4132 btrfs_set_token_inode_size(&token, item, inode->i_size); 4133 } 4134 4135 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4136 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4137 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4138 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4139 4140 btrfs_set_token_timespec_sec(&token, &item->atime, 4141 inode->i_atime.tv_sec); 4142 btrfs_set_token_timespec_nsec(&token, &item->atime, 4143 inode->i_atime.tv_nsec); 4144 4145 btrfs_set_token_timespec_sec(&token, &item->mtime, 4146 inode->i_mtime.tv_sec); 4147 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4148 inode->i_mtime.tv_nsec); 4149 4150 btrfs_set_token_timespec_sec(&token, &item->ctime, 4151 inode_get_ctime(inode).tv_sec); 4152 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4153 inode_get_ctime(inode).tv_nsec); 4154 4155 /* 4156 * We do not need to set the nbytes field, in fact during a fast fsync 4157 * its value may not even be correct, since a fast fsync does not wait 4158 * for ordered extent completion, which is where we update nbytes, it 4159 * only waits for writeback to complete. During log replay as we find 4160 * file extent items and replay them, we adjust the nbytes field of the 4161 * inode item in subvolume tree as needed (see overwrite_item()). 4162 */ 4163 4164 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4165 btrfs_set_token_inode_transid(&token, item, trans->transid); 4166 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4167 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4168 BTRFS_I(inode)->ro_flags); 4169 btrfs_set_token_inode_flags(&token, item, flags); 4170 btrfs_set_token_inode_block_group(&token, item, 0); 4171 } 4172 4173 static int log_inode_item(struct btrfs_trans_handle *trans, 4174 struct btrfs_root *log, struct btrfs_path *path, 4175 struct btrfs_inode *inode, bool inode_item_dropped) 4176 { 4177 struct btrfs_inode_item *inode_item; 4178 int ret; 4179 4180 /* 4181 * If we are doing a fast fsync and the inode was logged before in the 4182 * current transaction, then we know the inode was previously logged and 4183 * it exists in the log tree. For performance reasons, in this case use 4184 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4185 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4186 * contention in case there are concurrent fsyncs for other inodes of the 4187 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4188 * already exists can also result in unnecessarily splitting a leaf. 4189 */ 4190 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4191 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4192 ASSERT(ret <= 0); 4193 if (ret > 0) 4194 ret = -ENOENT; 4195 } else { 4196 /* 4197 * This means it is the first fsync in the current transaction, 4198 * so the inode item is not in the log and we need to insert it. 4199 * We can never get -EEXIST because we are only called for a fast 4200 * fsync and in case an inode eviction happens after the inode was 4201 * logged before in the current transaction, when we load again 4202 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4203 * flags and set ->logged_trans to 0. 4204 */ 4205 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4206 sizeof(*inode_item)); 4207 ASSERT(ret != -EEXIST); 4208 } 4209 if (ret) 4210 return ret; 4211 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4212 struct btrfs_inode_item); 4213 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4214 0, 0); 4215 btrfs_release_path(path); 4216 return 0; 4217 } 4218 4219 static int log_csums(struct btrfs_trans_handle *trans, 4220 struct btrfs_inode *inode, 4221 struct btrfs_root *log_root, 4222 struct btrfs_ordered_sum *sums) 4223 { 4224 const u64 lock_end = sums->logical + sums->len - 1; 4225 struct extent_state *cached_state = NULL; 4226 int ret; 4227 4228 /* 4229 * If this inode was not used for reflink operations in the current 4230 * transaction with new extents, then do the fast path, no need to 4231 * worry about logging checksum items with overlapping ranges. 4232 */ 4233 if (inode->last_reflink_trans < trans->transid) 4234 return btrfs_csum_file_blocks(trans, log_root, sums); 4235 4236 /* 4237 * Serialize logging for checksums. This is to avoid racing with the 4238 * same checksum being logged by another task that is logging another 4239 * file which happens to refer to the same extent as well. Such races 4240 * can leave checksum items in the log with overlapping ranges. 4241 */ 4242 ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, 4243 &cached_state); 4244 if (ret) 4245 return ret; 4246 /* 4247 * Due to extent cloning, we might have logged a csum item that covers a 4248 * subrange of a cloned extent, and later we can end up logging a csum 4249 * item for a larger subrange of the same extent or the entire range. 4250 * This would leave csum items in the log tree that cover the same range 4251 * and break the searches for checksums in the log tree, resulting in 4252 * some checksums missing in the fs/subvolume tree. So just delete (or 4253 * trim and adjust) any existing csum items in the log for this range. 4254 */ 4255 ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); 4256 if (!ret) 4257 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4258 4259 unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, 4260 &cached_state); 4261 4262 return ret; 4263 } 4264 4265 static noinline int copy_items(struct btrfs_trans_handle *trans, 4266 struct btrfs_inode *inode, 4267 struct btrfs_path *dst_path, 4268 struct btrfs_path *src_path, 4269 int start_slot, int nr, int inode_only, 4270 u64 logged_isize) 4271 { 4272 struct btrfs_root *log = inode->root->log_root; 4273 struct btrfs_file_extent_item *extent; 4274 struct extent_buffer *src; 4275 int ret = 0; 4276 struct btrfs_key *ins_keys; 4277 u32 *ins_sizes; 4278 struct btrfs_item_batch batch; 4279 char *ins_data; 4280 int i; 4281 int dst_index; 4282 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4283 const u64 i_size = i_size_read(&inode->vfs_inode); 4284 4285 /* 4286 * To keep lockdep happy and avoid deadlocks, clone the source leaf and 4287 * use the clone. This is because otherwise we would be changing the log 4288 * tree, to insert items from the subvolume tree or insert csum items, 4289 * while holding a read lock on a leaf from the subvolume tree, which 4290 * creates a nasty lock dependency when COWing log tree nodes/leaves: 4291 * 4292 * 1) Modifying the log tree triggers an extent buffer allocation while 4293 * holding a write lock on a parent extent buffer from the log tree. 4294 * Allocating the pages for an extent buffer, or the extent buffer 4295 * struct, can trigger inode eviction and finally the inode eviction 4296 * will trigger a release/remove of a delayed node, which requires 4297 * taking the delayed node's mutex; 4298 * 4299 * 2) Allocating a metadata extent for a log tree can trigger the async 4300 * reclaim thread and make us wait for it to release enough space and 4301 * unblock our reservation ticket. The reclaim thread can start 4302 * flushing delayed items, and that in turn results in the need to 4303 * lock delayed node mutexes and in the need to write lock extent 4304 * buffers of a subvolume tree - all this while holding a write lock 4305 * on the parent extent buffer in the log tree. 4306 * 4307 * So one task in scenario 1) running in parallel with another task in 4308 * scenario 2) could lead to a deadlock, one wanting to lock a delayed 4309 * node mutex while having a read lock on a leaf from the subvolume, 4310 * while the other is holding the delayed node's mutex and wants to 4311 * write lock the same subvolume leaf for flushing delayed items. 4312 */ 4313 src = btrfs_clone_extent_buffer(src_path->nodes[0]); 4314 if (!src) 4315 return -ENOMEM; 4316 4317 i = src_path->slots[0]; 4318 btrfs_release_path(src_path); 4319 src_path->nodes[0] = src; 4320 src_path->slots[0] = i; 4321 4322 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4323 nr * sizeof(u32), GFP_NOFS); 4324 if (!ins_data) 4325 return -ENOMEM; 4326 4327 ins_sizes = (u32 *)ins_data; 4328 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4329 batch.keys = ins_keys; 4330 batch.data_sizes = ins_sizes; 4331 batch.total_data_size = 0; 4332 batch.nr = 0; 4333 4334 dst_index = 0; 4335 for (i = 0; i < nr; i++) { 4336 const int src_slot = start_slot + i; 4337 struct btrfs_root *csum_root; 4338 struct btrfs_ordered_sum *sums; 4339 struct btrfs_ordered_sum *sums_next; 4340 LIST_HEAD(ordered_sums); 4341 u64 disk_bytenr; 4342 u64 disk_num_bytes; 4343 u64 extent_offset; 4344 u64 extent_num_bytes; 4345 bool is_old_extent; 4346 4347 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4348 4349 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4350 goto add_to_batch; 4351 4352 extent = btrfs_item_ptr(src, src_slot, 4353 struct btrfs_file_extent_item); 4354 4355 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4356 trans->transid); 4357 4358 /* 4359 * Don't copy extents from past generations. That would make us 4360 * log a lot more metadata for common cases like doing only a 4361 * few random writes into a file and then fsync it for the first 4362 * time or after the full sync flag is set on the inode. We can 4363 * get leaves full of extent items, most of which are from past 4364 * generations, so we can skip them - as long as the inode has 4365 * not been the target of a reflink operation in this transaction, 4366 * as in that case it might have had file extent items with old 4367 * generations copied into it. We also must always log prealloc 4368 * extents that start at or beyond eof, otherwise we would lose 4369 * them on log replay. 4370 */ 4371 if (is_old_extent && 4372 ins_keys[dst_index].offset < i_size && 4373 inode->last_reflink_trans < trans->transid) 4374 continue; 4375 4376 if (skip_csum) 4377 goto add_to_batch; 4378 4379 /* Only regular extents have checksums. */ 4380 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4381 goto add_to_batch; 4382 4383 /* 4384 * If it's an extent created in a past transaction, then its 4385 * checksums are already accessible from the committed csum tree, 4386 * no need to log them. 4387 */ 4388 if (is_old_extent) 4389 goto add_to_batch; 4390 4391 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4392 /* If it's an explicit hole, there are no checksums. */ 4393 if (disk_bytenr == 0) 4394 goto add_to_batch; 4395 4396 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4397 4398 if (btrfs_file_extent_compression(src, extent)) { 4399 extent_offset = 0; 4400 extent_num_bytes = disk_num_bytes; 4401 } else { 4402 extent_offset = btrfs_file_extent_offset(src, extent); 4403 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4404 } 4405 4406 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4407 disk_bytenr += extent_offset; 4408 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4409 disk_bytenr + extent_num_bytes - 1, 4410 &ordered_sums, 0, false); 4411 if (ret) 4412 goto out; 4413 4414 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4415 if (!ret) 4416 ret = log_csums(trans, inode, log, sums); 4417 list_del(&sums->list); 4418 kfree(sums); 4419 } 4420 if (ret) 4421 goto out; 4422 4423 add_to_batch: 4424 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4425 batch.total_data_size += ins_sizes[dst_index]; 4426 batch.nr++; 4427 dst_index++; 4428 } 4429 4430 /* 4431 * We have a leaf full of old extent items that don't need to be logged, 4432 * so we don't need to do anything. 4433 */ 4434 if (batch.nr == 0) 4435 goto out; 4436 4437 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4438 if (ret) 4439 goto out; 4440 4441 dst_index = 0; 4442 for (i = 0; i < nr; i++) { 4443 const int src_slot = start_slot + i; 4444 const int dst_slot = dst_path->slots[0] + dst_index; 4445 struct btrfs_key key; 4446 unsigned long src_offset; 4447 unsigned long dst_offset; 4448 4449 /* 4450 * We're done, all the remaining items in the source leaf 4451 * correspond to old file extent items. 4452 */ 4453 if (dst_index >= batch.nr) 4454 break; 4455 4456 btrfs_item_key_to_cpu(src, &key, src_slot); 4457 4458 if (key.type != BTRFS_EXTENT_DATA_KEY) 4459 goto copy_item; 4460 4461 extent = btrfs_item_ptr(src, src_slot, 4462 struct btrfs_file_extent_item); 4463 4464 /* See the comment in the previous loop, same logic. */ 4465 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4466 key.offset < i_size && 4467 inode->last_reflink_trans < trans->transid) 4468 continue; 4469 4470 copy_item: 4471 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4472 src_offset = btrfs_item_ptr_offset(src, src_slot); 4473 4474 if (key.type == BTRFS_INODE_ITEM_KEY) { 4475 struct btrfs_inode_item *inode_item; 4476 4477 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4478 struct btrfs_inode_item); 4479 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4480 &inode->vfs_inode, 4481 inode_only == LOG_INODE_EXISTS, 4482 logged_isize); 4483 } else { 4484 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4485 src_offset, ins_sizes[dst_index]); 4486 } 4487 4488 dst_index++; 4489 } 4490 4491 btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); 4492 btrfs_release_path(dst_path); 4493 out: 4494 kfree(ins_data); 4495 4496 return ret; 4497 } 4498 4499 static int extent_cmp(void *priv, const struct list_head *a, 4500 const struct list_head *b) 4501 { 4502 const struct extent_map *em1, *em2; 4503 4504 em1 = list_entry(a, struct extent_map, list); 4505 em2 = list_entry(b, struct extent_map, list); 4506 4507 if (em1->start < em2->start) 4508 return -1; 4509 else if (em1->start > em2->start) 4510 return 1; 4511 return 0; 4512 } 4513 4514 static int log_extent_csums(struct btrfs_trans_handle *trans, 4515 struct btrfs_inode *inode, 4516 struct btrfs_root *log_root, 4517 const struct extent_map *em, 4518 struct btrfs_log_ctx *ctx) 4519 { 4520 struct btrfs_ordered_extent *ordered; 4521 struct btrfs_root *csum_root; 4522 u64 csum_offset; 4523 u64 csum_len; 4524 u64 mod_start = em->mod_start; 4525 u64 mod_len = em->mod_len; 4526 LIST_HEAD(ordered_sums); 4527 int ret = 0; 4528 4529 if (inode->flags & BTRFS_INODE_NODATASUM || 4530 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4531 em->block_start == EXTENT_MAP_HOLE) 4532 return 0; 4533 4534 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4535 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4536 const u64 mod_end = mod_start + mod_len; 4537 struct btrfs_ordered_sum *sums; 4538 4539 if (mod_len == 0) 4540 break; 4541 4542 if (ordered_end <= mod_start) 4543 continue; 4544 if (mod_end <= ordered->file_offset) 4545 break; 4546 4547 /* 4548 * We are going to copy all the csums on this ordered extent, so 4549 * go ahead and adjust mod_start and mod_len in case this ordered 4550 * extent has already been logged. 4551 */ 4552 if (ordered->file_offset > mod_start) { 4553 if (ordered_end >= mod_end) 4554 mod_len = ordered->file_offset - mod_start; 4555 /* 4556 * If we have this case 4557 * 4558 * |--------- logged extent ---------| 4559 * |----- ordered extent ----| 4560 * 4561 * Just don't mess with mod_start and mod_len, we'll 4562 * just end up logging more csums than we need and it 4563 * will be ok. 4564 */ 4565 } else { 4566 if (ordered_end < mod_end) { 4567 mod_len = mod_end - ordered_end; 4568 mod_start = ordered_end; 4569 } else { 4570 mod_len = 0; 4571 } 4572 } 4573 4574 /* 4575 * To keep us from looping for the above case of an ordered 4576 * extent that falls inside of the logged extent. 4577 */ 4578 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4579 continue; 4580 4581 list_for_each_entry(sums, &ordered->list, list) { 4582 ret = log_csums(trans, inode, log_root, sums); 4583 if (ret) 4584 return ret; 4585 } 4586 } 4587 4588 /* We're done, found all csums in the ordered extents. */ 4589 if (mod_len == 0) 4590 return 0; 4591 4592 /* If we're compressed we have to save the entire range of csums. */ 4593 if (em->compress_type) { 4594 csum_offset = 0; 4595 csum_len = max(em->block_len, em->orig_block_len); 4596 } else { 4597 csum_offset = mod_start - em->start; 4598 csum_len = mod_len; 4599 } 4600 4601 /* block start is already adjusted for the file extent offset. */ 4602 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4603 ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, 4604 em->block_start + csum_offset + 4605 csum_len - 1, &ordered_sums, 0, false); 4606 if (ret) 4607 return ret; 4608 4609 while (!list_empty(&ordered_sums)) { 4610 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4611 struct btrfs_ordered_sum, 4612 list); 4613 if (!ret) 4614 ret = log_csums(trans, inode, log_root, sums); 4615 list_del(&sums->list); 4616 kfree(sums); 4617 } 4618 4619 return ret; 4620 } 4621 4622 static int log_one_extent(struct btrfs_trans_handle *trans, 4623 struct btrfs_inode *inode, 4624 const struct extent_map *em, 4625 struct btrfs_path *path, 4626 struct btrfs_log_ctx *ctx) 4627 { 4628 struct btrfs_drop_extents_args drop_args = { 0 }; 4629 struct btrfs_root *log = inode->root->log_root; 4630 struct btrfs_file_extent_item fi = { 0 }; 4631 struct extent_buffer *leaf; 4632 struct btrfs_key key; 4633 u64 extent_offset = em->start - em->orig_start; 4634 u64 block_len; 4635 int ret; 4636 4637 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4638 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4639 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4640 else 4641 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4642 4643 block_len = max(em->block_len, em->orig_block_len); 4644 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4645 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4646 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4647 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4648 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4649 extent_offset); 4650 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4651 } 4652 4653 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4654 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4655 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4656 btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4657 4658 ret = log_extent_csums(trans, inode, log, em, ctx); 4659 if (ret) 4660 return ret; 4661 4662 /* 4663 * If this is the first time we are logging the inode in the current 4664 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4665 * because it does a deletion search, which always acquires write locks 4666 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4667 * but also adds significant contention in a log tree, since log trees 4668 * are small, with a root at level 2 or 3 at most, due to their short 4669 * life span. 4670 */ 4671 if (ctx->logged_before) { 4672 drop_args.path = path; 4673 drop_args.start = em->start; 4674 drop_args.end = em->start + em->len; 4675 drop_args.replace_extent = true; 4676 drop_args.extent_item_size = sizeof(fi); 4677 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4678 if (ret) 4679 return ret; 4680 } 4681 4682 if (!drop_args.extent_inserted) { 4683 key.objectid = btrfs_ino(inode); 4684 key.type = BTRFS_EXTENT_DATA_KEY; 4685 key.offset = em->start; 4686 4687 ret = btrfs_insert_empty_item(trans, log, path, &key, 4688 sizeof(fi)); 4689 if (ret) 4690 return ret; 4691 } 4692 leaf = path->nodes[0]; 4693 write_extent_buffer(leaf, &fi, 4694 btrfs_item_ptr_offset(leaf, path->slots[0]), 4695 sizeof(fi)); 4696 btrfs_mark_buffer_dirty(trans, leaf); 4697 4698 btrfs_release_path(path); 4699 4700 return ret; 4701 } 4702 4703 /* 4704 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4705 * lose them after doing a full/fast fsync and replaying the log. We scan the 4706 * subvolume's root instead of iterating the inode's extent map tree because 4707 * otherwise we can log incorrect extent items based on extent map conversion. 4708 * That can happen due to the fact that extent maps are merged when they 4709 * are not in the extent map tree's list of modified extents. 4710 */ 4711 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4712 struct btrfs_inode *inode, 4713 struct btrfs_path *path) 4714 { 4715 struct btrfs_root *root = inode->root; 4716 struct btrfs_key key; 4717 const u64 i_size = i_size_read(&inode->vfs_inode); 4718 const u64 ino = btrfs_ino(inode); 4719 struct btrfs_path *dst_path = NULL; 4720 bool dropped_extents = false; 4721 u64 truncate_offset = i_size; 4722 struct extent_buffer *leaf; 4723 int slot; 4724 int ins_nr = 0; 4725 int start_slot = 0; 4726 int ret; 4727 4728 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4729 return 0; 4730 4731 key.objectid = ino; 4732 key.type = BTRFS_EXTENT_DATA_KEY; 4733 key.offset = i_size; 4734 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4735 if (ret < 0) 4736 goto out; 4737 4738 /* 4739 * We must check if there is a prealloc extent that starts before the 4740 * i_size and crosses the i_size boundary. This is to ensure later we 4741 * truncate down to the end of that extent and not to the i_size, as 4742 * otherwise we end up losing part of the prealloc extent after a log 4743 * replay and with an implicit hole if there is another prealloc extent 4744 * that starts at an offset beyond i_size. 4745 */ 4746 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4747 if (ret < 0) 4748 goto out; 4749 4750 if (ret == 0) { 4751 struct btrfs_file_extent_item *ei; 4752 4753 leaf = path->nodes[0]; 4754 slot = path->slots[0]; 4755 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4756 4757 if (btrfs_file_extent_type(leaf, ei) == 4758 BTRFS_FILE_EXTENT_PREALLOC) { 4759 u64 extent_end; 4760 4761 btrfs_item_key_to_cpu(leaf, &key, slot); 4762 extent_end = key.offset + 4763 btrfs_file_extent_num_bytes(leaf, ei); 4764 4765 if (extent_end > i_size) 4766 truncate_offset = extent_end; 4767 } 4768 } else { 4769 ret = 0; 4770 } 4771 4772 while (true) { 4773 leaf = path->nodes[0]; 4774 slot = path->slots[0]; 4775 4776 if (slot >= btrfs_header_nritems(leaf)) { 4777 if (ins_nr > 0) { 4778 ret = copy_items(trans, inode, dst_path, path, 4779 start_slot, ins_nr, 1, 0); 4780 if (ret < 0) 4781 goto out; 4782 ins_nr = 0; 4783 } 4784 ret = btrfs_next_leaf(root, path); 4785 if (ret < 0) 4786 goto out; 4787 if (ret > 0) { 4788 ret = 0; 4789 break; 4790 } 4791 continue; 4792 } 4793 4794 btrfs_item_key_to_cpu(leaf, &key, slot); 4795 if (key.objectid > ino) 4796 break; 4797 if (WARN_ON_ONCE(key.objectid < ino) || 4798 key.type < BTRFS_EXTENT_DATA_KEY || 4799 key.offset < i_size) { 4800 path->slots[0]++; 4801 continue; 4802 } 4803 /* 4804 * Avoid overlapping items in the log tree. The first time we 4805 * get here, get rid of everything from a past fsync. After 4806 * that, if the current extent starts before the end of the last 4807 * extent we copied, truncate the last one. This can happen if 4808 * an ordered extent completion modifies the subvolume tree 4809 * while btrfs_next_leaf() has the tree unlocked. 4810 */ 4811 if (!dropped_extents || key.offset < truncate_offset) { 4812 ret = truncate_inode_items(trans, root->log_root, inode, 4813 min(key.offset, truncate_offset), 4814 BTRFS_EXTENT_DATA_KEY); 4815 if (ret) 4816 goto out; 4817 dropped_extents = true; 4818 } 4819 truncate_offset = btrfs_file_extent_end(path); 4820 if (ins_nr == 0) 4821 start_slot = slot; 4822 ins_nr++; 4823 path->slots[0]++; 4824 if (!dst_path) { 4825 dst_path = btrfs_alloc_path(); 4826 if (!dst_path) { 4827 ret = -ENOMEM; 4828 goto out; 4829 } 4830 } 4831 } 4832 if (ins_nr > 0) 4833 ret = copy_items(trans, inode, dst_path, path, 4834 start_slot, ins_nr, 1, 0); 4835 out: 4836 btrfs_release_path(path); 4837 btrfs_free_path(dst_path); 4838 return ret; 4839 } 4840 4841 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4842 struct btrfs_inode *inode, 4843 struct btrfs_path *path, 4844 struct btrfs_log_ctx *ctx) 4845 { 4846 struct btrfs_ordered_extent *ordered; 4847 struct btrfs_ordered_extent *tmp; 4848 struct extent_map *em, *n; 4849 LIST_HEAD(extents); 4850 struct extent_map_tree *tree = &inode->extent_tree; 4851 int ret = 0; 4852 int num = 0; 4853 4854 write_lock(&tree->lock); 4855 4856 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4857 list_del_init(&em->list); 4858 /* 4859 * Just an arbitrary number, this can be really CPU intensive 4860 * once we start getting a lot of extents, and really once we 4861 * have a bunch of extents we just want to commit since it will 4862 * be faster. 4863 */ 4864 if (++num > 32768) { 4865 list_del_init(&tree->modified_extents); 4866 ret = -EFBIG; 4867 goto process; 4868 } 4869 4870 if (em->generation < trans->transid) 4871 continue; 4872 4873 /* We log prealloc extents beyond eof later. */ 4874 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4875 em->start >= i_size_read(&inode->vfs_inode)) 4876 continue; 4877 4878 /* Need a ref to keep it from getting evicted from cache */ 4879 refcount_inc(&em->refs); 4880 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4881 list_add_tail(&em->list, &extents); 4882 num++; 4883 } 4884 4885 list_sort(NULL, &extents, extent_cmp); 4886 process: 4887 while (!list_empty(&extents)) { 4888 em = list_entry(extents.next, struct extent_map, list); 4889 4890 list_del_init(&em->list); 4891 4892 /* 4893 * If we had an error we just need to delete everybody from our 4894 * private list. 4895 */ 4896 if (ret) { 4897 clear_em_logging(tree, em); 4898 free_extent_map(em); 4899 continue; 4900 } 4901 4902 write_unlock(&tree->lock); 4903 4904 ret = log_one_extent(trans, inode, em, path, ctx); 4905 write_lock(&tree->lock); 4906 clear_em_logging(tree, em); 4907 free_extent_map(em); 4908 } 4909 WARN_ON(!list_empty(&extents)); 4910 write_unlock(&tree->lock); 4911 4912 if (!ret) 4913 ret = btrfs_log_prealloc_extents(trans, inode, path); 4914 if (ret) 4915 return ret; 4916 4917 /* 4918 * We have logged all extents successfully, now make sure the commit of 4919 * the current transaction waits for the ordered extents to complete 4920 * before it commits and wipes out the log trees, otherwise we would 4921 * lose data if an ordered extents completes after the transaction 4922 * commits and a power failure happens after the transaction commit. 4923 */ 4924 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4925 list_del_init(&ordered->log_list); 4926 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4927 4928 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4929 spin_lock_irq(&inode->ordered_tree.lock); 4930 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4931 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4932 atomic_inc(&trans->transaction->pending_ordered); 4933 } 4934 spin_unlock_irq(&inode->ordered_tree.lock); 4935 } 4936 btrfs_put_ordered_extent(ordered); 4937 } 4938 4939 return 0; 4940 } 4941 4942 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4943 struct btrfs_path *path, u64 *size_ret) 4944 { 4945 struct btrfs_key key; 4946 int ret; 4947 4948 key.objectid = btrfs_ino(inode); 4949 key.type = BTRFS_INODE_ITEM_KEY; 4950 key.offset = 0; 4951 4952 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4953 if (ret < 0) { 4954 return ret; 4955 } else if (ret > 0) { 4956 *size_ret = 0; 4957 } else { 4958 struct btrfs_inode_item *item; 4959 4960 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4961 struct btrfs_inode_item); 4962 *size_ret = btrfs_inode_size(path->nodes[0], item); 4963 /* 4964 * If the in-memory inode's i_size is smaller then the inode 4965 * size stored in the btree, return the inode's i_size, so 4966 * that we get a correct inode size after replaying the log 4967 * when before a power failure we had a shrinking truncate 4968 * followed by addition of a new name (rename / new hard link). 4969 * Otherwise return the inode size from the btree, to avoid 4970 * data loss when replaying a log due to previously doing a 4971 * write that expands the inode's size and logging a new name 4972 * immediately after. 4973 */ 4974 if (*size_ret > inode->vfs_inode.i_size) 4975 *size_ret = inode->vfs_inode.i_size; 4976 } 4977 4978 btrfs_release_path(path); 4979 return 0; 4980 } 4981 4982 /* 4983 * At the moment we always log all xattrs. This is to figure out at log replay 4984 * time which xattrs must have their deletion replayed. If a xattr is missing 4985 * in the log tree and exists in the fs/subvol tree, we delete it. This is 4986 * because if a xattr is deleted, the inode is fsynced and a power failure 4987 * happens, causing the log to be replayed the next time the fs is mounted, 4988 * we want the xattr to not exist anymore (same behaviour as other filesystems 4989 * with a journal, ext3/4, xfs, f2fs, etc). 4990 */ 4991 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 4992 struct btrfs_inode *inode, 4993 struct btrfs_path *path, 4994 struct btrfs_path *dst_path) 4995 { 4996 struct btrfs_root *root = inode->root; 4997 int ret; 4998 struct btrfs_key key; 4999 const u64 ino = btrfs_ino(inode); 5000 int ins_nr = 0; 5001 int start_slot = 0; 5002 bool found_xattrs = false; 5003 5004 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5005 return 0; 5006 5007 key.objectid = ino; 5008 key.type = BTRFS_XATTR_ITEM_KEY; 5009 key.offset = 0; 5010 5011 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5012 if (ret < 0) 5013 return ret; 5014 5015 while (true) { 5016 int slot = path->slots[0]; 5017 struct extent_buffer *leaf = path->nodes[0]; 5018 int nritems = btrfs_header_nritems(leaf); 5019 5020 if (slot >= nritems) { 5021 if (ins_nr > 0) { 5022 ret = copy_items(trans, inode, dst_path, path, 5023 start_slot, ins_nr, 1, 0); 5024 if (ret < 0) 5025 return ret; 5026 ins_nr = 0; 5027 } 5028 ret = btrfs_next_leaf(root, path); 5029 if (ret < 0) 5030 return ret; 5031 else if (ret > 0) 5032 break; 5033 continue; 5034 } 5035 5036 btrfs_item_key_to_cpu(leaf, &key, slot); 5037 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5038 break; 5039 5040 if (ins_nr == 0) 5041 start_slot = slot; 5042 ins_nr++; 5043 path->slots[0]++; 5044 found_xattrs = true; 5045 cond_resched(); 5046 } 5047 if (ins_nr > 0) { 5048 ret = copy_items(trans, inode, dst_path, path, 5049 start_slot, ins_nr, 1, 0); 5050 if (ret < 0) 5051 return ret; 5052 } 5053 5054 if (!found_xattrs) 5055 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5056 5057 return 0; 5058 } 5059 5060 /* 5061 * When using the NO_HOLES feature if we punched a hole that causes the 5062 * deletion of entire leafs or all the extent items of the first leaf (the one 5063 * that contains the inode item and references) we may end up not processing 5064 * any extents, because there are no leafs with a generation matching the 5065 * current transaction that have extent items for our inode. So we need to find 5066 * if any holes exist and then log them. We also need to log holes after any 5067 * truncate operation that changes the inode's size. 5068 */ 5069 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5070 struct btrfs_inode *inode, 5071 struct btrfs_path *path) 5072 { 5073 struct btrfs_root *root = inode->root; 5074 struct btrfs_fs_info *fs_info = root->fs_info; 5075 struct btrfs_key key; 5076 const u64 ino = btrfs_ino(inode); 5077 const u64 i_size = i_size_read(&inode->vfs_inode); 5078 u64 prev_extent_end = 0; 5079 int ret; 5080 5081 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5082 return 0; 5083 5084 key.objectid = ino; 5085 key.type = BTRFS_EXTENT_DATA_KEY; 5086 key.offset = 0; 5087 5088 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5089 if (ret < 0) 5090 return ret; 5091 5092 while (true) { 5093 struct extent_buffer *leaf = path->nodes[0]; 5094 5095 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5096 ret = btrfs_next_leaf(root, path); 5097 if (ret < 0) 5098 return ret; 5099 if (ret > 0) { 5100 ret = 0; 5101 break; 5102 } 5103 leaf = path->nodes[0]; 5104 } 5105 5106 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5107 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5108 break; 5109 5110 /* We have a hole, log it. */ 5111 if (prev_extent_end < key.offset) { 5112 const u64 hole_len = key.offset - prev_extent_end; 5113 5114 /* 5115 * Release the path to avoid deadlocks with other code 5116 * paths that search the root while holding locks on 5117 * leafs from the log root. 5118 */ 5119 btrfs_release_path(path); 5120 ret = btrfs_insert_hole_extent(trans, root->log_root, 5121 ino, prev_extent_end, 5122 hole_len); 5123 if (ret < 0) 5124 return ret; 5125 5126 /* 5127 * Search for the same key again in the root. Since it's 5128 * an extent item and we are holding the inode lock, the 5129 * key must still exist. If it doesn't just emit warning 5130 * and return an error to fall back to a transaction 5131 * commit. 5132 */ 5133 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5134 if (ret < 0) 5135 return ret; 5136 if (WARN_ON(ret > 0)) 5137 return -ENOENT; 5138 leaf = path->nodes[0]; 5139 } 5140 5141 prev_extent_end = btrfs_file_extent_end(path); 5142 path->slots[0]++; 5143 cond_resched(); 5144 } 5145 5146 if (prev_extent_end < i_size) { 5147 u64 hole_len; 5148 5149 btrfs_release_path(path); 5150 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5151 ret = btrfs_insert_hole_extent(trans, root->log_root, ino, 5152 prev_extent_end, hole_len); 5153 if (ret < 0) 5154 return ret; 5155 } 5156 5157 return 0; 5158 } 5159 5160 /* 5161 * When we are logging a new inode X, check if it doesn't have a reference that 5162 * matches the reference from some other inode Y created in a past transaction 5163 * and that was renamed in the current transaction. If we don't do this, then at 5164 * log replay time we can lose inode Y (and all its files if it's a directory): 5165 * 5166 * mkdir /mnt/x 5167 * echo "hello world" > /mnt/x/foobar 5168 * sync 5169 * mv /mnt/x /mnt/y 5170 * mkdir /mnt/x # or touch /mnt/x 5171 * xfs_io -c fsync /mnt/x 5172 * <power fail> 5173 * mount fs, trigger log replay 5174 * 5175 * After the log replay procedure, we would lose the first directory and all its 5176 * files (file foobar). 5177 * For the case where inode Y is not a directory we simply end up losing it: 5178 * 5179 * echo "123" > /mnt/foo 5180 * sync 5181 * mv /mnt/foo /mnt/bar 5182 * echo "abc" > /mnt/foo 5183 * xfs_io -c fsync /mnt/foo 5184 * <power fail> 5185 * 5186 * We also need this for cases where a snapshot entry is replaced by some other 5187 * entry (file or directory) otherwise we end up with an unreplayable log due to 5188 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5189 * if it were a regular entry: 5190 * 5191 * mkdir /mnt/x 5192 * btrfs subvolume snapshot /mnt /mnt/x/snap 5193 * btrfs subvolume delete /mnt/x/snap 5194 * rmdir /mnt/x 5195 * mkdir /mnt/x 5196 * fsync /mnt/x or fsync some new file inside it 5197 * <power fail> 5198 * 5199 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5200 * the same transaction. 5201 */ 5202 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5203 const int slot, 5204 const struct btrfs_key *key, 5205 struct btrfs_inode *inode, 5206 u64 *other_ino, u64 *other_parent) 5207 { 5208 int ret; 5209 struct btrfs_path *search_path; 5210 char *name = NULL; 5211 u32 name_len = 0; 5212 u32 item_size = btrfs_item_size(eb, slot); 5213 u32 cur_offset = 0; 5214 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5215 5216 search_path = btrfs_alloc_path(); 5217 if (!search_path) 5218 return -ENOMEM; 5219 search_path->search_commit_root = 1; 5220 search_path->skip_locking = 1; 5221 5222 while (cur_offset < item_size) { 5223 u64 parent; 5224 u32 this_name_len; 5225 u32 this_len; 5226 unsigned long name_ptr; 5227 struct btrfs_dir_item *di; 5228 struct fscrypt_str name_str; 5229 5230 if (key->type == BTRFS_INODE_REF_KEY) { 5231 struct btrfs_inode_ref *iref; 5232 5233 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5234 parent = key->offset; 5235 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5236 name_ptr = (unsigned long)(iref + 1); 5237 this_len = sizeof(*iref) + this_name_len; 5238 } else { 5239 struct btrfs_inode_extref *extref; 5240 5241 extref = (struct btrfs_inode_extref *)(ptr + 5242 cur_offset); 5243 parent = btrfs_inode_extref_parent(eb, extref); 5244 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5245 name_ptr = (unsigned long)&extref->name; 5246 this_len = sizeof(*extref) + this_name_len; 5247 } 5248 5249 if (this_name_len > name_len) { 5250 char *new_name; 5251 5252 new_name = krealloc(name, this_name_len, GFP_NOFS); 5253 if (!new_name) { 5254 ret = -ENOMEM; 5255 goto out; 5256 } 5257 name_len = this_name_len; 5258 name = new_name; 5259 } 5260 5261 read_extent_buffer(eb, name, name_ptr, this_name_len); 5262 5263 name_str.name = name; 5264 name_str.len = this_name_len; 5265 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5266 parent, &name_str, 0); 5267 if (di && !IS_ERR(di)) { 5268 struct btrfs_key di_key; 5269 5270 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5271 di, &di_key); 5272 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5273 if (di_key.objectid != key->objectid) { 5274 ret = 1; 5275 *other_ino = di_key.objectid; 5276 *other_parent = parent; 5277 } else { 5278 ret = 0; 5279 } 5280 } else { 5281 ret = -EAGAIN; 5282 } 5283 goto out; 5284 } else if (IS_ERR(di)) { 5285 ret = PTR_ERR(di); 5286 goto out; 5287 } 5288 btrfs_release_path(search_path); 5289 5290 cur_offset += this_len; 5291 } 5292 ret = 0; 5293 out: 5294 btrfs_free_path(search_path); 5295 kfree(name); 5296 return ret; 5297 } 5298 5299 /* 5300 * Check if we need to log an inode. This is used in contexts where while 5301 * logging an inode we need to log another inode (either that it exists or in 5302 * full mode). This is used instead of btrfs_inode_in_log() because the later 5303 * requires the inode to be in the log and have the log transaction committed, 5304 * while here we do not care if the log transaction was already committed - our 5305 * caller will commit the log later - and we want to avoid logging an inode 5306 * multiple times when multiple tasks have joined the same log transaction. 5307 */ 5308 static bool need_log_inode(const struct btrfs_trans_handle *trans, 5309 struct btrfs_inode *inode) 5310 { 5311 /* 5312 * If a directory was not modified, no dentries added or removed, we can 5313 * and should avoid logging it. 5314 */ 5315 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5316 return false; 5317 5318 /* 5319 * If this inode does not have new/updated/deleted xattrs since the last 5320 * time it was logged and is flagged as logged in the current transaction, 5321 * we can skip logging it. As for new/deleted names, those are updated in 5322 * the log by link/unlink/rename operations. 5323 * In case the inode was logged and then evicted and reloaded, its 5324 * logged_trans will be 0, in which case we have to fully log it since 5325 * logged_trans is a transient field, not persisted. 5326 */ 5327 if (inode_logged(trans, inode, NULL) == 1 && 5328 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5329 return false; 5330 5331 return true; 5332 } 5333 5334 struct btrfs_dir_list { 5335 u64 ino; 5336 struct list_head list; 5337 }; 5338 5339 /* 5340 * Log the inodes of the new dentries of a directory. 5341 * See process_dir_items_leaf() for details about why it is needed. 5342 * This is a recursive operation - if an existing dentry corresponds to a 5343 * directory, that directory's new entries are logged too (same behaviour as 5344 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5345 * the dentries point to we do not acquire their VFS lock, otherwise lockdep 5346 * complains about the following circular lock dependency / possible deadlock: 5347 * 5348 * CPU0 CPU1 5349 * ---- ---- 5350 * lock(&type->i_mutex_dir_key#3/2); 5351 * lock(sb_internal#2); 5352 * lock(&type->i_mutex_dir_key#3/2); 5353 * lock(&sb->s_type->i_mutex_key#14); 5354 * 5355 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5356 * sb_start_intwrite() in btrfs_start_transaction(). 5357 * Not acquiring the VFS lock of the inodes is still safe because: 5358 * 5359 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5360 * that while logging the inode new references (names) are added or removed 5361 * from the inode, leaving the logged inode item with a link count that does 5362 * not match the number of logged inode reference items. This is fine because 5363 * at log replay time we compute the real number of links and correct the 5364 * link count in the inode item (see replay_one_buffer() and 5365 * link_to_fixup_dir()); 5366 * 5367 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5368 * while logging the inode's items new index items (key type 5369 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 5370 * has a size that doesn't match the sum of the lengths of all the logged 5371 * names - this is ok, not a problem, because at log replay time we set the 5372 * directory's i_size to the correct value (see replay_one_name() and 5373 * overwrite_item()). 5374 */ 5375 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5376 struct btrfs_inode *start_inode, 5377 struct btrfs_log_ctx *ctx) 5378 { 5379 struct btrfs_root *root = start_inode->root; 5380 struct btrfs_fs_info *fs_info = root->fs_info; 5381 struct btrfs_path *path; 5382 LIST_HEAD(dir_list); 5383 struct btrfs_dir_list *dir_elem; 5384 u64 ino = btrfs_ino(start_inode); 5385 struct btrfs_inode *curr_inode = start_inode; 5386 int ret = 0; 5387 5388 /* 5389 * If we are logging a new name, as part of a link or rename operation, 5390 * don't bother logging new dentries, as we just want to log the names 5391 * of an inode and that any new parents exist. 5392 */ 5393 if (ctx->logging_new_name) 5394 return 0; 5395 5396 path = btrfs_alloc_path(); 5397 if (!path) 5398 return -ENOMEM; 5399 5400 /* Pairs with btrfs_add_delayed_iput below. */ 5401 ihold(&curr_inode->vfs_inode); 5402 5403 while (true) { 5404 struct inode *vfs_inode; 5405 struct btrfs_key key; 5406 struct btrfs_key found_key; 5407 u64 next_index; 5408 bool continue_curr_inode = true; 5409 int iter_ret; 5410 5411 key.objectid = ino; 5412 key.type = BTRFS_DIR_INDEX_KEY; 5413 key.offset = btrfs_get_first_dir_index_to_log(curr_inode); 5414 next_index = key.offset; 5415 again: 5416 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { 5417 struct extent_buffer *leaf = path->nodes[0]; 5418 struct btrfs_dir_item *di; 5419 struct btrfs_key di_key; 5420 struct inode *di_inode; 5421 int log_mode = LOG_INODE_EXISTS; 5422 int type; 5423 5424 if (found_key.objectid != ino || 5425 found_key.type != BTRFS_DIR_INDEX_KEY) { 5426 continue_curr_inode = false; 5427 break; 5428 } 5429 5430 next_index = found_key.offset + 1; 5431 5432 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 5433 type = btrfs_dir_ftype(leaf, di); 5434 if (btrfs_dir_transid(leaf, di) < trans->transid) 5435 continue; 5436 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5437 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5438 continue; 5439 5440 btrfs_release_path(path); 5441 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5442 if (IS_ERR(di_inode)) { 5443 ret = PTR_ERR(di_inode); 5444 goto out; 5445 } 5446 5447 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 5448 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 5449 break; 5450 } 5451 5452 ctx->log_new_dentries = false; 5453 if (type == BTRFS_FT_DIR) 5454 log_mode = LOG_INODE_ALL; 5455 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 5456 log_mode, ctx); 5457 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 5458 if (ret) 5459 goto out; 5460 if (ctx->log_new_dentries) { 5461 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5462 if (!dir_elem) { 5463 ret = -ENOMEM; 5464 goto out; 5465 } 5466 dir_elem->ino = di_key.objectid; 5467 list_add_tail(&dir_elem->list, &dir_list); 5468 } 5469 break; 5470 } 5471 5472 btrfs_release_path(path); 5473 5474 if (iter_ret < 0) { 5475 ret = iter_ret; 5476 goto out; 5477 } else if (iter_ret > 0) { 5478 continue_curr_inode = false; 5479 } else { 5480 key = found_key; 5481 } 5482 5483 if (continue_curr_inode && key.offset < (u64)-1) { 5484 key.offset++; 5485 goto again; 5486 } 5487 5488 btrfs_set_first_dir_index_to_log(curr_inode, next_index); 5489 5490 if (list_empty(&dir_list)) 5491 break; 5492 5493 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); 5494 ino = dir_elem->ino; 5495 list_del(&dir_elem->list); 5496 kfree(dir_elem); 5497 5498 btrfs_add_delayed_iput(curr_inode); 5499 curr_inode = NULL; 5500 5501 vfs_inode = btrfs_iget(fs_info->sb, ino, root); 5502 if (IS_ERR(vfs_inode)) { 5503 ret = PTR_ERR(vfs_inode); 5504 break; 5505 } 5506 curr_inode = BTRFS_I(vfs_inode); 5507 } 5508 out: 5509 btrfs_free_path(path); 5510 if (curr_inode) 5511 btrfs_add_delayed_iput(curr_inode); 5512 5513 if (ret) { 5514 struct btrfs_dir_list *next; 5515 5516 list_for_each_entry_safe(dir_elem, next, &dir_list, list) 5517 kfree(dir_elem); 5518 } 5519 5520 return ret; 5521 } 5522 5523 struct btrfs_ino_list { 5524 u64 ino; 5525 u64 parent; 5526 struct list_head list; 5527 }; 5528 5529 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) 5530 { 5531 struct btrfs_ino_list *curr; 5532 struct btrfs_ino_list *next; 5533 5534 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { 5535 list_del(&curr->list); 5536 kfree(curr); 5537 } 5538 } 5539 5540 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, 5541 struct btrfs_path *path) 5542 { 5543 struct btrfs_key key; 5544 int ret; 5545 5546 key.objectid = ino; 5547 key.type = BTRFS_INODE_ITEM_KEY; 5548 key.offset = 0; 5549 5550 path->search_commit_root = 1; 5551 path->skip_locking = 1; 5552 5553 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5554 if (WARN_ON_ONCE(ret > 0)) { 5555 /* 5556 * We have previously found the inode through the commit root 5557 * so this should not happen. If it does, just error out and 5558 * fallback to a transaction commit. 5559 */ 5560 ret = -ENOENT; 5561 } else if (ret == 0) { 5562 struct btrfs_inode_item *item; 5563 5564 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5565 struct btrfs_inode_item); 5566 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) 5567 ret = 1; 5568 } 5569 5570 btrfs_release_path(path); 5571 path->search_commit_root = 0; 5572 path->skip_locking = 0; 5573 5574 return ret; 5575 } 5576 5577 static int add_conflicting_inode(struct btrfs_trans_handle *trans, 5578 struct btrfs_root *root, 5579 struct btrfs_path *path, 5580 u64 ino, u64 parent, 5581 struct btrfs_log_ctx *ctx) 5582 { 5583 struct btrfs_ino_list *ino_elem; 5584 struct inode *inode; 5585 5586 /* 5587 * It's rare to have a lot of conflicting inodes, in practice it is not 5588 * common to have more than 1 or 2. We don't want to collect too many, 5589 * as we could end up logging too many inodes (even if only in 5590 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction 5591 * commits. 5592 */ 5593 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) 5594 return BTRFS_LOG_FORCE_COMMIT; 5595 5596 inode = btrfs_iget(root->fs_info->sb, ino, root); 5597 /* 5598 * If the other inode that had a conflicting dir entry was deleted in 5599 * the current transaction then we either: 5600 * 5601 * 1) Log the parent directory (later after adding it to the list) if 5602 * the inode is a directory. This is because it may be a deleted 5603 * subvolume/snapshot or it may be a regular directory that had 5604 * deleted subvolumes/snapshots (or subdirectories that had them), 5605 * and at the moment we can't deal with dropping subvolumes/snapshots 5606 * during log replay. So we just log the parent, which will result in 5607 * a fallback to a transaction commit if we are dealing with those 5608 * cases (last_unlink_trans will match the current transaction); 5609 * 5610 * 2) Do nothing if it's not a directory. During log replay we simply 5611 * unlink the conflicting dentry from the parent directory and then 5612 * add the dentry for our inode. Like this we can avoid logging the 5613 * parent directory (and maybe fallback to a transaction commit in 5614 * case it has a last_unlink_trans == trans->transid, due to moving 5615 * some inode from it to some other directory). 5616 */ 5617 if (IS_ERR(inode)) { 5618 int ret = PTR_ERR(inode); 5619 5620 if (ret != -ENOENT) 5621 return ret; 5622 5623 ret = conflicting_inode_is_dir(root, ino, path); 5624 /* Not a directory or we got an error. */ 5625 if (ret <= 0) 5626 return ret; 5627 5628 /* Conflicting inode is a directory, so we'll log its parent. */ 5629 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5630 if (!ino_elem) 5631 return -ENOMEM; 5632 ino_elem->ino = ino; 5633 ino_elem->parent = parent; 5634 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5635 ctx->num_conflict_inodes++; 5636 5637 return 0; 5638 } 5639 5640 /* 5641 * If the inode was already logged skip it - otherwise we can hit an 5642 * infinite loop. Example: 5643 * 5644 * From the commit root (previous transaction) we have the following 5645 * inodes: 5646 * 5647 * inode 257 a directory 5648 * inode 258 with references "zz" and "zz_link" on inode 257 5649 * inode 259 with reference "a" on inode 257 5650 * 5651 * And in the current (uncommitted) transaction we have: 5652 * 5653 * inode 257 a directory, unchanged 5654 * inode 258 with references "a" and "a2" on inode 257 5655 * inode 259 with reference "zz_link" on inode 257 5656 * inode 261 with reference "zz" on inode 257 5657 * 5658 * When logging inode 261 the following infinite loop could 5659 * happen if we don't skip already logged inodes: 5660 * 5661 * - we detect inode 258 as a conflicting inode, with inode 261 5662 * on reference "zz", and log it; 5663 * 5664 * - we detect inode 259 as a conflicting inode, with inode 258 5665 * on reference "a", and log it; 5666 * 5667 * - we detect inode 258 as a conflicting inode, with inode 259 5668 * on reference "zz_link", and log it - again! After this we 5669 * repeat the above steps forever. 5670 * 5671 * Here we can use need_log_inode() because we only need to log the 5672 * inode in LOG_INODE_EXISTS mode and rename operations update the log, 5673 * so that the log ends up with the new name and without the old name. 5674 */ 5675 if (!need_log_inode(trans, BTRFS_I(inode))) { 5676 btrfs_add_delayed_iput(BTRFS_I(inode)); 5677 return 0; 5678 } 5679 5680 btrfs_add_delayed_iput(BTRFS_I(inode)); 5681 5682 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5683 if (!ino_elem) 5684 return -ENOMEM; 5685 ino_elem->ino = ino; 5686 ino_elem->parent = parent; 5687 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5688 ctx->num_conflict_inodes++; 5689 5690 return 0; 5691 } 5692 5693 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5694 struct btrfs_root *root, 5695 struct btrfs_log_ctx *ctx) 5696 { 5697 struct btrfs_fs_info *fs_info = root->fs_info; 5698 int ret = 0; 5699 5700 /* 5701 * Conflicting inodes are logged by the first call to btrfs_log_inode(), 5702 * otherwise we could have unbounded recursion of btrfs_log_inode() 5703 * calls. This check guarantees we can have only 1 level of recursion. 5704 */ 5705 if (ctx->logging_conflict_inodes) 5706 return 0; 5707 5708 ctx->logging_conflict_inodes = true; 5709 5710 /* 5711 * New conflicting inodes may be found and added to the list while we 5712 * are logging a conflicting inode, so keep iterating while the list is 5713 * not empty. 5714 */ 5715 while (!list_empty(&ctx->conflict_inodes)) { 5716 struct btrfs_ino_list *curr; 5717 struct inode *inode; 5718 u64 ino; 5719 u64 parent; 5720 5721 curr = list_first_entry(&ctx->conflict_inodes, 5722 struct btrfs_ino_list, list); 5723 ino = curr->ino; 5724 parent = curr->parent; 5725 list_del(&curr->list); 5726 kfree(curr); 5727 5728 inode = btrfs_iget(fs_info->sb, ino, root); 5729 /* 5730 * If the other inode that had a conflicting dir entry was 5731 * deleted in the current transaction, we need to log its parent 5732 * directory. See the comment at add_conflicting_inode(). 5733 */ 5734 if (IS_ERR(inode)) { 5735 ret = PTR_ERR(inode); 5736 if (ret != -ENOENT) 5737 break; 5738 5739 inode = btrfs_iget(fs_info->sb, parent, root); 5740 if (IS_ERR(inode)) { 5741 ret = PTR_ERR(inode); 5742 break; 5743 } 5744 5745 /* 5746 * Always log the directory, we cannot make this 5747 * conditional on need_log_inode() because the directory 5748 * might have been logged in LOG_INODE_EXISTS mode or 5749 * the dir index of the conflicting inode is not in a 5750 * dir index key range logged for the directory. So we 5751 * must make sure the deletion is recorded. 5752 */ 5753 ret = btrfs_log_inode(trans, BTRFS_I(inode), 5754 LOG_INODE_ALL, ctx); 5755 btrfs_add_delayed_iput(BTRFS_I(inode)); 5756 if (ret) 5757 break; 5758 continue; 5759 } 5760 5761 /* 5762 * Here we can use need_log_inode() because we only need to log 5763 * the inode in LOG_INODE_EXISTS mode and rename operations 5764 * update the log, so that the log ends up with the new name and 5765 * without the old name. 5766 * 5767 * We did this check at add_conflicting_inode(), but here we do 5768 * it again because if some other task logged the inode after 5769 * that, we can avoid doing it again. 5770 */ 5771 if (!need_log_inode(trans, BTRFS_I(inode))) { 5772 btrfs_add_delayed_iput(BTRFS_I(inode)); 5773 continue; 5774 } 5775 5776 /* 5777 * We are safe logging the other inode without acquiring its 5778 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5779 * are safe against concurrent renames of the other inode as 5780 * well because during a rename we pin the log and update the 5781 * log with the new name before we unpin it. 5782 */ 5783 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); 5784 btrfs_add_delayed_iput(BTRFS_I(inode)); 5785 if (ret) 5786 break; 5787 } 5788 5789 ctx->logging_conflict_inodes = false; 5790 if (ret) 5791 free_conflicting_inodes(ctx); 5792 5793 return ret; 5794 } 5795 5796 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5797 struct btrfs_inode *inode, 5798 struct btrfs_key *min_key, 5799 const struct btrfs_key *max_key, 5800 struct btrfs_path *path, 5801 struct btrfs_path *dst_path, 5802 const u64 logged_isize, 5803 const int inode_only, 5804 struct btrfs_log_ctx *ctx, 5805 bool *need_log_inode_item) 5806 { 5807 const u64 i_size = i_size_read(&inode->vfs_inode); 5808 struct btrfs_root *root = inode->root; 5809 int ins_start_slot = 0; 5810 int ins_nr = 0; 5811 int ret; 5812 5813 while (1) { 5814 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5815 if (ret < 0) 5816 return ret; 5817 if (ret > 0) { 5818 ret = 0; 5819 break; 5820 } 5821 again: 5822 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5823 if (min_key->objectid != max_key->objectid) 5824 break; 5825 if (min_key->type > max_key->type) 5826 break; 5827 5828 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5829 *need_log_inode_item = false; 5830 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5831 min_key->offset >= i_size) { 5832 /* 5833 * Extents at and beyond eof are logged with 5834 * btrfs_log_prealloc_extents(). 5835 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5836 * and no keys greater than that, so bail out. 5837 */ 5838 break; 5839 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5840 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5841 (inode->generation == trans->transid || 5842 ctx->logging_conflict_inodes)) { 5843 u64 other_ino = 0; 5844 u64 other_parent = 0; 5845 5846 ret = btrfs_check_ref_name_override(path->nodes[0], 5847 path->slots[0], min_key, inode, 5848 &other_ino, &other_parent); 5849 if (ret < 0) { 5850 return ret; 5851 } else if (ret > 0 && 5852 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5853 if (ins_nr > 0) { 5854 ins_nr++; 5855 } else { 5856 ins_nr = 1; 5857 ins_start_slot = path->slots[0]; 5858 } 5859 ret = copy_items(trans, inode, dst_path, path, 5860 ins_start_slot, ins_nr, 5861 inode_only, logged_isize); 5862 if (ret < 0) 5863 return ret; 5864 ins_nr = 0; 5865 5866 btrfs_release_path(path); 5867 ret = add_conflicting_inode(trans, root, path, 5868 other_ino, 5869 other_parent, ctx); 5870 if (ret) 5871 return ret; 5872 goto next_key; 5873 } 5874 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5875 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5876 if (ins_nr == 0) 5877 goto next_slot; 5878 ret = copy_items(trans, inode, dst_path, path, 5879 ins_start_slot, 5880 ins_nr, inode_only, logged_isize); 5881 if (ret < 0) 5882 return ret; 5883 ins_nr = 0; 5884 goto next_slot; 5885 } 5886 5887 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5888 ins_nr++; 5889 goto next_slot; 5890 } else if (!ins_nr) { 5891 ins_start_slot = path->slots[0]; 5892 ins_nr = 1; 5893 goto next_slot; 5894 } 5895 5896 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5897 ins_nr, inode_only, logged_isize); 5898 if (ret < 0) 5899 return ret; 5900 ins_nr = 1; 5901 ins_start_slot = path->slots[0]; 5902 next_slot: 5903 path->slots[0]++; 5904 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5905 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5906 path->slots[0]); 5907 goto again; 5908 } 5909 if (ins_nr) { 5910 ret = copy_items(trans, inode, dst_path, path, 5911 ins_start_slot, ins_nr, inode_only, 5912 logged_isize); 5913 if (ret < 0) 5914 return ret; 5915 ins_nr = 0; 5916 } 5917 btrfs_release_path(path); 5918 next_key: 5919 if (min_key->offset < (u64)-1) { 5920 min_key->offset++; 5921 } else if (min_key->type < max_key->type) { 5922 min_key->type++; 5923 min_key->offset = 0; 5924 } else { 5925 break; 5926 } 5927 5928 /* 5929 * We may process many leaves full of items for our inode, so 5930 * avoid monopolizing a cpu for too long by rescheduling while 5931 * not holding locks on any tree. 5932 */ 5933 cond_resched(); 5934 } 5935 if (ins_nr) { 5936 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5937 ins_nr, inode_only, logged_isize); 5938 if (ret) 5939 return ret; 5940 } 5941 5942 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5943 /* 5944 * Release the path because otherwise we might attempt to double 5945 * lock the same leaf with btrfs_log_prealloc_extents() below. 5946 */ 5947 btrfs_release_path(path); 5948 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5949 } 5950 5951 return ret; 5952 } 5953 5954 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, 5955 struct btrfs_root *log, 5956 struct btrfs_path *path, 5957 const struct btrfs_item_batch *batch, 5958 const struct btrfs_delayed_item *first_item) 5959 { 5960 const struct btrfs_delayed_item *curr = first_item; 5961 int ret; 5962 5963 ret = btrfs_insert_empty_items(trans, log, path, batch); 5964 if (ret) 5965 return ret; 5966 5967 for (int i = 0; i < batch->nr; i++) { 5968 char *data_ptr; 5969 5970 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 5971 write_extent_buffer(path->nodes[0], &curr->data, 5972 (unsigned long)data_ptr, curr->data_len); 5973 curr = list_next_entry(curr, log_list); 5974 path->slots[0]++; 5975 } 5976 5977 btrfs_release_path(path); 5978 5979 return 0; 5980 } 5981 5982 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, 5983 struct btrfs_inode *inode, 5984 struct btrfs_path *path, 5985 const struct list_head *delayed_ins_list, 5986 struct btrfs_log_ctx *ctx) 5987 { 5988 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ 5989 const int max_batch_size = 195; 5990 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); 5991 const u64 ino = btrfs_ino(inode); 5992 struct btrfs_root *log = inode->root->log_root; 5993 struct btrfs_item_batch batch = { 5994 .nr = 0, 5995 .total_data_size = 0, 5996 }; 5997 const struct btrfs_delayed_item *first = NULL; 5998 const struct btrfs_delayed_item *curr; 5999 char *ins_data; 6000 struct btrfs_key *ins_keys; 6001 u32 *ins_sizes; 6002 u64 curr_batch_size = 0; 6003 int batch_idx = 0; 6004 int ret; 6005 6006 /* We are adding dir index items to the log tree. */ 6007 lockdep_assert_held(&inode->log_mutex); 6008 6009 /* 6010 * We collect delayed items before copying index keys from the subvolume 6011 * to the log tree. However just after we collected them, they may have 6012 * been flushed (all of them or just some of them), and therefore we 6013 * could have copied them from the subvolume tree to the log tree. 6014 * So find the first delayed item that was not yet logged (they are 6015 * sorted by index number). 6016 */ 6017 list_for_each_entry(curr, delayed_ins_list, log_list) { 6018 if (curr->index > inode->last_dir_index_offset) { 6019 first = curr; 6020 break; 6021 } 6022 } 6023 6024 /* Empty list or all delayed items were already logged. */ 6025 if (!first) 6026 return 0; 6027 6028 ins_data = kmalloc(max_batch_size * sizeof(u32) + 6029 max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); 6030 if (!ins_data) 6031 return -ENOMEM; 6032 ins_sizes = (u32 *)ins_data; 6033 batch.data_sizes = ins_sizes; 6034 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); 6035 batch.keys = ins_keys; 6036 6037 curr = first; 6038 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { 6039 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); 6040 6041 if (curr_batch_size + curr_size > leaf_data_size || 6042 batch.nr == max_batch_size) { 6043 ret = insert_delayed_items_batch(trans, log, path, 6044 &batch, first); 6045 if (ret) 6046 goto out; 6047 batch_idx = 0; 6048 batch.nr = 0; 6049 batch.total_data_size = 0; 6050 curr_batch_size = 0; 6051 first = curr; 6052 } 6053 6054 ins_sizes[batch_idx] = curr->data_len; 6055 ins_keys[batch_idx].objectid = ino; 6056 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; 6057 ins_keys[batch_idx].offset = curr->index; 6058 curr_batch_size += curr_size; 6059 batch.total_data_size += curr->data_len; 6060 batch.nr++; 6061 batch_idx++; 6062 curr = list_next_entry(curr, log_list); 6063 } 6064 6065 ASSERT(batch.nr >= 1); 6066 ret = insert_delayed_items_batch(trans, log, path, &batch, first); 6067 6068 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, 6069 log_list); 6070 inode->last_dir_index_offset = curr->index; 6071 out: 6072 kfree(ins_data); 6073 6074 return ret; 6075 } 6076 6077 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, 6078 struct btrfs_inode *inode, 6079 struct btrfs_path *path, 6080 const struct list_head *delayed_del_list, 6081 struct btrfs_log_ctx *ctx) 6082 { 6083 const u64 ino = btrfs_ino(inode); 6084 const struct btrfs_delayed_item *curr; 6085 6086 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6087 log_list); 6088 6089 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6090 u64 first_dir_index = curr->index; 6091 u64 last_dir_index; 6092 const struct btrfs_delayed_item *next; 6093 int ret; 6094 6095 /* 6096 * Find a range of consecutive dir index items to delete. Like 6097 * this we log a single dir range item spanning several contiguous 6098 * dir items instead of logging one range item per dir index item. 6099 */ 6100 next = list_next_entry(curr, log_list); 6101 while (!list_entry_is_head(next, delayed_del_list, log_list)) { 6102 if (next->index != curr->index + 1) 6103 break; 6104 curr = next; 6105 next = list_next_entry(next, log_list); 6106 } 6107 6108 last_dir_index = curr->index; 6109 ASSERT(last_dir_index >= first_dir_index); 6110 6111 ret = insert_dir_log_key(trans, inode->root->log_root, path, 6112 ino, first_dir_index, last_dir_index); 6113 if (ret) 6114 return ret; 6115 curr = list_next_entry(curr, log_list); 6116 } 6117 6118 return 0; 6119 } 6120 6121 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, 6122 struct btrfs_inode *inode, 6123 struct btrfs_path *path, 6124 struct btrfs_log_ctx *ctx, 6125 const struct list_head *delayed_del_list, 6126 const struct btrfs_delayed_item *first, 6127 const struct btrfs_delayed_item **last_ret) 6128 { 6129 const struct btrfs_delayed_item *next; 6130 struct extent_buffer *leaf = path->nodes[0]; 6131 const int last_slot = btrfs_header_nritems(leaf) - 1; 6132 int slot = path->slots[0] + 1; 6133 const u64 ino = btrfs_ino(inode); 6134 6135 next = list_next_entry(first, log_list); 6136 6137 while (slot < last_slot && 6138 !list_entry_is_head(next, delayed_del_list, log_list)) { 6139 struct btrfs_key key; 6140 6141 btrfs_item_key_to_cpu(leaf, &key, slot); 6142 if (key.objectid != ino || 6143 key.type != BTRFS_DIR_INDEX_KEY || 6144 key.offset != next->index) 6145 break; 6146 6147 slot++; 6148 *last_ret = next; 6149 next = list_next_entry(next, log_list); 6150 } 6151 6152 return btrfs_del_items(trans, inode->root->log_root, path, 6153 path->slots[0], slot - path->slots[0]); 6154 } 6155 6156 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, 6157 struct btrfs_inode *inode, 6158 struct btrfs_path *path, 6159 const struct list_head *delayed_del_list, 6160 struct btrfs_log_ctx *ctx) 6161 { 6162 struct btrfs_root *log = inode->root->log_root; 6163 const struct btrfs_delayed_item *curr; 6164 u64 last_range_start = 0; 6165 u64 last_range_end = 0; 6166 struct btrfs_key key; 6167 6168 key.objectid = btrfs_ino(inode); 6169 key.type = BTRFS_DIR_INDEX_KEY; 6170 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6171 log_list); 6172 6173 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6174 const struct btrfs_delayed_item *last = curr; 6175 u64 first_dir_index = curr->index; 6176 u64 last_dir_index; 6177 bool deleted_items = false; 6178 int ret; 6179 6180 key.offset = curr->index; 6181 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 6182 if (ret < 0) { 6183 return ret; 6184 } else if (ret == 0) { 6185 ret = batch_delete_dir_index_items(trans, inode, path, ctx, 6186 delayed_del_list, curr, 6187 &last); 6188 if (ret) 6189 return ret; 6190 deleted_items = true; 6191 } 6192 6193 btrfs_release_path(path); 6194 6195 /* 6196 * If we deleted items from the leaf, it means we have a range 6197 * item logging their range, so no need to add one or update an 6198 * existing one. Otherwise we have to log a dir range item. 6199 */ 6200 if (deleted_items) 6201 goto next_batch; 6202 6203 last_dir_index = last->index; 6204 ASSERT(last_dir_index >= first_dir_index); 6205 /* 6206 * If this range starts right after where the previous one ends, 6207 * then we want to reuse the previous range item and change its 6208 * end offset to the end of this range. This is just to minimize 6209 * leaf space usage, by avoiding adding a new range item. 6210 */ 6211 if (last_range_end != 0 && first_dir_index == last_range_end + 1) 6212 first_dir_index = last_range_start; 6213 6214 ret = insert_dir_log_key(trans, log, path, key.objectid, 6215 first_dir_index, last_dir_index); 6216 if (ret) 6217 return ret; 6218 6219 last_range_start = first_dir_index; 6220 last_range_end = last_dir_index; 6221 next_batch: 6222 curr = list_next_entry(last, log_list); 6223 } 6224 6225 return 0; 6226 } 6227 6228 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, 6229 struct btrfs_inode *inode, 6230 struct btrfs_path *path, 6231 const struct list_head *delayed_del_list, 6232 struct btrfs_log_ctx *ctx) 6233 { 6234 /* 6235 * We are deleting dir index items from the log tree or adding range 6236 * items to it. 6237 */ 6238 lockdep_assert_held(&inode->log_mutex); 6239 6240 if (list_empty(delayed_del_list)) 6241 return 0; 6242 6243 if (ctx->logged_before) 6244 return log_delayed_deletions_incremental(trans, inode, path, 6245 delayed_del_list, ctx); 6246 6247 return log_delayed_deletions_full(trans, inode, path, delayed_del_list, 6248 ctx); 6249 } 6250 6251 /* 6252 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed 6253 * items instead of the subvolume tree. 6254 */ 6255 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, 6256 struct btrfs_inode *inode, 6257 const struct list_head *delayed_ins_list, 6258 struct btrfs_log_ctx *ctx) 6259 { 6260 const bool orig_log_new_dentries = ctx->log_new_dentries; 6261 struct btrfs_fs_info *fs_info = trans->fs_info; 6262 struct btrfs_delayed_item *item; 6263 int ret = 0; 6264 6265 /* 6266 * No need for the log mutex, plus to avoid potential deadlocks or 6267 * lockdep annotations due to nesting of delayed inode mutexes and log 6268 * mutexes. 6269 */ 6270 lockdep_assert_not_held(&inode->log_mutex); 6271 6272 ASSERT(!ctx->logging_new_delayed_dentries); 6273 ctx->logging_new_delayed_dentries = true; 6274 6275 list_for_each_entry(item, delayed_ins_list, log_list) { 6276 struct btrfs_dir_item *dir_item; 6277 struct inode *di_inode; 6278 struct btrfs_key key; 6279 int log_mode = LOG_INODE_EXISTS; 6280 6281 dir_item = (struct btrfs_dir_item *)item->data; 6282 btrfs_disk_key_to_cpu(&key, &dir_item->location); 6283 6284 if (key.type == BTRFS_ROOT_ITEM_KEY) 6285 continue; 6286 6287 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); 6288 if (IS_ERR(di_inode)) { 6289 ret = PTR_ERR(di_inode); 6290 break; 6291 } 6292 6293 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6294 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 6295 continue; 6296 } 6297 6298 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) 6299 log_mode = LOG_INODE_ALL; 6300 6301 ctx->log_new_dentries = false; 6302 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); 6303 6304 if (!ret && ctx->log_new_dentries) 6305 ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); 6306 6307 btrfs_add_delayed_iput(BTRFS_I(di_inode)); 6308 6309 if (ret) 6310 break; 6311 } 6312 6313 ctx->log_new_dentries = orig_log_new_dentries; 6314 ctx->logging_new_delayed_dentries = false; 6315 6316 return ret; 6317 } 6318 6319 /* log a single inode in the tree log. 6320 * At least one parent directory for this inode must exist in the tree 6321 * or be logged already. 6322 * 6323 * Any items from this inode changed by the current transaction are copied 6324 * to the log tree. An extra reference is taken on any extents in this 6325 * file, allowing us to avoid a whole pile of corner cases around logging 6326 * blocks that have been removed from the tree. 6327 * 6328 * See LOG_INODE_ALL and related defines for a description of what inode_only 6329 * does. 6330 * 6331 * This handles both files and directories. 6332 */ 6333 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 6334 struct btrfs_inode *inode, 6335 int inode_only, 6336 struct btrfs_log_ctx *ctx) 6337 { 6338 struct btrfs_path *path; 6339 struct btrfs_path *dst_path; 6340 struct btrfs_key min_key; 6341 struct btrfs_key max_key; 6342 struct btrfs_root *log = inode->root->log_root; 6343 int ret; 6344 bool fast_search = false; 6345 u64 ino = btrfs_ino(inode); 6346 struct extent_map_tree *em_tree = &inode->extent_tree; 6347 u64 logged_isize = 0; 6348 bool need_log_inode_item = true; 6349 bool xattrs_logged = false; 6350 bool inode_item_dropped = true; 6351 bool full_dir_logging = false; 6352 LIST_HEAD(delayed_ins_list); 6353 LIST_HEAD(delayed_del_list); 6354 6355 path = btrfs_alloc_path(); 6356 if (!path) 6357 return -ENOMEM; 6358 dst_path = btrfs_alloc_path(); 6359 if (!dst_path) { 6360 btrfs_free_path(path); 6361 return -ENOMEM; 6362 } 6363 6364 min_key.objectid = ino; 6365 min_key.type = BTRFS_INODE_ITEM_KEY; 6366 min_key.offset = 0; 6367 6368 max_key.objectid = ino; 6369 6370 6371 /* today the code can only do partial logging of directories */ 6372 if (S_ISDIR(inode->vfs_inode.i_mode) || 6373 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6374 &inode->runtime_flags) && 6375 inode_only >= LOG_INODE_EXISTS)) 6376 max_key.type = BTRFS_XATTR_ITEM_KEY; 6377 else 6378 max_key.type = (u8)-1; 6379 max_key.offset = (u64)-1; 6380 6381 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) 6382 full_dir_logging = true; 6383 6384 /* 6385 * If we are logging a directory while we are logging dentries of the 6386 * delayed items of some other inode, then we need to flush the delayed 6387 * items of this directory and not log the delayed items directly. This 6388 * is to prevent more than one level of recursion into btrfs_log_inode() 6389 * by having something like this: 6390 * 6391 * $ mkdir -p a/b/c/d/e/f/g/h/... 6392 * $ xfs_io -c "fsync" a 6393 * 6394 * Where all directories in the path did not exist before and are 6395 * created in the current transaction. 6396 * So in such a case we directly log the delayed items of the main 6397 * directory ("a") without flushing them first, while for each of its 6398 * subdirectories we flush their delayed items before logging them. 6399 * This prevents a potential unbounded recursion like this: 6400 * 6401 * btrfs_log_inode() 6402 * log_new_delayed_dentries() 6403 * btrfs_log_inode() 6404 * log_new_delayed_dentries() 6405 * btrfs_log_inode() 6406 * log_new_delayed_dentries() 6407 * (...) 6408 * 6409 * We have thresholds for the maximum number of delayed items to have in 6410 * memory, and once they are hit, the items are flushed asynchronously. 6411 * However the limit is quite high, so lets prevent deep levels of 6412 * recursion to happen by limiting the maximum depth to be 1. 6413 */ 6414 if (full_dir_logging && ctx->logging_new_delayed_dentries) { 6415 ret = btrfs_commit_inode_delayed_items(trans, inode); 6416 if (ret) 6417 goto out; 6418 } 6419 6420 mutex_lock(&inode->log_mutex); 6421 6422 /* 6423 * For symlinks, we must always log their content, which is stored in an 6424 * inline extent, otherwise we could end up with an empty symlink after 6425 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 6426 * one attempts to create an empty symlink). 6427 * We don't need to worry about flushing delalloc, because when we create 6428 * the inline extent when the symlink is created (we never have delalloc 6429 * for symlinks). 6430 */ 6431 if (S_ISLNK(inode->vfs_inode.i_mode)) 6432 inode_only = LOG_INODE_ALL; 6433 6434 /* 6435 * Before logging the inode item, cache the value returned by 6436 * inode_logged(), because after that we have the need to figure out if 6437 * the inode was previously logged in this transaction. 6438 */ 6439 ret = inode_logged(trans, inode, path); 6440 if (ret < 0) 6441 goto out_unlock; 6442 ctx->logged_before = (ret == 1); 6443 ret = 0; 6444 6445 /* 6446 * This is for cases where logging a directory could result in losing a 6447 * a file after replaying the log. For example, if we move a file from a 6448 * directory A to a directory B, then fsync directory A, we have no way 6449 * to known the file was moved from A to B, so logging just A would 6450 * result in losing the file after a log replay. 6451 */ 6452 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { 6453 ret = BTRFS_LOG_FORCE_COMMIT; 6454 goto out_unlock; 6455 } 6456 6457 /* 6458 * a brute force approach to making sure we get the most uptodate 6459 * copies of everything. 6460 */ 6461 if (S_ISDIR(inode->vfs_inode.i_mode)) { 6462 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 6463 if (ctx->logged_before) 6464 ret = drop_inode_items(trans, log, path, inode, 6465 BTRFS_XATTR_ITEM_KEY); 6466 } else { 6467 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 6468 /* 6469 * Make sure the new inode item we write to the log has 6470 * the same isize as the current one (if it exists). 6471 * This is necessary to prevent data loss after log 6472 * replay, and also to prevent doing a wrong expanding 6473 * truncate - for e.g. create file, write 4K into offset 6474 * 0, fsync, write 4K into offset 4096, add hard link, 6475 * fsync some other file (to sync log), power fail - if 6476 * we use the inode's current i_size, after log replay 6477 * we get a 8Kb file, with the last 4Kb extent as a hole 6478 * (zeroes), as if an expanding truncate happened, 6479 * instead of getting a file of 4Kb only. 6480 */ 6481 ret = logged_inode_size(log, inode, path, &logged_isize); 6482 if (ret) 6483 goto out_unlock; 6484 } 6485 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6486 &inode->runtime_flags)) { 6487 if (inode_only == LOG_INODE_EXISTS) { 6488 max_key.type = BTRFS_XATTR_ITEM_KEY; 6489 if (ctx->logged_before) 6490 ret = drop_inode_items(trans, log, path, 6491 inode, max_key.type); 6492 } else { 6493 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6494 &inode->runtime_flags); 6495 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6496 &inode->runtime_flags); 6497 if (ctx->logged_before) 6498 ret = truncate_inode_items(trans, log, 6499 inode, 0, 0); 6500 } 6501 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6502 &inode->runtime_flags) || 6503 inode_only == LOG_INODE_EXISTS) { 6504 if (inode_only == LOG_INODE_ALL) 6505 fast_search = true; 6506 max_key.type = BTRFS_XATTR_ITEM_KEY; 6507 if (ctx->logged_before) 6508 ret = drop_inode_items(trans, log, path, inode, 6509 max_key.type); 6510 } else { 6511 if (inode_only == LOG_INODE_ALL) 6512 fast_search = true; 6513 inode_item_dropped = false; 6514 goto log_extents; 6515 } 6516 6517 } 6518 if (ret) 6519 goto out_unlock; 6520 6521 /* 6522 * If we are logging a directory in full mode, collect the delayed items 6523 * before iterating the subvolume tree, so that we don't miss any new 6524 * dir index items in case they get flushed while or right after we are 6525 * iterating the subvolume tree. 6526 */ 6527 if (full_dir_logging && !ctx->logging_new_delayed_dentries) 6528 btrfs_log_get_delayed_items(inode, &delayed_ins_list, 6529 &delayed_del_list); 6530 6531 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 6532 path, dst_path, logged_isize, 6533 inode_only, ctx, 6534 &need_log_inode_item); 6535 if (ret) 6536 goto out_unlock; 6537 6538 btrfs_release_path(path); 6539 btrfs_release_path(dst_path); 6540 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 6541 if (ret) 6542 goto out_unlock; 6543 xattrs_logged = true; 6544 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 6545 btrfs_release_path(path); 6546 btrfs_release_path(dst_path); 6547 ret = btrfs_log_holes(trans, inode, path); 6548 if (ret) 6549 goto out_unlock; 6550 } 6551 log_extents: 6552 btrfs_release_path(path); 6553 btrfs_release_path(dst_path); 6554 if (need_log_inode_item) { 6555 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 6556 if (ret) 6557 goto out_unlock; 6558 /* 6559 * If we are doing a fast fsync and the inode was logged before 6560 * in this transaction, we don't need to log the xattrs because 6561 * they were logged before. If xattrs were added, changed or 6562 * deleted since the last time we logged the inode, then we have 6563 * already logged them because the inode had the runtime flag 6564 * BTRFS_INODE_COPY_EVERYTHING set. 6565 */ 6566 if (!xattrs_logged && inode->logged_trans < trans->transid) { 6567 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 6568 if (ret) 6569 goto out_unlock; 6570 btrfs_release_path(path); 6571 } 6572 } 6573 if (fast_search) { 6574 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 6575 if (ret) 6576 goto out_unlock; 6577 } else if (inode_only == LOG_INODE_ALL) { 6578 struct extent_map *em, *n; 6579 6580 write_lock(&em_tree->lock); 6581 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 6582 list_del_init(&em->list); 6583 write_unlock(&em_tree->lock); 6584 } 6585 6586 if (full_dir_logging) { 6587 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 6588 if (ret) 6589 goto out_unlock; 6590 ret = log_delayed_insertion_items(trans, inode, path, 6591 &delayed_ins_list, ctx); 6592 if (ret) 6593 goto out_unlock; 6594 ret = log_delayed_deletion_items(trans, inode, path, 6595 &delayed_del_list, ctx); 6596 if (ret) 6597 goto out_unlock; 6598 } 6599 6600 spin_lock(&inode->lock); 6601 inode->logged_trans = trans->transid; 6602 /* 6603 * Don't update last_log_commit if we logged that an inode exists. 6604 * We do this for three reasons: 6605 * 6606 * 1) We might have had buffered writes to this inode that were 6607 * flushed and had their ordered extents completed in this 6608 * transaction, but we did not previously log the inode with 6609 * LOG_INODE_ALL. Later the inode was evicted and after that 6610 * it was loaded again and this LOG_INODE_EXISTS log operation 6611 * happened. We must make sure that if an explicit fsync against 6612 * the inode is performed later, it logs the new extents, an 6613 * updated inode item, etc, and syncs the log. The same logic 6614 * applies to direct IO writes instead of buffered writes. 6615 * 6616 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 6617 * is logged with an i_size of 0 or whatever value was logged 6618 * before. If later the i_size of the inode is increased by a 6619 * truncate operation, the log is synced through an fsync of 6620 * some other inode and then finally an explicit fsync against 6621 * this inode is made, we must make sure this fsync logs the 6622 * inode with the new i_size, the hole between old i_size and 6623 * the new i_size, and syncs the log. 6624 * 6625 * 3) If we are logging that an ancestor inode exists as part of 6626 * logging a new name from a link or rename operation, don't update 6627 * its last_log_commit - otherwise if an explicit fsync is made 6628 * against an ancestor, the fsync considers the inode in the log 6629 * and doesn't sync the log, resulting in the ancestor missing after 6630 * a power failure unless the log was synced as part of an fsync 6631 * against any other unrelated inode. 6632 */ 6633 if (inode_only != LOG_INODE_EXISTS) 6634 inode->last_log_commit = inode->last_sub_trans; 6635 spin_unlock(&inode->lock); 6636 6637 /* 6638 * Reset the last_reflink_trans so that the next fsync does not need to 6639 * go through the slower path when logging extents and their checksums. 6640 */ 6641 if (inode_only == LOG_INODE_ALL) 6642 inode->last_reflink_trans = 0; 6643 6644 out_unlock: 6645 mutex_unlock(&inode->log_mutex); 6646 out: 6647 btrfs_free_path(path); 6648 btrfs_free_path(dst_path); 6649 6650 if (ret) 6651 free_conflicting_inodes(ctx); 6652 else 6653 ret = log_conflicting_inodes(trans, inode->root, ctx); 6654 6655 if (full_dir_logging && !ctx->logging_new_delayed_dentries) { 6656 if (!ret) 6657 ret = log_new_delayed_dentries(trans, inode, 6658 &delayed_ins_list, ctx); 6659 6660 btrfs_log_put_delayed_items(inode, &delayed_ins_list, 6661 &delayed_del_list); 6662 } 6663 6664 return ret; 6665 } 6666 6667 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6668 struct btrfs_inode *inode, 6669 struct btrfs_log_ctx *ctx) 6670 { 6671 struct btrfs_fs_info *fs_info = trans->fs_info; 6672 int ret; 6673 struct btrfs_path *path; 6674 struct btrfs_key key; 6675 struct btrfs_root *root = inode->root; 6676 const u64 ino = btrfs_ino(inode); 6677 6678 path = btrfs_alloc_path(); 6679 if (!path) 6680 return -ENOMEM; 6681 path->skip_locking = 1; 6682 path->search_commit_root = 1; 6683 6684 key.objectid = ino; 6685 key.type = BTRFS_INODE_REF_KEY; 6686 key.offset = 0; 6687 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6688 if (ret < 0) 6689 goto out; 6690 6691 while (true) { 6692 struct extent_buffer *leaf = path->nodes[0]; 6693 int slot = path->slots[0]; 6694 u32 cur_offset = 0; 6695 u32 item_size; 6696 unsigned long ptr; 6697 6698 if (slot >= btrfs_header_nritems(leaf)) { 6699 ret = btrfs_next_leaf(root, path); 6700 if (ret < 0) 6701 goto out; 6702 else if (ret > 0) 6703 break; 6704 continue; 6705 } 6706 6707 btrfs_item_key_to_cpu(leaf, &key, slot); 6708 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6709 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6710 break; 6711 6712 item_size = btrfs_item_size(leaf, slot); 6713 ptr = btrfs_item_ptr_offset(leaf, slot); 6714 while (cur_offset < item_size) { 6715 struct btrfs_key inode_key; 6716 struct inode *dir_inode; 6717 6718 inode_key.type = BTRFS_INODE_ITEM_KEY; 6719 inode_key.offset = 0; 6720 6721 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6722 struct btrfs_inode_extref *extref; 6723 6724 extref = (struct btrfs_inode_extref *) 6725 (ptr + cur_offset); 6726 inode_key.objectid = btrfs_inode_extref_parent( 6727 leaf, extref); 6728 cur_offset += sizeof(*extref); 6729 cur_offset += btrfs_inode_extref_name_len(leaf, 6730 extref); 6731 } else { 6732 inode_key.objectid = key.offset; 6733 cur_offset = item_size; 6734 } 6735 6736 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6737 root); 6738 /* 6739 * If the parent inode was deleted, return an error to 6740 * fallback to a transaction commit. This is to prevent 6741 * getting an inode that was moved from one parent A to 6742 * a parent B, got its former parent A deleted and then 6743 * it got fsync'ed, from existing at both parents after 6744 * a log replay (and the old parent still existing). 6745 * Example: 6746 * 6747 * mkdir /mnt/A 6748 * mkdir /mnt/B 6749 * touch /mnt/B/bar 6750 * sync 6751 * mv /mnt/B/bar /mnt/A/bar 6752 * mv -T /mnt/A /mnt/B 6753 * fsync /mnt/B/bar 6754 * <power fail> 6755 * 6756 * If we ignore the old parent B which got deleted, 6757 * after a log replay we would have file bar linked 6758 * at both parents and the old parent B would still 6759 * exist. 6760 */ 6761 if (IS_ERR(dir_inode)) { 6762 ret = PTR_ERR(dir_inode); 6763 goto out; 6764 } 6765 6766 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6767 btrfs_add_delayed_iput(BTRFS_I(dir_inode)); 6768 continue; 6769 } 6770 6771 ctx->log_new_dentries = false; 6772 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6773 LOG_INODE_ALL, ctx); 6774 if (!ret && ctx->log_new_dentries) 6775 ret = log_new_dir_dentries(trans, 6776 BTRFS_I(dir_inode), ctx); 6777 btrfs_add_delayed_iput(BTRFS_I(dir_inode)); 6778 if (ret) 6779 goto out; 6780 } 6781 path->slots[0]++; 6782 } 6783 ret = 0; 6784 out: 6785 btrfs_free_path(path); 6786 return ret; 6787 } 6788 6789 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6790 struct btrfs_root *root, 6791 struct btrfs_path *path, 6792 struct btrfs_log_ctx *ctx) 6793 { 6794 struct btrfs_key found_key; 6795 6796 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6797 6798 while (true) { 6799 struct btrfs_fs_info *fs_info = root->fs_info; 6800 struct extent_buffer *leaf; 6801 int slot; 6802 struct btrfs_key search_key; 6803 struct inode *inode; 6804 u64 ino; 6805 int ret = 0; 6806 6807 btrfs_release_path(path); 6808 6809 ino = found_key.offset; 6810 6811 search_key.objectid = found_key.offset; 6812 search_key.type = BTRFS_INODE_ITEM_KEY; 6813 search_key.offset = 0; 6814 inode = btrfs_iget(fs_info->sb, ino, root); 6815 if (IS_ERR(inode)) 6816 return PTR_ERR(inode); 6817 6818 if (BTRFS_I(inode)->generation >= trans->transid && 6819 need_log_inode(trans, BTRFS_I(inode))) 6820 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6821 LOG_INODE_EXISTS, ctx); 6822 btrfs_add_delayed_iput(BTRFS_I(inode)); 6823 if (ret) 6824 return ret; 6825 6826 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6827 break; 6828 6829 search_key.type = BTRFS_INODE_REF_KEY; 6830 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6831 if (ret < 0) 6832 return ret; 6833 6834 leaf = path->nodes[0]; 6835 slot = path->slots[0]; 6836 if (slot >= btrfs_header_nritems(leaf)) { 6837 ret = btrfs_next_leaf(root, path); 6838 if (ret < 0) 6839 return ret; 6840 else if (ret > 0) 6841 return -ENOENT; 6842 leaf = path->nodes[0]; 6843 slot = path->slots[0]; 6844 } 6845 6846 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6847 if (found_key.objectid != search_key.objectid || 6848 found_key.type != BTRFS_INODE_REF_KEY) 6849 return -ENOENT; 6850 } 6851 return 0; 6852 } 6853 6854 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6855 struct btrfs_inode *inode, 6856 struct dentry *parent, 6857 struct btrfs_log_ctx *ctx) 6858 { 6859 struct btrfs_root *root = inode->root; 6860 struct dentry *old_parent = NULL; 6861 struct super_block *sb = inode->vfs_inode.i_sb; 6862 int ret = 0; 6863 6864 while (true) { 6865 if (!parent || d_really_is_negative(parent) || 6866 sb != parent->d_sb) 6867 break; 6868 6869 inode = BTRFS_I(d_inode(parent)); 6870 if (root != inode->root) 6871 break; 6872 6873 if (inode->generation >= trans->transid && 6874 need_log_inode(trans, inode)) { 6875 ret = btrfs_log_inode(trans, inode, 6876 LOG_INODE_EXISTS, ctx); 6877 if (ret) 6878 break; 6879 } 6880 if (IS_ROOT(parent)) 6881 break; 6882 6883 parent = dget_parent(parent); 6884 dput(old_parent); 6885 old_parent = parent; 6886 } 6887 dput(old_parent); 6888 6889 return ret; 6890 } 6891 6892 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6893 struct btrfs_inode *inode, 6894 struct dentry *parent, 6895 struct btrfs_log_ctx *ctx) 6896 { 6897 struct btrfs_root *root = inode->root; 6898 const u64 ino = btrfs_ino(inode); 6899 struct btrfs_path *path; 6900 struct btrfs_key search_key; 6901 int ret; 6902 6903 /* 6904 * For a single hard link case, go through a fast path that does not 6905 * need to iterate the fs/subvolume tree. 6906 */ 6907 if (inode->vfs_inode.i_nlink < 2) 6908 return log_new_ancestors_fast(trans, inode, parent, ctx); 6909 6910 path = btrfs_alloc_path(); 6911 if (!path) 6912 return -ENOMEM; 6913 6914 search_key.objectid = ino; 6915 search_key.type = BTRFS_INODE_REF_KEY; 6916 search_key.offset = 0; 6917 again: 6918 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6919 if (ret < 0) 6920 goto out; 6921 if (ret == 0) 6922 path->slots[0]++; 6923 6924 while (true) { 6925 struct extent_buffer *leaf = path->nodes[0]; 6926 int slot = path->slots[0]; 6927 struct btrfs_key found_key; 6928 6929 if (slot >= btrfs_header_nritems(leaf)) { 6930 ret = btrfs_next_leaf(root, path); 6931 if (ret < 0) 6932 goto out; 6933 else if (ret > 0) 6934 break; 6935 continue; 6936 } 6937 6938 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6939 if (found_key.objectid != ino || 6940 found_key.type > BTRFS_INODE_EXTREF_KEY) 6941 break; 6942 6943 /* 6944 * Don't deal with extended references because they are rare 6945 * cases and too complex to deal with (we would need to keep 6946 * track of which subitem we are processing for each item in 6947 * this loop, etc). So just return some error to fallback to 6948 * a transaction commit. 6949 */ 6950 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6951 ret = -EMLINK; 6952 goto out; 6953 } 6954 6955 /* 6956 * Logging ancestors needs to do more searches on the fs/subvol 6957 * tree, so it releases the path as needed to avoid deadlocks. 6958 * Keep track of the last inode ref key and resume from that key 6959 * after logging all new ancestors for the current hard link. 6960 */ 6961 memcpy(&search_key, &found_key, sizeof(search_key)); 6962 6963 ret = log_new_ancestors(trans, root, path, ctx); 6964 if (ret) 6965 goto out; 6966 btrfs_release_path(path); 6967 goto again; 6968 } 6969 ret = 0; 6970 out: 6971 btrfs_free_path(path); 6972 return ret; 6973 } 6974 6975 /* 6976 * helper function around btrfs_log_inode to make sure newly created 6977 * parent directories also end up in the log. A minimal inode and backref 6978 * only logging is done of any parent directories that are older than 6979 * the last committed transaction 6980 */ 6981 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6982 struct btrfs_inode *inode, 6983 struct dentry *parent, 6984 int inode_only, 6985 struct btrfs_log_ctx *ctx) 6986 { 6987 struct btrfs_root *root = inode->root; 6988 struct btrfs_fs_info *fs_info = root->fs_info; 6989 int ret = 0; 6990 bool log_dentries = false; 6991 6992 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6993 ret = BTRFS_LOG_FORCE_COMMIT; 6994 goto end_no_trans; 6995 } 6996 6997 if (btrfs_root_refs(&root->root_item) == 0) { 6998 ret = BTRFS_LOG_FORCE_COMMIT; 6999 goto end_no_trans; 7000 } 7001 7002 /* 7003 * Skip already logged inodes or inodes corresponding to tmpfiles 7004 * (since logging them is pointless, a link count of 0 means they 7005 * will never be accessible). 7006 */ 7007 if ((btrfs_inode_in_log(inode, trans->transid) && 7008 list_empty(&ctx->ordered_extents)) || 7009 inode->vfs_inode.i_nlink == 0) { 7010 ret = BTRFS_NO_LOG_SYNC; 7011 goto end_no_trans; 7012 } 7013 7014 ret = start_log_trans(trans, root, ctx); 7015 if (ret) 7016 goto end_no_trans; 7017 7018 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 7019 if (ret) 7020 goto end_trans; 7021 7022 /* 7023 * for regular files, if its inode is already on disk, we don't 7024 * have to worry about the parents at all. This is because 7025 * we can use the last_unlink_trans field to record renames 7026 * and other fun in this file. 7027 */ 7028 if (S_ISREG(inode->vfs_inode.i_mode) && 7029 inode->generation < trans->transid && 7030 inode->last_unlink_trans < trans->transid) { 7031 ret = 0; 7032 goto end_trans; 7033 } 7034 7035 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 7036 log_dentries = true; 7037 7038 /* 7039 * On unlink we must make sure all our current and old parent directory 7040 * inodes are fully logged. This is to prevent leaving dangling 7041 * directory index entries in directories that were our parents but are 7042 * not anymore. Not doing this results in old parent directory being 7043 * impossible to delete after log replay (rmdir will always fail with 7044 * error -ENOTEMPTY). 7045 * 7046 * Example 1: 7047 * 7048 * mkdir testdir 7049 * touch testdir/foo 7050 * ln testdir/foo testdir/bar 7051 * sync 7052 * unlink testdir/bar 7053 * xfs_io -c fsync testdir/foo 7054 * <power failure> 7055 * mount fs, triggers log replay 7056 * 7057 * If we don't log the parent directory (testdir), after log replay the 7058 * directory still has an entry pointing to the file inode using the bar 7059 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 7060 * the file inode has a link count of 1. 7061 * 7062 * Example 2: 7063 * 7064 * mkdir testdir 7065 * touch foo 7066 * ln foo testdir/foo2 7067 * ln foo testdir/foo3 7068 * sync 7069 * unlink testdir/foo3 7070 * xfs_io -c fsync foo 7071 * <power failure> 7072 * mount fs, triggers log replay 7073 * 7074 * Similar as the first example, after log replay the parent directory 7075 * testdir still has an entry pointing to the inode file with name foo3 7076 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 7077 * and has a link count of 2. 7078 */ 7079 if (inode->last_unlink_trans >= trans->transid) { 7080 ret = btrfs_log_all_parents(trans, inode, ctx); 7081 if (ret) 7082 goto end_trans; 7083 } 7084 7085 ret = log_all_new_ancestors(trans, inode, parent, ctx); 7086 if (ret) 7087 goto end_trans; 7088 7089 if (log_dentries) 7090 ret = log_new_dir_dentries(trans, inode, ctx); 7091 else 7092 ret = 0; 7093 end_trans: 7094 if (ret < 0) { 7095 btrfs_set_log_full_commit(trans); 7096 ret = BTRFS_LOG_FORCE_COMMIT; 7097 } 7098 7099 if (ret) 7100 btrfs_remove_log_ctx(root, ctx); 7101 btrfs_end_log_trans(root); 7102 end_no_trans: 7103 return ret; 7104 } 7105 7106 /* 7107 * it is not safe to log dentry if the chunk root has added new 7108 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 7109 * If this returns 1, you must commit the transaction to safely get your 7110 * data on disk. 7111 */ 7112 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 7113 struct dentry *dentry, 7114 struct btrfs_log_ctx *ctx) 7115 { 7116 struct dentry *parent = dget_parent(dentry); 7117 int ret; 7118 7119 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 7120 LOG_INODE_ALL, ctx); 7121 dput(parent); 7122 7123 return ret; 7124 } 7125 7126 /* 7127 * should be called during mount to recover any replay any log trees 7128 * from the FS 7129 */ 7130 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 7131 { 7132 int ret; 7133 struct btrfs_path *path; 7134 struct btrfs_trans_handle *trans; 7135 struct btrfs_key key; 7136 struct btrfs_key found_key; 7137 struct btrfs_root *log; 7138 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 7139 struct walk_control wc = { 7140 .process_func = process_one_buffer, 7141 .stage = LOG_WALK_PIN_ONLY, 7142 }; 7143 7144 path = btrfs_alloc_path(); 7145 if (!path) 7146 return -ENOMEM; 7147 7148 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7149 7150 trans = btrfs_start_transaction(fs_info->tree_root, 0); 7151 if (IS_ERR(trans)) { 7152 ret = PTR_ERR(trans); 7153 goto error; 7154 } 7155 7156 wc.trans = trans; 7157 wc.pin = 1; 7158 7159 ret = walk_log_tree(trans, log_root_tree, &wc); 7160 if (ret) { 7161 btrfs_abort_transaction(trans, ret); 7162 goto error; 7163 } 7164 7165 again: 7166 key.objectid = BTRFS_TREE_LOG_OBJECTID; 7167 key.offset = (u64)-1; 7168 key.type = BTRFS_ROOT_ITEM_KEY; 7169 7170 while (1) { 7171 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 7172 7173 if (ret < 0) { 7174 btrfs_abort_transaction(trans, ret); 7175 goto error; 7176 } 7177 if (ret > 0) { 7178 if (path->slots[0] == 0) 7179 break; 7180 path->slots[0]--; 7181 } 7182 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 7183 path->slots[0]); 7184 btrfs_release_path(path); 7185 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 7186 break; 7187 7188 log = btrfs_read_tree_root(log_root_tree, &found_key); 7189 if (IS_ERR(log)) { 7190 ret = PTR_ERR(log); 7191 btrfs_abort_transaction(trans, ret); 7192 goto error; 7193 } 7194 7195 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 7196 true); 7197 if (IS_ERR(wc.replay_dest)) { 7198 ret = PTR_ERR(wc.replay_dest); 7199 7200 /* 7201 * We didn't find the subvol, likely because it was 7202 * deleted. This is ok, simply skip this log and go to 7203 * the next one. 7204 * 7205 * We need to exclude the root because we can't have 7206 * other log replays overwriting this log as we'll read 7207 * it back in a few more times. This will keep our 7208 * block from being modified, and we'll just bail for 7209 * each subsequent pass. 7210 */ 7211 if (ret == -ENOENT) 7212 ret = btrfs_pin_extent_for_log_replay(trans, 7213 log->node->start, 7214 log->node->len); 7215 btrfs_put_root(log); 7216 7217 if (!ret) 7218 goto next; 7219 btrfs_abort_transaction(trans, ret); 7220 goto error; 7221 } 7222 7223 wc.replay_dest->log_root = log; 7224 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 7225 if (ret) 7226 /* The loop needs to continue due to the root refs */ 7227 btrfs_abort_transaction(trans, ret); 7228 else 7229 ret = walk_log_tree(trans, log, &wc); 7230 7231 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7232 ret = fixup_inode_link_counts(trans, wc.replay_dest, 7233 path); 7234 if (ret) 7235 btrfs_abort_transaction(trans, ret); 7236 } 7237 7238 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7239 struct btrfs_root *root = wc.replay_dest; 7240 7241 btrfs_release_path(path); 7242 7243 /* 7244 * We have just replayed everything, and the highest 7245 * objectid of fs roots probably has changed in case 7246 * some inode_item's got replayed. 7247 * 7248 * root->objectid_mutex is not acquired as log replay 7249 * could only happen during mount. 7250 */ 7251 ret = btrfs_init_root_free_objectid(root); 7252 if (ret) 7253 btrfs_abort_transaction(trans, ret); 7254 } 7255 7256 wc.replay_dest->log_root = NULL; 7257 btrfs_put_root(wc.replay_dest); 7258 btrfs_put_root(log); 7259 7260 if (ret) 7261 goto error; 7262 next: 7263 if (found_key.offset == 0) 7264 break; 7265 key.offset = found_key.offset - 1; 7266 } 7267 btrfs_release_path(path); 7268 7269 /* step one is to pin it all, step two is to replay just inodes */ 7270 if (wc.pin) { 7271 wc.pin = 0; 7272 wc.process_func = replay_one_buffer; 7273 wc.stage = LOG_WALK_REPLAY_INODES; 7274 goto again; 7275 } 7276 /* step three is to replay everything */ 7277 if (wc.stage < LOG_WALK_REPLAY_ALL) { 7278 wc.stage++; 7279 goto again; 7280 } 7281 7282 btrfs_free_path(path); 7283 7284 /* step 4: commit the transaction, which also unpins the blocks */ 7285 ret = btrfs_commit_transaction(trans); 7286 if (ret) 7287 return ret; 7288 7289 log_root_tree->log_root = NULL; 7290 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7291 btrfs_put_root(log_root_tree); 7292 7293 return 0; 7294 error: 7295 if (wc.trans) 7296 btrfs_end_transaction(wc.trans); 7297 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7298 btrfs_free_path(path); 7299 return ret; 7300 } 7301 7302 /* 7303 * there are some corner cases where we want to force a full 7304 * commit instead of allowing a directory to be logged. 7305 * 7306 * They revolve around files there were unlinked from the directory, and 7307 * this function updates the parent directory so that a full commit is 7308 * properly done if it is fsync'd later after the unlinks are done. 7309 * 7310 * Must be called before the unlink operations (updates to the subvolume tree, 7311 * inodes, etc) are done. 7312 */ 7313 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 7314 struct btrfs_inode *dir, struct btrfs_inode *inode, 7315 bool for_rename) 7316 { 7317 /* 7318 * when we're logging a file, if it hasn't been renamed 7319 * or unlinked, and its inode is fully committed on disk, 7320 * we don't have to worry about walking up the directory chain 7321 * to log its parents. 7322 * 7323 * So, we use the last_unlink_trans field to put this transid 7324 * into the file. When the file is logged we check it and 7325 * don't log the parents if the file is fully on disk. 7326 */ 7327 mutex_lock(&inode->log_mutex); 7328 inode->last_unlink_trans = trans->transid; 7329 mutex_unlock(&inode->log_mutex); 7330 7331 if (!for_rename) 7332 return; 7333 7334 /* 7335 * If this directory was already logged, any new names will be logged 7336 * with btrfs_log_new_name() and old names will be deleted from the log 7337 * tree with btrfs_del_dir_entries_in_log() or with 7338 * btrfs_del_inode_ref_in_log(). 7339 */ 7340 if (inode_logged(trans, dir, NULL) == 1) 7341 return; 7342 7343 /* 7344 * If the inode we're about to unlink was logged before, the log will be 7345 * properly updated with the new name with btrfs_log_new_name() and the 7346 * old name removed with btrfs_del_dir_entries_in_log() or with 7347 * btrfs_del_inode_ref_in_log(). 7348 */ 7349 if (inode_logged(trans, inode, NULL) == 1) 7350 return; 7351 7352 /* 7353 * when renaming files across directories, if the directory 7354 * there we're unlinking from gets fsync'd later on, there's 7355 * no way to find the destination directory later and fsync it 7356 * properly. So, we have to be conservative and force commits 7357 * so the new name gets discovered. 7358 */ 7359 mutex_lock(&dir->log_mutex); 7360 dir->last_unlink_trans = trans->transid; 7361 mutex_unlock(&dir->log_mutex); 7362 } 7363 7364 /* 7365 * Make sure that if someone attempts to fsync the parent directory of a deleted 7366 * snapshot, it ends up triggering a transaction commit. This is to guarantee 7367 * that after replaying the log tree of the parent directory's root we will not 7368 * see the snapshot anymore and at log replay time we will not see any log tree 7369 * corresponding to the deleted snapshot's root, which could lead to replaying 7370 * it after replaying the log tree of the parent directory (which would replay 7371 * the snapshot delete operation). 7372 * 7373 * Must be called before the actual snapshot destroy operation (updates to the 7374 * parent root and tree of tree roots trees, etc) are done. 7375 */ 7376 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 7377 struct btrfs_inode *dir) 7378 { 7379 mutex_lock(&dir->log_mutex); 7380 dir->last_unlink_trans = trans->transid; 7381 mutex_unlock(&dir->log_mutex); 7382 } 7383 7384 /* 7385 * Update the log after adding a new name for an inode. 7386 * 7387 * @trans: Transaction handle. 7388 * @old_dentry: The dentry associated with the old name and the old 7389 * parent directory. 7390 * @old_dir: The inode of the previous parent directory for the case 7391 * of a rename. For a link operation, it must be NULL. 7392 * @old_dir_index: The index number associated with the old name, meaningful 7393 * only for rename operations (when @old_dir is not NULL). 7394 * Ignored for link operations. 7395 * @parent: The dentry associated with the directory under which the 7396 * new name is located. 7397 * 7398 * Call this after adding a new name for an inode, as a result of a link or 7399 * rename operation, and it will properly update the log to reflect the new name. 7400 */ 7401 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 7402 struct dentry *old_dentry, struct btrfs_inode *old_dir, 7403 u64 old_dir_index, struct dentry *parent) 7404 { 7405 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 7406 struct btrfs_root *root = inode->root; 7407 struct btrfs_log_ctx ctx; 7408 bool log_pinned = false; 7409 int ret; 7410 7411 /* 7412 * this will force the logging code to walk the dentry chain 7413 * up for the file 7414 */ 7415 if (!S_ISDIR(inode->vfs_inode.i_mode)) 7416 inode->last_unlink_trans = trans->transid; 7417 7418 /* 7419 * if this inode hasn't been logged and directory we're renaming it 7420 * from hasn't been logged, we don't need to log it 7421 */ 7422 ret = inode_logged(trans, inode, NULL); 7423 if (ret < 0) { 7424 goto out; 7425 } else if (ret == 0) { 7426 if (!old_dir) 7427 return; 7428 /* 7429 * If the inode was not logged and we are doing a rename (old_dir is not 7430 * NULL), check if old_dir was logged - if it was not we can return and 7431 * do nothing. 7432 */ 7433 ret = inode_logged(trans, old_dir, NULL); 7434 if (ret < 0) 7435 goto out; 7436 else if (ret == 0) 7437 return; 7438 } 7439 ret = 0; 7440 7441 /* 7442 * If we are doing a rename (old_dir is not NULL) from a directory that 7443 * was previously logged, make sure that on log replay we get the old 7444 * dir entry deleted. This is needed because we will also log the new 7445 * name of the renamed inode, so we need to make sure that after log 7446 * replay we don't end up with both the new and old dir entries existing. 7447 */ 7448 if (old_dir && old_dir->logged_trans == trans->transid) { 7449 struct btrfs_root *log = old_dir->root->log_root; 7450 struct btrfs_path *path; 7451 struct fscrypt_name fname; 7452 7453 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7454 7455 ret = fscrypt_setup_filename(&old_dir->vfs_inode, 7456 &old_dentry->d_name, 0, &fname); 7457 if (ret) 7458 goto out; 7459 /* 7460 * We have two inodes to update in the log, the old directory and 7461 * the inode that got renamed, so we must pin the log to prevent 7462 * anyone from syncing the log until we have updated both inodes 7463 * in the log. 7464 */ 7465 ret = join_running_log_trans(root); 7466 /* 7467 * At least one of the inodes was logged before, so this should 7468 * not fail, but if it does, it's not serious, just bail out and 7469 * mark the log for a full commit. 7470 */ 7471 if (WARN_ON_ONCE(ret < 0)) { 7472 fscrypt_free_filename(&fname); 7473 goto out; 7474 } 7475 7476 log_pinned = true; 7477 7478 path = btrfs_alloc_path(); 7479 if (!path) { 7480 ret = -ENOMEM; 7481 fscrypt_free_filename(&fname); 7482 goto out; 7483 } 7484 7485 /* 7486 * Other concurrent task might be logging the old directory, 7487 * as it can be triggered when logging other inode that had or 7488 * still has a dentry in the old directory. We lock the old 7489 * directory's log_mutex to ensure the deletion of the old 7490 * name is persisted, because during directory logging we 7491 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of 7492 * the old name's dir index item is in the delayed items, so 7493 * it could be missed by an in progress directory logging. 7494 */ 7495 mutex_lock(&old_dir->log_mutex); 7496 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7497 &fname.disk_name, old_dir_index); 7498 if (ret > 0) { 7499 /* 7500 * The dentry does not exist in the log, so record its 7501 * deletion. 7502 */ 7503 btrfs_release_path(path); 7504 ret = insert_dir_log_key(trans, log, path, 7505 btrfs_ino(old_dir), 7506 old_dir_index, old_dir_index); 7507 } 7508 mutex_unlock(&old_dir->log_mutex); 7509 7510 btrfs_free_path(path); 7511 fscrypt_free_filename(&fname); 7512 if (ret < 0) 7513 goto out; 7514 } 7515 7516 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7517 ctx.logging_new_name = true; 7518 /* 7519 * We don't care about the return value. If we fail to log the new name 7520 * then we know the next attempt to sync the log will fallback to a full 7521 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7522 * we don't need to worry about getting a log committed that has an 7523 * inconsistent state after a rename operation. 7524 */ 7525 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7526 ASSERT(list_empty(&ctx.conflict_inodes)); 7527 out: 7528 /* 7529 * If an error happened mark the log for a full commit because it's not 7530 * consistent and up to date or we couldn't find out if one of the 7531 * inodes was logged before in this transaction. Do it before unpinning 7532 * the log, to avoid any races with someone else trying to commit it. 7533 */ 7534 if (ret < 0) 7535 btrfs_set_log_full_commit(trans); 7536 if (log_pinned) 7537 btrfs_end_log_trans(root); 7538 } 7539 7540