1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 25 #define MAX_CONFLICT_INODES 10 26 27 /* magic values for the inode_only field in btrfs_log_inode: 28 * 29 * LOG_INODE_ALL means to log everything 30 * LOG_INODE_EXISTS means to log just enough to recreate the inode 31 * during log replay 32 */ 33 enum { 34 LOG_INODE_ALL, 35 LOG_INODE_EXISTS, 36 }; 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 enum { 91 LOG_WALK_PIN_ONLY, 92 LOG_WALK_REPLAY_INODES, 93 LOG_WALK_REPLAY_DIR_INDEX, 94 LOG_WALK_REPLAY_ALL, 95 }; 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_inode *inode, 99 int inode_only, 100 struct btrfs_log_ctx *ctx); 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, 103 struct btrfs_path *path, u64 objectid); 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 105 struct btrfs_root *root, 106 struct btrfs_root *log, 107 struct btrfs_path *path, 108 u64 dirid, int del_all); 109 static void wait_log_commit(struct btrfs_root *root, int transid); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 struct btrfs_root *tree_root = fs_info->tree_root; 145 const bool zoned = btrfs_is_zoned(fs_info); 146 int ret = 0; 147 bool created = false; 148 149 /* 150 * First check if the log root tree was already created. If not, create 151 * it before locking the root's log_mutex, just to keep lockdep happy. 152 */ 153 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 154 mutex_lock(&tree_root->log_mutex); 155 if (!fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, fs_info); 157 if (!ret) { 158 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 159 created = true; 160 } 161 } 162 mutex_unlock(&tree_root->log_mutex); 163 if (ret) 164 return ret; 165 } 166 167 mutex_lock(&root->log_mutex); 168 169 again: 170 if (root->log_root) { 171 int index = (root->log_transid + 1) % 2; 172 173 if (btrfs_need_log_full_commit(trans)) { 174 ret = BTRFS_LOG_FORCE_COMMIT; 175 goto out; 176 } 177 178 if (zoned && atomic_read(&root->log_commit[index])) { 179 wait_log_commit(root, root->log_transid - 1); 180 goto again; 181 } 182 183 if (!root->log_start_pid) { 184 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 185 root->log_start_pid = current->pid; 186 } else if (root->log_start_pid != current->pid) { 187 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 188 } 189 } else { 190 /* 191 * This means fs_info->log_root_tree was already created 192 * for some other FS trees. Do the full commit not to mix 193 * nodes from multiple log transactions to do sequential 194 * writing. 195 */ 196 if (zoned && !created) { 197 ret = BTRFS_LOG_FORCE_COMMIT; 198 goto out; 199 } 200 201 ret = btrfs_add_log_tree(trans, root); 202 if (ret) 203 goto out; 204 205 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 206 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 207 root->log_start_pid = current->pid; 208 } 209 210 atomic_inc(&root->log_writers); 211 if (!ctx->logging_new_name) { 212 int index = root->log_transid % 2; 213 list_add_tail(&ctx->list, &root->log_ctxs[index]); 214 ctx->log_transid = root->log_transid; 215 } 216 217 out: 218 mutex_unlock(&root->log_mutex); 219 return ret; 220 } 221 222 /* 223 * returns 0 if there was a log transaction running and we were able 224 * to join, or returns -ENOENT if there were not transactions 225 * in progress 226 */ 227 static int join_running_log_trans(struct btrfs_root *root) 228 { 229 const bool zoned = btrfs_is_zoned(root->fs_info); 230 int ret = -ENOENT; 231 232 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 233 return ret; 234 235 mutex_lock(&root->log_mutex); 236 again: 237 if (root->log_root) { 238 int index = (root->log_transid + 1) % 2; 239 240 ret = 0; 241 if (zoned && atomic_read(&root->log_commit[index])) { 242 wait_log_commit(root, root->log_transid - 1); 243 goto again; 244 } 245 atomic_inc(&root->log_writers); 246 } 247 mutex_unlock(&root->log_mutex); 248 return ret; 249 } 250 251 /* 252 * This either makes the current running log transaction wait 253 * until you call btrfs_end_log_trans() or it makes any future 254 * log transactions wait until you call btrfs_end_log_trans() 255 */ 256 void btrfs_pin_log_trans(struct btrfs_root *root) 257 { 258 atomic_inc(&root->log_writers); 259 } 260 261 /* 262 * indicate we're done making changes to the log tree 263 * and wake up anyone waiting to do a sync 264 */ 265 void btrfs_end_log_trans(struct btrfs_root *root) 266 { 267 if (atomic_dec_and_test(&root->log_writers)) { 268 /* atomic_dec_and_test implies a barrier */ 269 cond_wake_up_nomb(&root->log_writer_wait); 270 } 271 } 272 273 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 274 { 275 filemap_fdatawait_range(buf->pages[0]->mapping, 276 buf->start, buf->start + buf->len - 1); 277 } 278 279 /* 280 * the walk control struct is used to pass state down the chain when 281 * processing the log tree. The stage field tells us which part 282 * of the log tree processing we are currently doing. The others 283 * are state fields used for that specific part 284 */ 285 struct walk_control { 286 /* should we free the extent on disk when done? This is used 287 * at transaction commit time while freeing a log tree 288 */ 289 int free; 290 291 /* pin only walk, we record which extents on disk belong to the 292 * log trees 293 */ 294 int pin; 295 296 /* what stage of the replay code we're currently in */ 297 int stage; 298 299 /* 300 * Ignore any items from the inode currently being processed. Needs 301 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 302 * the LOG_WALK_REPLAY_INODES stage. 303 */ 304 bool ignore_cur_inode; 305 306 /* the root we are currently replaying */ 307 struct btrfs_root *replay_dest; 308 309 /* the trans handle for the current replay */ 310 struct btrfs_trans_handle *trans; 311 312 /* the function that gets used to process blocks we find in the 313 * tree. Note the extent_buffer might not be up to date when it is 314 * passed in, and it must be checked or read if you need the data 315 * inside it 316 */ 317 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 318 struct walk_control *wc, u64 gen, int level); 319 }; 320 321 /* 322 * process_func used to pin down extents, write them or wait on them 323 */ 324 static int process_one_buffer(struct btrfs_root *log, 325 struct extent_buffer *eb, 326 struct walk_control *wc, u64 gen, int level) 327 { 328 struct btrfs_fs_info *fs_info = log->fs_info; 329 int ret = 0; 330 331 /* 332 * If this fs is mixed then we need to be able to process the leaves to 333 * pin down any logged extents, so we have to read the block. 334 */ 335 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 336 ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 337 if (ret) 338 return ret; 339 } 340 341 if (wc->pin) { 342 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 343 eb->len); 344 if (ret) 345 return ret; 346 347 if (btrfs_buffer_uptodate(eb, gen, 0) && 348 btrfs_header_level(eb) == 0) 349 ret = btrfs_exclude_logged_extents(eb); 350 } 351 return ret; 352 } 353 354 static int do_overwrite_item(struct btrfs_trans_handle *trans, 355 struct btrfs_root *root, 356 struct btrfs_path *path, 357 struct extent_buffer *eb, int slot, 358 struct btrfs_key *key) 359 { 360 int ret; 361 u32 item_size; 362 u64 saved_i_size = 0; 363 int save_old_i_size = 0; 364 unsigned long src_ptr; 365 unsigned long dst_ptr; 366 int overwrite_root = 0; 367 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 368 369 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 370 overwrite_root = 1; 371 372 item_size = btrfs_item_size(eb, slot); 373 src_ptr = btrfs_item_ptr_offset(eb, slot); 374 375 /* Our caller must have done a search for the key for us. */ 376 ASSERT(path->nodes[0] != NULL); 377 378 /* 379 * And the slot must point to the exact key or the slot where the key 380 * should be at (the first item with a key greater than 'key') 381 */ 382 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 383 struct btrfs_key found_key; 384 385 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 386 ret = btrfs_comp_cpu_keys(&found_key, key); 387 ASSERT(ret >= 0); 388 } else { 389 ret = 1; 390 } 391 392 if (ret == 0) { 393 char *src_copy; 394 char *dst_copy; 395 u32 dst_size = btrfs_item_size(path->nodes[0], 396 path->slots[0]); 397 if (dst_size != item_size) 398 goto insert; 399 400 if (item_size == 0) { 401 btrfs_release_path(path); 402 return 0; 403 } 404 dst_copy = kmalloc(item_size, GFP_NOFS); 405 src_copy = kmalloc(item_size, GFP_NOFS); 406 if (!dst_copy || !src_copy) { 407 btrfs_release_path(path); 408 kfree(dst_copy); 409 kfree(src_copy); 410 return -ENOMEM; 411 } 412 413 read_extent_buffer(eb, src_copy, src_ptr, item_size); 414 415 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 416 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 417 item_size); 418 ret = memcmp(dst_copy, src_copy, item_size); 419 420 kfree(dst_copy); 421 kfree(src_copy); 422 /* 423 * they have the same contents, just return, this saves 424 * us from cowing blocks in the destination tree and doing 425 * extra writes that may not have been done by a previous 426 * sync 427 */ 428 if (ret == 0) { 429 btrfs_release_path(path); 430 return 0; 431 } 432 433 /* 434 * We need to load the old nbytes into the inode so when we 435 * replay the extents we've logged we get the right nbytes. 436 */ 437 if (inode_item) { 438 struct btrfs_inode_item *item; 439 u64 nbytes; 440 u32 mode; 441 442 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 443 struct btrfs_inode_item); 444 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 445 item = btrfs_item_ptr(eb, slot, 446 struct btrfs_inode_item); 447 btrfs_set_inode_nbytes(eb, item, nbytes); 448 449 /* 450 * If this is a directory we need to reset the i_size to 451 * 0 so that we can set it up properly when replaying 452 * the rest of the items in this log. 453 */ 454 mode = btrfs_inode_mode(eb, item); 455 if (S_ISDIR(mode)) 456 btrfs_set_inode_size(eb, item, 0); 457 } 458 } else if (inode_item) { 459 struct btrfs_inode_item *item; 460 u32 mode; 461 462 /* 463 * New inode, set nbytes to 0 so that the nbytes comes out 464 * properly when we replay the extents. 465 */ 466 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 467 btrfs_set_inode_nbytes(eb, item, 0); 468 469 /* 470 * If this is a directory we need to reset the i_size to 0 so 471 * that we can set it up properly when replaying the rest of 472 * the items in this log. 473 */ 474 mode = btrfs_inode_mode(eb, item); 475 if (S_ISDIR(mode)) 476 btrfs_set_inode_size(eb, item, 0); 477 } 478 insert: 479 btrfs_release_path(path); 480 /* try to insert the key into the destination tree */ 481 path->skip_release_on_error = 1; 482 ret = btrfs_insert_empty_item(trans, root, path, 483 key, item_size); 484 path->skip_release_on_error = 0; 485 486 /* make sure any existing item is the correct size */ 487 if (ret == -EEXIST || ret == -EOVERFLOW) { 488 u32 found_size; 489 found_size = btrfs_item_size(path->nodes[0], 490 path->slots[0]); 491 if (found_size > item_size) 492 btrfs_truncate_item(path, item_size, 1); 493 else if (found_size < item_size) 494 btrfs_extend_item(path, item_size - found_size); 495 } else if (ret) { 496 return ret; 497 } 498 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 499 path->slots[0]); 500 501 /* don't overwrite an existing inode if the generation number 502 * was logged as zero. This is done when the tree logging code 503 * is just logging an inode to make sure it exists after recovery. 504 * 505 * Also, don't overwrite i_size on directories during replay. 506 * log replay inserts and removes directory items based on the 507 * state of the tree found in the subvolume, and i_size is modified 508 * as it goes 509 */ 510 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 511 struct btrfs_inode_item *src_item; 512 struct btrfs_inode_item *dst_item; 513 514 src_item = (struct btrfs_inode_item *)src_ptr; 515 dst_item = (struct btrfs_inode_item *)dst_ptr; 516 517 if (btrfs_inode_generation(eb, src_item) == 0) { 518 struct extent_buffer *dst_eb = path->nodes[0]; 519 const u64 ino_size = btrfs_inode_size(eb, src_item); 520 521 /* 522 * For regular files an ino_size == 0 is used only when 523 * logging that an inode exists, as part of a directory 524 * fsync, and the inode wasn't fsynced before. In this 525 * case don't set the size of the inode in the fs/subvol 526 * tree, otherwise we would be throwing valid data away. 527 */ 528 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 529 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 530 ino_size != 0) 531 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 532 goto no_copy; 533 } 534 535 if (overwrite_root && 536 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 537 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 538 save_old_i_size = 1; 539 saved_i_size = btrfs_inode_size(path->nodes[0], 540 dst_item); 541 } 542 } 543 544 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 545 src_ptr, item_size); 546 547 if (save_old_i_size) { 548 struct btrfs_inode_item *dst_item; 549 dst_item = (struct btrfs_inode_item *)dst_ptr; 550 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 551 } 552 553 /* make sure the generation is filled in */ 554 if (key->type == BTRFS_INODE_ITEM_KEY) { 555 struct btrfs_inode_item *dst_item; 556 dst_item = (struct btrfs_inode_item *)dst_ptr; 557 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 558 btrfs_set_inode_generation(path->nodes[0], dst_item, 559 trans->transid); 560 } 561 } 562 no_copy: 563 btrfs_mark_buffer_dirty(path->nodes[0]); 564 btrfs_release_path(path); 565 return 0; 566 } 567 568 /* 569 * Item overwrite used by replay and tree logging. eb, slot and key all refer 570 * to the src data we are copying out. 571 * 572 * root is the tree we are copying into, and path is a scratch 573 * path for use in this function (it should be released on entry and 574 * will be released on exit). 575 * 576 * If the key is already in the destination tree the existing item is 577 * overwritten. If the existing item isn't big enough, it is extended. 578 * If it is too large, it is truncated. 579 * 580 * If the key isn't in the destination yet, a new item is inserted. 581 */ 582 static int overwrite_item(struct btrfs_trans_handle *trans, 583 struct btrfs_root *root, 584 struct btrfs_path *path, 585 struct extent_buffer *eb, int slot, 586 struct btrfs_key *key) 587 { 588 int ret; 589 590 /* Look for the key in the destination tree. */ 591 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 592 if (ret < 0) 593 return ret; 594 595 return do_overwrite_item(trans, root, path, eb, slot, key); 596 } 597 598 /* 599 * simple helper to read an inode off the disk from a given root 600 * This can only be called for subvolume roots and not for the log 601 */ 602 static noinline struct inode *read_one_inode(struct btrfs_root *root, 603 u64 objectid) 604 { 605 struct inode *inode; 606 607 inode = btrfs_iget(root->fs_info->sb, objectid, root); 608 if (IS_ERR(inode)) 609 inode = NULL; 610 return inode; 611 } 612 613 /* replays a single extent in 'eb' at 'slot' with 'key' into the 614 * subvolume 'root'. path is released on entry and should be released 615 * on exit. 616 * 617 * extents in the log tree have not been allocated out of the extent 618 * tree yet. So, this completes the allocation, taking a reference 619 * as required if the extent already exists or creating a new extent 620 * if it isn't in the extent allocation tree yet. 621 * 622 * The extent is inserted into the file, dropping any existing extents 623 * from the file that overlap the new one. 624 */ 625 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 626 struct btrfs_root *root, 627 struct btrfs_path *path, 628 struct extent_buffer *eb, int slot, 629 struct btrfs_key *key) 630 { 631 struct btrfs_drop_extents_args drop_args = { 0 }; 632 struct btrfs_fs_info *fs_info = root->fs_info; 633 int found_type; 634 u64 extent_end; 635 u64 start = key->offset; 636 u64 nbytes = 0; 637 struct btrfs_file_extent_item *item; 638 struct inode *inode = NULL; 639 unsigned long size; 640 int ret = 0; 641 642 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 643 found_type = btrfs_file_extent_type(eb, item); 644 645 if (found_type == BTRFS_FILE_EXTENT_REG || 646 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 647 nbytes = btrfs_file_extent_num_bytes(eb, item); 648 extent_end = start + nbytes; 649 650 /* 651 * We don't add to the inodes nbytes if we are prealloc or a 652 * hole. 653 */ 654 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 655 nbytes = 0; 656 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 657 size = btrfs_file_extent_ram_bytes(eb, item); 658 nbytes = btrfs_file_extent_ram_bytes(eb, item); 659 extent_end = ALIGN(start + size, 660 fs_info->sectorsize); 661 } else { 662 ret = 0; 663 goto out; 664 } 665 666 inode = read_one_inode(root, key->objectid); 667 if (!inode) { 668 ret = -EIO; 669 goto out; 670 } 671 672 /* 673 * first check to see if we already have this extent in the 674 * file. This must be done before the btrfs_drop_extents run 675 * so we don't try to drop this extent. 676 */ 677 ret = btrfs_lookup_file_extent(trans, root, path, 678 btrfs_ino(BTRFS_I(inode)), start, 0); 679 680 if (ret == 0 && 681 (found_type == BTRFS_FILE_EXTENT_REG || 682 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 683 struct btrfs_file_extent_item cmp1; 684 struct btrfs_file_extent_item cmp2; 685 struct btrfs_file_extent_item *existing; 686 struct extent_buffer *leaf; 687 688 leaf = path->nodes[0]; 689 existing = btrfs_item_ptr(leaf, path->slots[0], 690 struct btrfs_file_extent_item); 691 692 read_extent_buffer(eb, &cmp1, (unsigned long)item, 693 sizeof(cmp1)); 694 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 695 sizeof(cmp2)); 696 697 /* 698 * we already have a pointer to this exact extent, 699 * we don't have to do anything 700 */ 701 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 702 btrfs_release_path(path); 703 goto out; 704 } 705 } 706 btrfs_release_path(path); 707 708 /* drop any overlapping extents */ 709 drop_args.start = start; 710 drop_args.end = extent_end; 711 drop_args.drop_cache = true; 712 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 713 if (ret) 714 goto out; 715 716 if (found_type == BTRFS_FILE_EXTENT_REG || 717 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 718 u64 offset; 719 unsigned long dest_offset; 720 struct btrfs_key ins; 721 722 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 723 btrfs_fs_incompat(fs_info, NO_HOLES)) 724 goto update_inode; 725 726 ret = btrfs_insert_empty_item(trans, root, path, key, 727 sizeof(*item)); 728 if (ret) 729 goto out; 730 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 731 path->slots[0]); 732 copy_extent_buffer(path->nodes[0], eb, dest_offset, 733 (unsigned long)item, sizeof(*item)); 734 735 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 736 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 737 ins.type = BTRFS_EXTENT_ITEM_KEY; 738 offset = key->offset - btrfs_file_extent_offset(eb, item); 739 740 /* 741 * Manually record dirty extent, as here we did a shallow 742 * file extent item copy and skip normal backref update, 743 * but modifying extent tree all by ourselves. 744 * So need to manually record dirty extent for qgroup, 745 * as the owner of the file extent changed from log tree 746 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 747 */ 748 ret = btrfs_qgroup_trace_extent(trans, 749 btrfs_file_extent_disk_bytenr(eb, item), 750 btrfs_file_extent_disk_num_bytes(eb, item), 751 GFP_NOFS); 752 if (ret < 0) 753 goto out; 754 755 if (ins.objectid > 0) { 756 struct btrfs_ref ref = { 0 }; 757 u64 csum_start; 758 u64 csum_end; 759 LIST_HEAD(ordered_sums); 760 761 /* 762 * is this extent already allocated in the extent 763 * allocation tree? If so, just add a reference 764 */ 765 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 766 ins.offset); 767 if (ret < 0) { 768 goto out; 769 } else if (ret == 0) { 770 btrfs_init_generic_ref(&ref, 771 BTRFS_ADD_DELAYED_REF, 772 ins.objectid, ins.offset, 0); 773 btrfs_init_data_ref(&ref, 774 root->root_key.objectid, 775 key->objectid, offset, 0, false); 776 ret = btrfs_inc_extent_ref(trans, &ref); 777 if (ret) 778 goto out; 779 } else { 780 /* 781 * insert the extent pointer in the extent 782 * allocation tree 783 */ 784 ret = btrfs_alloc_logged_file_extent(trans, 785 root->root_key.objectid, 786 key->objectid, offset, &ins); 787 if (ret) 788 goto out; 789 } 790 btrfs_release_path(path); 791 792 if (btrfs_file_extent_compression(eb, item)) { 793 csum_start = ins.objectid; 794 csum_end = csum_start + ins.offset; 795 } else { 796 csum_start = ins.objectid + 797 btrfs_file_extent_offset(eb, item); 798 csum_end = csum_start + 799 btrfs_file_extent_num_bytes(eb, item); 800 } 801 802 ret = btrfs_lookup_csums_range(root->log_root, 803 csum_start, csum_end - 1, 804 &ordered_sums, 0, false); 805 if (ret) 806 goto out; 807 /* 808 * Now delete all existing cums in the csum root that 809 * cover our range. We do this because we can have an 810 * extent that is completely referenced by one file 811 * extent item and partially referenced by another 812 * file extent item (like after using the clone or 813 * extent_same ioctls). In this case if we end up doing 814 * the replay of the one that partially references the 815 * extent first, and we do not do the csum deletion 816 * below, we can get 2 csum items in the csum tree that 817 * overlap each other. For example, imagine our log has 818 * the two following file extent items: 819 * 820 * key (257 EXTENT_DATA 409600) 821 * extent data disk byte 12845056 nr 102400 822 * extent data offset 20480 nr 20480 ram 102400 823 * 824 * key (257 EXTENT_DATA 819200) 825 * extent data disk byte 12845056 nr 102400 826 * extent data offset 0 nr 102400 ram 102400 827 * 828 * Where the second one fully references the 100K extent 829 * that starts at disk byte 12845056, and the log tree 830 * has a single csum item that covers the entire range 831 * of the extent: 832 * 833 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 834 * 835 * After the first file extent item is replayed, the 836 * csum tree gets the following csum item: 837 * 838 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 839 * 840 * Which covers the 20K sub-range starting at offset 20K 841 * of our extent. Now when we replay the second file 842 * extent item, if we do not delete existing csum items 843 * that cover any of its blocks, we end up getting two 844 * csum items in our csum tree that overlap each other: 845 * 846 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 847 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 848 * 849 * Which is a problem, because after this anyone trying 850 * to lookup up for the checksum of any block of our 851 * extent starting at an offset of 40K or higher, will 852 * end up looking at the second csum item only, which 853 * does not contain the checksum for any block starting 854 * at offset 40K or higher of our extent. 855 */ 856 while (!list_empty(&ordered_sums)) { 857 struct btrfs_ordered_sum *sums; 858 struct btrfs_root *csum_root; 859 860 sums = list_entry(ordered_sums.next, 861 struct btrfs_ordered_sum, 862 list); 863 csum_root = btrfs_csum_root(fs_info, 864 sums->bytenr); 865 if (!ret) 866 ret = btrfs_del_csums(trans, csum_root, 867 sums->bytenr, 868 sums->len); 869 if (!ret) 870 ret = btrfs_csum_file_blocks(trans, 871 csum_root, 872 sums); 873 list_del(&sums->list); 874 kfree(sums); 875 } 876 if (ret) 877 goto out; 878 } else { 879 btrfs_release_path(path); 880 } 881 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 882 /* inline extents are easy, we just overwrite them */ 883 ret = overwrite_item(trans, root, path, eb, slot, key); 884 if (ret) 885 goto out; 886 } 887 888 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 889 extent_end - start); 890 if (ret) 891 goto out; 892 893 update_inode: 894 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 895 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 896 out: 897 iput(inode); 898 return ret; 899 } 900 901 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 902 struct btrfs_inode *dir, 903 struct btrfs_inode *inode, 904 const char *name, 905 int name_len) 906 { 907 int ret; 908 909 ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); 910 if (ret) 911 return ret; 912 /* 913 * Whenever we need to check if a name exists or not, we check the 914 * fs/subvolume tree. So after an unlink we must run delayed items, so 915 * that future checks for a name during log replay see that the name 916 * does not exists anymore. 917 */ 918 return btrfs_run_delayed_items(trans); 919 } 920 921 /* 922 * when cleaning up conflicts between the directory names in the 923 * subvolume, directory names in the log and directory names in the 924 * inode back references, we may have to unlink inodes from directories. 925 * 926 * This is a helper function to do the unlink of a specific directory 927 * item 928 */ 929 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 930 struct btrfs_path *path, 931 struct btrfs_inode *dir, 932 struct btrfs_dir_item *di) 933 { 934 struct btrfs_root *root = dir->root; 935 struct inode *inode; 936 char *name; 937 int name_len; 938 struct extent_buffer *leaf; 939 struct btrfs_key location; 940 int ret; 941 942 leaf = path->nodes[0]; 943 944 btrfs_dir_item_key_to_cpu(leaf, di, &location); 945 name_len = btrfs_dir_name_len(leaf, di); 946 name = kmalloc(name_len, GFP_NOFS); 947 if (!name) 948 return -ENOMEM; 949 950 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 951 btrfs_release_path(path); 952 953 inode = read_one_inode(root, location.objectid); 954 if (!inode) { 955 ret = -EIO; 956 goto out; 957 } 958 959 ret = link_to_fixup_dir(trans, root, path, location.objectid); 960 if (ret) 961 goto out; 962 963 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, 964 name_len); 965 out: 966 kfree(name); 967 iput(inode); 968 return ret; 969 } 970 971 /* 972 * See if a given name and sequence number found in an inode back reference are 973 * already in a directory and correctly point to this inode. 974 * 975 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 976 * exists. 977 */ 978 static noinline int inode_in_dir(struct btrfs_root *root, 979 struct btrfs_path *path, 980 u64 dirid, u64 objectid, u64 index, 981 const char *name, int name_len) 982 { 983 struct btrfs_dir_item *di; 984 struct btrfs_key location; 985 int ret = 0; 986 987 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 988 index, name, name_len, 0); 989 if (IS_ERR(di)) { 990 ret = PTR_ERR(di); 991 goto out; 992 } else if (di) { 993 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 994 if (location.objectid != objectid) 995 goto out; 996 } else { 997 goto out; 998 } 999 1000 btrfs_release_path(path); 1001 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 1002 if (IS_ERR(di)) { 1003 ret = PTR_ERR(di); 1004 goto out; 1005 } else if (di) { 1006 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1007 if (location.objectid == objectid) 1008 ret = 1; 1009 } 1010 out: 1011 btrfs_release_path(path); 1012 return ret; 1013 } 1014 1015 /* 1016 * helper function to check a log tree for a named back reference in 1017 * an inode. This is used to decide if a back reference that is 1018 * found in the subvolume conflicts with what we find in the log. 1019 * 1020 * inode backreferences may have multiple refs in a single item, 1021 * during replay we process one reference at a time, and we don't 1022 * want to delete valid links to a file from the subvolume if that 1023 * link is also in the log. 1024 */ 1025 static noinline int backref_in_log(struct btrfs_root *log, 1026 struct btrfs_key *key, 1027 u64 ref_objectid, 1028 const char *name, int namelen) 1029 { 1030 struct btrfs_path *path; 1031 int ret; 1032 1033 path = btrfs_alloc_path(); 1034 if (!path) 1035 return -ENOMEM; 1036 1037 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1038 if (ret < 0) { 1039 goto out; 1040 } else if (ret == 1) { 1041 ret = 0; 1042 goto out; 1043 } 1044 1045 if (key->type == BTRFS_INODE_EXTREF_KEY) 1046 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1047 path->slots[0], 1048 ref_objectid, 1049 name, namelen); 1050 else 1051 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1052 path->slots[0], 1053 name, namelen); 1054 out: 1055 btrfs_free_path(path); 1056 return ret; 1057 } 1058 1059 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1060 struct btrfs_root *root, 1061 struct btrfs_path *path, 1062 struct btrfs_root *log_root, 1063 struct btrfs_inode *dir, 1064 struct btrfs_inode *inode, 1065 u64 inode_objectid, u64 parent_objectid, 1066 u64 ref_index, char *name, int namelen) 1067 { 1068 int ret; 1069 char *victim_name; 1070 int victim_name_len; 1071 struct extent_buffer *leaf; 1072 struct btrfs_dir_item *di; 1073 struct btrfs_key search_key; 1074 struct btrfs_inode_extref *extref; 1075 1076 again: 1077 /* Search old style refs */ 1078 search_key.objectid = inode_objectid; 1079 search_key.type = BTRFS_INODE_REF_KEY; 1080 search_key.offset = parent_objectid; 1081 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1082 if (ret == 0) { 1083 struct btrfs_inode_ref *victim_ref; 1084 unsigned long ptr; 1085 unsigned long ptr_end; 1086 1087 leaf = path->nodes[0]; 1088 1089 /* are we trying to overwrite a back ref for the root directory 1090 * if so, just jump out, we're done 1091 */ 1092 if (search_key.objectid == search_key.offset) 1093 return 1; 1094 1095 /* check all the names in this back reference to see 1096 * if they are in the log. if so, we allow them to stay 1097 * otherwise they must be unlinked as a conflict 1098 */ 1099 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1100 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1101 while (ptr < ptr_end) { 1102 victim_ref = (struct btrfs_inode_ref *)ptr; 1103 victim_name_len = btrfs_inode_ref_name_len(leaf, 1104 victim_ref); 1105 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1106 if (!victim_name) 1107 return -ENOMEM; 1108 1109 read_extent_buffer(leaf, victim_name, 1110 (unsigned long)(victim_ref + 1), 1111 victim_name_len); 1112 1113 ret = backref_in_log(log_root, &search_key, 1114 parent_objectid, victim_name, 1115 victim_name_len); 1116 if (ret < 0) { 1117 kfree(victim_name); 1118 return ret; 1119 } else if (!ret) { 1120 inc_nlink(&inode->vfs_inode); 1121 btrfs_release_path(path); 1122 1123 ret = unlink_inode_for_log_replay(trans, dir, inode, 1124 victim_name, victim_name_len); 1125 kfree(victim_name); 1126 if (ret) 1127 return ret; 1128 goto again; 1129 } 1130 kfree(victim_name); 1131 1132 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1133 } 1134 } 1135 btrfs_release_path(path); 1136 1137 /* Same search but for extended refs */ 1138 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1139 inode_objectid, parent_objectid, 0, 1140 0); 1141 if (IS_ERR(extref)) { 1142 return PTR_ERR(extref); 1143 } else if (extref) { 1144 u32 item_size; 1145 u32 cur_offset = 0; 1146 unsigned long base; 1147 struct inode *victim_parent; 1148 1149 leaf = path->nodes[0]; 1150 1151 item_size = btrfs_item_size(leaf, path->slots[0]); 1152 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1153 1154 while (cur_offset < item_size) { 1155 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1156 1157 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1158 1159 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1160 goto next; 1161 1162 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1163 if (!victim_name) 1164 return -ENOMEM; 1165 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1166 victim_name_len); 1167 1168 search_key.objectid = inode_objectid; 1169 search_key.type = BTRFS_INODE_EXTREF_KEY; 1170 search_key.offset = btrfs_extref_hash(parent_objectid, 1171 victim_name, 1172 victim_name_len); 1173 ret = backref_in_log(log_root, &search_key, 1174 parent_objectid, victim_name, 1175 victim_name_len); 1176 if (ret < 0) { 1177 kfree(victim_name); 1178 return ret; 1179 } else if (!ret) { 1180 ret = -ENOENT; 1181 victim_parent = read_one_inode(root, 1182 parent_objectid); 1183 if (victim_parent) { 1184 inc_nlink(&inode->vfs_inode); 1185 btrfs_release_path(path); 1186 1187 ret = unlink_inode_for_log_replay(trans, 1188 BTRFS_I(victim_parent), 1189 inode, 1190 victim_name, 1191 victim_name_len); 1192 } 1193 iput(victim_parent); 1194 kfree(victim_name); 1195 if (ret) 1196 return ret; 1197 goto again; 1198 } 1199 kfree(victim_name); 1200 next: 1201 cur_offset += victim_name_len + sizeof(*extref); 1202 } 1203 } 1204 btrfs_release_path(path); 1205 1206 /* look for a conflicting sequence number */ 1207 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1208 ref_index, name, namelen, 0); 1209 if (IS_ERR(di)) { 1210 return PTR_ERR(di); 1211 } else if (di) { 1212 ret = drop_one_dir_item(trans, path, dir, di); 1213 if (ret) 1214 return ret; 1215 } 1216 btrfs_release_path(path); 1217 1218 /* look for a conflicting name */ 1219 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1220 name, namelen, 0); 1221 if (IS_ERR(di)) { 1222 return PTR_ERR(di); 1223 } else if (di) { 1224 ret = drop_one_dir_item(trans, path, dir, di); 1225 if (ret) 1226 return ret; 1227 } 1228 btrfs_release_path(path); 1229 1230 return 0; 1231 } 1232 1233 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1234 u32 *namelen, char **name, u64 *index, 1235 u64 *parent_objectid) 1236 { 1237 struct btrfs_inode_extref *extref; 1238 1239 extref = (struct btrfs_inode_extref *)ref_ptr; 1240 1241 *namelen = btrfs_inode_extref_name_len(eb, extref); 1242 *name = kmalloc(*namelen, GFP_NOFS); 1243 if (*name == NULL) 1244 return -ENOMEM; 1245 1246 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1247 *namelen); 1248 1249 if (index) 1250 *index = btrfs_inode_extref_index(eb, extref); 1251 if (parent_objectid) 1252 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1253 1254 return 0; 1255 } 1256 1257 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1258 u32 *namelen, char **name, u64 *index) 1259 { 1260 struct btrfs_inode_ref *ref; 1261 1262 ref = (struct btrfs_inode_ref *)ref_ptr; 1263 1264 *namelen = btrfs_inode_ref_name_len(eb, ref); 1265 *name = kmalloc(*namelen, GFP_NOFS); 1266 if (*name == NULL) 1267 return -ENOMEM; 1268 1269 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1270 1271 if (index) 1272 *index = btrfs_inode_ref_index(eb, ref); 1273 1274 return 0; 1275 } 1276 1277 /* 1278 * Take an inode reference item from the log tree and iterate all names from the 1279 * inode reference item in the subvolume tree with the same key (if it exists). 1280 * For any name that is not in the inode reference item from the log tree, do a 1281 * proper unlink of that name (that is, remove its entry from the inode 1282 * reference item and both dir index keys). 1283 */ 1284 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1285 struct btrfs_root *root, 1286 struct btrfs_path *path, 1287 struct btrfs_inode *inode, 1288 struct extent_buffer *log_eb, 1289 int log_slot, 1290 struct btrfs_key *key) 1291 { 1292 int ret; 1293 unsigned long ref_ptr; 1294 unsigned long ref_end; 1295 struct extent_buffer *eb; 1296 1297 again: 1298 btrfs_release_path(path); 1299 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1300 if (ret > 0) { 1301 ret = 0; 1302 goto out; 1303 } 1304 if (ret < 0) 1305 goto out; 1306 1307 eb = path->nodes[0]; 1308 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1309 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1310 while (ref_ptr < ref_end) { 1311 char *name = NULL; 1312 int namelen; 1313 u64 parent_id; 1314 1315 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1316 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1317 NULL, &parent_id); 1318 } else { 1319 parent_id = key->offset; 1320 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1321 NULL); 1322 } 1323 if (ret) 1324 goto out; 1325 1326 if (key->type == BTRFS_INODE_EXTREF_KEY) 1327 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1328 parent_id, name, 1329 namelen); 1330 else 1331 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1332 name, namelen); 1333 1334 if (!ret) { 1335 struct inode *dir; 1336 1337 btrfs_release_path(path); 1338 dir = read_one_inode(root, parent_id); 1339 if (!dir) { 1340 ret = -ENOENT; 1341 kfree(name); 1342 goto out; 1343 } 1344 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1345 inode, name, namelen); 1346 kfree(name); 1347 iput(dir); 1348 if (ret) 1349 goto out; 1350 goto again; 1351 } 1352 1353 kfree(name); 1354 ref_ptr += namelen; 1355 if (key->type == BTRFS_INODE_EXTREF_KEY) 1356 ref_ptr += sizeof(struct btrfs_inode_extref); 1357 else 1358 ref_ptr += sizeof(struct btrfs_inode_ref); 1359 } 1360 ret = 0; 1361 out: 1362 btrfs_release_path(path); 1363 return ret; 1364 } 1365 1366 /* 1367 * replay one inode back reference item found in the log tree. 1368 * eb, slot and key refer to the buffer and key found in the log tree. 1369 * root is the destination we are replaying into, and path is for temp 1370 * use by this function. (it should be released on return). 1371 */ 1372 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1373 struct btrfs_root *root, 1374 struct btrfs_root *log, 1375 struct btrfs_path *path, 1376 struct extent_buffer *eb, int slot, 1377 struct btrfs_key *key) 1378 { 1379 struct inode *dir = NULL; 1380 struct inode *inode = NULL; 1381 unsigned long ref_ptr; 1382 unsigned long ref_end; 1383 char *name = NULL; 1384 int namelen; 1385 int ret; 1386 int log_ref_ver = 0; 1387 u64 parent_objectid; 1388 u64 inode_objectid; 1389 u64 ref_index = 0; 1390 int ref_struct_size; 1391 1392 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1393 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1394 1395 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1396 struct btrfs_inode_extref *r; 1397 1398 ref_struct_size = sizeof(struct btrfs_inode_extref); 1399 log_ref_ver = 1; 1400 r = (struct btrfs_inode_extref *)ref_ptr; 1401 parent_objectid = btrfs_inode_extref_parent(eb, r); 1402 } else { 1403 ref_struct_size = sizeof(struct btrfs_inode_ref); 1404 parent_objectid = key->offset; 1405 } 1406 inode_objectid = key->objectid; 1407 1408 /* 1409 * it is possible that we didn't log all the parent directories 1410 * for a given inode. If we don't find the dir, just don't 1411 * copy the back ref in. The link count fixup code will take 1412 * care of the rest 1413 */ 1414 dir = read_one_inode(root, parent_objectid); 1415 if (!dir) { 1416 ret = -ENOENT; 1417 goto out; 1418 } 1419 1420 inode = read_one_inode(root, inode_objectid); 1421 if (!inode) { 1422 ret = -EIO; 1423 goto out; 1424 } 1425 1426 while (ref_ptr < ref_end) { 1427 if (log_ref_ver) { 1428 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1429 &ref_index, &parent_objectid); 1430 /* 1431 * parent object can change from one array 1432 * item to another. 1433 */ 1434 if (!dir) 1435 dir = read_one_inode(root, parent_objectid); 1436 if (!dir) { 1437 ret = -ENOENT; 1438 goto out; 1439 } 1440 } else { 1441 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1442 &ref_index); 1443 } 1444 if (ret) 1445 goto out; 1446 1447 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1448 btrfs_ino(BTRFS_I(inode)), ref_index, 1449 name, namelen); 1450 if (ret < 0) { 1451 goto out; 1452 } else if (ret == 0) { 1453 /* 1454 * look for a conflicting back reference in the 1455 * metadata. if we find one we have to unlink that name 1456 * of the file before we add our new link. Later on, we 1457 * overwrite any existing back reference, and we don't 1458 * want to create dangling pointers in the directory. 1459 */ 1460 ret = __add_inode_ref(trans, root, path, log, 1461 BTRFS_I(dir), BTRFS_I(inode), 1462 inode_objectid, parent_objectid, 1463 ref_index, name, namelen); 1464 if (ret) { 1465 if (ret == 1) 1466 ret = 0; 1467 goto out; 1468 } 1469 1470 /* insert our name */ 1471 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1472 name, namelen, 0, ref_index); 1473 if (ret) 1474 goto out; 1475 1476 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1477 if (ret) 1478 goto out; 1479 } 1480 /* Else, ret == 1, we already have a perfect match, we're done. */ 1481 1482 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1483 kfree(name); 1484 name = NULL; 1485 if (log_ref_ver) { 1486 iput(dir); 1487 dir = NULL; 1488 } 1489 } 1490 1491 /* 1492 * Before we overwrite the inode reference item in the subvolume tree 1493 * with the item from the log tree, we must unlink all names from the 1494 * parent directory that are in the subvolume's tree inode reference 1495 * item, otherwise we end up with an inconsistent subvolume tree where 1496 * dir index entries exist for a name but there is no inode reference 1497 * item with the same name. 1498 */ 1499 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1500 key); 1501 if (ret) 1502 goto out; 1503 1504 /* finally write the back reference in the inode */ 1505 ret = overwrite_item(trans, root, path, eb, slot, key); 1506 out: 1507 btrfs_release_path(path); 1508 kfree(name); 1509 iput(dir); 1510 iput(inode); 1511 return ret; 1512 } 1513 1514 static int count_inode_extrefs(struct btrfs_root *root, 1515 struct btrfs_inode *inode, struct btrfs_path *path) 1516 { 1517 int ret = 0; 1518 int name_len; 1519 unsigned int nlink = 0; 1520 u32 item_size; 1521 u32 cur_offset = 0; 1522 u64 inode_objectid = btrfs_ino(inode); 1523 u64 offset = 0; 1524 unsigned long ptr; 1525 struct btrfs_inode_extref *extref; 1526 struct extent_buffer *leaf; 1527 1528 while (1) { 1529 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1530 &extref, &offset); 1531 if (ret) 1532 break; 1533 1534 leaf = path->nodes[0]; 1535 item_size = btrfs_item_size(leaf, path->slots[0]); 1536 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1537 cur_offset = 0; 1538 1539 while (cur_offset < item_size) { 1540 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1541 name_len = btrfs_inode_extref_name_len(leaf, extref); 1542 1543 nlink++; 1544 1545 cur_offset += name_len + sizeof(*extref); 1546 } 1547 1548 offset++; 1549 btrfs_release_path(path); 1550 } 1551 btrfs_release_path(path); 1552 1553 if (ret < 0 && ret != -ENOENT) 1554 return ret; 1555 return nlink; 1556 } 1557 1558 static int count_inode_refs(struct btrfs_root *root, 1559 struct btrfs_inode *inode, struct btrfs_path *path) 1560 { 1561 int ret; 1562 struct btrfs_key key; 1563 unsigned int nlink = 0; 1564 unsigned long ptr; 1565 unsigned long ptr_end; 1566 int name_len; 1567 u64 ino = btrfs_ino(inode); 1568 1569 key.objectid = ino; 1570 key.type = BTRFS_INODE_REF_KEY; 1571 key.offset = (u64)-1; 1572 1573 while (1) { 1574 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1575 if (ret < 0) 1576 break; 1577 if (ret > 0) { 1578 if (path->slots[0] == 0) 1579 break; 1580 path->slots[0]--; 1581 } 1582 process_slot: 1583 btrfs_item_key_to_cpu(path->nodes[0], &key, 1584 path->slots[0]); 1585 if (key.objectid != ino || 1586 key.type != BTRFS_INODE_REF_KEY) 1587 break; 1588 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1589 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1590 path->slots[0]); 1591 while (ptr < ptr_end) { 1592 struct btrfs_inode_ref *ref; 1593 1594 ref = (struct btrfs_inode_ref *)ptr; 1595 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1596 ref); 1597 ptr = (unsigned long)(ref + 1) + name_len; 1598 nlink++; 1599 } 1600 1601 if (key.offset == 0) 1602 break; 1603 if (path->slots[0] > 0) { 1604 path->slots[0]--; 1605 goto process_slot; 1606 } 1607 key.offset--; 1608 btrfs_release_path(path); 1609 } 1610 btrfs_release_path(path); 1611 1612 return nlink; 1613 } 1614 1615 /* 1616 * There are a few corners where the link count of the file can't 1617 * be properly maintained during replay. So, instead of adding 1618 * lots of complexity to the log code, we just scan the backrefs 1619 * for any file that has been through replay. 1620 * 1621 * The scan will update the link count on the inode to reflect the 1622 * number of back refs found. If it goes down to zero, the iput 1623 * will free the inode. 1624 */ 1625 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1626 struct btrfs_root *root, 1627 struct inode *inode) 1628 { 1629 struct btrfs_path *path; 1630 int ret; 1631 u64 nlink = 0; 1632 u64 ino = btrfs_ino(BTRFS_I(inode)); 1633 1634 path = btrfs_alloc_path(); 1635 if (!path) 1636 return -ENOMEM; 1637 1638 ret = count_inode_refs(root, BTRFS_I(inode), path); 1639 if (ret < 0) 1640 goto out; 1641 1642 nlink = ret; 1643 1644 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1645 if (ret < 0) 1646 goto out; 1647 1648 nlink += ret; 1649 1650 ret = 0; 1651 1652 if (nlink != inode->i_nlink) { 1653 set_nlink(inode, nlink); 1654 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1655 if (ret) 1656 goto out; 1657 } 1658 BTRFS_I(inode)->index_cnt = (u64)-1; 1659 1660 if (inode->i_nlink == 0) { 1661 if (S_ISDIR(inode->i_mode)) { 1662 ret = replay_dir_deletes(trans, root, NULL, path, 1663 ino, 1); 1664 if (ret) 1665 goto out; 1666 } 1667 ret = btrfs_insert_orphan_item(trans, root, ino); 1668 if (ret == -EEXIST) 1669 ret = 0; 1670 } 1671 1672 out: 1673 btrfs_free_path(path); 1674 return ret; 1675 } 1676 1677 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1678 struct btrfs_root *root, 1679 struct btrfs_path *path) 1680 { 1681 int ret; 1682 struct btrfs_key key; 1683 struct inode *inode; 1684 1685 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1686 key.type = BTRFS_ORPHAN_ITEM_KEY; 1687 key.offset = (u64)-1; 1688 while (1) { 1689 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1690 if (ret < 0) 1691 break; 1692 1693 if (ret == 1) { 1694 ret = 0; 1695 if (path->slots[0] == 0) 1696 break; 1697 path->slots[0]--; 1698 } 1699 1700 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1701 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1702 key.type != BTRFS_ORPHAN_ITEM_KEY) 1703 break; 1704 1705 ret = btrfs_del_item(trans, root, path); 1706 if (ret) 1707 break; 1708 1709 btrfs_release_path(path); 1710 inode = read_one_inode(root, key.offset); 1711 if (!inode) { 1712 ret = -EIO; 1713 break; 1714 } 1715 1716 ret = fixup_inode_link_count(trans, root, inode); 1717 iput(inode); 1718 if (ret) 1719 break; 1720 1721 /* 1722 * fixup on a directory may create new entries, 1723 * make sure we always look for the highset possible 1724 * offset 1725 */ 1726 key.offset = (u64)-1; 1727 } 1728 btrfs_release_path(path); 1729 return ret; 1730 } 1731 1732 1733 /* 1734 * record a given inode in the fixup dir so we can check its link 1735 * count when replay is done. The link count is incremented here 1736 * so the inode won't go away until we check it 1737 */ 1738 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1739 struct btrfs_root *root, 1740 struct btrfs_path *path, 1741 u64 objectid) 1742 { 1743 struct btrfs_key key; 1744 int ret = 0; 1745 struct inode *inode; 1746 1747 inode = read_one_inode(root, objectid); 1748 if (!inode) 1749 return -EIO; 1750 1751 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1752 key.type = BTRFS_ORPHAN_ITEM_KEY; 1753 key.offset = objectid; 1754 1755 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1756 1757 btrfs_release_path(path); 1758 if (ret == 0) { 1759 if (!inode->i_nlink) 1760 set_nlink(inode, 1); 1761 else 1762 inc_nlink(inode); 1763 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1764 } else if (ret == -EEXIST) { 1765 ret = 0; 1766 } 1767 iput(inode); 1768 1769 return ret; 1770 } 1771 1772 /* 1773 * when replaying the log for a directory, we only insert names 1774 * for inodes that actually exist. This means an fsync on a directory 1775 * does not implicitly fsync all the new files in it 1776 */ 1777 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1778 struct btrfs_root *root, 1779 u64 dirid, u64 index, 1780 char *name, int name_len, 1781 struct btrfs_key *location) 1782 { 1783 struct inode *inode; 1784 struct inode *dir; 1785 int ret; 1786 1787 inode = read_one_inode(root, location->objectid); 1788 if (!inode) 1789 return -ENOENT; 1790 1791 dir = read_one_inode(root, dirid); 1792 if (!dir) { 1793 iput(inode); 1794 return -EIO; 1795 } 1796 1797 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1798 name_len, 1, index); 1799 1800 /* FIXME, put inode into FIXUP list */ 1801 1802 iput(inode); 1803 iput(dir); 1804 return ret; 1805 } 1806 1807 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1808 struct btrfs_inode *dir, 1809 struct btrfs_path *path, 1810 struct btrfs_dir_item *dst_di, 1811 const struct btrfs_key *log_key, 1812 u8 log_type, 1813 bool exists) 1814 { 1815 struct btrfs_key found_key; 1816 1817 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1818 /* The existing dentry points to the same inode, don't delete it. */ 1819 if (found_key.objectid == log_key->objectid && 1820 found_key.type == log_key->type && 1821 found_key.offset == log_key->offset && 1822 btrfs_dir_type(path->nodes[0], dst_di) == log_type) 1823 return 1; 1824 1825 /* 1826 * Don't drop the conflicting directory entry if the inode for the new 1827 * entry doesn't exist. 1828 */ 1829 if (!exists) 1830 return 0; 1831 1832 return drop_one_dir_item(trans, path, dir, dst_di); 1833 } 1834 1835 /* 1836 * take a single entry in a log directory item and replay it into 1837 * the subvolume. 1838 * 1839 * if a conflicting item exists in the subdirectory already, 1840 * the inode it points to is unlinked and put into the link count 1841 * fix up tree. 1842 * 1843 * If a name from the log points to a file or directory that does 1844 * not exist in the FS, it is skipped. fsyncs on directories 1845 * do not force down inodes inside that directory, just changes to the 1846 * names or unlinks in a directory. 1847 * 1848 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1849 * non-existing inode) and 1 if the name was replayed. 1850 */ 1851 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1852 struct btrfs_root *root, 1853 struct btrfs_path *path, 1854 struct extent_buffer *eb, 1855 struct btrfs_dir_item *di, 1856 struct btrfs_key *key) 1857 { 1858 char *name; 1859 int name_len; 1860 struct btrfs_dir_item *dir_dst_di; 1861 struct btrfs_dir_item *index_dst_di; 1862 bool dir_dst_matches = false; 1863 bool index_dst_matches = false; 1864 struct btrfs_key log_key; 1865 struct btrfs_key search_key; 1866 struct inode *dir; 1867 u8 log_type; 1868 bool exists; 1869 int ret; 1870 bool update_size = true; 1871 bool name_added = false; 1872 1873 dir = read_one_inode(root, key->objectid); 1874 if (!dir) 1875 return -EIO; 1876 1877 name_len = btrfs_dir_name_len(eb, di); 1878 name = kmalloc(name_len, GFP_NOFS); 1879 if (!name) { 1880 ret = -ENOMEM; 1881 goto out; 1882 } 1883 1884 log_type = btrfs_dir_type(eb, di); 1885 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1886 name_len); 1887 1888 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 1889 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 1890 btrfs_release_path(path); 1891 if (ret < 0) 1892 goto out; 1893 exists = (ret == 0); 1894 ret = 0; 1895 1896 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1897 name, name_len, 1); 1898 if (IS_ERR(dir_dst_di)) { 1899 ret = PTR_ERR(dir_dst_di); 1900 goto out; 1901 } else if (dir_dst_di) { 1902 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1903 dir_dst_di, &log_key, log_type, 1904 exists); 1905 if (ret < 0) 1906 goto out; 1907 dir_dst_matches = (ret == 1); 1908 } 1909 1910 btrfs_release_path(path); 1911 1912 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 1913 key->objectid, key->offset, 1914 name, name_len, 1); 1915 if (IS_ERR(index_dst_di)) { 1916 ret = PTR_ERR(index_dst_di); 1917 goto out; 1918 } else if (index_dst_di) { 1919 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 1920 index_dst_di, &log_key, 1921 log_type, exists); 1922 if (ret < 0) 1923 goto out; 1924 index_dst_matches = (ret == 1); 1925 } 1926 1927 btrfs_release_path(path); 1928 1929 if (dir_dst_matches && index_dst_matches) { 1930 ret = 0; 1931 update_size = false; 1932 goto out; 1933 } 1934 1935 /* 1936 * Check if the inode reference exists in the log for the given name, 1937 * inode and parent inode 1938 */ 1939 search_key.objectid = log_key.objectid; 1940 search_key.type = BTRFS_INODE_REF_KEY; 1941 search_key.offset = key->objectid; 1942 ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); 1943 if (ret < 0) { 1944 goto out; 1945 } else if (ret) { 1946 /* The dentry will be added later. */ 1947 ret = 0; 1948 update_size = false; 1949 goto out; 1950 } 1951 1952 search_key.objectid = log_key.objectid; 1953 search_key.type = BTRFS_INODE_EXTREF_KEY; 1954 search_key.offset = key->objectid; 1955 ret = backref_in_log(root->log_root, &search_key, key->objectid, name, 1956 name_len); 1957 if (ret < 0) { 1958 goto out; 1959 } else if (ret) { 1960 /* The dentry will be added later. */ 1961 ret = 0; 1962 update_size = false; 1963 goto out; 1964 } 1965 btrfs_release_path(path); 1966 ret = insert_one_name(trans, root, key->objectid, key->offset, 1967 name, name_len, &log_key); 1968 if (ret && ret != -ENOENT && ret != -EEXIST) 1969 goto out; 1970 if (!ret) 1971 name_added = true; 1972 update_size = false; 1973 ret = 0; 1974 1975 out: 1976 if (!ret && update_size) { 1977 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 1978 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 1979 } 1980 kfree(name); 1981 iput(dir); 1982 if (!ret && name_added) 1983 ret = 1; 1984 return ret; 1985 } 1986 1987 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 1988 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 1989 struct btrfs_root *root, 1990 struct btrfs_path *path, 1991 struct extent_buffer *eb, int slot, 1992 struct btrfs_key *key) 1993 { 1994 int ret; 1995 struct btrfs_dir_item *di; 1996 1997 /* We only log dir index keys, which only contain a single dir item. */ 1998 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 1999 2000 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2001 ret = replay_one_name(trans, root, path, eb, di, key); 2002 if (ret < 0) 2003 return ret; 2004 2005 /* 2006 * If this entry refers to a non-directory (directories can not have a 2007 * link count > 1) and it was added in the transaction that was not 2008 * committed, make sure we fixup the link count of the inode the entry 2009 * points to. Otherwise something like the following would result in a 2010 * directory pointing to an inode with a wrong link that does not account 2011 * for this dir entry: 2012 * 2013 * mkdir testdir 2014 * touch testdir/foo 2015 * touch testdir/bar 2016 * sync 2017 * 2018 * ln testdir/bar testdir/bar_link 2019 * ln testdir/foo testdir/foo_link 2020 * xfs_io -c "fsync" testdir/bar 2021 * 2022 * <power failure> 2023 * 2024 * mount fs, log replay happens 2025 * 2026 * File foo would remain with a link count of 1 when it has two entries 2027 * pointing to it in the directory testdir. This would make it impossible 2028 * to ever delete the parent directory has it would result in stale 2029 * dentries that can never be deleted. 2030 */ 2031 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2032 struct btrfs_path *fixup_path; 2033 struct btrfs_key di_key; 2034 2035 fixup_path = btrfs_alloc_path(); 2036 if (!fixup_path) 2037 return -ENOMEM; 2038 2039 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2040 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2041 btrfs_free_path(fixup_path); 2042 } 2043 2044 return ret; 2045 } 2046 2047 /* 2048 * directory replay has two parts. There are the standard directory 2049 * items in the log copied from the subvolume, and range items 2050 * created in the log while the subvolume was logged. 2051 * 2052 * The range items tell us which parts of the key space the log 2053 * is authoritative for. During replay, if a key in the subvolume 2054 * directory is in a logged range item, but not actually in the log 2055 * that means it was deleted from the directory before the fsync 2056 * and should be removed. 2057 */ 2058 static noinline int find_dir_range(struct btrfs_root *root, 2059 struct btrfs_path *path, 2060 u64 dirid, 2061 u64 *start_ret, u64 *end_ret) 2062 { 2063 struct btrfs_key key; 2064 u64 found_end; 2065 struct btrfs_dir_log_item *item; 2066 int ret; 2067 int nritems; 2068 2069 if (*start_ret == (u64)-1) 2070 return 1; 2071 2072 key.objectid = dirid; 2073 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2074 key.offset = *start_ret; 2075 2076 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2077 if (ret < 0) 2078 goto out; 2079 if (ret > 0) { 2080 if (path->slots[0] == 0) 2081 goto out; 2082 path->slots[0]--; 2083 } 2084 if (ret != 0) 2085 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2086 2087 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2088 ret = 1; 2089 goto next; 2090 } 2091 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2092 struct btrfs_dir_log_item); 2093 found_end = btrfs_dir_log_end(path->nodes[0], item); 2094 2095 if (*start_ret >= key.offset && *start_ret <= found_end) { 2096 ret = 0; 2097 *start_ret = key.offset; 2098 *end_ret = found_end; 2099 goto out; 2100 } 2101 ret = 1; 2102 next: 2103 /* check the next slot in the tree to see if it is a valid item */ 2104 nritems = btrfs_header_nritems(path->nodes[0]); 2105 path->slots[0]++; 2106 if (path->slots[0] >= nritems) { 2107 ret = btrfs_next_leaf(root, path); 2108 if (ret) 2109 goto out; 2110 } 2111 2112 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2113 2114 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2115 ret = 1; 2116 goto out; 2117 } 2118 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2119 struct btrfs_dir_log_item); 2120 found_end = btrfs_dir_log_end(path->nodes[0], item); 2121 *start_ret = key.offset; 2122 *end_ret = found_end; 2123 ret = 0; 2124 out: 2125 btrfs_release_path(path); 2126 return ret; 2127 } 2128 2129 /* 2130 * this looks for a given directory item in the log. If the directory 2131 * item is not in the log, the item is removed and the inode it points 2132 * to is unlinked 2133 */ 2134 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2135 struct btrfs_root *log, 2136 struct btrfs_path *path, 2137 struct btrfs_path *log_path, 2138 struct inode *dir, 2139 struct btrfs_key *dir_key) 2140 { 2141 struct btrfs_root *root = BTRFS_I(dir)->root; 2142 int ret; 2143 struct extent_buffer *eb; 2144 int slot; 2145 struct btrfs_dir_item *di; 2146 int name_len; 2147 char *name; 2148 struct inode *inode = NULL; 2149 struct btrfs_key location; 2150 2151 /* 2152 * Currently we only log dir index keys. Even if we replay a log created 2153 * by an older kernel that logged both dir index and dir item keys, all 2154 * we need to do is process the dir index keys, we (and our caller) can 2155 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2156 */ 2157 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2158 2159 eb = path->nodes[0]; 2160 slot = path->slots[0]; 2161 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2162 name_len = btrfs_dir_name_len(eb, di); 2163 name = kmalloc(name_len, GFP_NOFS); 2164 if (!name) { 2165 ret = -ENOMEM; 2166 goto out; 2167 } 2168 2169 read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); 2170 2171 if (log) { 2172 struct btrfs_dir_item *log_di; 2173 2174 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2175 dir_key->objectid, 2176 dir_key->offset, 2177 name, name_len, 0); 2178 if (IS_ERR(log_di)) { 2179 ret = PTR_ERR(log_di); 2180 goto out; 2181 } else if (log_di) { 2182 /* The dentry exists in the log, we have nothing to do. */ 2183 ret = 0; 2184 goto out; 2185 } 2186 } 2187 2188 btrfs_dir_item_key_to_cpu(eb, di, &location); 2189 btrfs_release_path(path); 2190 btrfs_release_path(log_path); 2191 inode = read_one_inode(root, location.objectid); 2192 if (!inode) { 2193 ret = -EIO; 2194 goto out; 2195 } 2196 2197 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2198 if (ret) 2199 goto out; 2200 2201 inc_nlink(inode); 2202 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2203 name, name_len); 2204 /* 2205 * Unlike dir item keys, dir index keys can only have one name (entry) in 2206 * them, as there are no key collisions since each key has a unique offset 2207 * (an index number), so we're done. 2208 */ 2209 out: 2210 btrfs_release_path(path); 2211 btrfs_release_path(log_path); 2212 kfree(name); 2213 iput(inode); 2214 return ret; 2215 } 2216 2217 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2218 struct btrfs_root *root, 2219 struct btrfs_root *log, 2220 struct btrfs_path *path, 2221 const u64 ino) 2222 { 2223 struct btrfs_key search_key; 2224 struct btrfs_path *log_path; 2225 int i; 2226 int nritems; 2227 int ret; 2228 2229 log_path = btrfs_alloc_path(); 2230 if (!log_path) 2231 return -ENOMEM; 2232 2233 search_key.objectid = ino; 2234 search_key.type = BTRFS_XATTR_ITEM_KEY; 2235 search_key.offset = 0; 2236 again: 2237 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2238 if (ret < 0) 2239 goto out; 2240 process_leaf: 2241 nritems = btrfs_header_nritems(path->nodes[0]); 2242 for (i = path->slots[0]; i < nritems; i++) { 2243 struct btrfs_key key; 2244 struct btrfs_dir_item *di; 2245 struct btrfs_dir_item *log_di; 2246 u32 total_size; 2247 u32 cur; 2248 2249 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2250 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2251 ret = 0; 2252 goto out; 2253 } 2254 2255 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2256 total_size = btrfs_item_size(path->nodes[0], i); 2257 cur = 0; 2258 while (cur < total_size) { 2259 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2260 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2261 u32 this_len = sizeof(*di) + name_len + data_len; 2262 char *name; 2263 2264 name = kmalloc(name_len, GFP_NOFS); 2265 if (!name) { 2266 ret = -ENOMEM; 2267 goto out; 2268 } 2269 read_extent_buffer(path->nodes[0], name, 2270 (unsigned long)(di + 1), name_len); 2271 2272 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2273 name, name_len, 0); 2274 btrfs_release_path(log_path); 2275 if (!log_di) { 2276 /* Doesn't exist in log tree, so delete it. */ 2277 btrfs_release_path(path); 2278 di = btrfs_lookup_xattr(trans, root, path, ino, 2279 name, name_len, -1); 2280 kfree(name); 2281 if (IS_ERR(di)) { 2282 ret = PTR_ERR(di); 2283 goto out; 2284 } 2285 ASSERT(di); 2286 ret = btrfs_delete_one_dir_name(trans, root, 2287 path, di); 2288 if (ret) 2289 goto out; 2290 btrfs_release_path(path); 2291 search_key = key; 2292 goto again; 2293 } 2294 kfree(name); 2295 if (IS_ERR(log_di)) { 2296 ret = PTR_ERR(log_di); 2297 goto out; 2298 } 2299 cur += this_len; 2300 di = (struct btrfs_dir_item *)((char *)di + this_len); 2301 } 2302 } 2303 ret = btrfs_next_leaf(root, path); 2304 if (ret > 0) 2305 ret = 0; 2306 else if (ret == 0) 2307 goto process_leaf; 2308 out: 2309 btrfs_free_path(log_path); 2310 btrfs_release_path(path); 2311 return ret; 2312 } 2313 2314 2315 /* 2316 * deletion replay happens before we copy any new directory items 2317 * out of the log or out of backreferences from inodes. It 2318 * scans the log to find ranges of keys that log is authoritative for, 2319 * and then scans the directory to find items in those ranges that are 2320 * not present in the log. 2321 * 2322 * Anything we don't find in the log is unlinked and removed from the 2323 * directory. 2324 */ 2325 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2326 struct btrfs_root *root, 2327 struct btrfs_root *log, 2328 struct btrfs_path *path, 2329 u64 dirid, int del_all) 2330 { 2331 u64 range_start; 2332 u64 range_end; 2333 int ret = 0; 2334 struct btrfs_key dir_key; 2335 struct btrfs_key found_key; 2336 struct btrfs_path *log_path; 2337 struct inode *dir; 2338 2339 dir_key.objectid = dirid; 2340 dir_key.type = BTRFS_DIR_INDEX_KEY; 2341 log_path = btrfs_alloc_path(); 2342 if (!log_path) 2343 return -ENOMEM; 2344 2345 dir = read_one_inode(root, dirid); 2346 /* it isn't an error if the inode isn't there, that can happen 2347 * because we replay the deletes before we copy in the inode item 2348 * from the log 2349 */ 2350 if (!dir) { 2351 btrfs_free_path(log_path); 2352 return 0; 2353 } 2354 2355 range_start = 0; 2356 range_end = 0; 2357 while (1) { 2358 if (del_all) 2359 range_end = (u64)-1; 2360 else { 2361 ret = find_dir_range(log, path, dirid, 2362 &range_start, &range_end); 2363 if (ret < 0) 2364 goto out; 2365 else if (ret > 0) 2366 break; 2367 } 2368 2369 dir_key.offset = range_start; 2370 while (1) { 2371 int nritems; 2372 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2373 0, 0); 2374 if (ret < 0) 2375 goto out; 2376 2377 nritems = btrfs_header_nritems(path->nodes[0]); 2378 if (path->slots[0] >= nritems) { 2379 ret = btrfs_next_leaf(root, path); 2380 if (ret == 1) 2381 break; 2382 else if (ret < 0) 2383 goto out; 2384 } 2385 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2386 path->slots[0]); 2387 if (found_key.objectid != dirid || 2388 found_key.type != dir_key.type) { 2389 ret = 0; 2390 goto out; 2391 } 2392 2393 if (found_key.offset > range_end) 2394 break; 2395 2396 ret = check_item_in_log(trans, log, path, 2397 log_path, dir, 2398 &found_key); 2399 if (ret) 2400 goto out; 2401 if (found_key.offset == (u64)-1) 2402 break; 2403 dir_key.offset = found_key.offset + 1; 2404 } 2405 btrfs_release_path(path); 2406 if (range_end == (u64)-1) 2407 break; 2408 range_start = range_end + 1; 2409 } 2410 ret = 0; 2411 out: 2412 btrfs_release_path(path); 2413 btrfs_free_path(log_path); 2414 iput(dir); 2415 return ret; 2416 } 2417 2418 /* 2419 * the process_func used to replay items from the log tree. This 2420 * gets called in two different stages. The first stage just looks 2421 * for inodes and makes sure they are all copied into the subvolume. 2422 * 2423 * The second stage copies all the other item types from the log into 2424 * the subvolume. The two stage approach is slower, but gets rid of 2425 * lots of complexity around inodes referencing other inodes that exist 2426 * only in the log (references come from either directory items or inode 2427 * back refs). 2428 */ 2429 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2430 struct walk_control *wc, u64 gen, int level) 2431 { 2432 int nritems; 2433 struct btrfs_path *path; 2434 struct btrfs_root *root = wc->replay_dest; 2435 struct btrfs_key key; 2436 int i; 2437 int ret; 2438 2439 ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 2440 if (ret) 2441 return ret; 2442 2443 level = btrfs_header_level(eb); 2444 2445 if (level != 0) 2446 return 0; 2447 2448 path = btrfs_alloc_path(); 2449 if (!path) 2450 return -ENOMEM; 2451 2452 nritems = btrfs_header_nritems(eb); 2453 for (i = 0; i < nritems; i++) { 2454 btrfs_item_key_to_cpu(eb, &key, i); 2455 2456 /* inode keys are done during the first stage */ 2457 if (key.type == BTRFS_INODE_ITEM_KEY && 2458 wc->stage == LOG_WALK_REPLAY_INODES) { 2459 struct btrfs_inode_item *inode_item; 2460 u32 mode; 2461 2462 inode_item = btrfs_item_ptr(eb, i, 2463 struct btrfs_inode_item); 2464 /* 2465 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2466 * and never got linked before the fsync, skip it, as 2467 * replaying it is pointless since it would be deleted 2468 * later. We skip logging tmpfiles, but it's always 2469 * possible we are replaying a log created with a kernel 2470 * that used to log tmpfiles. 2471 */ 2472 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2473 wc->ignore_cur_inode = true; 2474 continue; 2475 } else { 2476 wc->ignore_cur_inode = false; 2477 } 2478 ret = replay_xattr_deletes(wc->trans, root, log, 2479 path, key.objectid); 2480 if (ret) 2481 break; 2482 mode = btrfs_inode_mode(eb, inode_item); 2483 if (S_ISDIR(mode)) { 2484 ret = replay_dir_deletes(wc->trans, 2485 root, log, path, key.objectid, 0); 2486 if (ret) 2487 break; 2488 } 2489 ret = overwrite_item(wc->trans, root, path, 2490 eb, i, &key); 2491 if (ret) 2492 break; 2493 2494 /* 2495 * Before replaying extents, truncate the inode to its 2496 * size. We need to do it now and not after log replay 2497 * because before an fsync we can have prealloc extents 2498 * added beyond the inode's i_size. If we did it after, 2499 * through orphan cleanup for example, we would drop 2500 * those prealloc extents just after replaying them. 2501 */ 2502 if (S_ISREG(mode)) { 2503 struct btrfs_drop_extents_args drop_args = { 0 }; 2504 struct inode *inode; 2505 u64 from; 2506 2507 inode = read_one_inode(root, key.objectid); 2508 if (!inode) { 2509 ret = -EIO; 2510 break; 2511 } 2512 from = ALIGN(i_size_read(inode), 2513 root->fs_info->sectorsize); 2514 drop_args.start = from; 2515 drop_args.end = (u64)-1; 2516 drop_args.drop_cache = true; 2517 ret = btrfs_drop_extents(wc->trans, root, 2518 BTRFS_I(inode), 2519 &drop_args); 2520 if (!ret) { 2521 inode_sub_bytes(inode, 2522 drop_args.bytes_found); 2523 /* Update the inode's nbytes. */ 2524 ret = btrfs_update_inode(wc->trans, 2525 root, BTRFS_I(inode)); 2526 } 2527 iput(inode); 2528 if (ret) 2529 break; 2530 } 2531 2532 ret = link_to_fixup_dir(wc->trans, root, 2533 path, key.objectid); 2534 if (ret) 2535 break; 2536 } 2537 2538 if (wc->ignore_cur_inode) 2539 continue; 2540 2541 if (key.type == BTRFS_DIR_INDEX_KEY && 2542 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2543 ret = replay_one_dir_item(wc->trans, root, path, 2544 eb, i, &key); 2545 if (ret) 2546 break; 2547 } 2548 2549 if (wc->stage < LOG_WALK_REPLAY_ALL) 2550 continue; 2551 2552 /* these keys are simply copied */ 2553 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2554 ret = overwrite_item(wc->trans, root, path, 2555 eb, i, &key); 2556 if (ret) 2557 break; 2558 } else if (key.type == BTRFS_INODE_REF_KEY || 2559 key.type == BTRFS_INODE_EXTREF_KEY) { 2560 ret = add_inode_ref(wc->trans, root, log, path, 2561 eb, i, &key); 2562 if (ret && ret != -ENOENT) 2563 break; 2564 ret = 0; 2565 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2566 ret = replay_one_extent(wc->trans, root, path, 2567 eb, i, &key); 2568 if (ret) 2569 break; 2570 } 2571 /* 2572 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2573 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2574 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2575 * older kernel with such keys, ignore them. 2576 */ 2577 } 2578 btrfs_free_path(path); 2579 return ret; 2580 } 2581 2582 /* 2583 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2584 */ 2585 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2586 { 2587 struct btrfs_block_group *cache; 2588 2589 cache = btrfs_lookup_block_group(fs_info, start); 2590 if (!cache) { 2591 btrfs_err(fs_info, "unable to find block group for %llu", start); 2592 return; 2593 } 2594 2595 spin_lock(&cache->space_info->lock); 2596 spin_lock(&cache->lock); 2597 cache->reserved -= fs_info->nodesize; 2598 cache->space_info->bytes_reserved -= fs_info->nodesize; 2599 spin_unlock(&cache->lock); 2600 spin_unlock(&cache->space_info->lock); 2601 2602 btrfs_put_block_group(cache); 2603 } 2604 2605 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2606 struct btrfs_root *root, 2607 struct btrfs_path *path, int *level, 2608 struct walk_control *wc) 2609 { 2610 struct btrfs_fs_info *fs_info = root->fs_info; 2611 u64 bytenr; 2612 u64 ptr_gen; 2613 struct extent_buffer *next; 2614 struct extent_buffer *cur; 2615 u32 blocksize; 2616 int ret = 0; 2617 2618 while (*level > 0) { 2619 struct btrfs_key first_key; 2620 2621 cur = path->nodes[*level]; 2622 2623 WARN_ON(btrfs_header_level(cur) != *level); 2624 2625 if (path->slots[*level] >= 2626 btrfs_header_nritems(cur)) 2627 break; 2628 2629 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2630 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2631 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2632 blocksize = fs_info->nodesize; 2633 2634 next = btrfs_find_create_tree_block(fs_info, bytenr, 2635 btrfs_header_owner(cur), 2636 *level - 1); 2637 if (IS_ERR(next)) 2638 return PTR_ERR(next); 2639 2640 if (*level == 1) { 2641 ret = wc->process_func(root, next, wc, ptr_gen, 2642 *level - 1); 2643 if (ret) { 2644 free_extent_buffer(next); 2645 return ret; 2646 } 2647 2648 path->slots[*level]++; 2649 if (wc->free) { 2650 ret = btrfs_read_extent_buffer(next, ptr_gen, 2651 *level - 1, &first_key); 2652 if (ret) { 2653 free_extent_buffer(next); 2654 return ret; 2655 } 2656 2657 if (trans) { 2658 btrfs_tree_lock(next); 2659 btrfs_clean_tree_block(next); 2660 btrfs_wait_tree_block_writeback(next); 2661 btrfs_tree_unlock(next); 2662 ret = btrfs_pin_reserved_extent(trans, 2663 bytenr, blocksize); 2664 if (ret) { 2665 free_extent_buffer(next); 2666 return ret; 2667 } 2668 btrfs_redirty_list_add( 2669 trans->transaction, next); 2670 } else { 2671 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2672 clear_extent_buffer_dirty(next); 2673 unaccount_log_buffer(fs_info, bytenr); 2674 } 2675 } 2676 free_extent_buffer(next); 2677 continue; 2678 } 2679 ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); 2680 if (ret) { 2681 free_extent_buffer(next); 2682 return ret; 2683 } 2684 2685 if (path->nodes[*level-1]) 2686 free_extent_buffer(path->nodes[*level-1]); 2687 path->nodes[*level-1] = next; 2688 *level = btrfs_header_level(next); 2689 path->slots[*level] = 0; 2690 cond_resched(); 2691 } 2692 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2693 2694 cond_resched(); 2695 return 0; 2696 } 2697 2698 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2699 struct btrfs_root *root, 2700 struct btrfs_path *path, int *level, 2701 struct walk_control *wc) 2702 { 2703 struct btrfs_fs_info *fs_info = root->fs_info; 2704 int i; 2705 int slot; 2706 int ret; 2707 2708 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2709 slot = path->slots[i]; 2710 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2711 path->slots[i]++; 2712 *level = i; 2713 WARN_ON(*level == 0); 2714 return 0; 2715 } else { 2716 ret = wc->process_func(root, path->nodes[*level], wc, 2717 btrfs_header_generation(path->nodes[*level]), 2718 *level); 2719 if (ret) 2720 return ret; 2721 2722 if (wc->free) { 2723 struct extent_buffer *next; 2724 2725 next = path->nodes[*level]; 2726 2727 if (trans) { 2728 btrfs_tree_lock(next); 2729 btrfs_clean_tree_block(next); 2730 btrfs_wait_tree_block_writeback(next); 2731 btrfs_tree_unlock(next); 2732 ret = btrfs_pin_reserved_extent(trans, 2733 path->nodes[*level]->start, 2734 path->nodes[*level]->len); 2735 if (ret) 2736 return ret; 2737 btrfs_redirty_list_add(trans->transaction, 2738 next); 2739 } else { 2740 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2741 clear_extent_buffer_dirty(next); 2742 2743 unaccount_log_buffer(fs_info, 2744 path->nodes[*level]->start); 2745 } 2746 } 2747 free_extent_buffer(path->nodes[*level]); 2748 path->nodes[*level] = NULL; 2749 *level = i + 1; 2750 } 2751 } 2752 return 1; 2753 } 2754 2755 /* 2756 * drop the reference count on the tree rooted at 'snap'. This traverses 2757 * the tree freeing any blocks that have a ref count of zero after being 2758 * decremented. 2759 */ 2760 static int walk_log_tree(struct btrfs_trans_handle *trans, 2761 struct btrfs_root *log, struct walk_control *wc) 2762 { 2763 struct btrfs_fs_info *fs_info = log->fs_info; 2764 int ret = 0; 2765 int wret; 2766 int level; 2767 struct btrfs_path *path; 2768 int orig_level; 2769 2770 path = btrfs_alloc_path(); 2771 if (!path) 2772 return -ENOMEM; 2773 2774 level = btrfs_header_level(log->node); 2775 orig_level = level; 2776 path->nodes[level] = log->node; 2777 atomic_inc(&log->node->refs); 2778 path->slots[level] = 0; 2779 2780 while (1) { 2781 wret = walk_down_log_tree(trans, log, path, &level, wc); 2782 if (wret > 0) 2783 break; 2784 if (wret < 0) { 2785 ret = wret; 2786 goto out; 2787 } 2788 2789 wret = walk_up_log_tree(trans, log, path, &level, wc); 2790 if (wret > 0) 2791 break; 2792 if (wret < 0) { 2793 ret = wret; 2794 goto out; 2795 } 2796 } 2797 2798 /* was the root node processed? if not, catch it here */ 2799 if (path->nodes[orig_level]) { 2800 ret = wc->process_func(log, path->nodes[orig_level], wc, 2801 btrfs_header_generation(path->nodes[orig_level]), 2802 orig_level); 2803 if (ret) 2804 goto out; 2805 if (wc->free) { 2806 struct extent_buffer *next; 2807 2808 next = path->nodes[orig_level]; 2809 2810 if (trans) { 2811 btrfs_tree_lock(next); 2812 btrfs_clean_tree_block(next); 2813 btrfs_wait_tree_block_writeback(next); 2814 btrfs_tree_unlock(next); 2815 ret = btrfs_pin_reserved_extent(trans, 2816 next->start, next->len); 2817 if (ret) 2818 goto out; 2819 btrfs_redirty_list_add(trans->transaction, next); 2820 } else { 2821 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2822 clear_extent_buffer_dirty(next); 2823 unaccount_log_buffer(fs_info, next->start); 2824 } 2825 } 2826 } 2827 2828 out: 2829 btrfs_free_path(path); 2830 return ret; 2831 } 2832 2833 /* 2834 * helper function to update the item for a given subvolumes log root 2835 * in the tree of log roots 2836 */ 2837 static int update_log_root(struct btrfs_trans_handle *trans, 2838 struct btrfs_root *log, 2839 struct btrfs_root_item *root_item) 2840 { 2841 struct btrfs_fs_info *fs_info = log->fs_info; 2842 int ret; 2843 2844 if (log->log_transid == 1) { 2845 /* insert root item on the first sync */ 2846 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2847 &log->root_key, root_item); 2848 } else { 2849 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2850 &log->root_key, root_item); 2851 } 2852 return ret; 2853 } 2854 2855 static void wait_log_commit(struct btrfs_root *root, int transid) 2856 { 2857 DEFINE_WAIT(wait); 2858 int index = transid % 2; 2859 2860 /* 2861 * we only allow two pending log transactions at a time, 2862 * so we know that if ours is more than 2 older than the 2863 * current transaction, we're done 2864 */ 2865 for (;;) { 2866 prepare_to_wait(&root->log_commit_wait[index], 2867 &wait, TASK_UNINTERRUPTIBLE); 2868 2869 if (!(root->log_transid_committed < transid && 2870 atomic_read(&root->log_commit[index]))) 2871 break; 2872 2873 mutex_unlock(&root->log_mutex); 2874 schedule(); 2875 mutex_lock(&root->log_mutex); 2876 } 2877 finish_wait(&root->log_commit_wait[index], &wait); 2878 } 2879 2880 static void wait_for_writer(struct btrfs_root *root) 2881 { 2882 DEFINE_WAIT(wait); 2883 2884 for (;;) { 2885 prepare_to_wait(&root->log_writer_wait, &wait, 2886 TASK_UNINTERRUPTIBLE); 2887 if (!atomic_read(&root->log_writers)) 2888 break; 2889 2890 mutex_unlock(&root->log_mutex); 2891 schedule(); 2892 mutex_lock(&root->log_mutex); 2893 } 2894 finish_wait(&root->log_writer_wait, &wait); 2895 } 2896 2897 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 2898 struct btrfs_log_ctx *ctx) 2899 { 2900 mutex_lock(&root->log_mutex); 2901 list_del_init(&ctx->list); 2902 mutex_unlock(&root->log_mutex); 2903 } 2904 2905 /* 2906 * Invoked in log mutex context, or be sure there is no other task which 2907 * can access the list. 2908 */ 2909 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 2910 int index, int error) 2911 { 2912 struct btrfs_log_ctx *ctx; 2913 struct btrfs_log_ctx *safe; 2914 2915 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 2916 list_del_init(&ctx->list); 2917 ctx->log_ret = error; 2918 } 2919 } 2920 2921 /* 2922 * btrfs_sync_log does sends a given tree log down to the disk and 2923 * updates the super blocks to record it. When this call is done, 2924 * you know that any inodes previously logged are safely on disk only 2925 * if it returns 0. 2926 * 2927 * Any other return value means you need to call btrfs_commit_transaction. 2928 * Some of the edge cases for fsyncing directories that have had unlinks 2929 * or renames done in the past mean that sometimes the only safe 2930 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 2931 * that has happened. 2932 */ 2933 int btrfs_sync_log(struct btrfs_trans_handle *trans, 2934 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 2935 { 2936 int index1; 2937 int index2; 2938 int mark; 2939 int ret; 2940 struct btrfs_fs_info *fs_info = root->fs_info; 2941 struct btrfs_root *log = root->log_root; 2942 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 2943 struct btrfs_root_item new_root_item; 2944 int log_transid = 0; 2945 struct btrfs_log_ctx root_log_ctx; 2946 struct blk_plug plug; 2947 u64 log_root_start; 2948 u64 log_root_level; 2949 2950 mutex_lock(&root->log_mutex); 2951 log_transid = ctx->log_transid; 2952 if (root->log_transid_committed >= log_transid) { 2953 mutex_unlock(&root->log_mutex); 2954 return ctx->log_ret; 2955 } 2956 2957 index1 = log_transid % 2; 2958 if (atomic_read(&root->log_commit[index1])) { 2959 wait_log_commit(root, log_transid); 2960 mutex_unlock(&root->log_mutex); 2961 return ctx->log_ret; 2962 } 2963 ASSERT(log_transid == root->log_transid); 2964 atomic_set(&root->log_commit[index1], 1); 2965 2966 /* wait for previous tree log sync to complete */ 2967 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2968 wait_log_commit(root, log_transid - 1); 2969 2970 while (1) { 2971 int batch = atomic_read(&root->log_batch); 2972 /* when we're on an ssd, just kick the log commit out */ 2973 if (!btrfs_test_opt(fs_info, SSD) && 2974 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 2975 mutex_unlock(&root->log_mutex); 2976 schedule_timeout_uninterruptible(1); 2977 mutex_lock(&root->log_mutex); 2978 } 2979 wait_for_writer(root); 2980 if (batch == atomic_read(&root->log_batch)) 2981 break; 2982 } 2983 2984 /* bail out if we need to do a full commit */ 2985 if (btrfs_need_log_full_commit(trans)) { 2986 ret = BTRFS_LOG_FORCE_COMMIT; 2987 mutex_unlock(&root->log_mutex); 2988 goto out; 2989 } 2990 2991 if (log_transid % 2 == 0) 2992 mark = EXTENT_DIRTY; 2993 else 2994 mark = EXTENT_NEW; 2995 2996 /* we start IO on all the marked extents here, but we don't actually 2997 * wait for them until later. 2998 */ 2999 blk_start_plug(&plug); 3000 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3001 /* 3002 * -EAGAIN happens when someone, e.g., a concurrent transaction 3003 * commit, writes a dirty extent in this tree-log commit. This 3004 * concurrent write will create a hole writing out the extents, 3005 * and we cannot proceed on a zoned filesystem, requiring 3006 * sequential writing. While we can bail out to a full commit 3007 * here, but we can continue hoping the concurrent writing fills 3008 * the hole. 3009 */ 3010 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3011 ret = 0; 3012 if (ret) { 3013 blk_finish_plug(&plug); 3014 btrfs_abort_transaction(trans, ret); 3015 btrfs_set_log_full_commit(trans); 3016 mutex_unlock(&root->log_mutex); 3017 goto out; 3018 } 3019 3020 /* 3021 * We _must_ update under the root->log_mutex in order to make sure we 3022 * have a consistent view of the log root we are trying to commit at 3023 * this moment. 3024 * 3025 * We _must_ copy this into a local copy, because we are not holding the 3026 * log_root_tree->log_mutex yet. This is important because when we 3027 * commit the log_root_tree we must have a consistent view of the 3028 * log_root_tree when we update the super block to point at the 3029 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3030 * with the commit and possibly point at the new block which we may not 3031 * have written out. 3032 */ 3033 btrfs_set_root_node(&log->root_item, log->node); 3034 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3035 3036 root->log_transid++; 3037 log->log_transid = root->log_transid; 3038 root->log_start_pid = 0; 3039 /* 3040 * IO has been started, blocks of the log tree have WRITTEN flag set 3041 * in their headers. new modifications of the log will be written to 3042 * new positions. so it's safe to allow log writers to go in. 3043 */ 3044 mutex_unlock(&root->log_mutex); 3045 3046 if (btrfs_is_zoned(fs_info)) { 3047 mutex_lock(&fs_info->tree_root->log_mutex); 3048 if (!log_root_tree->node) { 3049 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3050 if (ret) { 3051 mutex_unlock(&fs_info->tree_root->log_mutex); 3052 blk_finish_plug(&plug); 3053 goto out; 3054 } 3055 } 3056 mutex_unlock(&fs_info->tree_root->log_mutex); 3057 } 3058 3059 btrfs_init_log_ctx(&root_log_ctx, NULL); 3060 3061 mutex_lock(&log_root_tree->log_mutex); 3062 3063 index2 = log_root_tree->log_transid % 2; 3064 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3065 root_log_ctx.log_transid = log_root_tree->log_transid; 3066 3067 /* 3068 * Now we are safe to update the log_root_tree because we're under the 3069 * log_mutex, and we're a current writer so we're holding the commit 3070 * open until we drop the log_mutex. 3071 */ 3072 ret = update_log_root(trans, log, &new_root_item); 3073 if (ret) { 3074 if (!list_empty(&root_log_ctx.list)) 3075 list_del_init(&root_log_ctx.list); 3076 3077 blk_finish_plug(&plug); 3078 btrfs_set_log_full_commit(trans); 3079 3080 if (ret != -ENOSPC) { 3081 btrfs_abort_transaction(trans, ret); 3082 mutex_unlock(&log_root_tree->log_mutex); 3083 goto out; 3084 } 3085 btrfs_wait_tree_log_extents(log, mark); 3086 mutex_unlock(&log_root_tree->log_mutex); 3087 ret = BTRFS_LOG_FORCE_COMMIT; 3088 goto out; 3089 } 3090 3091 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3092 blk_finish_plug(&plug); 3093 list_del_init(&root_log_ctx.list); 3094 mutex_unlock(&log_root_tree->log_mutex); 3095 ret = root_log_ctx.log_ret; 3096 goto out; 3097 } 3098 3099 index2 = root_log_ctx.log_transid % 2; 3100 if (atomic_read(&log_root_tree->log_commit[index2])) { 3101 blk_finish_plug(&plug); 3102 ret = btrfs_wait_tree_log_extents(log, mark); 3103 wait_log_commit(log_root_tree, 3104 root_log_ctx.log_transid); 3105 mutex_unlock(&log_root_tree->log_mutex); 3106 if (!ret) 3107 ret = root_log_ctx.log_ret; 3108 goto out; 3109 } 3110 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3111 atomic_set(&log_root_tree->log_commit[index2], 1); 3112 3113 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3114 wait_log_commit(log_root_tree, 3115 root_log_ctx.log_transid - 1); 3116 } 3117 3118 /* 3119 * now that we've moved on to the tree of log tree roots, 3120 * check the full commit flag again 3121 */ 3122 if (btrfs_need_log_full_commit(trans)) { 3123 blk_finish_plug(&plug); 3124 btrfs_wait_tree_log_extents(log, mark); 3125 mutex_unlock(&log_root_tree->log_mutex); 3126 ret = BTRFS_LOG_FORCE_COMMIT; 3127 goto out_wake_log_root; 3128 } 3129 3130 ret = btrfs_write_marked_extents(fs_info, 3131 &log_root_tree->dirty_log_pages, 3132 EXTENT_DIRTY | EXTENT_NEW); 3133 blk_finish_plug(&plug); 3134 /* 3135 * As described above, -EAGAIN indicates a hole in the extents. We 3136 * cannot wait for these write outs since the waiting cause a 3137 * deadlock. Bail out to the full commit instead. 3138 */ 3139 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3140 btrfs_set_log_full_commit(trans); 3141 btrfs_wait_tree_log_extents(log, mark); 3142 mutex_unlock(&log_root_tree->log_mutex); 3143 goto out_wake_log_root; 3144 } else if (ret) { 3145 btrfs_set_log_full_commit(trans); 3146 btrfs_abort_transaction(trans, ret); 3147 mutex_unlock(&log_root_tree->log_mutex); 3148 goto out_wake_log_root; 3149 } 3150 ret = btrfs_wait_tree_log_extents(log, mark); 3151 if (!ret) 3152 ret = btrfs_wait_tree_log_extents(log_root_tree, 3153 EXTENT_NEW | EXTENT_DIRTY); 3154 if (ret) { 3155 btrfs_set_log_full_commit(trans); 3156 mutex_unlock(&log_root_tree->log_mutex); 3157 goto out_wake_log_root; 3158 } 3159 3160 log_root_start = log_root_tree->node->start; 3161 log_root_level = btrfs_header_level(log_root_tree->node); 3162 log_root_tree->log_transid++; 3163 mutex_unlock(&log_root_tree->log_mutex); 3164 3165 /* 3166 * Here we are guaranteed that nobody is going to write the superblock 3167 * for the current transaction before us and that neither we do write 3168 * our superblock before the previous transaction finishes its commit 3169 * and writes its superblock, because: 3170 * 3171 * 1) We are holding a handle on the current transaction, so no body 3172 * can commit it until we release the handle; 3173 * 3174 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3175 * if the previous transaction is still committing, and hasn't yet 3176 * written its superblock, we wait for it to do it, because a 3177 * transaction commit acquires the tree_log_mutex when the commit 3178 * begins and releases it only after writing its superblock. 3179 */ 3180 mutex_lock(&fs_info->tree_log_mutex); 3181 3182 /* 3183 * The previous transaction writeout phase could have failed, and thus 3184 * marked the fs in an error state. We must not commit here, as we 3185 * could have updated our generation in the super_for_commit and 3186 * writing the super here would result in transid mismatches. If there 3187 * is an error here just bail. 3188 */ 3189 if (BTRFS_FS_ERROR(fs_info)) { 3190 ret = -EIO; 3191 btrfs_set_log_full_commit(trans); 3192 btrfs_abort_transaction(trans, ret); 3193 mutex_unlock(&fs_info->tree_log_mutex); 3194 goto out_wake_log_root; 3195 } 3196 3197 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3198 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3199 ret = write_all_supers(fs_info, 1); 3200 mutex_unlock(&fs_info->tree_log_mutex); 3201 if (ret) { 3202 btrfs_set_log_full_commit(trans); 3203 btrfs_abort_transaction(trans, ret); 3204 goto out_wake_log_root; 3205 } 3206 3207 /* 3208 * We know there can only be one task here, since we have not yet set 3209 * root->log_commit[index1] to 0 and any task attempting to sync the 3210 * log must wait for the previous log transaction to commit if it's 3211 * still in progress or wait for the current log transaction commit if 3212 * someone else already started it. We use <= and not < because the 3213 * first log transaction has an ID of 0. 3214 */ 3215 ASSERT(root->last_log_commit <= log_transid); 3216 root->last_log_commit = log_transid; 3217 3218 out_wake_log_root: 3219 mutex_lock(&log_root_tree->log_mutex); 3220 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3221 3222 log_root_tree->log_transid_committed++; 3223 atomic_set(&log_root_tree->log_commit[index2], 0); 3224 mutex_unlock(&log_root_tree->log_mutex); 3225 3226 /* 3227 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3228 * all the updates above are seen by the woken threads. It might not be 3229 * necessary, but proving that seems to be hard. 3230 */ 3231 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3232 out: 3233 mutex_lock(&root->log_mutex); 3234 btrfs_remove_all_log_ctxs(root, index1, ret); 3235 root->log_transid_committed++; 3236 atomic_set(&root->log_commit[index1], 0); 3237 mutex_unlock(&root->log_mutex); 3238 3239 /* 3240 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3241 * all the updates above are seen by the woken threads. It might not be 3242 * necessary, but proving that seems to be hard. 3243 */ 3244 cond_wake_up(&root->log_commit_wait[index1]); 3245 return ret; 3246 } 3247 3248 static void free_log_tree(struct btrfs_trans_handle *trans, 3249 struct btrfs_root *log) 3250 { 3251 int ret; 3252 struct walk_control wc = { 3253 .free = 1, 3254 .process_func = process_one_buffer 3255 }; 3256 3257 if (log->node) { 3258 ret = walk_log_tree(trans, log, &wc); 3259 if (ret) { 3260 /* 3261 * We weren't able to traverse the entire log tree, the 3262 * typical scenario is getting an -EIO when reading an 3263 * extent buffer of the tree, due to a previous writeback 3264 * failure of it. 3265 */ 3266 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3267 &log->fs_info->fs_state); 3268 3269 /* 3270 * Some extent buffers of the log tree may still be dirty 3271 * and not yet written back to storage, because we may 3272 * have updates to a log tree without syncing a log tree, 3273 * such as during rename and link operations. So flush 3274 * them out and wait for their writeback to complete, so 3275 * that we properly cleanup their state and pages. 3276 */ 3277 btrfs_write_marked_extents(log->fs_info, 3278 &log->dirty_log_pages, 3279 EXTENT_DIRTY | EXTENT_NEW); 3280 btrfs_wait_tree_log_extents(log, 3281 EXTENT_DIRTY | EXTENT_NEW); 3282 3283 if (trans) 3284 btrfs_abort_transaction(trans, ret); 3285 else 3286 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3287 } 3288 } 3289 3290 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3291 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3292 extent_io_tree_release(&log->log_csum_range); 3293 3294 btrfs_put_root(log); 3295 } 3296 3297 /* 3298 * free all the extents used by the tree log. This should be called 3299 * at commit time of the full transaction 3300 */ 3301 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3302 { 3303 if (root->log_root) { 3304 free_log_tree(trans, root->log_root); 3305 root->log_root = NULL; 3306 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3307 } 3308 return 0; 3309 } 3310 3311 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3312 struct btrfs_fs_info *fs_info) 3313 { 3314 if (fs_info->log_root_tree) { 3315 free_log_tree(trans, fs_info->log_root_tree); 3316 fs_info->log_root_tree = NULL; 3317 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3318 } 3319 return 0; 3320 } 3321 3322 /* 3323 * Check if an inode was logged in the current transaction. This correctly deals 3324 * with the case where the inode was logged but has a logged_trans of 0, which 3325 * happens if the inode is evicted and loaded again, as logged_trans is an in 3326 * memory only field (not persisted). 3327 * 3328 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3329 * and < 0 on error. 3330 */ 3331 static int inode_logged(struct btrfs_trans_handle *trans, 3332 struct btrfs_inode *inode, 3333 struct btrfs_path *path_in) 3334 { 3335 struct btrfs_path *path = path_in; 3336 struct btrfs_key key; 3337 int ret; 3338 3339 if (inode->logged_trans == trans->transid) 3340 return 1; 3341 3342 /* 3343 * If logged_trans is not 0, then we know the inode logged was not logged 3344 * in this transaction, so we can return false right away. 3345 */ 3346 if (inode->logged_trans > 0) 3347 return 0; 3348 3349 /* 3350 * If no log tree was created for this root in this transaction, then 3351 * the inode can not have been logged in this transaction. In that case 3352 * set logged_trans to anything greater than 0 and less than the current 3353 * transaction's ID, to avoid the search below in a future call in case 3354 * a log tree gets created after this. 3355 */ 3356 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3357 inode->logged_trans = trans->transid - 1; 3358 return 0; 3359 } 3360 3361 /* 3362 * We have a log tree and the inode's logged_trans is 0. We can't tell 3363 * for sure if the inode was logged before in this transaction by looking 3364 * only at logged_trans. We could be pessimistic and assume it was, but 3365 * that can lead to unnecessarily logging an inode during rename and link 3366 * operations, and then further updating the log in followup rename and 3367 * link operations, specially if it's a directory, which adds latency 3368 * visible to applications doing a series of rename or link operations. 3369 * 3370 * A logged_trans of 0 here can mean several things: 3371 * 3372 * 1) The inode was never logged since the filesystem was mounted, and may 3373 * or may have not been evicted and loaded again; 3374 * 3375 * 2) The inode was logged in a previous transaction, then evicted and 3376 * then loaded again; 3377 * 3378 * 3) The inode was logged in the current transaction, then evicted and 3379 * then loaded again. 3380 * 3381 * For cases 1) and 2) we don't want to return true, but we need to detect 3382 * case 3) and return true. So we do a search in the log root for the inode 3383 * item. 3384 */ 3385 key.objectid = btrfs_ino(inode); 3386 key.type = BTRFS_INODE_ITEM_KEY; 3387 key.offset = 0; 3388 3389 if (!path) { 3390 path = btrfs_alloc_path(); 3391 if (!path) 3392 return -ENOMEM; 3393 } 3394 3395 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3396 3397 if (path_in) 3398 btrfs_release_path(path); 3399 else 3400 btrfs_free_path(path); 3401 3402 /* 3403 * Logging an inode always results in logging its inode item. So if we 3404 * did not find the item we know the inode was not logged for sure. 3405 */ 3406 if (ret < 0) { 3407 return ret; 3408 } else if (ret > 0) { 3409 /* 3410 * Set logged_trans to a value greater than 0 and less then the 3411 * current transaction to avoid doing the search in future calls. 3412 */ 3413 inode->logged_trans = trans->transid - 1; 3414 return 0; 3415 } 3416 3417 /* 3418 * The inode was previously logged and then evicted, set logged_trans to 3419 * the current transacion's ID, to avoid future tree searches as long as 3420 * the inode is not evicted again. 3421 */ 3422 inode->logged_trans = trans->transid; 3423 3424 /* 3425 * If it's a directory, then we must set last_dir_index_offset to the 3426 * maximum possible value, so that the next attempt to log the inode does 3427 * not skip checking if dir index keys found in modified subvolume tree 3428 * leaves have been logged before, otherwise it would result in attempts 3429 * to insert duplicate dir index keys in the log tree. This must be done 3430 * because last_dir_index_offset is an in-memory only field, not persisted 3431 * in the inode item or any other on-disk structure, so its value is lost 3432 * once the inode is evicted. 3433 */ 3434 if (S_ISDIR(inode->vfs_inode.i_mode)) 3435 inode->last_dir_index_offset = (u64)-1; 3436 3437 return 1; 3438 } 3439 3440 /* 3441 * Delete a directory entry from the log if it exists. 3442 * 3443 * Returns < 0 on error 3444 * 1 if the entry does not exists 3445 * 0 if the entry existed and was successfully deleted 3446 */ 3447 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3448 struct btrfs_root *log, 3449 struct btrfs_path *path, 3450 u64 dir_ino, 3451 const char *name, int name_len, 3452 u64 index) 3453 { 3454 struct btrfs_dir_item *di; 3455 3456 /* 3457 * We only log dir index items of a directory, so we don't need to look 3458 * for dir item keys. 3459 */ 3460 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3461 index, name, name_len, -1); 3462 if (IS_ERR(di)) 3463 return PTR_ERR(di); 3464 else if (!di) 3465 return 1; 3466 3467 /* 3468 * We do not need to update the size field of the directory's 3469 * inode item because on log replay we update the field to reflect 3470 * all existing entries in the directory (see overwrite_item()). 3471 */ 3472 return btrfs_delete_one_dir_name(trans, log, path, di); 3473 } 3474 3475 /* 3476 * If both a file and directory are logged, and unlinks or renames are 3477 * mixed in, we have a few interesting corners: 3478 * 3479 * create file X in dir Y 3480 * link file X to X.link in dir Y 3481 * fsync file X 3482 * unlink file X but leave X.link 3483 * fsync dir Y 3484 * 3485 * After a crash we would expect only X.link to exist. But file X 3486 * didn't get fsync'd again so the log has back refs for X and X.link. 3487 * 3488 * We solve this by removing directory entries and inode backrefs from the 3489 * log when a file that was logged in the current transaction is 3490 * unlinked. Any later fsync will include the updated log entries, and 3491 * we'll be able to reconstruct the proper directory items from backrefs. 3492 * 3493 * This optimizations allows us to avoid relogging the entire inode 3494 * or the entire directory. 3495 */ 3496 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3497 struct btrfs_root *root, 3498 const char *name, int name_len, 3499 struct btrfs_inode *dir, u64 index) 3500 { 3501 struct btrfs_path *path; 3502 int ret; 3503 3504 ret = inode_logged(trans, dir, NULL); 3505 if (ret == 0) 3506 return; 3507 else if (ret < 0) { 3508 btrfs_set_log_full_commit(trans); 3509 return; 3510 } 3511 3512 ret = join_running_log_trans(root); 3513 if (ret) 3514 return; 3515 3516 mutex_lock(&dir->log_mutex); 3517 3518 path = btrfs_alloc_path(); 3519 if (!path) { 3520 ret = -ENOMEM; 3521 goto out_unlock; 3522 } 3523 3524 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3525 name, name_len, index); 3526 btrfs_free_path(path); 3527 out_unlock: 3528 mutex_unlock(&dir->log_mutex); 3529 if (ret < 0) 3530 btrfs_set_log_full_commit(trans); 3531 btrfs_end_log_trans(root); 3532 } 3533 3534 /* see comments for btrfs_del_dir_entries_in_log */ 3535 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3536 struct btrfs_root *root, 3537 const char *name, int name_len, 3538 struct btrfs_inode *inode, u64 dirid) 3539 { 3540 struct btrfs_root *log; 3541 u64 index; 3542 int ret; 3543 3544 ret = inode_logged(trans, inode, NULL); 3545 if (ret == 0) 3546 return; 3547 else if (ret < 0) { 3548 btrfs_set_log_full_commit(trans); 3549 return; 3550 } 3551 3552 ret = join_running_log_trans(root); 3553 if (ret) 3554 return; 3555 log = root->log_root; 3556 mutex_lock(&inode->log_mutex); 3557 3558 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3559 dirid, &index); 3560 mutex_unlock(&inode->log_mutex); 3561 if (ret < 0 && ret != -ENOENT) 3562 btrfs_set_log_full_commit(trans); 3563 btrfs_end_log_trans(root); 3564 } 3565 3566 /* 3567 * creates a range item in the log for 'dirid'. first_offset and 3568 * last_offset tell us which parts of the key space the log should 3569 * be considered authoritative for. 3570 */ 3571 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3572 struct btrfs_root *log, 3573 struct btrfs_path *path, 3574 u64 dirid, 3575 u64 first_offset, u64 last_offset) 3576 { 3577 int ret; 3578 struct btrfs_key key; 3579 struct btrfs_dir_log_item *item; 3580 3581 key.objectid = dirid; 3582 key.offset = first_offset; 3583 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3584 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3585 /* 3586 * -EEXIST is fine and can happen sporadically when we are logging a 3587 * directory and have concurrent insertions in the subvolume's tree for 3588 * items from other inodes and that result in pushing off some dir items 3589 * from one leaf to another in order to accommodate for the new items. 3590 * This results in logging the same dir index range key. 3591 */ 3592 if (ret && ret != -EEXIST) 3593 return ret; 3594 3595 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3596 struct btrfs_dir_log_item); 3597 if (ret == -EEXIST) { 3598 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); 3599 3600 /* 3601 * btrfs_del_dir_entries_in_log() might have been called during 3602 * an unlink between the initial insertion of this key and the 3603 * current update, or we might be logging a single entry deletion 3604 * during a rename, so set the new last_offset to the max value. 3605 */ 3606 last_offset = max(last_offset, curr_end); 3607 } 3608 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3609 btrfs_mark_buffer_dirty(path->nodes[0]); 3610 btrfs_release_path(path); 3611 return 0; 3612 } 3613 3614 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3615 struct btrfs_root *log, 3616 struct extent_buffer *src, 3617 struct btrfs_path *dst_path, 3618 int start_slot, 3619 int count) 3620 { 3621 char *ins_data = NULL; 3622 struct btrfs_item_batch batch; 3623 struct extent_buffer *dst; 3624 unsigned long src_offset; 3625 unsigned long dst_offset; 3626 struct btrfs_key key; 3627 u32 item_size; 3628 int ret; 3629 int i; 3630 3631 ASSERT(count > 0); 3632 batch.nr = count; 3633 3634 if (count == 1) { 3635 btrfs_item_key_to_cpu(src, &key, start_slot); 3636 item_size = btrfs_item_size(src, start_slot); 3637 batch.keys = &key; 3638 batch.data_sizes = &item_size; 3639 batch.total_data_size = item_size; 3640 } else { 3641 struct btrfs_key *ins_keys; 3642 u32 *ins_sizes; 3643 3644 ins_data = kmalloc(count * sizeof(u32) + 3645 count * sizeof(struct btrfs_key), GFP_NOFS); 3646 if (!ins_data) 3647 return -ENOMEM; 3648 3649 ins_sizes = (u32 *)ins_data; 3650 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3651 batch.keys = ins_keys; 3652 batch.data_sizes = ins_sizes; 3653 batch.total_data_size = 0; 3654 3655 for (i = 0; i < count; i++) { 3656 const int slot = start_slot + i; 3657 3658 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3659 ins_sizes[i] = btrfs_item_size(src, slot); 3660 batch.total_data_size += ins_sizes[i]; 3661 } 3662 } 3663 3664 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3665 if (ret) 3666 goto out; 3667 3668 dst = dst_path->nodes[0]; 3669 /* 3670 * Copy all the items in bulk, in a single copy operation. Item data is 3671 * organized such that it's placed at the end of a leaf and from right 3672 * to left. For example, the data for the second item ends at an offset 3673 * that matches the offset where the data for the first item starts, the 3674 * data for the third item ends at an offset that matches the offset 3675 * where the data of the second items starts, and so on. 3676 * Therefore our source and destination start offsets for copy match the 3677 * offsets of the last items (highest slots). 3678 */ 3679 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3680 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3681 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3682 btrfs_release_path(dst_path); 3683 out: 3684 kfree(ins_data); 3685 3686 return ret; 3687 } 3688 3689 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3690 struct btrfs_inode *inode, 3691 struct btrfs_path *path, 3692 struct btrfs_path *dst_path, 3693 struct btrfs_log_ctx *ctx, 3694 u64 *last_old_dentry_offset) 3695 { 3696 struct btrfs_root *log = inode->root->log_root; 3697 struct extent_buffer *src; 3698 const int nritems = btrfs_header_nritems(path->nodes[0]); 3699 const u64 ino = btrfs_ino(inode); 3700 bool last_found = false; 3701 int batch_start = 0; 3702 int batch_size = 0; 3703 int i; 3704 3705 /* 3706 * We need to clone the leaf, release the read lock on it, and use the 3707 * clone before modifying the log tree. See the comment at copy_items() 3708 * about why we need to do this. 3709 */ 3710 src = btrfs_clone_extent_buffer(path->nodes[0]); 3711 if (!src) 3712 return -ENOMEM; 3713 3714 i = path->slots[0]; 3715 btrfs_release_path(path); 3716 path->nodes[0] = src; 3717 path->slots[0] = i; 3718 3719 for (; i < nritems; i++) { 3720 struct btrfs_dir_item *di; 3721 struct btrfs_key key; 3722 int ret; 3723 3724 btrfs_item_key_to_cpu(src, &key, i); 3725 3726 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3727 last_found = true; 3728 break; 3729 } 3730 3731 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3732 ctx->last_dir_item_offset = key.offset; 3733 3734 /* 3735 * Skip ranges of items that consist only of dir item keys created 3736 * in past transactions. However if we find a gap, we must log a 3737 * dir index range item for that gap, so that index keys in that 3738 * gap are deleted during log replay. 3739 */ 3740 if (btrfs_dir_transid(src, di) < trans->transid) { 3741 if (key.offset > *last_old_dentry_offset + 1) { 3742 ret = insert_dir_log_key(trans, log, dst_path, 3743 ino, *last_old_dentry_offset + 1, 3744 key.offset - 1); 3745 if (ret < 0) 3746 return ret; 3747 } 3748 3749 *last_old_dentry_offset = key.offset; 3750 continue; 3751 } 3752 3753 /* If we logged this dir index item before, we can skip it. */ 3754 if (key.offset <= inode->last_dir_index_offset) 3755 continue; 3756 3757 /* 3758 * We must make sure that when we log a directory entry, the 3759 * corresponding inode, after log replay, has a matching link 3760 * count. For example: 3761 * 3762 * touch foo 3763 * mkdir mydir 3764 * sync 3765 * ln foo mydir/bar 3766 * xfs_io -c "fsync" mydir 3767 * <crash> 3768 * <mount fs and log replay> 3769 * 3770 * Would result in a fsync log that when replayed, our file inode 3771 * would have a link count of 1, but we get two directory entries 3772 * pointing to the same inode. After removing one of the names, 3773 * it would not be possible to remove the other name, which 3774 * resulted always in stale file handle errors, and would not be 3775 * possible to rmdir the parent directory, since its i_size could 3776 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3777 * resulting in -ENOTEMPTY errors. 3778 */ 3779 if (!ctx->log_new_dentries) { 3780 struct btrfs_key di_key; 3781 3782 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3783 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3784 ctx->log_new_dentries = true; 3785 } 3786 3787 if (batch_size == 0) 3788 batch_start = i; 3789 batch_size++; 3790 } 3791 3792 if (batch_size > 0) { 3793 int ret; 3794 3795 ret = flush_dir_items_batch(trans, log, src, dst_path, 3796 batch_start, batch_size); 3797 if (ret < 0) 3798 return ret; 3799 } 3800 3801 return last_found ? 1 : 0; 3802 } 3803 3804 /* 3805 * log all the items included in the current transaction for a given 3806 * directory. This also creates the range items in the log tree required 3807 * to replay anything deleted before the fsync 3808 */ 3809 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3810 struct btrfs_inode *inode, 3811 struct btrfs_path *path, 3812 struct btrfs_path *dst_path, 3813 struct btrfs_log_ctx *ctx, 3814 u64 min_offset, u64 *last_offset_ret) 3815 { 3816 struct btrfs_key min_key; 3817 struct btrfs_root *root = inode->root; 3818 struct btrfs_root *log = root->log_root; 3819 int err = 0; 3820 int ret; 3821 u64 last_old_dentry_offset = min_offset - 1; 3822 u64 last_offset = (u64)-1; 3823 u64 ino = btrfs_ino(inode); 3824 3825 min_key.objectid = ino; 3826 min_key.type = BTRFS_DIR_INDEX_KEY; 3827 min_key.offset = min_offset; 3828 3829 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3830 3831 /* 3832 * we didn't find anything from this transaction, see if there 3833 * is anything at all 3834 */ 3835 if (ret != 0 || min_key.objectid != ino || 3836 min_key.type != BTRFS_DIR_INDEX_KEY) { 3837 min_key.objectid = ino; 3838 min_key.type = BTRFS_DIR_INDEX_KEY; 3839 min_key.offset = (u64)-1; 3840 btrfs_release_path(path); 3841 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3842 if (ret < 0) { 3843 btrfs_release_path(path); 3844 return ret; 3845 } 3846 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3847 3848 /* if ret == 0 there are items for this type, 3849 * create a range to tell us the last key of this type. 3850 * otherwise, there are no items in this directory after 3851 * *min_offset, and we create a range to indicate that. 3852 */ 3853 if (ret == 0) { 3854 struct btrfs_key tmp; 3855 3856 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 3857 path->slots[0]); 3858 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3859 last_old_dentry_offset = tmp.offset; 3860 } 3861 goto done; 3862 } 3863 3864 /* go backward to find any previous key */ 3865 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 3866 if (ret == 0) { 3867 struct btrfs_key tmp; 3868 3869 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 3870 /* 3871 * The dir index key before the first one we found that needs to 3872 * be logged might be in a previous leaf, and there might be a 3873 * gap between these keys, meaning that we had deletions that 3874 * happened. So the key range item we log (key type 3875 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 3876 * previous key's offset plus 1, so that those deletes are replayed. 3877 */ 3878 if (tmp.type == BTRFS_DIR_INDEX_KEY) 3879 last_old_dentry_offset = tmp.offset; 3880 } 3881 btrfs_release_path(path); 3882 3883 /* 3884 * Find the first key from this transaction again. See the note for 3885 * log_new_dir_dentries, if we're logging a directory recursively we 3886 * won't be holding its i_mutex, which means we can modify the directory 3887 * while we're logging it. If we remove an entry between our first 3888 * search and this search we'll not find the key again and can just 3889 * bail. 3890 */ 3891 search: 3892 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3893 if (ret != 0) 3894 goto done; 3895 3896 /* 3897 * we have a block from this transaction, log every item in it 3898 * from our directory 3899 */ 3900 while (1) { 3901 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 3902 &last_old_dentry_offset); 3903 if (ret != 0) { 3904 if (ret < 0) 3905 err = ret; 3906 goto done; 3907 } 3908 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 3909 3910 /* 3911 * look ahead to the next item and see if it is also 3912 * from this directory and from this transaction 3913 */ 3914 ret = btrfs_next_leaf(root, path); 3915 if (ret) { 3916 if (ret == 1) 3917 last_offset = (u64)-1; 3918 else 3919 err = ret; 3920 goto done; 3921 } 3922 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 3923 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 3924 last_offset = (u64)-1; 3925 goto done; 3926 } 3927 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 3928 /* 3929 * The next leaf was not changed in the current transaction 3930 * and has at least one dir index key. 3931 * We check for the next key because there might have been 3932 * one or more deletions between the last key we logged and 3933 * that next key. So the key range item we log (key type 3934 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 3935 * offset minus 1, so that those deletes are replayed. 3936 */ 3937 last_offset = min_key.offset - 1; 3938 goto done; 3939 } 3940 if (need_resched()) { 3941 btrfs_release_path(path); 3942 cond_resched(); 3943 goto search; 3944 } 3945 } 3946 done: 3947 btrfs_release_path(path); 3948 btrfs_release_path(dst_path); 3949 3950 if (err == 0) { 3951 *last_offset_ret = last_offset; 3952 /* 3953 * In case the leaf was changed in the current transaction but 3954 * all its dir items are from a past transaction, the last item 3955 * in the leaf is a dir item and there's no gap between that last 3956 * dir item and the first one on the next leaf (which did not 3957 * change in the current transaction), then we don't need to log 3958 * a range, last_old_dentry_offset is == to last_offset. 3959 */ 3960 ASSERT(last_old_dentry_offset <= last_offset); 3961 if (last_old_dentry_offset < last_offset) { 3962 ret = insert_dir_log_key(trans, log, path, ino, 3963 last_old_dentry_offset + 1, 3964 last_offset); 3965 if (ret) 3966 err = ret; 3967 } 3968 } 3969 return err; 3970 } 3971 3972 /* 3973 * If the inode was logged before and it was evicted, then its 3974 * last_dir_index_offset is (u64)-1, so we don't the value of the last index 3975 * key offset. If that's the case, search for it and update the inode. This 3976 * is to avoid lookups in the log tree every time we try to insert a dir index 3977 * key from a leaf changed in the current transaction, and to allow us to always 3978 * do batch insertions of dir index keys. 3979 */ 3980 static int update_last_dir_index_offset(struct btrfs_inode *inode, 3981 struct btrfs_path *path, 3982 const struct btrfs_log_ctx *ctx) 3983 { 3984 const u64 ino = btrfs_ino(inode); 3985 struct btrfs_key key; 3986 int ret; 3987 3988 lockdep_assert_held(&inode->log_mutex); 3989 3990 if (inode->last_dir_index_offset != (u64)-1) 3991 return 0; 3992 3993 if (!ctx->logged_before) { 3994 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 3995 return 0; 3996 } 3997 3998 key.objectid = ino; 3999 key.type = BTRFS_DIR_INDEX_KEY; 4000 key.offset = (u64)-1; 4001 4002 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 4003 /* 4004 * An error happened or we actually have an index key with an offset 4005 * value of (u64)-1. Bail out, we're done. 4006 */ 4007 if (ret <= 0) 4008 goto out; 4009 4010 ret = 0; 4011 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1; 4012 4013 /* 4014 * No dir index items, bail out and leave last_dir_index_offset with 4015 * the value right before the first valid index value. 4016 */ 4017 if (path->slots[0] == 0) 4018 goto out; 4019 4020 /* 4021 * btrfs_search_slot() left us at one slot beyond the slot with the last 4022 * index key, or beyond the last key of the directory that is not an 4023 * index key. If we have an index key before, set last_dir_index_offset 4024 * to its offset value, otherwise leave it with a value right before the 4025 * first valid index value, as it means we have an empty directory. 4026 */ 4027 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 4028 if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY) 4029 inode->last_dir_index_offset = key.offset; 4030 4031 out: 4032 btrfs_release_path(path); 4033 4034 return ret; 4035 } 4036 4037 /* 4038 * logging directories is very similar to logging inodes, We find all the items 4039 * from the current transaction and write them to the log. 4040 * 4041 * The recovery code scans the directory in the subvolume, and if it finds a 4042 * key in the range logged that is not present in the log tree, then it means 4043 * that dir entry was unlinked during the transaction. 4044 * 4045 * In order for that scan to work, we must include one key smaller than 4046 * the smallest logged by this transaction and one key larger than the largest 4047 * key logged by this transaction. 4048 */ 4049 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4050 struct btrfs_inode *inode, 4051 struct btrfs_path *path, 4052 struct btrfs_path *dst_path, 4053 struct btrfs_log_ctx *ctx) 4054 { 4055 u64 min_key; 4056 u64 max_key; 4057 int ret; 4058 4059 ret = update_last_dir_index_offset(inode, path, ctx); 4060 if (ret) 4061 return ret; 4062 4063 min_key = BTRFS_DIR_START_INDEX; 4064 max_key = 0; 4065 ctx->last_dir_item_offset = inode->last_dir_index_offset; 4066 4067 while (1) { 4068 ret = log_dir_items(trans, inode, path, dst_path, 4069 ctx, min_key, &max_key); 4070 if (ret) 4071 return ret; 4072 if (max_key == (u64)-1) 4073 break; 4074 min_key = max_key + 1; 4075 } 4076 4077 inode->last_dir_index_offset = ctx->last_dir_item_offset; 4078 4079 return 0; 4080 } 4081 4082 /* 4083 * a helper function to drop items from the log before we relog an 4084 * inode. max_key_type indicates the highest item type to remove. 4085 * This cannot be run for file data extents because it does not 4086 * free the extents they point to. 4087 */ 4088 static int drop_inode_items(struct btrfs_trans_handle *trans, 4089 struct btrfs_root *log, 4090 struct btrfs_path *path, 4091 struct btrfs_inode *inode, 4092 int max_key_type) 4093 { 4094 int ret; 4095 struct btrfs_key key; 4096 struct btrfs_key found_key; 4097 int start_slot; 4098 4099 key.objectid = btrfs_ino(inode); 4100 key.type = max_key_type; 4101 key.offset = (u64)-1; 4102 4103 while (1) { 4104 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4105 BUG_ON(ret == 0); /* Logic error */ 4106 if (ret < 0) 4107 break; 4108 4109 if (path->slots[0] == 0) 4110 break; 4111 4112 path->slots[0]--; 4113 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4114 path->slots[0]); 4115 4116 if (found_key.objectid != key.objectid) 4117 break; 4118 4119 found_key.offset = 0; 4120 found_key.type = 0; 4121 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 4122 if (ret < 0) 4123 break; 4124 4125 ret = btrfs_del_items(trans, log, path, start_slot, 4126 path->slots[0] - start_slot + 1); 4127 /* 4128 * If start slot isn't 0 then we don't need to re-search, we've 4129 * found the last guy with the objectid in this tree. 4130 */ 4131 if (ret || start_slot != 0) 4132 break; 4133 btrfs_release_path(path); 4134 } 4135 btrfs_release_path(path); 4136 if (ret > 0) 4137 ret = 0; 4138 return ret; 4139 } 4140 4141 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4142 struct btrfs_root *log_root, 4143 struct btrfs_inode *inode, 4144 u64 new_size, u32 min_type) 4145 { 4146 struct btrfs_truncate_control control = { 4147 .new_size = new_size, 4148 .ino = btrfs_ino(inode), 4149 .min_type = min_type, 4150 .skip_ref_updates = true, 4151 }; 4152 4153 return btrfs_truncate_inode_items(trans, log_root, &control); 4154 } 4155 4156 static void fill_inode_item(struct btrfs_trans_handle *trans, 4157 struct extent_buffer *leaf, 4158 struct btrfs_inode_item *item, 4159 struct inode *inode, int log_inode_only, 4160 u64 logged_isize) 4161 { 4162 struct btrfs_map_token token; 4163 u64 flags; 4164 4165 btrfs_init_map_token(&token, leaf); 4166 4167 if (log_inode_only) { 4168 /* set the generation to zero so the recover code 4169 * can tell the difference between an logging 4170 * just to say 'this inode exists' and a logging 4171 * to say 'update this inode with these values' 4172 */ 4173 btrfs_set_token_inode_generation(&token, item, 0); 4174 btrfs_set_token_inode_size(&token, item, logged_isize); 4175 } else { 4176 btrfs_set_token_inode_generation(&token, item, 4177 BTRFS_I(inode)->generation); 4178 btrfs_set_token_inode_size(&token, item, inode->i_size); 4179 } 4180 4181 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4182 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4183 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4184 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4185 4186 btrfs_set_token_timespec_sec(&token, &item->atime, 4187 inode->i_atime.tv_sec); 4188 btrfs_set_token_timespec_nsec(&token, &item->atime, 4189 inode->i_atime.tv_nsec); 4190 4191 btrfs_set_token_timespec_sec(&token, &item->mtime, 4192 inode->i_mtime.tv_sec); 4193 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4194 inode->i_mtime.tv_nsec); 4195 4196 btrfs_set_token_timespec_sec(&token, &item->ctime, 4197 inode->i_ctime.tv_sec); 4198 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4199 inode->i_ctime.tv_nsec); 4200 4201 /* 4202 * We do not need to set the nbytes field, in fact during a fast fsync 4203 * its value may not even be correct, since a fast fsync does not wait 4204 * for ordered extent completion, which is where we update nbytes, it 4205 * only waits for writeback to complete. During log replay as we find 4206 * file extent items and replay them, we adjust the nbytes field of the 4207 * inode item in subvolume tree as needed (see overwrite_item()). 4208 */ 4209 4210 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4211 btrfs_set_token_inode_transid(&token, item, trans->transid); 4212 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4213 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4214 BTRFS_I(inode)->ro_flags); 4215 btrfs_set_token_inode_flags(&token, item, flags); 4216 btrfs_set_token_inode_block_group(&token, item, 0); 4217 } 4218 4219 static int log_inode_item(struct btrfs_trans_handle *trans, 4220 struct btrfs_root *log, struct btrfs_path *path, 4221 struct btrfs_inode *inode, bool inode_item_dropped) 4222 { 4223 struct btrfs_inode_item *inode_item; 4224 int ret; 4225 4226 /* 4227 * If we are doing a fast fsync and the inode was logged before in the 4228 * current transaction, then we know the inode was previously logged and 4229 * it exists in the log tree. For performance reasons, in this case use 4230 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4231 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4232 * contention in case there are concurrent fsyncs for other inodes of the 4233 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4234 * already exists can also result in unnecessarily splitting a leaf. 4235 */ 4236 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4237 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4238 ASSERT(ret <= 0); 4239 if (ret > 0) 4240 ret = -ENOENT; 4241 } else { 4242 /* 4243 * This means it is the first fsync in the current transaction, 4244 * so the inode item is not in the log and we need to insert it. 4245 * We can never get -EEXIST because we are only called for a fast 4246 * fsync and in case an inode eviction happens after the inode was 4247 * logged before in the current transaction, when we load again 4248 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4249 * flags and set ->logged_trans to 0. 4250 */ 4251 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4252 sizeof(*inode_item)); 4253 ASSERT(ret != -EEXIST); 4254 } 4255 if (ret) 4256 return ret; 4257 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4258 struct btrfs_inode_item); 4259 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4260 0, 0); 4261 btrfs_release_path(path); 4262 return 0; 4263 } 4264 4265 static int log_csums(struct btrfs_trans_handle *trans, 4266 struct btrfs_inode *inode, 4267 struct btrfs_root *log_root, 4268 struct btrfs_ordered_sum *sums) 4269 { 4270 const u64 lock_end = sums->bytenr + sums->len - 1; 4271 struct extent_state *cached_state = NULL; 4272 int ret; 4273 4274 /* 4275 * If this inode was not used for reflink operations in the current 4276 * transaction with new extents, then do the fast path, no need to 4277 * worry about logging checksum items with overlapping ranges. 4278 */ 4279 if (inode->last_reflink_trans < trans->transid) 4280 return btrfs_csum_file_blocks(trans, log_root, sums); 4281 4282 /* 4283 * Serialize logging for checksums. This is to avoid racing with the 4284 * same checksum being logged by another task that is logging another 4285 * file which happens to refer to the same extent as well. Such races 4286 * can leave checksum items in the log with overlapping ranges. 4287 */ 4288 ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, 4289 &cached_state); 4290 if (ret) 4291 return ret; 4292 /* 4293 * Due to extent cloning, we might have logged a csum item that covers a 4294 * subrange of a cloned extent, and later we can end up logging a csum 4295 * item for a larger subrange of the same extent or the entire range. 4296 * This would leave csum items in the log tree that cover the same range 4297 * and break the searches for checksums in the log tree, resulting in 4298 * some checksums missing in the fs/subvolume tree. So just delete (or 4299 * trim and adjust) any existing csum items in the log for this range. 4300 */ 4301 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4302 if (!ret) 4303 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4304 4305 unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, 4306 &cached_state); 4307 4308 return ret; 4309 } 4310 4311 static noinline int copy_items(struct btrfs_trans_handle *trans, 4312 struct btrfs_inode *inode, 4313 struct btrfs_path *dst_path, 4314 struct btrfs_path *src_path, 4315 int start_slot, int nr, int inode_only, 4316 u64 logged_isize) 4317 { 4318 struct btrfs_root *log = inode->root->log_root; 4319 struct btrfs_file_extent_item *extent; 4320 struct extent_buffer *src; 4321 int ret = 0; 4322 struct btrfs_key *ins_keys; 4323 u32 *ins_sizes; 4324 struct btrfs_item_batch batch; 4325 char *ins_data; 4326 int i; 4327 int dst_index; 4328 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4329 const u64 i_size = i_size_read(&inode->vfs_inode); 4330 4331 /* 4332 * To keep lockdep happy and avoid deadlocks, clone the source leaf and 4333 * use the clone. This is because otherwise we would be changing the log 4334 * tree, to insert items from the subvolume tree or insert csum items, 4335 * while holding a read lock on a leaf from the subvolume tree, which 4336 * creates a nasty lock dependency when COWing log tree nodes/leaves: 4337 * 4338 * 1) Modifying the log tree triggers an extent buffer allocation while 4339 * holding a write lock on a parent extent buffer from the log tree. 4340 * Allocating the pages for an extent buffer, or the extent buffer 4341 * struct, can trigger inode eviction and finally the inode eviction 4342 * will trigger a release/remove of a delayed node, which requires 4343 * taking the delayed node's mutex; 4344 * 4345 * 2) Allocating a metadata extent for a log tree can trigger the async 4346 * reclaim thread and make us wait for it to release enough space and 4347 * unblock our reservation ticket. The reclaim thread can start 4348 * flushing delayed items, and that in turn results in the need to 4349 * lock delayed node mutexes and in the need to write lock extent 4350 * buffers of a subvolume tree - all this while holding a write lock 4351 * on the parent extent buffer in the log tree. 4352 * 4353 * So one task in scenario 1) running in parallel with another task in 4354 * scenario 2) could lead to a deadlock, one wanting to lock a delayed 4355 * node mutex while having a read lock on a leaf from the subvolume, 4356 * while the other is holding the delayed node's mutex and wants to 4357 * write lock the same subvolume leaf for flushing delayed items. 4358 */ 4359 src = btrfs_clone_extent_buffer(src_path->nodes[0]); 4360 if (!src) 4361 return -ENOMEM; 4362 4363 i = src_path->slots[0]; 4364 btrfs_release_path(src_path); 4365 src_path->nodes[0] = src; 4366 src_path->slots[0] = i; 4367 4368 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4369 nr * sizeof(u32), GFP_NOFS); 4370 if (!ins_data) 4371 return -ENOMEM; 4372 4373 ins_sizes = (u32 *)ins_data; 4374 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4375 batch.keys = ins_keys; 4376 batch.data_sizes = ins_sizes; 4377 batch.total_data_size = 0; 4378 batch.nr = 0; 4379 4380 dst_index = 0; 4381 for (i = 0; i < nr; i++) { 4382 const int src_slot = start_slot + i; 4383 struct btrfs_root *csum_root; 4384 struct btrfs_ordered_sum *sums; 4385 struct btrfs_ordered_sum *sums_next; 4386 LIST_HEAD(ordered_sums); 4387 u64 disk_bytenr; 4388 u64 disk_num_bytes; 4389 u64 extent_offset; 4390 u64 extent_num_bytes; 4391 bool is_old_extent; 4392 4393 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4394 4395 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4396 goto add_to_batch; 4397 4398 extent = btrfs_item_ptr(src, src_slot, 4399 struct btrfs_file_extent_item); 4400 4401 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4402 trans->transid); 4403 4404 /* 4405 * Don't copy extents from past generations. That would make us 4406 * log a lot more metadata for common cases like doing only a 4407 * few random writes into a file and then fsync it for the first 4408 * time or after the full sync flag is set on the inode. We can 4409 * get leaves full of extent items, most of which are from past 4410 * generations, so we can skip them - as long as the inode has 4411 * not been the target of a reflink operation in this transaction, 4412 * as in that case it might have had file extent items with old 4413 * generations copied into it. We also must always log prealloc 4414 * extents that start at or beyond eof, otherwise we would lose 4415 * them on log replay. 4416 */ 4417 if (is_old_extent && 4418 ins_keys[dst_index].offset < i_size && 4419 inode->last_reflink_trans < trans->transid) 4420 continue; 4421 4422 if (skip_csum) 4423 goto add_to_batch; 4424 4425 /* Only regular extents have checksums. */ 4426 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4427 goto add_to_batch; 4428 4429 /* 4430 * If it's an extent created in a past transaction, then its 4431 * checksums are already accessible from the committed csum tree, 4432 * no need to log them. 4433 */ 4434 if (is_old_extent) 4435 goto add_to_batch; 4436 4437 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4438 /* If it's an explicit hole, there are no checksums. */ 4439 if (disk_bytenr == 0) 4440 goto add_to_batch; 4441 4442 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4443 4444 if (btrfs_file_extent_compression(src, extent)) { 4445 extent_offset = 0; 4446 extent_num_bytes = disk_num_bytes; 4447 } else { 4448 extent_offset = btrfs_file_extent_offset(src, extent); 4449 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4450 } 4451 4452 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4453 disk_bytenr += extent_offset; 4454 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, 4455 disk_bytenr + extent_num_bytes - 1, 4456 &ordered_sums, 0, false); 4457 if (ret) 4458 goto out; 4459 4460 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4461 if (!ret) 4462 ret = log_csums(trans, inode, log, sums); 4463 list_del(&sums->list); 4464 kfree(sums); 4465 } 4466 if (ret) 4467 goto out; 4468 4469 add_to_batch: 4470 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4471 batch.total_data_size += ins_sizes[dst_index]; 4472 batch.nr++; 4473 dst_index++; 4474 } 4475 4476 /* 4477 * We have a leaf full of old extent items that don't need to be logged, 4478 * so we don't need to do anything. 4479 */ 4480 if (batch.nr == 0) 4481 goto out; 4482 4483 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4484 if (ret) 4485 goto out; 4486 4487 dst_index = 0; 4488 for (i = 0; i < nr; i++) { 4489 const int src_slot = start_slot + i; 4490 const int dst_slot = dst_path->slots[0] + dst_index; 4491 struct btrfs_key key; 4492 unsigned long src_offset; 4493 unsigned long dst_offset; 4494 4495 /* 4496 * We're done, all the remaining items in the source leaf 4497 * correspond to old file extent items. 4498 */ 4499 if (dst_index >= batch.nr) 4500 break; 4501 4502 btrfs_item_key_to_cpu(src, &key, src_slot); 4503 4504 if (key.type != BTRFS_EXTENT_DATA_KEY) 4505 goto copy_item; 4506 4507 extent = btrfs_item_ptr(src, src_slot, 4508 struct btrfs_file_extent_item); 4509 4510 /* See the comment in the previous loop, same logic. */ 4511 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4512 key.offset < i_size && 4513 inode->last_reflink_trans < trans->transid) 4514 continue; 4515 4516 copy_item: 4517 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4518 src_offset = btrfs_item_ptr_offset(src, src_slot); 4519 4520 if (key.type == BTRFS_INODE_ITEM_KEY) { 4521 struct btrfs_inode_item *inode_item; 4522 4523 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4524 struct btrfs_inode_item); 4525 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4526 &inode->vfs_inode, 4527 inode_only == LOG_INODE_EXISTS, 4528 logged_isize); 4529 } else { 4530 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4531 src_offset, ins_sizes[dst_index]); 4532 } 4533 4534 dst_index++; 4535 } 4536 4537 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4538 btrfs_release_path(dst_path); 4539 out: 4540 kfree(ins_data); 4541 4542 return ret; 4543 } 4544 4545 static int extent_cmp(void *priv, const struct list_head *a, 4546 const struct list_head *b) 4547 { 4548 const struct extent_map *em1, *em2; 4549 4550 em1 = list_entry(a, struct extent_map, list); 4551 em2 = list_entry(b, struct extent_map, list); 4552 4553 if (em1->start < em2->start) 4554 return -1; 4555 else if (em1->start > em2->start) 4556 return 1; 4557 return 0; 4558 } 4559 4560 static int log_extent_csums(struct btrfs_trans_handle *trans, 4561 struct btrfs_inode *inode, 4562 struct btrfs_root *log_root, 4563 const struct extent_map *em, 4564 struct btrfs_log_ctx *ctx) 4565 { 4566 struct btrfs_ordered_extent *ordered; 4567 struct btrfs_root *csum_root; 4568 u64 csum_offset; 4569 u64 csum_len; 4570 u64 mod_start = em->mod_start; 4571 u64 mod_len = em->mod_len; 4572 LIST_HEAD(ordered_sums); 4573 int ret = 0; 4574 4575 if (inode->flags & BTRFS_INODE_NODATASUM || 4576 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4577 em->block_start == EXTENT_MAP_HOLE) 4578 return 0; 4579 4580 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4581 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4582 const u64 mod_end = mod_start + mod_len; 4583 struct btrfs_ordered_sum *sums; 4584 4585 if (mod_len == 0) 4586 break; 4587 4588 if (ordered_end <= mod_start) 4589 continue; 4590 if (mod_end <= ordered->file_offset) 4591 break; 4592 4593 /* 4594 * We are going to copy all the csums on this ordered extent, so 4595 * go ahead and adjust mod_start and mod_len in case this ordered 4596 * extent has already been logged. 4597 */ 4598 if (ordered->file_offset > mod_start) { 4599 if (ordered_end >= mod_end) 4600 mod_len = ordered->file_offset - mod_start; 4601 /* 4602 * If we have this case 4603 * 4604 * |--------- logged extent ---------| 4605 * |----- ordered extent ----| 4606 * 4607 * Just don't mess with mod_start and mod_len, we'll 4608 * just end up logging more csums than we need and it 4609 * will be ok. 4610 */ 4611 } else { 4612 if (ordered_end < mod_end) { 4613 mod_len = mod_end - ordered_end; 4614 mod_start = ordered_end; 4615 } else { 4616 mod_len = 0; 4617 } 4618 } 4619 4620 /* 4621 * To keep us from looping for the above case of an ordered 4622 * extent that falls inside of the logged extent. 4623 */ 4624 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4625 continue; 4626 4627 list_for_each_entry(sums, &ordered->list, list) { 4628 ret = log_csums(trans, inode, log_root, sums); 4629 if (ret) 4630 return ret; 4631 } 4632 } 4633 4634 /* We're done, found all csums in the ordered extents. */ 4635 if (mod_len == 0) 4636 return 0; 4637 4638 /* If we're compressed we have to save the entire range of csums. */ 4639 if (em->compress_type) { 4640 csum_offset = 0; 4641 csum_len = max(em->block_len, em->orig_block_len); 4642 } else { 4643 csum_offset = mod_start - em->start; 4644 csum_len = mod_len; 4645 } 4646 4647 /* block start is already adjusted for the file extent offset. */ 4648 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4649 ret = btrfs_lookup_csums_range(csum_root, 4650 em->block_start + csum_offset, 4651 em->block_start + csum_offset + 4652 csum_len - 1, &ordered_sums, 0, false); 4653 if (ret) 4654 return ret; 4655 4656 while (!list_empty(&ordered_sums)) { 4657 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4658 struct btrfs_ordered_sum, 4659 list); 4660 if (!ret) 4661 ret = log_csums(trans, inode, log_root, sums); 4662 list_del(&sums->list); 4663 kfree(sums); 4664 } 4665 4666 return ret; 4667 } 4668 4669 static int log_one_extent(struct btrfs_trans_handle *trans, 4670 struct btrfs_inode *inode, 4671 const struct extent_map *em, 4672 struct btrfs_path *path, 4673 struct btrfs_log_ctx *ctx) 4674 { 4675 struct btrfs_drop_extents_args drop_args = { 0 }; 4676 struct btrfs_root *log = inode->root->log_root; 4677 struct btrfs_file_extent_item fi = { 0 }; 4678 struct extent_buffer *leaf; 4679 struct btrfs_key key; 4680 u64 extent_offset = em->start - em->orig_start; 4681 u64 block_len; 4682 int ret; 4683 4684 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4685 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4686 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4687 else 4688 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4689 4690 block_len = max(em->block_len, em->orig_block_len); 4691 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4692 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4693 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4694 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4695 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4696 extent_offset); 4697 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4698 } 4699 4700 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4701 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4702 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4703 btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4704 4705 ret = log_extent_csums(trans, inode, log, em, ctx); 4706 if (ret) 4707 return ret; 4708 4709 /* 4710 * If this is the first time we are logging the inode in the current 4711 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4712 * because it does a deletion search, which always acquires write locks 4713 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4714 * but also adds significant contention in a log tree, since log trees 4715 * are small, with a root at level 2 or 3 at most, due to their short 4716 * life span. 4717 */ 4718 if (ctx->logged_before) { 4719 drop_args.path = path; 4720 drop_args.start = em->start; 4721 drop_args.end = em->start + em->len; 4722 drop_args.replace_extent = true; 4723 drop_args.extent_item_size = sizeof(fi); 4724 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4725 if (ret) 4726 return ret; 4727 } 4728 4729 if (!drop_args.extent_inserted) { 4730 key.objectid = btrfs_ino(inode); 4731 key.type = BTRFS_EXTENT_DATA_KEY; 4732 key.offset = em->start; 4733 4734 ret = btrfs_insert_empty_item(trans, log, path, &key, 4735 sizeof(fi)); 4736 if (ret) 4737 return ret; 4738 } 4739 leaf = path->nodes[0]; 4740 write_extent_buffer(leaf, &fi, 4741 btrfs_item_ptr_offset(leaf, path->slots[0]), 4742 sizeof(fi)); 4743 btrfs_mark_buffer_dirty(leaf); 4744 4745 btrfs_release_path(path); 4746 4747 return ret; 4748 } 4749 4750 /* 4751 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4752 * lose them after doing a full/fast fsync and replaying the log. We scan the 4753 * subvolume's root instead of iterating the inode's extent map tree because 4754 * otherwise we can log incorrect extent items based on extent map conversion. 4755 * That can happen due to the fact that extent maps are merged when they 4756 * are not in the extent map tree's list of modified extents. 4757 */ 4758 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4759 struct btrfs_inode *inode, 4760 struct btrfs_path *path) 4761 { 4762 struct btrfs_root *root = inode->root; 4763 struct btrfs_key key; 4764 const u64 i_size = i_size_read(&inode->vfs_inode); 4765 const u64 ino = btrfs_ino(inode); 4766 struct btrfs_path *dst_path = NULL; 4767 bool dropped_extents = false; 4768 u64 truncate_offset = i_size; 4769 struct extent_buffer *leaf; 4770 int slot; 4771 int ins_nr = 0; 4772 int start_slot; 4773 int ret; 4774 4775 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4776 return 0; 4777 4778 key.objectid = ino; 4779 key.type = BTRFS_EXTENT_DATA_KEY; 4780 key.offset = i_size; 4781 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4782 if (ret < 0) 4783 goto out; 4784 4785 /* 4786 * We must check if there is a prealloc extent that starts before the 4787 * i_size and crosses the i_size boundary. This is to ensure later we 4788 * truncate down to the end of that extent and not to the i_size, as 4789 * otherwise we end up losing part of the prealloc extent after a log 4790 * replay and with an implicit hole if there is another prealloc extent 4791 * that starts at an offset beyond i_size. 4792 */ 4793 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4794 if (ret < 0) 4795 goto out; 4796 4797 if (ret == 0) { 4798 struct btrfs_file_extent_item *ei; 4799 4800 leaf = path->nodes[0]; 4801 slot = path->slots[0]; 4802 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4803 4804 if (btrfs_file_extent_type(leaf, ei) == 4805 BTRFS_FILE_EXTENT_PREALLOC) { 4806 u64 extent_end; 4807 4808 btrfs_item_key_to_cpu(leaf, &key, slot); 4809 extent_end = key.offset + 4810 btrfs_file_extent_num_bytes(leaf, ei); 4811 4812 if (extent_end > i_size) 4813 truncate_offset = extent_end; 4814 } 4815 } else { 4816 ret = 0; 4817 } 4818 4819 while (true) { 4820 leaf = path->nodes[0]; 4821 slot = path->slots[0]; 4822 4823 if (slot >= btrfs_header_nritems(leaf)) { 4824 if (ins_nr > 0) { 4825 ret = copy_items(trans, inode, dst_path, path, 4826 start_slot, ins_nr, 1, 0); 4827 if (ret < 0) 4828 goto out; 4829 ins_nr = 0; 4830 } 4831 ret = btrfs_next_leaf(root, path); 4832 if (ret < 0) 4833 goto out; 4834 if (ret > 0) { 4835 ret = 0; 4836 break; 4837 } 4838 continue; 4839 } 4840 4841 btrfs_item_key_to_cpu(leaf, &key, slot); 4842 if (key.objectid > ino) 4843 break; 4844 if (WARN_ON_ONCE(key.objectid < ino) || 4845 key.type < BTRFS_EXTENT_DATA_KEY || 4846 key.offset < i_size) { 4847 path->slots[0]++; 4848 continue; 4849 } 4850 if (!dropped_extents) { 4851 /* 4852 * Avoid logging extent items logged in past fsync calls 4853 * and leading to duplicate keys in the log tree. 4854 */ 4855 ret = truncate_inode_items(trans, root->log_root, inode, 4856 truncate_offset, 4857 BTRFS_EXTENT_DATA_KEY); 4858 if (ret) 4859 goto out; 4860 dropped_extents = true; 4861 } 4862 if (ins_nr == 0) 4863 start_slot = slot; 4864 ins_nr++; 4865 path->slots[0]++; 4866 if (!dst_path) { 4867 dst_path = btrfs_alloc_path(); 4868 if (!dst_path) { 4869 ret = -ENOMEM; 4870 goto out; 4871 } 4872 } 4873 } 4874 if (ins_nr > 0) 4875 ret = copy_items(trans, inode, dst_path, path, 4876 start_slot, ins_nr, 1, 0); 4877 out: 4878 btrfs_release_path(path); 4879 btrfs_free_path(dst_path); 4880 return ret; 4881 } 4882 4883 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4884 struct btrfs_inode *inode, 4885 struct btrfs_path *path, 4886 struct btrfs_log_ctx *ctx) 4887 { 4888 struct btrfs_ordered_extent *ordered; 4889 struct btrfs_ordered_extent *tmp; 4890 struct extent_map *em, *n; 4891 struct list_head extents; 4892 struct extent_map_tree *tree = &inode->extent_tree; 4893 int ret = 0; 4894 int num = 0; 4895 4896 INIT_LIST_HEAD(&extents); 4897 4898 write_lock(&tree->lock); 4899 4900 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4901 list_del_init(&em->list); 4902 /* 4903 * Just an arbitrary number, this can be really CPU intensive 4904 * once we start getting a lot of extents, and really once we 4905 * have a bunch of extents we just want to commit since it will 4906 * be faster. 4907 */ 4908 if (++num > 32768) { 4909 list_del_init(&tree->modified_extents); 4910 ret = -EFBIG; 4911 goto process; 4912 } 4913 4914 if (em->generation < trans->transid) 4915 continue; 4916 4917 /* We log prealloc extents beyond eof later. */ 4918 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4919 em->start >= i_size_read(&inode->vfs_inode)) 4920 continue; 4921 4922 /* Need a ref to keep it from getting evicted from cache */ 4923 refcount_inc(&em->refs); 4924 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4925 list_add_tail(&em->list, &extents); 4926 num++; 4927 } 4928 4929 list_sort(NULL, &extents, extent_cmp); 4930 process: 4931 while (!list_empty(&extents)) { 4932 em = list_entry(extents.next, struct extent_map, list); 4933 4934 list_del_init(&em->list); 4935 4936 /* 4937 * If we had an error we just need to delete everybody from our 4938 * private list. 4939 */ 4940 if (ret) { 4941 clear_em_logging(tree, em); 4942 free_extent_map(em); 4943 continue; 4944 } 4945 4946 write_unlock(&tree->lock); 4947 4948 ret = log_one_extent(trans, inode, em, path, ctx); 4949 write_lock(&tree->lock); 4950 clear_em_logging(tree, em); 4951 free_extent_map(em); 4952 } 4953 WARN_ON(!list_empty(&extents)); 4954 write_unlock(&tree->lock); 4955 4956 if (!ret) 4957 ret = btrfs_log_prealloc_extents(trans, inode, path); 4958 if (ret) 4959 return ret; 4960 4961 /* 4962 * We have logged all extents successfully, now make sure the commit of 4963 * the current transaction waits for the ordered extents to complete 4964 * before it commits and wipes out the log trees, otherwise we would 4965 * lose data if an ordered extents completes after the transaction 4966 * commits and a power failure happens after the transaction commit. 4967 */ 4968 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 4969 list_del_init(&ordered->log_list); 4970 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 4971 4972 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4973 spin_lock_irq(&inode->ordered_tree.lock); 4974 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 4975 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 4976 atomic_inc(&trans->transaction->pending_ordered); 4977 } 4978 spin_unlock_irq(&inode->ordered_tree.lock); 4979 } 4980 btrfs_put_ordered_extent(ordered); 4981 } 4982 4983 return 0; 4984 } 4985 4986 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 4987 struct btrfs_path *path, u64 *size_ret) 4988 { 4989 struct btrfs_key key; 4990 int ret; 4991 4992 key.objectid = btrfs_ino(inode); 4993 key.type = BTRFS_INODE_ITEM_KEY; 4994 key.offset = 0; 4995 4996 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 4997 if (ret < 0) { 4998 return ret; 4999 } else if (ret > 0) { 5000 *size_ret = 0; 5001 } else { 5002 struct btrfs_inode_item *item; 5003 5004 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5005 struct btrfs_inode_item); 5006 *size_ret = btrfs_inode_size(path->nodes[0], item); 5007 /* 5008 * If the in-memory inode's i_size is smaller then the inode 5009 * size stored in the btree, return the inode's i_size, so 5010 * that we get a correct inode size after replaying the log 5011 * when before a power failure we had a shrinking truncate 5012 * followed by addition of a new name (rename / new hard link). 5013 * Otherwise return the inode size from the btree, to avoid 5014 * data loss when replaying a log due to previously doing a 5015 * write that expands the inode's size and logging a new name 5016 * immediately after. 5017 */ 5018 if (*size_ret > inode->vfs_inode.i_size) 5019 *size_ret = inode->vfs_inode.i_size; 5020 } 5021 5022 btrfs_release_path(path); 5023 return 0; 5024 } 5025 5026 /* 5027 * At the moment we always log all xattrs. This is to figure out at log replay 5028 * time which xattrs must have their deletion replayed. If a xattr is missing 5029 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5030 * because if a xattr is deleted, the inode is fsynced and a power failure 5031 * happens, causing the log to be replayed the next time the fs is mounted, 5032 * we want the xattr to not exist anymore (same behaviour as other filesystems 5033 * with a journal, ext3/4, xfs, f2fs, etc). 5034 */ 5035 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5036 struct btrfs_inode *inode, 5037 struct btrfs_path *path, 5038 struct btrfs_path *dst_path) 5039 { 5040 struct btrfs_root *root = inode->root; 5041 int ret; 5042 struct btrfs_key key; 5043 const u64 ino = btrfs_ino(inode); 5044 int ins_nr = 0; 5045 int start_slot = 0; 5046 bool found_xattrs = false; 5047 5048 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5049 return 0; 5050 5051 key.objectid = ino; 5052 key.type = BTRFS_XATTR_ITEM_KEY; 5053 key.offset = 0; 5054 5055 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5056 if (ret < 0) 5057 return ret; 5058 5059 while (true) { 5060 int slot = path->slots[0]; 5061 struct extent_buffer *leaf = path->nodes[0]; 5062 int nritems = btrfs_header_nritems(leaf); 5063 5064 if (slot >= nritems) { 5065 if (ins_nr > 0) { 5066 ret = copy_items(trans, inode, dst_path, path, 5067 start_slot, ins_nr, 1, 0); 5068 if (ret < 0) 5069 return ret; 5070 ins_nr = 0; 5071 } 5072 ret = btrfs_next_leaf(root, path); 5073 if (ret < 0) 5074 return ret; 5075 else if (ret > 0) 5076 break; 5077 continue; 5078 } 5079 5080 btrfs_item_key_to_cpu(leaf, &key, slot); 5081 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5082 break; 5083 5084 if (ins_nr == 0) 5085 start_slot = slot; 5086 ins_nr++; 5087 path->slots[0]++; 5088 found_xattrs = true; 5089 cond_resched(); 5090 } 5091 if (ins_nr > 0) { 5092 ret = copy_items(trans, inode, dst_path, path, 5093 start_slot, ins_nr, 1, 0); 5094 if (ret < 0) 5095 return ret; 5096 } 5097 5098 if (!found_xattrs) 5099 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5100 5101 return 0; 5102 } 5103 5104 /* 5105 * When using the NO_HOLES feature if we punched a hole that causes the 5106 * deletion of entire leafs or all the extent items of the first leaf (the one 5107 * that contains the inode item and references) we may end up not processing 5108 * any extents, because there are no leafs with a generation matching the 5109 * current transaction that have extent items for our inode. So we need to find 5110 * if any holes exist and then log them. We also need to log holes after any 5111 * truncate operation that changes the inode's size. 5112 */ 5113 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5114 struct btrfs_inode *inode, 5115 struct btrfs_path *path) 5116 { 5117 struct btrfs_root *root = inode->root; 5118 struct btrfs_fs_info *fs_info = root->fs_info; 5119 struct btrfs_key key; 5120 const u64 ino = btrfs_ino(inode); 5121 const u64 i_size = i_size_read(&inode->vfs_inode); 5122 u64 prev_extent_end = 0; 5123 int ret; 5124 5125 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5126 return 0; 5127 5128 key.objectid = ino; 5129 key.type = BTRFS_EXTENT_DATA_KEY; 5130 key.offset = 0; 5131 5132 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5133 if (ret < 0) 5134 return ret; 5135 5136 while (true) { 5137 struct extent_buffer *leaf = path->nodes[0]; 5138 5139 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5140 ret = btrfs_next_leaf(root, path); 5141 if (ret < 0) 5142 return ret; 5143 if (ret > 0) { 5144 ret = 0; 5145 break; 5146 } 5147 leaf = path->nodes[0]; 5148 } 5149 5150 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5151 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5152 break; 5153 5154 /* We have a hole, log it. */ 5155 if (prev_extent_end < key.offset) { 5156 const u64 hole_len = key.offset - prev_extent_end; 5157 5158 /* 5159 * Release the path to avoid deadlocks with other code 5160 * paths that search the root while holding locks on 5161 * leafs from the log root. 5162 */ 5163 btrfs_release_path(path); 5164 ret = btrfs_insert_hole_extent(trans, root->log_root, 5165 ino, prev_extent_end, 5166 hole_len); 5167 if (ret < 0) 5168 return ret; 5169 5170 /* 5171 * Search for the same key again in the root. Since it's 5172 * an extent item and we are holding the inode lock, the 5173 * key must still exist. If it doesn't just emit warning 5174 * and return an error to fall back to a transaction 5175 * commit. 5176 */ 5177 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5178 if (ret < 0) 5179 return ret; 5180 if (WARN_ON(ret > 0)) 5181 return -ENOENT; 5182 leaf = path->nodes[0]; 5183 } 5184 5185 prev_extent_end = btrfs_file_extent_end(path); 5186 path->slots[0]++; 5187 cond_resched(); 5188 } 5189 5190 if (prev_extent_end < i_size) { 5191 u64 hole_len; 5192 5193 btrfs_release_path(path); 5194 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5195 ret = btrfs_insert_hole_extent(trans, root->log_root, ino, 5196 prev_extent_end, hole_len); 5197 if (ret < 0) 5198 return ret; 5199 } 5200 5201 return 0; 5202 } 5203 5204 /* 5205 * When we are logging a new inode X, check if it doesn't have a reference that 5206 * matches the reference from some other inode Y created in a past transaction 5207 * and that was renamed in the current transaction. If we don't do this, then at 5208 * log replay time we can lose inode Y (and all its files if it's a directory): 5209 * 5210 * mkdir /mnt/x 5211 * echo "hello world" > /mnt/x/foobar 5212 * sync 5213 * mv /mnt/x /mnt/y 5214 * mkdir /mnt/x # or touch /mnt/x 5215 * xfs_io -c fsync /mnt/x 5216 * <power fail> 5217 * mount fs, trigger log replay 5218 * 5219 * After the log replay procedure, we would lose the first directory and all its 5220 * files (file foobar). 5221 * For the case where inode Y is not a directory we simply end up losing it: 5222 * 5223 * echo "123" > /mnt/foo 5224 * sync 5225 * mv /mnt/foo /mnt/bar 5226 * echo "abc" > /mnt/foo 5227 * xfs_io -c fsync /mnt/foo 5228 * <power fail> 5229 * 5230 * We also need this for cases where a snapshot entry is replaced by some other 5231 * entry (file or directory) otherwise we end up with an unreplayable log due to 5232 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5233 * if it were a regular entry: 5234 * 5235 * mkdir /mnt/x 5236 * btrfs subvolume snapshot /mnt /mnt/x/snap 5237 * btrfs subvolume delete /mnt/x/snap 5238 * rmdir /mnt/x 5239 * mkdir /mnt/x 5240 * fsync /mnt/x or fsync some new file inside it 5241 * <power fail> 5242 * 5243 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5244 * the same transaction. 5245 */ 5246 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5247 const int slot, 5248 const struct btrfs_key *key, 5249 struct btrfs_inode *inode, 5250 u64 *other_ino, u64 *other_parent) 5251 { 5252 int ret; 5253 struct btrfs_path *search_path; 5254 char *name = NULL; 5255 u32 name_len = 0; 5256 u32 item_size = btrfs_item_size(eb, slot); 5257 u32 cur_offset = 0; 5258 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5259 5260 search_path = btrfs_alloc_path(); 5261 if (!search_path) 5262 return -ENOMEM; 5263 search_path->search_commit_root = 1; 5264 search_path->skip_locking = 1; 5265 5266 while (cur_offset < item_size) { 5267 u64 parent; 5268 u32 this_name_len; 5269 u32 this_len; 5270 unsigned long name_ptr; 5271 struct btrfs_dir_item *di; 5272 5273 if (key->type == BTRFS_INODE_REF_KEY) { 5274 struct btrfs_inode_ref *iref; 5275 5276 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5277 parent = key->offset; 5278 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5279 name_ptr = (unsigned long)(iref + 1); 5280 this_len = sizeof(*iref) + this_name_len; 5281 } else { 5282 struct btrfs_inode_extref *extref; 5283 5284 extref = (struct btrfs_inode_extref *)(ptr + 5285 cur_offset); 5286 parent = btrfs_inode_extref_parent(eb, extref); 5287 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5288 name_ptr = (unsigned long)&extref->name; 5289 this_len = sizeof(*extref) + this_name_len; 5290 } 5291 5292 if (this_name_len > name_len) { 5293 char *new_name; 5294 5295 new_name = krealloc(name, this_name_len, GFP_NOFS); 5296 if (!new_name) { 5297 ret = -ENOMEM; 5298 goto out; 5299 } 5300 name_len = this_name_len; 5301 name = new_name; 5302 } 5303 5304 read_extent_buffer(eb, name, name_ptr, this_name_len); 5305 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5306 parent, name, this_name_len, 0); 5307 if (di && !IS_ERR(di)) { 5308 struct btrfs_key di_key; 5309 5310 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5311 di, &di_key); 5312 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5313 if (di_key.objectid != key->objectid) { 5314 ret = 1; 5315 *other_ino = di_key.objectid; 5316 *other_parent = parent; 5317 } else { 5318 ret = 0; 5319 } 5320 } else { 5321 ret = -EAGAIN; 5322 } 5323 goto out; 5324 } else if (IS_ERR(di)) { 5325 ret = PTR_ERR(di); 5326 goto out; 5327 } 5328 btrfs_release_path(search_path); 5329 5330 cur_offset += this_len; 5331 } 5332 ret = 0; 5333 out: 5334 btrfs_free_path(search_path); 5335 kfree(name); 5336 return ret; 5337 } 5338 5339 /* 5340 * Check if we need to log an inode. This is used in contexts where while 5341 * logging an inode we need to log another inode (either that it exists or in 5342 * full mode). This is used instead of btrfs_inode_in_log() because the later 5343 * requires the inode to be in the log and have the log transaction committed, 5344 * while here we do not care if the log transaction was already committed - our 5345 * caller will commit the log later - and we want to avoid logging an inode 5346 * multiple times when multiple tasks have joined the same log transaction. 5347 */ 5348 static bool need_log_inode(const struct btrfs_trans_handle *trans, 5349 const struct btrfs_inode *inode) 5350 { 5351 /* 5352 * If a directory was not modified, no dentries added or removed, we can 5353 * and should avoid logging it. 5354 */ 5355 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 5356 return false; 5357 5358 /* 5359 * If this inode does not have new/updated/deleted xattrs since the last 5360 * time it was logged and is flagged as logged in the current transaction, 5361 * we can skip logging it. As for new/deleted names, those are updated in 5362 * the log by link/unlink/rename operations. 5363 * In case the inode was logged and then evicted and reloaded, its 5364 * logged_trans will be 0, in which case we have to fully log it since 5365 * logged_trans is a transient field, not persisted. 5366 */ 5367 if (inode->logged_trans == trans->transid && 5368 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 5369 return false; 5370 5371 return true; 5372 } 5373 5374 struct btrfs_dir_list { 5375 u64 ino; 5376 struct list_head list; 5377 }; 5378 5379 /* 5380 * Log the inodes of the new dentries of a directory. 5381 * See process_dir_items_leaf() for details about why it is needed. 5382 * This is a recursive operation - if an existing dentry corresponds to a 5383 * directory, that directory's new entries are logged too (same behaviour as 5384 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 5385 * the dentries point to we do not acquire their VFS lock, otherwise lockdep 5386 * complains about the following circular lock dependency / possible deadlock: 5387 * 5388 * CPU0 CPU1 5389 * ---- ---- 5390 * lock(&type->i_mutex_dir_key#3/2); 5391 * lock(sb_internal#2); 5392 * lock(&type->i_mutex_dir_key#3/2); 5393 * lock(&sb->s_type->i_mutex_key#14); 5394 * 5395 * Where sb_internal is the lock (a counter that works as a lock) acquired by 5396 * sb_start_intwrite() in btrfs_start_transaction(). 5397 * Not acquiring the VFS lock of the inodes is still safe because: 5398 * 5399 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 5400 * that while logging the inode new references (names) are added or removed 5401 * from the inode, leaving the logged inode item with a link count that does 5402 * not match the number of logged inode reference items. This is fine because 5403 * at log replay time we compute the real number of links and correct the 5404 * link count in the inode item (see replay_one_buffer() and 5405 * link_to_fixup_dir()); 5406 * 5407 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 5408 * while logging the inode's items new index items (key type 5409 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 5410 * has a size that doesn't match the sum of the lengths of all the logged 5411 * names - this is ok, not a problem, because at log replay time we set the 5412 * directory's i_size to the correct value (see replay_one_name() and 5413 * do_overwrite_item()). 5414 */ 5415 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 5416 struct btrfs_inode *start_inode, 5417 struct btrfs_log_ctx *ctx) 5418 { 5419 struct btrfs_root *root = start_inode->root; 5420 struct btrfs_fs_info *fs_info = root->fs_info; 5421 struct btrfs_path *path; 5422 LIST_HEAD(dir_list); 5423 struct btrfs_dir_list *dir_elem; 5424 u64 ino = btrfs_ino(start_inode); 5425 int ret = 0; 5426 5427 /* 5428 * If we are logging a new name, as part of a link or rename operation, 5429 * don't bother logging new dentries, as we just want to log the names 5430 * of an inode and that any new parents exist. 5431 */ 5432 if (ctx->logging_new_name) 5433 return 0; 5434 5435 path = btrfs_alloc_path(); 5436 if (!path) 5437 return -ENOMEM; 5438 5439 while (true) { 5440 struct extent_buffer *leaf; 5441 struct btrfs_key min_key; 5442 bool continue_curr_inode = true; 5443 int nritems; 5444 int i; 5445 5446 min_key.objectid = ino; 5447 min_key.type = BTRFS_DIR_INDEX_KEY; 5448 min_key.offset = 0; 5449 again: 5450 btrfs_release_path(path); 5451 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 5452 if (ret < 0) { 5453 break; 5454 } else if (ret > 0) { 5455 ret = 0; 5456 goto next; 5457 } 5458 5459 leaf = path->nodes[0]; 5460 nritems = btrfs_header_nritems(leaf); 5461 for (i = path->slots[0]; i < nritems; i++) { 5462 struct btrfs_dir_item *di; 5463 struct btrfs_key di_key; 5464 struct inode *di_inode; 5465 int log_mode = LOG_INODE_EXISTS; 5466 int type; 5467 5468 btrfs_item_key_to_cpu(leaf, &min_key, i); 5469 if (min_key.objectid != ino || 5470 min_key.type != BTRFS_DIR_INDEX_KEY) { 5471 continue_curr_inode = false; 5472 break; 5473 } 5474 5475 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 5476 type = btrfs_dir_type(leaf, di); 5477 if (btrfs_dir_transid(leaf, di) < trans->transid) 5478 continue; 5479 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 5480 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 5481 continue; 5482 5483 btrfs_release_path(path); 5484 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 5485 if (IS_ERR(di_inode)) { 5486 ret = PTR_ERR(di_inode); 5487 goto out; 5488 } 5489 5490 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 5491 btrfs_add_delayed_iput(di_inode); 5492 break; 5493 } 5494 5495 ctx->log_new_dentries = false; 5496 if (type == BTRFS_FT_DIR) 5497 log_mode = LOG_INODE_ALL; 5498 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 5499 log_mode, ctx); 5500 btrfs_add_delayed_iput(di_inode); 5501 if (ret) 5502 goto out; 5503 if (ctx->log_new_dentries) { 5504 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 5505 if (!dir_elem) { 5506 ret = -ENOMEM; 5507 goto out; 5508 } 5509 dir_elem->ino = di_key.objectid; 5510 list_add_tail(&dir_elem->list, &dir_list); 5511 } 5512 break; 5513 } 5514 5515 if (continue_curr_inode && min_key.offset < (u64)-1) { 5516 min_key.offset++; 5517 goto again; 5518 } 5519 5520 next: 5521 if (list_empty(&dir_list)) 5522 break; 5523 5524 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list); 5525 ino = dir_elem->ino; 5526 list_del(&dir_elem->list); 5527 kfree(dir_elem); 5528 } 5529 out: 5530 btrfs_free_path(path); 5531 if (ret) { 5532 struct btrfs_dir_list *next; 5533 5534 list_for_each_entry_safe(dir_elem, next, &dir_list, list) 5535 kfree(dir_elem); 5536 } 5537 5538 return ret; 5539 } 5540 5541 struct btrfs_ino_list { 5542 u64 ino; 5543 u64 parent; 5544 struct list_head list; 5545 }; 5546 5547 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx) 5548 { 5549 struct btrfs_ino_list *curr; 5550 struct btrfs_ino_list *next; 5551 5552 list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) { 5553 list_del(&curr->list); 5554 kfree(curr); 5555 } 5556 } 5557 5558 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, 5559 struct btrfs_path *path) 5560 { 5561 struct btrfs_key key; 5562 int ret; 5563 5564 key.objectid = ino; 5565 key.type = BTRFS_INODE_ITEM_KEY; 5566 key.offset = 0; 5567 5568 path->search_commit_root = 1; 5569 path->skip_locking = 1; 5570 5571 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5572 if (WARN_ON_ONCE(ret > 0)) { 5573 /* 5574 * We have previously found the inode through the commit root 5575 * so this should not happen. If it does, just error out and 5576 * fallback to a transaction commit. 5577 */ 5578 ret = -ENOENT; 5579 } else if (ret == 0) { 5580 struct btrfs_inode_item *item; 5581 5582 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5583 struct btrfs_inode_item); 5584 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item))) 5585 ret = 1; 5586 } 5587 5588 btrfs_release_path(path); 5589 path->search_commit_root = 0; 5590 path->skip_locking = 0; 5591 5592 return ret; 5593 } 5594 5595 static int add_conflicting_inode(struct btrfs_trans_handle *trans, 5596 struct btrfs_root *root, 5597 struct btrfs_path *path, 5598 u64 ino, u64 parent, 5599 struct btrfs_log_ctx *ctx) 5600 { 5601 struct btrfs_ino_list *ino_elem; 5602 struct inode *inode; 5603 5604 /* 5605 * It's rare to have a lot of conflicting inodes, in practice it is not 5606 * common to have more than 1 or 2. We don't want to collect too many, 5607 * as we could end up logging too many inodes (even if only in 5608 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction 5609 * commits. 5610 */ 5611 if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) 5612 return BTRFS_LOG_FORCE_COMMIT; 5613 5614 inode = btrfs_iget(root->fs_info->sb, ino, root); 5615 /* 5616 * If the other inode that had a conflicting dir entry was deleted in 5617 * the current transaction then we either: 5618 * 5619 * 1) Log the parent directory (later after adding it to the list) if 5620 * the inode is a directory. This is because it may be a deleted 5621 * subvolume/snapshot or it may be a regular directory that had 5622 * deleted subvolumes/snapshots (or subdirectories that had them), 5623 * and at the moment we can't deal with dropping subvolumes/snapshots 5624 * during log replay. So we just log the parent, which will result in 5625 * a fallback to a transaction commit if we are dealing with those 5626 * cases (last_unlink_trans will match the current transaction); 5627 * 5628 * 2) Do nothing if it's not a directory. During log replay we simply 5629 * unlink the conflicting dentry from the parent directory and then 5630 * add the dentry for our inode. Like this we can avoid logging the 5631 * parent directory (and maybe fallback to a transaction commit in 5632 * case it has a last_unlink_trans == trans->transid, due to moving 5633 * some inode from it to some other directory). 5634 */ 5635 if (IS_ERR(inode)) { 5636 int ret = PTR_ERR(inode); 5637 5638 if (ret != -ENOENT) 5639 return ret; 5640 5641 ret = conflicting_inode_is_dir(root, ino, path); 5642 /* Not a directory or we got an error. */ 5643 if (ret <= 0) 5644 return ret; 5645 5646 /* Conflicting inode is a directory, so we'll log its parent. */ 5647 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5648 if (!ino_elem) 5649 return -ENOMEM; 5650 ino_elem->ino = ino; 5651 ino_elem->parent = parent; 5652 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5653 ctx->num_conflict_inodes++; 5654 5655 return 0; 5656 } 5657 5658 /* 5659 * If the inode was already logged skip it - otherwise we can hit an 5660 * infinite loop. Example: 5661 * 5662 * From the commit root (previous transaction) we have the following 5663 * inodes: 5664 * 5665 * inode 257 a directory 5666 * inode 258 with references "zz" and "zz_link" on inode 257 5667 * inode 259 with reference "a" on inode 257 5668 * 5669 * And in the current (uncommitted) transaction we have: 5670 * 5671 * inode 257 a directory, unchanged 5672 * inode 258 with references "a" and "a2" on inode 257 5673 * inode 259 with reference "zz_link" on inode 257 5674 * inode 261 with reference "zz" on inode 257 5675 * 5676 * When logging inode 261 the following infinite loop could 5677 * happen if we don't skip already logged inodes: 5678 * 5679 * - we detect inode 258 as a conflicting inode, with inode 261 5680 * on reference "zz", and log it; 5681 * 5682 * - we detect inode 259 as a conflicting inode, with inode 258 5683 * on reference "a", and log it; 5684 * 5685 * - we detect inode 258 as a conflicting inode, with inode 259 5686 * on reference "zz_link", and log it - again! After this we 5687 * repeat the above steps forever. 5688 * 5689 * Here we can use need_log_inode() because we only need to log the 5690 * inode in LOG_INODE_EXISTS mode and rename operations update the log, 5691 * so that the log ends up with the new name and without the old name. 5692 */ 5693 if (!need_log_inode(trans, BTRFS_I(inode))) { 5694 btrfs_add_delayed_iput(inode); 5695 return 0; 5696 } 5697 5698 btrfs_add_delayed_iput(inode); 5699 5700 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5701 if (!ino_elem) 5702 return -ENOMEM; 5703 ino_elem->ino = ino; 5704 ino_elem->parent = parent; 5705 list_add_tail(&ino_elem->list, &ctx->conflict_inodes); 5706 ctx->num_conflict_inodes++; 5707 5708 return 0; 5709 } 5710 5711 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5712 struct btrfs_root *root, 5713 struct btrfs_log_ctx *ctx) 5714 { 5715 struct btrfs_fs_info *fs_info = root->fs_info; 5716 int ret = 0; 5717 5718 /* 5719 * Conflicting inodes are logged by the first call to btrfs_log_inode(), 5720 * otherwise we could have unbounded recursion of btrfs_log_inode() 5721 * calls. This check guarantees we can have only 1 level of recursion. 5722 */ 5723 if (ctx->logging_conflict_inodes) 5724 return 0; 5725 5726 ctx->logging_conflict_inodes = true; 5727 5728 /* 5729 * New conflicting inodes may be found and added to the list while we 5730 * are logging a conflicting inode, so keep iterating while the list is 5731 * not empty. 5732 */ 5733 while (!list_empty(&ctx->conflict_inodes)) { 5734 struct btrfs_ino_list *curr; 5735 struct inode *inode; 5736 u64 ino; 5737 u64 parent; 5738 5739 curr = list_first_entry(&ctx->conflict_inodes, 5740 struct btrfs_ino_list, list); 5741 ino = curr->ino; 5742 parent = curr->parent; 5743 list_del(&curr->list); 5744 kfree(curr); 5745 5746 inode = btrfs_iget(fs_info->sb, ino, root); 5747 /* 5748 * If the other inode that had a conflicting dir entry was 5749 * deleted in the current transaction, we need to log its parent 5750 * directory. See the comment at add_conflicting_inode(). 5751 */ 5752 if (IS_ERR(inode)) { 5753 ret = PTR_ERR(inode); 5754 if (ret != -ENOENT) 5755 break; 5756 5757 inode = btrfs_iget(fs_info->sb, parent, root); 5758 if (IS_ERR(inode)) { 5759 ret = PTR_ERR(inode); 5760 break; 5761 } 5762 5763 /* 5764 * Always log the directory, we cannot make this 5765 * conditional on need_log_inode() because the directory 5766 * might have been logged in LOG_INODE_EXISTS mode or 5767 * the dir index of the conflicting inode is not in a 5768 * dir index key range logged for the directory. So we 5769 * must make sure the deletion is recorded. 5770 */ 5771 ret = btrfs_log_inode(trans, BTRFS_I(inode), 5772 LOG_INODE_ALL, ctx); 5773 btrfs_add_delayed_iput(inode); 5774 if (ret) 5775 break; 5776 continue; 5777 } 5778 5779 /* 5780 * Here we can use need_log_inode() because we only need to log 5781 * the inode in LOG_INODE_EXISTS mode and rename operations 5782 * update the log, so that the log ends up with the new name and 5783 * without the old name. 5784 * 5785 * We did this check at add_conflicting_inode(), but here we do 5786 * it again because if some other task logged the inode after 5787 * that, we can avoid doing it again. 5788 */ 5789 if (!need_log_inode(trans, BTRFS_I(inode))) { 5790 btrfs_add_delayed_iput(inode); 5791 continue; 5792 } 5793 5794 /* 5795 * We are safe logging the other inode without acquiring its 5796 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5797 * are safe against concurrent renames of the other inode as 5798 * well because during a rename we pin the log and update the 5799 * log with the new name before we unpin it. 5800 */ 5801 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); 5802 btrfs_add_delayed_iput(inode); 5803 if (ret) 5804 break; 5805 } 5806 5807 ctx->logging_conflict_inodes = false; 5808 if (ret) 5809 free_conflicting_inodes(ctx); 5810 5811 return ret; 5812 } 5813 5814 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5815 struct btrfs_inode *inode, 5816 struct btrfs_key *min_key, 5817 const struct btrfs_key *max_key, 5818 struct btrfs_path *path, 5819 struct btrfs_path *dst_path, 5820 const u64 logged_isize, 5821 const int inode_only, 5822 struct btrfs_log_ctx *ctx, 5823 bool *need_log_inode_item) 5824 { 5825 const u64 i_size = i_size_read(&inode->vfs_inode); 5826 struct btrfs_root *root = inode->root; 5827 int ins_start_slot = 0; 5828 int ins_nr = 0; 5829 int ret; 5830 5831 while (1) { 5832 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5833 if (ret < 0) 5834 return ret; 5835 if (ret > 0) { 5836 ret = 0; 5837 break; 5838 } 5839 again: 5840 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5841 if (min_key->objectid != max_key->objectid) 5842 break; 5843 if (min_key->type > max_key->type) 5844 break; 5845 5846 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5847 *need_log_inode_item = false; 5848 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5849 min_key->offset >= i_size) { 5850 /* 5851 * Extents at and beyond eof are logged with 5852 * btrfs_log_prealloc_extents(). 5853 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5854 * and no keys greater than that, so bail out. 5855 */ 5856 break; 5857 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5858 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5859 (inode->generation == trans->transid || 5860 ctx->logging_conflict_inodes)) { 5861 u64 other_ino = 0; 5862 u64 other_parent = 0; 5863 5864 ret = btrfs_check_ref_name_override(path->nodes[0], 5865 path->slots[0], min_key, inode, 5866 &other_ino, &other_parent); 5867 if (ret < 0) { 5868 return ret; 5869 } else if (ret > 0 && 5870 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5871 if (ins_nr > 0) { 5872 ins_nr++; 5873 } else { 5874 ins_nr = 1; 5875 ins_start_slot = path->slots[0]; 5876 } 5877 ret = copy_items(trans, inode, dst_path, path, 5878 ins_start_slot, ins_nr, 5879 inode_only, logged_isize); 5880 if (ret < 0) 5881 return ret; 5882 ins_nr = 0; 5883 5884 btrfs_release_path(path); 5885 ret = add_conflicting_inode(trans, root, path, 5886 other_ino, 5887 other_parent, ctx); 5888 if (ret) 5889 return ret; 5890 goto next_key; 5891 } 5892 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5893 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5894 if (ins_nr == 0) 5895 goto next_slot; 5896 ret = copy_items(trans, inode, dst_path, path, 5897 ins_start_slot, 5898 ins_nr, inode_only, logged_isize); 5899 if (ret < 0) 5900 return ret; 5901 ins_nr = 0; 5902 goto next_slot; 5903 } 5904 5905 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5906 ins_nr++; 5907 goto next_slot; 5908 } else if (!ins_nr) { 5909 ins_start_slot = path->slots[0]; 5910 ins_nr = 1; 5911 goto next_slot; 5912 } 5913 5914 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5915 ins_nr, inode_only, logged_isize); 5916 if (ret < 0) 5917 return ret; 5918 ins_nr = 1; 5919 ins_start_slot = path->slots[0]; 5920 next_slot: 5921 path->slots[0]++; 5922 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5923 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5924 path->slots[0]); 5925 goto again; 5926 } 5927 if (ins_nr) { 5928 ret = copy_items(trans, inode, dst_path, path, 5929 ins_start_slot, ins_nr, inode_only, 5930 logged_isize); 5931 if (ret < 0) 5932 return ret; 5933 ins_nr = 0; 5934 } 5935 btrfs_release_path(path); 5936 next_key: 5937 if (min_key->offset < (u64)-1) { 5938 min_key->offset++; 5939 } else if (min_key->type < max_key->type) { 5940 min_key->type++; 5941 min_key->offset = 0; 5942 } else { 5943 break; 5944 } 5945 5946 /* 5947 * We may process many leaves full of items for our inode, so 5948 * avoid monopolizing a cpu for too long by rescheduling while 5949 * not holding locks on any tree. 5950 */ 5951 cond_resched(); 5952 } 5953 if (ins_nr) { 5954 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5955 ins_nr, inode_only, logged_isize); 5956 if (ret) 5957 return ret; 5958 } 5959 5960 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5961 /* 5962 * Release the path because otherwise we might attempt to double 5963 * lock the same leaf with btrfs_log_prealloc_extents() below. 5964 */ 5965 btrfs_release_path(path); 5966 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5967 } 5968 5969 return ret; 5970 } 5971 5972 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans, 5973 struct btrfs_root *log, 5974 struct btrfs_path *path, 5975 const struct btrfs_item_batch *batch, 5976 const struct btrfs_delayed_item *first_item) 5977 { 5978 const struct btrfs_delayed_item *curr = first_item; 5979 int ret; 5980 5981 ret = btrfs_insert_empty_items(trans, log, path, batch); 5982 if (ret) 5983 return ret; 5984 5985 for (int i = 0; i < batch->nr; i++) { 5986 char *data_ptr; 5987 5988 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 5989 write_extent_buffer(path->nodes[0], &curr->data, 5990 (unsigned long)data_ptr, curr->data_len); 5991 curr = list_next_entry(curr, log_list); 5992 path->slots[0]++; 5993 } 5994 5995 btrfs_release_path(path); 5996 5997 return 0; 5998 } 5999 6000 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, 6001 struct btrfs_inode *inode, 6002 struct btrfs_path *path, 6003 const struct list_head *delayed_ins_list, 6004 struct btrfs_log_ctx *ctx) 6005 { 6006 /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */ 6007 const int max_batch_size = 195; 6008 const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info); 6009 const u64 ino = btrfs_ino(inode); 6010 struct btrfs_root *log = inode->root->log_root; 6011 struct btrfs_item_batch batch = { 6012 .nr = 0, 6013 .total_data_size = 0, 6014 }; 6015 const struct btrfs_delayed_item *first = NULL; 6016 const struct btrfs_delayed_item *curr; 6017 char *ins_data; 6018 struct btrfs_key *ins_keys; 6019 u32 *ins_sizes; 6020 u64 curr_batch_size = 0; 6021 int batch_idx = 0; 6022 int ret; 6023 6024 /* We are adding dir index items to the log tree. */ 6025 lockdep_assert_held(&inode->log_mutex); 6026 6027 /* 6028 * We collect delayed items before copying index keys from the subvolume 6029 * to the log tree. However just after we collected them, they may have 6030 * been flushed (all of them or just some of them), and therefore we 6031 * could have copied them from the subvolume tree to the log tree. 6032 * So find the first delayed item that was not yet logged (they are 6033 * sorted by index number). 6034 */ 6035 list_for_each_entry(curr, delayed_ins_list, log_list) { 6036 if (curr->index > inode->last_dir_index_offset) { 6037 first = curr; 6038 break; 6039 } 6040 } 6041 6042 /* Empty list or all delayed items were already logged. */ 6043 if (!first) 6044 return 0; 6045 6046 ins_data = kmalloc(max_batch_size * sizeof(u32) + 6047 max_batch_size * sizeof(struct btrfs_key), GFP_NOFS); 6048 if (!ins_data) 6049 return -ENOMEM; 6050 ins_sizes = (u32 *)ins_data; 6051 batch.data_sizes = ins_sizes; 6052 ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32)); 6053 batch.keys = ins_keys; 6054 6055 curr = first; 6056 while (!list_entry_is_head(curr, delayed_ins_list, log_list)) { 6057 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item); 6058 6059 if (curr_batch_size + curr_size > leaf_data_size || 6060 batch.nr == max_batch_size) { 6061 ret = insert_delayed_items_batch(trans, log, path, 6062 &batch, first); 6063 if (ret) 6064 goto out; 6065 batch_idx = 0; 6066 batch.nr = 0; 6067 batch.total_data_size = 0; 6068 curr_batch_size = 0; 6069 first = curr; 6070 } 6071 6072 ins_sizes[batch_idx] = curr->data_len; 6073 ins_keys[batch_idx].objectid = ino; 6074 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY; 6075 ins_keys[batch_idx].offset = curr->index; 6076 curr_batch_size += curr_size; 6077 batch.total_data_size += curr->data_len; 6078 batch.nr++; 6079 batch_idx++; 6080 curr = list_next_entry(curr, log_list); 6081 } 6082 6083 ASSERT(batch.nr >= 1); 6084 ret = insert_delayed_items_batch(trans, log, path, &batch, first); 6085 6086 curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, 6087 log_list); 6088 inode->last_dir_index_offset = curr->index; 6089 out: 6090 kfree(ins_data); 6091 6092 return ret; 6093 } 6094 6095 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, 6096 struct btrfs_inode *inode, 6097 struct btrfs_path *path, 6098 const struct list_head *delayed_del_list, 6099 struct btrfs_log_ctx *ctx) 6100 { 6101 const u64 ino = btrfs_ino(inode); 6102 const struct btrfs_delayed_item *curr; 6103 6104 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6105 log_list); 6106 6107 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6108 u64 first_dir_index = curr->index; 6109 u64 last_dir_index; 6110 const struct btrfs_delayed_item *next; 6111 int ret; 6112 6113 /* 6114 * Find a range of consecutive dir index items to delete. Like 6115 * this we log a single dir range item spanning several contiguous 6116 * dir items instead of logging one range item per dir index item. 6117 */ 6118 next = list_next_entry(curr, log_list); 6119 while (!list_entry_is_head(next, delayed_del_list, log_list)) { 6120 if (next->index != curr->index + 1) 6121 break; 6122 curr = next; 6123 next = list_next_entry(next, log_list); 6124 } 6125 6126 last_dir_index = curr->index; 6127 ASSERT(last_dir_index >= first_dir_index); 6128 6129 ret = insert_dir_log_key(trans, inode->root->log_root, path, 6130 ino, first_dir_index, last_dir_index); 6131 if (ret) 6132 return ret; 6133 curr = list_next_entry(curr, log_list); 6134 } 6135 6136 return 0; 6137 } 6138 6139 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, 6140 struct btrfs_inode *inode, 6141 struct btrfs_path *path, 6142 struct btrfs_log_ctx *ctx, 6143 const struct list_head *delayed_del_list, 6144 const struct btrfs_delayed_item *first, 6145 const struct btrfs_delayed_item **last_ret) 6146 { 6147 const struct btrfs_delayed_item *next; 6148 struct extent_buffer *leaf = path->nodes[0]; 6149 const int last_slot = btrfs_header_nritems(leaf) - 1; 6150 int slot = path->slots[0] + 1; 6151 const u64 ino = btrfs_ino(inode); 6152 6153 next = list_next_entry(first, log_list); 6154 6155 while (slot < last_slot && 6156 !list_entry_is_head(next, delayed_del_list, log_list)) { 6157 struct btrfs_key key; 6158 6159 btrfs_item_key_to_cpu(leaf, &key, slot); 6160 if (key.objectid != ino || 6161 key.type != BTRFS_DIR_INDEX_KEY || 6162 key.offset != next->index) 6163 break; 6164 6165 slot++; 6166 *last_ret = next; 6167 next = list_next_entry(next, log_list); 6168 } 6169 6170 return btrfs_del_items(trans, inode->root->log_root, path, 6171 path->slots[0], slot - path->slots[0]); 6172 } 6173 6174 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, 6175 struct btrfs_inode *inode, 6176 struct btrfs_path *path, 6177 const struct list_head *delayed_del_list, 6178 struct btrfs_log_ctx *ctx) 6179 { 6180 struct btrfs_root *log = inode->root->log_root; 6181 const struct btrfs_delayed_item *curr; 6182 u64 last_range_start; 6183 u64 last_range_end = 0; 6184 struct btrfs_key key; 6185 6186 key.objectid = btrfs_ino(inode); 6187 key.type = BTRFS_DIR_INDEX_KEY; 6188 curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item, 6189 log_list); 6190 6191 while (!list_entry_is_head(curr, delayed_del_list, log_list)) { 6192 const struct btrfs_delayed_item *last = curr; 6193 u64 first_dir_index = curr->index; 6194 u64 last_dir_index; 6195 bool deleted_items = false; 6196 int ret; 6197 6198 key.offset = curr->index; 6199 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 6200 if (ret < 0) { 6201 return ret; 6202 } else if (ret == 0) { 6203 ret = batch_delete_dir_index_items(trans, inode, path, ctx, 6204 delayed_del_list, curr, 6205 &last); 6206 if (ret) 6207 return ret; 6208 deleted_items = true; 6209 } 6210 6211 btrfs_release_path(path); 6212 6213 /* 6214 * If we deleted items from the leaf, it means we have a range 6215 * item logging their range, so no need to add one or update an 6216 * existing one. Otherwise we have to log a dir range item. 6217 */ 6218 if (deleted_items) 6219 goto next_batch; 6220 6221 last_dir_index = last->index; 6222 ASSERT(last_dir_index >= first_dir_index); 6223 /* 6224 * If this range starts right after where the previous one ends, 6225 * then we want to reuse the previous range item and change its 6226 * end offset to the end of this range. This is just to minimize 6227 * leaf space usage, by avoiding adding a new range item. 6228 */ 6229 if (last_range_end != 0 && first_dir_index == last_range_end + 1) 6230 first_dir_index = last_range_start; 6231 6232 ret = insert_dir_log_key(trans, log, path, key.objectid, 6233 first_dir_index, last_dir_index); 6234 if (ret) 6235 return ret; 6236 6237 last_range_start = first_dir_index; 6238 last_range_end = last_dir_index; 6239 next_batch: 6240 curr = list_next_entry(last, log_list); 6241 } 6242 6243 return 0; 6244 } 6245 6246 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans, 6247 struct btrfs_inode *inode, 6248 struct btrfs_path *path, 6249 const struct list_head *delayed_del_list, 6250 struct btrfs_log_ctx *ctx) 6251 { 6252 /* 6253 * We are deleting dir index items from the log tree or adding range 6254 * items to it. 6255 */ 6256 lockdep_assert_held(&inode->log_mutex); 6257 6258 if (list_empty(delayed_del_list)) 6259 return 0; 6260 6261 if (ctx->logged_before) 6262 return log_delayed_deletions_incremental(trans, inode, path, 6263 delayed_del_list, ctx); 6264 6265 return log_delayed_deletions_full(trans, inode, path, delayed_del_list, 6266 ctx); 6267 } 6268 6269 /* 6270 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed 6271 * items instead of the subvolume tree. 6272 */ 6273 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, 6274 struct btrfs_inode *inode, 6275 const struct list_head *delayed_ins_list, 6276 struct btrfs_log_ctx *ctx) 6277 { 6278 const bool orig_log_new_dentries = ctx->log_new_dentries; 6279 struct btrfs_fs_info *fs_info = trans->fs_info; 6280 struct btrfs_delayed_item *item; 6281 int ret = 0; 6282 6283 /* 6284 * No need for the log mutex, plus to avoid potential deadlocks or 6285 * lockdep annotations due to nesting of delayed inode mutexes and log 6286 * mutexes. 6287 */ 6288 lockdep_assert_not_held(&inode->log_mutex); 6289 6290 ASSERT(!ctx->logging_new_delayed_dentries); 6291 ctx->logging_new_delayed_dentries = true; 6292 6293 list_for_each_entry(item, delayed_ins_list, log_list) { 6294 struct btrfs_dir_item *dir_item; 6295 struct inode *di_inode; 6296 struct btrfs_key key; 6297 int log_mode = LOG_INODE_EXISTS; 6298 6299 dir_item = (struct btrfs_dir_item *)item->data; 6300 btrfs_disk_key_to_cpu(&key, &dir_item->location); 6301 6302 if (key.type == BTRFS_ROOT_ITEM_KEY) 6303 continue; 6304 6305 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root); 6306 if (IS_ERR(di_inode)) { 6307 ret = PTR_ERR(di_inode); 6308 break; 6309 } 6310 6311 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6312 btrfs_add_delayed_iput(di_inode); 6313 continue; 6314 } 6315 6316 if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR) 6317 log_mode = LOG_INODE_ALL; 6318 6319 ctx->log_new_dentries = false; 6320 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); 6321 6322 if (!ret && ctx->log_new_dentries) 6323 ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); 6324 6325 btrfs_add_delayed_iput(di_inode); 6326 6327 if (ret) 6328 break; 6329 } 6330 6331 ctx->log_new_dentries = orig_log_new_dentries; 6332 ctx->logging_new_delayed_dentries = false; 6333 6334 return ret; 6335 } 6336 6337 /* log a single inode in the tree log. 6338 * At least one parent directory for this inode must exist in the tree 6339 * or be logged already. 6340 * 6341 * Any items from this inode changed by the current transaction are copied 6342 * to the log tree. An extra reference is taken on any extents in this 6343 * file, allowing us to avoid a whole pile of corner cases around logging 6344 * blocks that have been removed from the tree. 6345 * 6346 * See LOG_INODE_ALL and related defines for a description of what inode_only 6347 * does. 6348 * 6349 * This handles both files and directories. 6350 */ 6351 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 6352 struct btrfs_inode *inode, 6353 int inode_only, 6354 struct btrfs_log_ctx *ctx) 6355 { 6356 struct btrfs_path *path; 6357 struct btrfs_path *dst_path; 6358 struct btrfs_key min_key; 6359 struct btrfs_key max_key; 6360 struct btrfs_root *log = inode->root->log_root; 6361 int ret; 6362 bool fast_search = false; 6363 u64 ino = btrfs_ino(inode); 6364 struct extent_map_tree *em_tree = &inode->extent_tree; 6365 u64 logged_isize = 0; 6366 bool need_log_inode_item = true; 6367 bool xattrs_logged = false; 6368 bool inode_item_dropped = true; 6369 bool full_dir_logging = false; 6370 LIST_HEAD(delayed_ins_list); 6371 LIST_HEAD(delayed_del_list); 6372 6373 path = btrfs_alloc_path(); 6374 if (!path) 6375 return -ENOMEM; 6376 dst_path = btrfs_alloc_path(); 6377 if (!dst_path) { 6378 btrfs_free_path(path); 6379 return -ENOMEM; 6380 } 6381 6382 min_key.objectid = ino; 6383 min_key.type = BTRFS_INODE_ITEM_KEY; 6384 min_key.offset = 0; 6385 6386 max_key.objectid = ino; 6387 6388 6389 /* today the code can only do partial logging of directories */ 6390 if (S_ISDIR(inode->vfs_inode.i_mode) || 6391 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6392 &inode->runtime_flags) && 6393 inode_only >= LOG_INODE_EXISTS)) 6394 max_key.type = BTRFS_XATTR_ITEM_KEY; 6395 else 6396 max_key.type = (u8)-1; 6397 max_key.offset = (u64)-1; 6398 6399 if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL) 6400 full_dir_logging = true; 6401 6402 /* 6403 * If we are logging a directory while we are logging dentries of the 6404 * delayed items of some other inode, then we need to flush the delayed 6405 * items of this directory and not log the delayed items directly. This 6406 * is to prevent more than one level of recursion into btrfs_log_inode() 6407 * by having something like this: 6408 * 6409 * $ mkdir -p a/b/c/d/e/f/g/h/... 6410 * $ xfs_io -c "fsync" a 6411 * 6412 * Where all directories in the path did not exist before and are 6413 * created in the current transaction. 6414 * So in such a case we directly log the delayed items of the main 6415 * directory ("a") without flushing them first, while for each of its 6416 * subdirectories we flush their delayed items before logging them. 6417 * This prevents a potential unbounded recursion like this: 6418 * 6419 * btrfs_log_inode() 6420 * log_new_delayed_dentries() 6421 * btrfs_log_inode() 6422 * log_new_delayed_dentries() 6423 * btrfs_log_inode() 6424 * log_new_delayed_dentries() 6425 * (...) 6426 * 6427 * We have thresholds for the maximum number of delayed items to have in 6428 * memory, and once they are hit, the items are flushed asynchronously. 6429 * However the limit is quite high, so lets prevent deep levels of 6430 * recursion to happen by limiting the maximum depth to be 1. 6431 */ 6432 if (full_dir_logging && ctx->logging_new_delayed_dentries) { 6433 ret = btrfs_commit_inode_delayed_items(trans, inode); 6434 if (ret) 6435 goto out; 6436 } 6437 6438 mutex_lock(&inode->log_mutex); 6439 6440 /* 6441 * For symlinks, we must always log their content, which is stored in an 6442 * inline extent, otherwise we could end up with an empty symlink after 6443 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 6444 * one attempts to create an empty symlink). 6445 * We don't need to worry about flushing delalloc, because when we create 6446 * the inline extent when the symlink is created (we never have delalloc 6447 * for symlinks). 6448 */ 6449 if (S_ISLNK(inode->vfs_inode.i_mode)) 6450 inode_only = LOG_INODE_ALL; 6451 6452 /* 6453 * Before logging the inode item, cache the value returned by 6454 * inode_logged(), because after that we have the need to figure out if 6455 * the inode was previously logged in this transaction. 6456 */ 6457 ret = inode_logged(trans, inode, path); 6458 if (ret < 0) 6459 goto out_unlock; 6460 ctx->logged_before = (ret == 1); 6461 ret = 0; 6462 6463 /* 6464 * This is for cases where logging a directory could result in losing a 6465 * a file after replaying the log. For example, if we move a file from a 6466 * directory A to a directory B, then fsync directory A, we have no way 6467 * to known the file was moved from A to B, so logging just A would 6468 * result in losing the file after a log replay. 6469 */ 6470 if (full_dir_logging && inode->last_unlink_trans >= trans->transid) { 6471 btrfs_set_log_full_commit(trans); 6472 ret = BTRFS_LOG_FORCE_COMMIT; 6473 goto out_unlock; 6474 } 6475 6476 /* 6477 * a brute force approach to making sure we get the most uptodate 6478 * copies of everything. 6479 */ 6480 if (S_ISDIR(inode->vfs_inode.i_mode)) { 6481 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 6482 if (ctx->logged_before) 6483 ret = drop_inode_items(trans, log, path, inode, 6484 BTRFS_XATTR_ITEM_KEY); 6485 } else { 6486 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 6487 /* 6488 * Make sure the new inode item we write to the log has 6489 * the same isize as the current one (if it exists). 6490 * This is necessary to prevent data loss after log 6491 * replay, and also to prevent doing a wrong expanding 6492 * truncate - for e.g. create file, write 4K into offset 6493 * 0, fsync, write 4K into offset 4096, add hard link, 6494 * fsync some other file (to sync log), power fail - if 6495 * we use the inode's current i_size, after log replay 6496 * we get a 8Kb file, with the last 4Kb extent as a hole 6497 * (zeroes), as if an expanding truncate happened, 6498 * instead of getting a file of 4Kb only. 6499 */ 6500 ret = logged_inode_size(log, inode, path, &logged_isize); 6501 if (ret) 6502 goto out_unlock; 6503 } 6504 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6505 &inode->runtime_flags)) { 6506 if (inode_only == LOG_INODE_EXISTS) { 6507 max_key.type = BTRFS_XATTR_ITEM_KEY; 6508 if (ctx->logged_before) 6509 ret = drop_inode_items(trans, log, path, 6510 inode, max_key.type); 6511 } else { 6512 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 6513 &inode->runtime_flags); 6514 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6515 &inode->runtime_flags); 6516 if (ctx->logged_before) 6517 ret = truncate_inode_items(trans, log, 6518 inode, 0, 0); 6519 } 6520 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 6521 &inode->runtime_flags) || 6522 inode_only == LOG_INODE_EXISTS) { 6523 if (inode_only == LOG_INODE_ALL) 6524 fast_search = true; 6525 max_key.type = BTRFS_XATTR_ITEM_KEY; 6526 if (ctx->logged_before) 6527 ret = drop_inode_items(trans, log, path, inode, 6528 max_key.type); 6529 } else { 6530 if (inode_only == LOG_INODE_ALL) 6531 fast_search = true; 6532 inode_item_dropped = false; 6533 goto log_extents; 6534 } 6535 6536 } 6537 if (ret) 6538 goto out_unlock; 6539 6540 /* 6541 * If we are logging a directory in full mode, collect the delayed items 6542 * before iterating the subvolume tree, so that we don't miss any new 6543 * dir index items in case they get flushed while or right after we are 6544 * iterating the subvolume tree. 6545 */ 6546 if (full_dir_logging && !ctx->logging_new_delayed_dentries) 6547 btrfs_log_get_delayed_items(inode, &delayed_ins_list, 6548 &delayed_del_list); 6549 6550 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 6551 path, dst_path, logged_isize, 6552 inode_only, ctx, 6553 &need_log_inode_item); 6554 if (ret) 6555 goto out_unlock; 6556 6557 btrfs_release_path(path); 6558 btrfs_release_path(dst_path); 6559 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 6560 if (ret) 6561 goto out_unlock; 6562 xattrs_logged = true; 6563 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 6564 btrfs_release_path(path); 6565 btrfs_release_path(dst_path); 6566 ret = btrfs_log_holes(trans, inode, path); 6567 if (ret) 6568 goto out_unlock; 6569 } 6570 log_extents: 6571 btrfs_release_path(path); 6572 btrfs_release_path(dst_path); 6573 if (need_log_inode_item) { 6574 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 6575 if (ret) 6576 goto out_unlock; 6577 /* 6578 * If we are doing a fast fsync and the inode was logged before 6579 * in this transaction, we don't need to log the xattrs because 6580 * they were logged before. If xattrs were added, changed or 6581 * deleted since the last time we logged the inode, then we have 6582 * already logged them because the inode had the runtime flag 6583 * BTRFS_INODE_COPY_EVERYTHING set. 6584 */ 6585 if (!xattrs_logged && inode->logged_trans < trans->transid) { 6586 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 6587 if (ret) 6588 goto out_unlock; 6589 btrfs_release_path(path); 6590 } 6591 } 6592 if (fast_search) { 6593 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 6594 if (ret) 6595 goto out_unlock; 6596 } else if (inode_only == LOG_INODE_ALL) { 6597 struct extent_map *em, *n; 6598 6599 write_lock(&em_tree->lock); 6600 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 6601 list_del_init(&em->list); 6602 write_unlock(&em_tree->lock); 6603 } 6604 6605 if (full_dir_logging) { 6606 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 6607 if (ret) 6608 goto out_unlock; 6609 ret = log_delayed_insertion_items(trans, inode, path, 6610 &delayed_ins_list, ctx); 6611 if (ret) 6612 goto out_unlock; 6613 ret = log_delayed_deletion_items(trans, inode, path, 6614 &delayed_del_list, ctx); 6615 if (ret) 6616 goto out_unlock; 6617 } 6618 6619 spin_lock(&inode->lock); 6620 inode->logged_trans = trans->transid; 6621 /* 6622 * Don't update last_log_commit if we logged that an inode exists. 6623 * We do this for three reasons: 6624 * 6625 * 1) We might have had buffered writes to this inode that were 6626 * flushed and had their ordered extents completed in this 6627 * transaction, but we did not previously log the inode with 6628 * LOG_INODE_ALL. Later the inode was evicted and after that 6629 * it was loaded again and this LOG_INODE_EXISTS log operation 6630 * happened. We must make sure that if an explicit fsync against 6631 * the inode is performed later, it logs the new extents, an 6632 * updated inode item, etc, and syncs the log. The same logic 6633 * applies to direct IO writes instead of buffered writes. 6634 * 6635 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 6636 * is logged with an i_size of 0 or whatever value was logged 6637 * before. If later the i_size of the inode is increased by a 6638 * truncate operation, the log is synced through an fsync of 6639 * some other inode and then finally an explicit fsync against 6640 * this inode is made, we must make sure this fsync logs the 6641 * inode with the new i_size, the hole between old i_size and 6642 * the new i_size, and syncs the log. 6643 * 6644 * 3) If we are logging that an ancestor inode exists as part of 6645 * logging a new name from a link or rename operation, don't update 6646 * its last_log_commit - otherwise if an explicit fsync is made 6647 * against an ancestor, the fsync considers the inode in the log 6648 * and doesn't sync the log, resulting in the ancestor missing after 6649 * a power failure unless the log was synced as part of an fsync 6650 * against any other unrelated inode. 6651 */ 6652 if (inode_only != LOG_INODE_EXISTS) 6653 inode->last_log_commit = inode->last_sub_trans; 6654 spin_unlock(&inode->lock); 6655 6656 /* 6657 * Reset the last_reflink_trans so that the next fsync does not need to 6658 * go through the slower path when logging extents and their checksums. 6659 */ 6660 if (inode_only == LOG_INODE_ALL) 6661 inode->last_reflink_trans = 0; 6662 6663 out_unlock: 6664 mutex_unlock(&inode->log_mutex); 6665 out: 6666 btrfs_free_path(path); 6667 btrfs_free_path(dst_path); 6668 6669 if (ret) 6670 free_conflicting_inodes(ctx); 6671 else 6672 ret = log_conflicting_inodes(trans, inode->root, ctx); 6673 6674 if (full_dir_logging && !ctx->logging_new_delayed_dentries) { 6675 if (!ret) 6676 ret = log_new_delayed_dentries(trans, inode, 6677 &delayed_ins_list, ctx); 6678 6679 btrfs_log_put_delayed_items(inode, &delayed_ins_list, 6680 &delayed_del_list); 6681 } 6682 6683 return ret; 6684 } 6685 6686 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6687 struct btrfs_inode *inode, 6688 struct btrfs_log_ctx *ctx) 6689 { 6690 struct btrfs_fs_info *fs_info = trans->fs_info; 6691 int ret; 6692 struct btrfs_path *path; 6693 struct btrfs_key key; 6694 struct btrfs_root *root = inode->root; 6695 const u64 ino = btrfs_ino(inode); 6696 6697 path = btrfs_alloc_path(); 6698 if (!path) 6699 return -ENOMEM; 6700 path->skip_locking = 1; 6701 path->search_commit_root = 1; 6702 6703 key.objectid = ino; 6704 key.type = BTRFS_INODE_REF_KEY; 6705 key.offset = 0; 6706 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6707 if (ret < 0) 6708 goto out; 6709 6710 while (true) { 6711 struct extent_buffer *leaf = path->nodes[0]; 6712 int slot = path->slots[0]; 6713 u32 cur_offset = 0; 6714 u32 item_size; 6715 unsigned long ptr; 6716 6717 if (slot >= btrfs_header_nritems(leaf)) { 6718 ret = btrfs_next_leaf(root, path); 6719 if (ret < 0) 6720 goto out; 6721 else if (ret > 0) 6722 break; 6723 continue; 6724 } 6725 6726 btrfs_item_key_to_cpu(leaf, &key, slot); 6727 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6728 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6729 break; 6730 6731 item_size = btrfs_item_size(leaf, slot); 6732 ptr = btrfs_item_ptr_offset(leaf, slot); 6733 while (cur_offset < item_size) { 6734 struct btrfs_key inode_key; 6735 struct inode *dir_inode; 6736 6737 inode_key.type = BTRFS_INODE_ITEM_KEY; 6738 inode_key.offset = 0; 6739 6740 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6741 struct btrfs_inode_extref *extref; 6742 6743 extref = (struct btrfs_inode_extref *) 6744 (ptr + cur_offset); 6745 inode_key.objectid = btrfs_inode_extref_parent( 6746 leaf, extref); 6747 cur_offset += sizeof(*extref); 6748 cur_offset += btrfs_inode_extref_name_len(leaf, 6749 extref); 6750 } else { 6751 inode_key.objectid = key.offset; 6752 cur_offset = item_size; 6753 } 6754 6755 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6756 root); 6757 /* 6758 * If the parent inode was deleted, return an error to 6759 * fallback to a transaction commit. This is to prevent 6760 * getting an inode that was moved from one parent A to 6761 * a parent B, got its former parent A deleted and then 6762 * it got fsync'ed, from existing at both parents after 6763 * a log replay (and the old parent still existing). 6764 * Example: 6765 * 6766 * mkdir /mnt/A 6767 * mkdir /mnt/B 6768 * touch /mnt/B/bar 6769 * sync 6770 * mv /mnt/B/bar /mnt/A/bar 6771 * mv -T /mnt/A /mnt/B 6772 * fsync /mnt/B/bar 6773 * <power fail> 6774 * 6775 * If we ignore the old parent B which got deleted, 6776 * after a log replay we would have file bar linked 6777 * at both parents and the old parent B would still 6778 * exist. 6779 */ 6780 if (IS_ERR(dir_inode)) { 6781 ret = PTR_ERR(dir_inode); 6782 goto out; 6783 } 6784 6785 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6786 btrfs_add_delayed_iput(dir_inode); 6787 continue; 6788 } 6789 6790 ctx->log_new_dentries = false; 6791 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6792 LOG_INODE_ALL, ctx); 6793 if (!ret && ctx->log_new_dentries) 6794 ret = log_new_dir_dentries(trans, 6795 BTRFS_I(dir_inode), ctx); 6796 btrfs_add_delayed_iput(dir_inode); 6797 if (ret) 6798 goto out; 6799 } 6800 path->slots[0]++; 6801 } 6802 ret = 0; 6803 out: 6804 btrfs_free_path(path); 6805 return ret; 6806 } 6807 6808 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6809 struct btrfs_root *root, 6810 struct btrfs_path *path, 6811 struct btrfs_log_ctx *ctx) 6812 { 6813 struct btrfs_key found_key; 6814 6815 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6816 6817 while (true) { 6818 struct btrfs_fs_info *fs_info = root->fs_info; 6819 struct extent_buffer *leaf = path->nodes[0]; 6820 int slot = path->slots[0]; 6821 struct btrfs_key search_key; 6822 struct inode *inode; 6823 u64 ino; 6824 int ret = 0; 6825 6826 btrfs_release_path(path); 6827 6828 ino = found_key.offset; 6829 6830 search_key.objectid = found_key.offset; 6831 search_key.type = BTRFS_INODE_ITEM_KEY; 6832 search_key.offset = 0; 6833 inode = btrfs_iget(fs_info->sb, ino, root); 6834 if (IS_ERR(inode)) 6835 return PTR_ERR(inode); 6836 6837 if (BTRFS_I(inode)->generation >= trans->transid && 6838 need_log_inode(trans, BTRFS_I(inode))) 6839 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6840 LOG_INODE_EXISTS, ctx); 6841 btrfs_add_delayed_iput(inode); 6842 if (ret) 6843 return ret; 6844 6845 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6846 break; 6847 6848 search_key.type = BTRFS_INODE_REF_KEY; 6849 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6850 if (ret < 0) 6851 return ret; 6852 6853 leaf = path->nodes[0]; 6854 slot = path->slots[0]; 6855 if (slot >= btrfs_header_nritems(leaf)) { 6856 ret = btrfs_next_leaf(root, path); 6857 if (ret < 0) 6858 return ret; 6859 else if (ret > 0) 6860 return -ENOENT; 6861 leaf = path->nodes[0]; 6862 slot = path->slots[0]; 6863 } 6864 6865 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6866 if (found_key.objectid != search_key.objectid || 6867 found_key.type != BTRFS_INODE_REF_KEY) 6868 return -ENOENT; 6869 } 6870 return 0; 6871 } 6872 6873 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6874 struct btrfs_inode *inode, 6875 struct dentry *parent, 6876 struct btrfs_log_ctx *ctx) 6877 { 6878 struct btrfs_root *root = inode->root; 6879 struct dentry *old_parent = NULL; 6880 struct super_block *sb = inode->vfs_inode.i_sb; 6881 int ret = 0; 6882 6883 while (true) { 6884 if (!parent || d_really_is_negative(parent) || 6885 sb != parent->d_sb) 6886 break; 6887 6888 inode = BTRFS_I(d_inode(parent)); 6889 if (root != inode->root) 6890 break; 6891 6892 if (inode->generation >= trans->transid && 6893 need_log_inode(trans, inode)) { 6894 ret = btrfs_log_inode(trans, inode, 6895 LOG_INODE_EXISTS, ctx); 6896 if (ret) 6897 break; 6898 } 6899 if (IS_ROOT(parent)) 6900 break; 6901 6902 parent = dget_parent(parent); 6903 dput(old_parent); 6904 old_parent = parent; 6905 } 6906 dput(old_parent); 6907 6908 return ret; 6909 } 6910 6911 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6912 struct btrfs_inode *inode, 6913 struct dentry *parent, 6914 struct btrfs_log_ctx *ctx) 6915 { 6916 struct btrfs_root *root = inode->root; 6917 const u64 ino = btrfs_ino(inode); 6918 struct btrfs_path *path; 6919 struct btrfs_key search_key; 6920 int ret; 6921 6922 /* 6923 * For a single hard link case, go through a fast path that does not 6924 * need to iterate the fs/subvolume tree. 6925 */ 6926 if (inode->vfs_inode.i_nlink < 2) 6927 return log_new_ancestors_fast(trans, inode, parent, ctx); 6928 6929 path = btrfs_alloc_path(); 6930 if (!path) 6931 return -ENOMEM; 6932 6933 search_key.objectid = ino; 6934 search_key.type = BTRFS_INODE_REF_KEY; 6935 search_key.offset = 0; 6936 again: 6937 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6938 if (ret < 0) 6939 goto out; 6940 if (ret == 0) 6941 path->slots[0]++; 6942 6943 while (true) { 6944 struct extent_buffer *leaf = path->nodes[0]; 6945 int slot = path->slots[0]; 6946 struct btrfs_key found_key; 6947 6948 if (slot >= btrfs_header_nritems(leaf)) { 6949 ret = btrfs_next_leaf(root, path); 6950 if (ret < 0) 6951 goto out; 6952 else if (ret > 0) 6953 break; 6954 continue; 6955 } 6956 6957 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6958 if (found_key.objectid != ino || 6959 found_key.type > BTRFS_INODE_EXTREF_KEY) 6960 break; 6961 6962 /* 6963 * Don't deal with extended references because they are rare 6964 * cases and too complex to deal with (we would need to keep 6965 * track of which subitem we are processing for each item in 6966 * this loop, etc). So just return some error to fallback to 6967 * a transaction commit. 6968 */ 6969 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6970 ret = -EMLINK; 6971 goto out; 6972 } 6973 6974 /* 6975 * Logging ancestors needs to do more searches on the fs/subvol 6976 * tree, so it releases the path as needed to avoid deadlocks. 6977 * Keep track of the last inode ref key and resume from that key 6978 * after logging all new ancestors for the current hard link. 6979 */ 6980 memcpy(&search_key, &found_key, sizeof(search_key)); 6981 6982 ret = log_new_ancestors(trans, root, path, ctx); 6983 if (ret) 6984 goto out; 6985 btrfs_release_path(path); 6986 goto again; 6987 } 6988 ret = 0; 6989 out: 6990 btrfs_free_path(path); 6991 return ret; 6992 } 6993 6994 /* 6995 * helper function around btrfs_log_inode to make sure newly created 6996 * parent directories also end up in the log. A minimal inode and backref 6997 * only logging is done of any parent directories that are older than 6998 * the last committed transaction 6999 */ 7000 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 7001 struct btrfs_inode *inode, 7002 struct dentry *parent, 7003 int inode_only, 7004 struct btrfs_log_ctx *ctx) 7005 { 7006 struct btrfs_root *root = inode->root; 7007 struct btrfs_fs_info *fs_info = root->fs_info; 7008 int ret = 0; 7009 bool log_dentries = false; 7010 7011 if (btrfs_test_opt(fs_info, NOTREELOG)) { 7012 ret = BTRFS_LOG_FORCE_COMMIT; 7013 goto end_no_trans; 7014 } 7015 7016 if (btrfs_root_refs(&root->root_item) == 0) { 7017 ret = BTRFS_LOG_FORCE_COMMIT; 7018 goto end_no_trans; 7019 } 7020 7021 /* 7022 * Skip already logged inodes or inodes corresponding to tmpfiles 7023 * (since logging them is pointless, a link count of 0 means they 7024 * will never be accessible). 7025 */ 7026 if ((btrfs_inode_in_log(inode, trans->transid) && 7027 list_empty(&ctx->ordered_extents)) || 7028 inode->vfs_inode.i_nlink == 0) { 7029 ret = BTRFS_NO_LOG_SYNC; 7030 goto end_no_trans; 7031 } 7032 7033 ret = start_log_trans(trans, root, ctx); 7034 if (ret) 7035 goto end_no_trans; 7036 7037 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 7038 if (ret) 7039 goto end_trans; 7040 7041 /* 7042 * for regular files, if its inode is already on disk, we don't 7043 * have to worry about the parents at all. This is because 7044 * we can use the last_unlink_trans field to record renames 7045 * and other fun in this file. 7046 */ 7047 if (S_ISREG(inode->vfs_inode.i_mode) && 7048 inode->generation < trans->transid && 7049 inode->last_unlink_trans < trans->transid) { 7050 ret = 0; 7051 goto end_trans; 7052 } 7053 7054 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 7055 log_dentries = true; 7056 7057 /* 7058 * On unlink we must make sure all our current and old parent directory 7059 * inodes are fully logged. This is to prevent leaving dangling 7060 * directory index entries in directories that were our parents but are 7061 * not anymore. Not doing this results in old parent directory being 7062 * impossible to delete after log replay (rmdir will always fail with 7063 * error -ENOTEMPTY). 7064 * 7065 * Example 1: 7066 * 7067 * mkdir testdir 7068 * touch testdir/foo 7069 * ln testdir/foo testdir/bar 7070 * sync 7071 * unlink testdir/bar 7072 * xfs_io -c fsync testdir/foo 7073 * <power failure> 7074 * mount fs, triggers log replay 7075 * 7076 * If we don't log the parent directory (testdir), after log replay the 7077 * directory still has an entry pointing to the file inode using the bar 7078 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 7079 * the file inode has a link count of 1. 7080 * 7081 * Example 2: 7082 * 7083 * mkdir testdir 7084 * touch foo 7085 * ln foo testdir/foo2 7086 * ln foo testdir/foo3 7087 * sync 7088 * unlink testdir/foo3 7089 * xfs_io -c fsync foo 7090 * <power failure> 7091 * mount fs, triggers log replay 7092 * 7093 * Similar as the first example, after log replay the parent directory 7094 * testdir still has an entry pointing to the inode file with name foo3 7095 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 7096 * and has a link count of 2. 7097 */ 7098 if (inode->last_unlink_trans >= trans->transid) { 7099 ret = btrfs_log_all_parents(trans, inode, ctx); 7100 if (ret) 7101 goto end_trans; 7102 } 7103 7104 ret = log_all_new_ancestors(trans, inode, parent, ctx); 7105 if (ret) 7106 goto end_trans; 7107 7108 if (log_dentries) 7109 ret = log_new_dir_dentries(trans, inode, ctx); 7110 else 7111 ret = 0; 7112 end_trans: 7113 if (ret < 0) { 7114 btrfs_set_log_full_commit(trans); 7115 ret = BTRFS_LOG_FORCE_COMMIT; 7116 } 7117 7118 if (ret) 7119 btrfs_remove_log_ctx(root, ctx); 7120 btrfs_end_log_trans(root); 7121 end_no_trans: 7122 return ret; 7123 } 7124 7125 /* 7126 * it is not safe to log dentry if the chunk root has added new 7127 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 7128 * If this returns 1, you must commit the transaction to safely get your 7129 * data on disk. 7130 */ 7131 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 7132 struct dentry *dentry, 7133 struct btrfs_log_ctx *ctx) 7134 { 7135 struct dentry *parent = dget_parent(dentry); 7136 int ret; 7137 7138 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 7139 LOG_INODE_ALL, ctx); 7140 dput(parent); 7141 7142 return ret; 7143 } 7144 7145 /* 7146 * should be called during mount to recover any replay any log trees 7147 * from the FS 7148 */ 7149 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 7150 { 7151 int ret; 7152 struct btrfs_path *path; 7153 struct btrfs_trans_handle *trans; 7154 struct btrfs_key key; 7155 struct btrfs_key found_key; 7156 struct btrfs_root *log; 7157 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 7158 struct walk_control wc = { 7159 .process_func = process_one_buffer, 7160 .stage = LOG_WALK_PIN_ONLY, 7161 }; 7162 7163 path = btrfs_alloc_path(); 7164 if (!path) 7165 return -ENOMEM; 7166 7167 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7168 7169 trans = btrfs_start_transaction(fs_info->tree_root, 0); 7170 if (IS_ERR(trans)) { 7171 ret = PTR_ERR(trans); 7172 goto error; 7173 } 7174 7175 wc.trans = trans; 7176 wc.pin = 1; 7177 7178 ret = walk_log_tree(trans, log_root_tree, &wc); 7179 if (ret) { 7180 btrfs_abort_transaction(trans, ret); 7181 goto error; 7182 } 7183 7184 again: 7185 key.objectid = BTRFS_TREE_LOG_OBJECTID; 7186 key.offset = (u64)-1; 7187 key.type = BTRFS_ROOT_ITEM_KEY; 7188 7189 while (1) { 7190 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 7191 7192 if (ret < 0) { 7193 btrfs_abort_transaction(trans, ret); 7194 goto error; 7195 } 7196 if (ret > 0) { 7197 if (path->slots[0] == 0) 7198 break; 7199 path->slots[0]--; 7200 } 7201 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 7202 path->slots[0]); 7203 btrfs_release_path(path); 7204 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 7205 break; 7206 7207 log = btrfs_read_tree_root(log_root_tree, &found_key); 7208 if (IS_ERR(log)) { 7209 ret = PTR_ERR(log); 7210 btrfs_abort_transaction(trans, ret); 7211 goto error; 7212 } 7213 7214 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 7215 true); 7216 if (IS_ERR(wc.replay_dest)) { 7217 ret = PTR_ERR(wc.replay_dest); 7218 7219 /* 7220 * We didn't find the subvol, likely because it was 7221 * deleted. This is ok, simply skip this log and go to 7222 * the next one. 7223 * 7224 * We need to exclude the root because we can't have 7225 * other log replays overwriting this log as we'll read 7226 * it back in a few more times. This will keep our 7227 * block from being modified, and we'll just bail for 7228 * each subsequent pass. 7229 */ 7230 if (ret == -ENOENT) 7231 ret = btrfs_pin_extent_for_log_replay(trans, 7232 log->node->start, 7233 log->node->len); 7234 btrfs_put_root(log); 7235 7236 if (!ret) 7237 goto next; 7238 btrfs_abort_transaction(trans, ret); 7239 goto error; 7240 } 7241 7242 wc.replay_dest->log_root = log; 7243 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 7244 if (ret) 7245 /* The loop needs to continue due to the root refs */ 7246 btrfs_abort_transaction(trans, ret); 7247 else 7248 ret = walk_log_tree(trans, log, &wc); 7249 7250 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7251 ret = fixup_inode_link_counts(trans, wc.replay_dest, 7252 path); 7253 if (ret) 7254 btrfs_abort_transaction(trans, ret); 7255 } 7256 7257 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 7258 struct btrfs_root *root = wc.replay_dest; 7259 7260 btrfs_release_path(path); 7261 7262 /* 7263 * We have just replayed everything, and the highest 7264 * objectid of fs roots probably has changed in case 7265 * some inode_item's got replayed. 7266 * 7267 * root->objectid_mutex is not acquired as log replay 7268 * could only happen during mount. 7269 */ 7270 ret = btrfs_init_root_free_objectid(root); 7271 if (ret) 7272 btrfs_abort_transaction(trans, ret); 7273 } 7274 7275 wc.replay_dest->log_root = NULL; 7276 btrfs_put_root(wc.replay_dest); 7277 btrfs_put_root(log); 7278 7279 if (ret) 7280 goto error; 7281 next: 7282 if (found_key.offset == 0) 7283 break; 7284 key.offset = found_key.offset - 1; 7285 } 7286 btrfs_release_path(path); 7287 7288 /* step one is to pin it all, step two is to replay just inodes */ 7289 if (wc.pin) { 7290 wc.pin = 0; 7291 wc.process_func = replay_one_buffer; 7292 wc.stage = LOG_WALK_REPLAY_INODES; 7293 goto again; 7294 } 7295 /* step three is to replay everything */ 7296 if (wc.stage < LOG_WALK_REPLAY_ALL) { 7297 wc.stage++; 7298 goto again; 7299 } 7300 7301 btrfs_free_path(path); 7302 7303 /* step 4: commit the transaction, which also unpins the blocks */ 7304 ret = btrfs_commit_transaction(trans); 7305 if (ret) 7306 return ret; 7307 7308 log_root_tree->log_root = NULL; 7309 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7310 btrfs_put_root(log_root_tree); 7311 7312 return 0; 7313 error: 7314 if (wc.trans) 7315 btrfs_end_transaction(wc.trans); 7316 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 7317 btrfs_free_path(path); 7318 return ret; 7319 } 7320 7321 /* 7322 * there are some corner cases where we want to force a full 7323 * commit instead of allowing a directory to be logged. 7324 * 7325 * They revolve around files there were unlinked from the directory, and 7326 * this function updates the parent directory so that a full commit is 7327 * properly done if it is fsync'd later after the unlinks are done. 7328 * 7329 * Must be called before the unlink operations (updates to the subvolume tree, 7330 * inodes, etc) are done. 7331 */ 7332 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 7333 struct btrfs_inode *dir, struct btrfs_inode *inode, 7334 int for_rename) 7335 { 7336 /* 7337 * when we're logging a file, if it hasn't been renamed 7338 * or unlinked, and its inode is fully committed on disk, 7339 * we don't have to worry about walking up the directory chain 7340 * to log its parents. 7341 * 7342 * So, we use the last_unlink_trans field to put this transid 7343 * into the file. When the file is logged we check it and 7344 * don't log the parents if the file is fully on disk. 7345 */ 7346 mutex_lock(&inode->log_mutex); 7347 inode->last_unlink_trans = trans->transid; 7348 mutex_unlock(&inode->log_mutex); 7349 7350 /* 7351 * if this directory was already logged any new 7352 * names for this file/dir will get recorded 7353 */ 7354 if (dir->logged_trans == trans->transid) 7355 return; 7356 7357 /* 7358 * if the inode we're about to unlink was logged, 7359 * the log will be properly updated for any new names 7360 */ 7361 if (inode->logged_trans == trans->transid) 7362 return; 7363 7364 /* 7365 * when renaming files across directories, if the directory 7366 * there we're unlinking from gets fsync'd later on, there's 7367 * no way to find the destination directory later and fsync it 7368 * properly. So, we have to be conservative and force commits 7369 * so the new name gets discovered. 7370 */ 7371 if (for_rename) 7372 goto record; 7373 7374 /* we can safely do the unlink without any special recording */ 7375 return; 7376 7377 record: 7378 mutex_lock(&dir->log_mutex); 7379 dir->last_unlink_trans = trans->transid; 7380 mutex_unlock(&dir->log_mutex); 7381 } 7382 7383 /* 7384 * Make sure that if someone attempts to fsync the parent directory of a deleted 7385 * snapshot, it ends up triggering a transaction commit. This is to guarantee 7386 * that after replaying the log tree of the parent directory's root we will not 7387 * see the snapshot anymore and at log replay time we will not see any log tree 7388 * corresponding to the deleted snapshot's root, which could lead to replaying 7389 * it after replaying the log tree of the parent directory (which would replay 7390 * the snapshot delete operation). 7391 * 7392 * Must be called before the actual snapshot destroy operation (updates to the 7393 * parent root and tree of tree roots trees, etc) are done. 7394 */ 7395 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 7396 struct btrfs_inode *dir) 7397 { 7398 mutex_lock(&dir->log_mutex); 7399 dir->last_unlink_trans = trans->transid; 7400 mutex_unlock(&dir->log_mutex); 7401 } 7402 7403 /** 7404 * Update the log after adding a new name for an inode. 7405 * 7406 * @trans: Transaction handle. 7407 * @old_dentry: The dentry associated with the old name and the old 7408 * parent directory. 7409 * @old_dir: The inode of the previous parent directory for the case 7410 * of a rename. For a link operation, it must be NULL. 7411 * @old_dir_index: The index number associated with the old name, meaningful 7412 * only for rename operations (when @old_dir is not NULL). 7413 * Ignored for link operations. 7414 * @parent: The dentry associated with the directory under which the 7415 * new name is located. 7416 * 7417 * Call this after adding a new name for an inode, as a result of a link or 7418 * rename operation, and it will properly update the log to reflect the new name. 7419 */ 7420 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 7421 struct dentry *old_dentry, struct btrfs_inode *old_dir, 7422 u64 old_dir_index, struct dentry *parent) 7423 { 7424 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 7425 struct btrfs_root *root = inode->root; 7426 struct btrfs_log_ctx ctx; 7427 bool log_pinned = false; 7428 int ret; 7429 7430 /* 7431 * this will force the logging code to walk the dentry chain 7432 * up for the file 7433 */ 7434 if (!S_ISDIR(inode->vfs_inode.i_mode)) 7435 inode->last_unlink_trans = trans->transid; 7436 7437 /* 7438 * if this inode hasn't been logged and directory we're renaming it 7439 * from hasn't been logged, we don't need to log it 7440 */ 7441 ret = inode_logged(trans, inode, NULL); 7442 if (ret < 0) { 7443 goto out; 7444 } else if (ret == 0) { 7445 if (!old_dir) 7446 return; 7447 /* 7448 * If the inode was not logged and we are doing a rename (old_dir is not 7449 * NULL), check if old_dir was logged - if it was not we can return and 7450 * do nothing. 7451 */ 7452 ret = inode_logged(trans, old_dir, NULL); 7453 if (ret < 0) 7454 goto out; 7455 else if (ret == 0) 7456 return; 7457 } 7458 ret = 0; 7459 7460 /* 7461 * If we are doing a rename (old_dir is not NULL) from a directory that 7462 * was previously logged, make sure that on log replay we get the old 7463 * dir entry deleted. This is needed because we will also log the new 7464 * name of the renamed inode, so we need to make sure that after log 7465 * replay we don't end up with both the new and old dir entries existing. 7466 */ 7467 if (old_dir && old_dir->logged_trans == trans->transid) { 7468 struct btrfs_root *log = old_dir->root->log_root; 7469 struct btrfs_path *path; 7470 7471 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7472 7473 /* 7474 * We have two inodes to update in the log, the old directory and 7475 * the inode that got renamed, so we must pin the log to prevent 7476 * anyone from syncing the log until we have updated both inodes 7477 * in the log. 7478 */ 7479 ret = join_running_log_trans(root); 7480 /* 7481 * At least one of the inodes was logged before, so this should 7482 * not fail, but if it does, it's not serious, just bail out and 7483 * mark the log for a full commit. 7484 */ 7485 if (WARN_ON_ONCE(ret < 0)) 7486 goto out; 7487 log_pinned = true; 7488 7489 path = btrfs_alloc_path(); 7490 if (!path) { 7491 ret = -ENOMEM; 7492 goto out; 7493 } 7494 7495 /* 7496 * Other concurrent task might be logging the old directory, 7497 * as it can be triggered when logging other inode that had or 7498 * still has a dentry in the old directory. We lock the old 7499 * directory's log_mutex to ensure the deletion of the old 7500 * name is persisted, because during directory logging we 7501 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of 7502 * the old name's dir index item is in the delayed items, so 7503 * it could be missed by an in progress directory logging. 7504 */ 7505 mutex_lock(&old_dir->log_mutex); 7506 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7507 old_dentry->d_name.name, 7508 old_dentry->d_name.len, old_dir_index); 7509 if (ret > 0) { 7510 /* 7511 * The dentry does not exist in the log, so record its 7512 * deletion. 7513 */ 7514 btrfs_release_path(path); 7515 ret = insert_dir_log_key(trans, log, path, 7516 btrfs_ino(old_dir), 7517 old_dir_index, old_dir_index); 7518 } 7519 mutex_unlock(&old_dir->log_mutex); 7520 7521 btrfs_free_path(path); 7522 if (ret < 0) 7523 goto out; 7524 } 7525 7526 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7527 ctx.logging_new_name = true; 7528 /* 7529 * We don't care about the return value. If we fail to log the new name 7530 * then we know the next attempt to sync the log will fallback to a full 7531 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7532 * we don't need to worry about getting a log committed that has an 7533 * inconsistent state after a rename operation. 7534 */ 7535 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7536 ASSERT(list_empty(&ctx.conflict_inodes)); 7537 out: 7538 /* 7539 * If an error happened mark the log for a full commit because it's not 7540 * consistent and up to date or we couldn't find out if one of the 7541 * inodes was logged before in this transaction. Do it before unpinning 7542 * the log, to avoid any races with someone else trying to commit it. 7543 */ 7544 if (ret < 0) 7545 btrfs_set_log_full_commit(trans); 7546 if (log_pinned) 7547 btrfs_end_log_trans(root); 7548 } 7549 7550