1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 25 /* magic values for the inode_only field in btrfs_log_inode: 26 * 27 * LOG_INODE_ALL means to log everything 28 * LOG_INODE_EXISTS means to log just enough to recreate the inode 29 * during log replay 30 */ 31 enum { 32 LOG_INODE_ALL, 33 LOG_INODE_EXISTS, 34 LOG_OTHER_INODE, 35 LOG_OTHER_INODE_ALL, 36 }; 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 enum { 91 LOG_WALK_PIN_ONLY, 92 LOG_WALK_REPLAY_INODES, 93 LOG_WALK_REPLAY_DIR_INDEX, 94 LOG_WALK_REPLAY_ALL, 95 }; 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_inode *inode, 99 int inode_only, 100 struct btrfs_log_ctx *ctx); 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, 103 struct btrfs_path *path, u64 objectid); 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 105 struct btrfs_root *root, 106 struct btrfs_root *log, 107 struct btrfs_path *path, 108 u64 dirid, int del_all); 109 static void wait_log_commit(struct btrfs_root *root, int transid); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 struct btrfs_root *tree_root = fs_info->tree_root; 145 const bool zoned = btrfs_is_zoned(fs_info); 146 int ret = 0; 147 bool created = false; 148 149 /* 150 * First check if the log root tree was already created. If not, create 151 * it before locking the root's log_mutex, just to keep lockdep happy. 152 */ 153 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 154 mutex_lock(&tree_root->log_mutex); 155 if (!fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, fs_info); 157 if (!ret) { 158 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 159 created = true; 160 } 161 } 162 mutex_unlock(&tree_root->log_mutex); 163 if (ret) 164 return ret; 165 } 166 167 mutex_lock(&root->log_mutex); 168 169 again: 170 if (root->log_root) { 171 int index = (root->log_transid + 1) % 2; 172 173 if (btrfs_need_log_full_commit(trans)) { 174 ret = -EAGAIN; 175 goto out; 176 } 177 178 if (zoned && atomic_read(&root->log_commit[index])) { 179 wait_log_commit(root, root->log_transid - 1); 180 goto again; 181 } 182 183 if (!root->log_start_pid) { 184 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 185 root->log_start_pid = current->pid; 186 } else if (root->log_start_pid != current->pid) { 187 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 188 } 189 } else { 190 /* 191 * This means fs_info->log_root_tree was already created 192 * for some other FS trees. Do the full commit not to mix 193 * nodes from multiple log transactions to do sequential 194 * writing. 195 */ 196 if (zoned && !created) { 197 ret = -EAGAIN; 198 goto out; 199 } 200 201 ret = btrfs_add_log_tree(trans, root); 202 if (ret) 203 goto out; 204 205 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 206 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 207 root->log_start_pid = current->pid; 208 } 209 210 atomic_inc(&root->log_writers); 211 if (!ctx->logging_new_name) { 212 int index = root->log_transid % 2; 213 list_add_tail(&ctx->list, &root->log_ctxs[index]); 214 ctx->log_transid = root->log_transid; 215 } 216 217 out: 218 mutex_unlock(&root->log_mutex); 219 return ret; 220 } 221 222 /* 223 * returns 0 if there was a log transaction running and we were able 224 * to join, or returns -ENOENT if there were not transactions 225 * in progress 226 */ 227 static int join_running_log_trans(struct btrfs_root *root) 228 { 229 const bool zoned = btrfs_is_zoned(root->fs_info); 230 int ret = -ENOENT; 231 232 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 233 return ret; 234 235 mutex_lock(&root->log_mutex); 236 again: 237 if (root->log_root) { 238 int index = (root->log_transid + 1) % 2; 239 240 ret = 0; 241 if (zoned && atomic_read(&root->log_commit[index])) { 242 wait_log_commit(root, root->log_transid - 1); 243 goto again; 244 } 245 atomic_inc(&root->log_writers); 246 } 247 mutex_unlock(&root->log_mutex); 248 return ret; 249 } 250 251 /* 252 * This either makes the current running log transaction wait 253 * until you call btrfs_end_log_trans() or it makes any future 254 * log transactions wait until you call btrfs_end_log_trans() 255 */ 256 void btrfs_pin_log_trans(struct btrfs_root *root) 257 { 258 atomic_inc(&root->log_writers); 259 } 260 261 /* 262 * indicate we're done making changes to the log tree 263 * and wake up anyone waiting to do a sync 264 */ 265 void btrfs_end_log_trans(struct btrfs_root *root) 266 { 267 if (atomic_dec_and_test(&root->log_writers)) { 268 /* atomic_dec_and_test implies a barrier */ 269 cond_wake_up_nomb(&root->log_writer_wait); 270 } 271 } 272 273 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 274 { 275 filemap_fdatawait_range(buf->pages[0]->mapping, 276 buf->start, buf->start + buf->len - 1); 277 } 278 279 /* 280 * the walk control struct is used to pass state down the chain when 281 * processing the log tree. The stage field tells us which part 282 * of the log tree processing we are currently doing. The others 283 * are state fields used for that specific part 284 */ 285 struct walk_control { 286 /* should we free the extent on disk when done? This is used 287 * at transaction commit time while freeing a log tree 288 */ 289 int free; 290 291 /* pin only walk, we record which extents on disk belong to the 292 * log trees 293 */ 294 int pin; 295 296 /* what stage of the replay code we're currently in */ 297 int stage; 298 299 /* 300 * Ignore any items from the inode currently being processed. Needs 301 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 302 * the LOG_WALK_REPLAY_INODES stage. 303 */ 304 bool ignore_cur_inode; 305 306 /* the root we are currently replaying */ 307 struct btrfs_root *replay_dest; 308 309 /* the trans handle for the current replay */ 310 struct btrfs_trans_handle *trans; 311 312 /* the function that gets used to process blocks we find in the 313 * tree. Note the extent_buffer might not be up to date when it is 314 * passed in, and it must be checked or read if you need the data 315 * inside it 316 */ 317 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 318 struct walk_control *wc, u64 gen, int level); 319 }; 320 321 /* 322 * process_func used to pin down extents, write them or wait on them 323 */ 324 static int process_one_buffer(struct btrfs_root *log, 325 struct extent_buffer *eb, 326 struct walk_control *wc, u64 gen, int level) 327 { 328 struct btrfs_fs_info *fs_info = log->fs_info; 329 int ret = 0; 330 331 /* 332 * If this fs is mixed then we need to be able to process the leaves to 333 * pin down any logged extents, so we have to read the block. 334 */ 335 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 336 ret = btrfs_read_buffer(eb, gen, level, NULL); 337 if (ret) 338 return ret; 339 } 340 341 if (wc->pin) { 342 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 343 eb->len); 344 if (ret) 345 return ret; 346 347 if (btrfs_buffer_uptodate(eb, gen, 0) && 348 btrfs_header_level(eb) == 0) 349 ret = btrfs_exclude_logged_extents(eb); 350 } 351 return ret; 352 } 353 354 static int do_overwrite_item(struct btrfs_trans_handle *trans, 355 struct btrfs_root *root, 356 struct btrfs_path *path, 357 struct extent_buffer *eb, int slot, 358 struct btrfs_key *key) 359 { 360 int ret; 361 u32 item_size; 362 u64 saved_i_size = 0; 363 int save_old_i_size = 0; 364 unsigned long src_ptr; 365 unsigned long dst_ptr; 366 int overwrite_root = 0; 367 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 368 369 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 370 overwrite_root = 1; 371 372 item_size = btrfs_item_size(eb, slot); 373 src_ptr = btrfs_item_ptr_offset(eb, slot); 374 375 /* Our caller must have done a search for the key for us. */ 376 ASSERT(path->nodes[0] != NULL); 377 378 /* 379 * And the slot must point to the exact key or the slot where the key 380 * should be at (the first item with a key greater than 'key') 381 */ 382 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 383 struct btrfs_key found_key; 384 385 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 386 ret = btrfs_comp_cpu_keys(&found_key, key); 387 ASSERT(ret >= 0); 388 } else { 389 ret = 1; 390 } 391 392 if (ret == 0) { 393 char *src_copy; 394 char *dst_copy; 395 u32 dst_size = btrfs_item_size(path->nodes[0], 396 path->slots[0]); 397 if (dst_size != item_size) 398 goto insert; 399 400 if (item_size == 0) { 401 btrfs_release_path(path); 402 return 0; 403 } 404 dst_copy = kmalloc(item_size, GFP_NOFS); 405 src_copy = kmalloc(item_size, GFP_NOFS); 406 if (!dst_copy || !src_copy) { 407 btrfs_release_path(path); 408 kfree(dst_copy); 409 kfree(src_copy); 410 return -ENOMEM; 411 } 412 413 read_extent_buffer(eb, src_copy, src_ptr, item_size); 414 415 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 416 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 417 item_size); 418 ret = memcmp(dst_copy, src_copy, item_size); 419 420 kfree(dst_copy); 421 kfree(src_copy); 422 /* 423 * they have the same contents, just return, this saves 424 * us from cowing blocks in the destination tree and doing 425 * extra writes that may not have been done by a previous 426 * sync 427 */ 428 if (ret == 0) { 429 btrfs_release_path(path); 430 return 0; 431 } 432 433 /* 434 * We need to load the old nbytes into the inode so when we 435 * replay the extents we've logged we get the right nbytes. 436 */ 437 if (inode_item) { 438 struct btrfs_inode_item *item; 439 u64 nbytes; 440 u32 mode; 441 442 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 443 struct btrfs_inode_item); 444 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 445 item = btrfs_item_ptr(eb, slot, 446 struct btrfs_inode_item); 447 btrfs_set_inode_nbytes(eb, item, nbytes); 448 449 /* 450 * If this is a directory we need to reset the i_size to 451 * 0 so that we can set it up properly when replaying 452 * the rest of the items in this log. 453 */ 454 mode = btrfs_inode_mode(eb, item); 455 if (S_ISDIR(mode)) 456 btrfs_set_inode_size(eb, item, 0); 457 } 458 } else if (inode_item) { 459 struct btrfs_inode_item *item; 460 u32 mode; 461 462 /* 463 * New inode, set nbytes to 0 so that the nbytes comes out 464 * properly when we replay the extents. 465 */ 466 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 467 btrfs_set_inode_nbytes(eb, item, 0); 468 469 /* 470 * If this is a directory we need to reset the i_size to 0 so 471 * that we can set it up properly when replaying the rest of 472 * the items in this log. 473 */ 474 mode = btrfs_inode_mode(eb, item); 475 if (S_ISDIR(mode)) 476 btrfs_set_inode_size(eb, item, 0); 477 } 478 insert: 479 btrfs_release_path(path); 480 /* try to insert the key into the destination tree */ 481 path->skip_release_on_error = 1; 482 ret = btrfs_insert_empty_item(trans, root, path, 483 key, item_size); 484 path->skip_release_on_error = 0; 485 486 /* make sure any existing item is the correct size */ 487 if (ret == -EEXIST || ret == -EOVERFLOW) { 488 u32 found_size; 489 found_size = btrfs_item_size(path->nodes[0], 490 path->slots[0]); 491 if (found_size > item_size) 492 btrfs_truncate_item(path, item_size, 1); 493 else if (found_size < item_size) 494 btrfs_extend_item(path, item_size - found_size); 495 } else if (ret) { 496 return ret; 497 } 498 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 499 path->slots[0]); 500 501 /* don't overwrite an existing inode if the generation number 502 * was logged as zero. This is done when the tree logging code 503 * is just logging an inode to make sure it exists after recovery. 504 * 505 * Also, don't overwrite i_size on directories during replay. 506 * log replay inserts and removes directory items based on the 507 * state of the tree found in the subvolume, and i_size is modified 508 * as it goes 509 */ 510 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 511 struct btrfs_inode_item *src_item; 512 struct btrfs_inode_item *dst_item; 513 514 src_item = (struct btrfs_inode_item *)src_ptr; 515 dst_item = (struct btrfs_inode_item *)dst_ptr; 516 517 if (btrfs_inode_generation(eb, src_item) == 0) { 518 struct extent_buffer *dst_eb = path->nodes[0]; 519 const u64 ino_size = btrfs_inode_size(eb, src_item); 520 521 /* 522 * For regular files an ino_size == 0 is used only when 523 * logging that an inode exists, as part of a directory 524 * fsync, and the inode wasn't fsynced before. In this 525 * case don't set the size of the inode in the fs/subvol 526 * tree, otherwise we would be throwing valid data away. 527 */ 528 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 529 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 530 ino_size != 0) 531 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 532 goto no_copy; 533 } 534 535 if (overwrite_root && 536 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 537 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 538 save_old_i_size = 1; 539 saved_i_size = btrfs_inode_size(path->nodes[0], 540 dst_item); 541 } 542 } 543 544 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 545 src_ptr, item_size); 546 547 if (save_old_i_size) { 548 struct btrfs_inode_item *dst_item; 549 dst_item = (struct btrfs_inode_item *)dst_ptr; 550 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 551 } 552 553 /* make sure the generation is filled in */ 554 if (key->type == BTRFS_INODE_ITEM_KEY) { 555 struct btrfs_inode_item *dst_item; 556 dst_item = (struct btrfs_inode_item *)dst_ptr; 557 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 558 btrfs_set_inode_generation(path->nodes[0], dst_item, 559 trans->transid); 560 } 561 } 562 no_copy: 563 btrfs_mark_buffer_dirty(path->nodes[0]); 564 btrfs_release_path(path); 565 return 0; 566 } 567 568 /* 569 * Item overwrite used by replay and tree logging. eb, slot and key all refer 570 * to the src data we are copying out. 571 * 572 * root is the tree we are copying into, and path is a scratch 573 * path for use in this function (it should be released on entry and 574 * will be released on exit). 575 * 576 * If the key is already in the destination tree the existing item is 577 * overwritten. If the existing item isn't big enough, it is extended. 578 * If it is too large, it is truncated. 579 * 580 * If the key isn't in the destination yet, a new item is inserted. 581 */ 582 static int overwrite_item(struct btrfs_trans_handle *trans, 583 struct btrfs_root *root, 584 struct btrfs_path *path, 585 struct extent_buffer *eb, int slot, 586 struct btrfs_key *key) 587 { 588 int ret; 589 590 /* Look for the key in the destination tree. */ 591 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 592 if (ret < 0) 593 return ret; 594 595 return do_overwrite_item(trans, root, path, eb, slot, key); 596 } 597 598 /* 599 * simple helper to read an inode off the disk from a given root 600 * This can only be called for subvolume roots and not for the log 601 */ 602 static noinline struct inode *read_one_inode(struct btrfs_root *root, 603 u64 objectid) 604 { 605 struct inode *inode; 606 607 inode = btrfs_iget(root->fs_info->sb, objectid, root); 608 if (IS_ERR(inode)) 609 inode = NULL; 610 return inode; 611 } 612 613 /* replays a single extent in 'eb' at 'slot' with 'key' into the 614 * subvolume 'root'. path is released on entry and should be released 615 * on exit. 616 * 617 * extents in the log tree have not been allocated out of the extent 618 * tree yet. So, this completes the allocation, taking a reference 619 * as required if the extent already exists or creating a new extent 620 * if it isn't in the extent allocation tree yet. 621 * 622 * The extent is inserted into the file, dropping any existing extents 623 * from the file that overlap the new one. 624 */ 625 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 626 struct btrfs_root *root, 627 struct btrfs_path *path, 628 struct extent_buffer *eb, int slot, 629 struct btrfs_key *key) 630 { 631 struct btrfs_drop_extents_args drop_args = { 0 }; 632 struct btrfs_fs_info *fs_info = root->fs_info; 633 int found_type; 634 u64 extent_end; 635 u64 start = key->offset; 636 u64 nbytes = 0; 637 struct btrfs_file_extent_item *item; 638 struct inode *inode = NULL; 639 unsigned long size; 640 int ret = 0; 641 642 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 643 found_type = btrfs_file_extent_type(eb, item); 644 645 if (found_type == BTRFS_FILE_EXTENT_REG || 646 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 647 nbytes = btrfs_file_extent_num_bytes(eb, item); 648 extent_end = start + nbytes; 649 650 /* 651 * We don't add to the inodes nbytes if we are prealloc or a 652 * hole. 653 */ 654 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 655 nbytes = 0; 656 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 657 size = btrfs_file_extent_ram_bytes(eb, item); 658 nbytes = btrfs_file_extent_ram_bytes(eb, item); 659 extent_end = ALIGN(start + size, 660 fs_info->sectorsize); 661 } else { 662 ret = 0; 663 goto out; 664 } 665 666 inode = read_one_inode(root, key->objectid); 667 if (!inode) { 668 ret = -EIO; 669 goto out; 670 } 671 672 /* 673 * first check to see if we already have this extent in the 674 * file. This must be done before the btrfs_drop_extents run 675 * so we don't try to drop this extent. 676 */ 677 ret = btrfs_lookup_file_extent(trans, root, path, 678 btrfs_ino(BTRFS_I(inode)), start, 0); 679 680 if (ret == 0 && 681 (found_type == BTRFS_FILE_EXTENT_REG || 682 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 683 struct btrfs_file_extent_item cmp1; 684 struct btrfs_file_extent_item cmp2; 685 struct btrfs_file_extent_item *existing; 686 struct extent_buffer *leaf; 687 688 leaf = path->nodes[0]; 689 existing = btrfs_item_ptr(leaf, path->slots[0], 690 struct btrfs_file_extent_item); 691 692 read_extent_buffer(eb, &cmp1, (unsigned long)item, 693 sizeof(cmp1)); 694 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 695 sizeof(cmp2)); 696 697 /* 698 * we already have a pointer to this exact extent, 699 * we don't have to do anything 700 */ 701 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 702 btrfs_release_path(path); 703 goto out; 704 } 705 } 706 btrfs_release_path(path); 707 708 /* drop any overlapping extents */ 709 drop_args.start = start; 710 drop_args.end = extent_end; 711 drop_args.drop_cache = true; 712 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 713 if (ret) 714 goto out; 715 716 if (found_type == BTRFS_FILE_EXTENT_REG || 717 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 718 u64 offset; 719 unsigned long dest_offset; 720 struct btrfs_key ins; 721 722 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 723 btrfs_fs_incompat(fs_info, NO_HOLES)) 724 goto update_inode; 725 726 ret = btrfs_insert_empty_item(trans, root, path, key, 727 sizeof(*item)); 728 if (ret) 729 goto out; 730 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 731 path->slots[0]); 732 copy_extent_buffer(path->nodes[0], eb, dest_offset, 733 (unsigned long)item, sizeof(*item)); 734 735 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 736 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 737 ins.type = BTRFS_EXTENT_ITEM_KEY; 738 offset = key->offset - btrfs_file_extent_offset(eb, item); 739 740 /* 741 * Manually record dirty extent, as here we did a shallow 742 * file extent item copy and skip normal backref update, 743 * but modifying extent tree all by ourselves. 744 * So need to manually record dirty extent for qgroup, 745 * as the owner of the file extent changed from log tree 746 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 747 */ 748 ret = btrfs_qgroup_trace_extent(trans, 749 btrfs_file_extent_disk_bytenr(eb, item), 750 btrfs_file_extent_disk_num_bytes(eb, item), 751 GFP_NOFS); 752 if (ret < 0) 753 goto out; 754 755 if (ins.objectid > 0) { 756 struct btrfs_ref ref = { 0 }; 757 u64 csum_start; 758 u64 csum_end; 759 LIST_HEAD(ordered_sums); 760 761 /* 762 * is this extent already allocated in the extent 763 * allocation tree? If so, just add a reference 764 */ 765 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 766 ins.offset); 767 if (ret < 0) { 768 goto out; 769 } else if (ret == 0) { 770 btrfs_init_generic_ref(&ref, 771 BTRFS_ADD_DELAYED_REF, 772 ins.objectid, ins.offset, 0); 773 btrfs_init_data_ref(&ref, 774 root->root_key.objectid, 775 key->objectid, offset, 0, false); 776 ret = btrfs_inc_extent_ref(trans, &ref); 777 if (ret) 778 goto out; 779 } else { 780 /* 781 * insert the extent pointer in the extent 782 * allocation tree 783 */ 784 ret = btrfs_alloc_logged_file_extent(trans, 785 root->root_key.objectid, 786 key->objectid, offset, &ins); 787 if (ret) 788 goto out; 789 } 790 btrfs_release_path(path); 791 792 if (btrfs_file_extent_compression(eb, item)) { 793 csum_start = ins.objectid; 794 csum_end = csum_start + ins.offset; 795 } else { 796 csum_start = ins.objectid + 797 btrfs_file_extent_offset(eb, item); 798 csum_end = csum_start + 799 btrfs_file_extent_num_bytes(eb, item); 800 } 801 802 ret = btrfs_lookup_csums_range(root->log_root, 803 csum_start, csum_end - 1, 804 &ordered_sums, 0); 805 if (ret) 806 goto out; 807 /* 808 * Now delete all existing cums in the csum root that 809 * cover our range. We do this because we can have an 810 * extent that is completely referenced by one file 811 * extent item and partially referenced by another 812 * file extent item (like after using the clone or 813 * extent_same ioctls). In this case if we end up doing 814 * the replay of the one that partially references the 815 * extent first, and we do not do the csum deletion 816 * below, we can get 2 csum items in the csum tree that 817 * overlap each other. For example, imagine our log has 818 * the two following file extent items: 819 * 820 * key (257 EXTENT_DATA 409600) 821 * extent data disk byte 12845056 nr 102400 822 * extent data offset 20480 nr 20480 ram 102400 823 * 824 * key (257 EXTENT_DATA 819200) 825 * extent data disk byte 12845056 nr 102400 826 * extent data offset 0 nr 102400 ram 102400 827 * 828 * Where the second one fully references the 100K extent 829 * that starts at disk byte 12845056, and the log tree 830 * has a single csum item that covers the entire range 831 * of the extent: 832 * 833 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 834 * 835 * After the first file extent item is replayed, the 836 * csum tree gets the following csum item: 837 * 838 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 839 * 840 * Which covers the 20K sub-range starting at offset 20K 841 * of our extent. Now when we replay the second file 842 * extent item, if we do not delete existing csum items 843 * that cover any of its blocks, we end up getting two 844 * csum items in our csum tree that overlap each other: 845 * 846 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 847 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 848 * 849 * Which is a problem, because after this anyone trying 850 * to lookup up for the checksum of any block of our 851 * extent starting at an offset of 40K or higher, will 852 * end up looking at the second csum item only, which 853 * does not contain the checksum for any block starting 854 * at offset 40K or higher of our extent. 855 */ 856 while (!list_empty(&ordered_sums)) { 857 struct btrfs_ordered_sum *sums; 858 struct btrfs_root *csum_root; 859 860 sums = list_entry(ordered_sums.next, 861 struct btrfs_ordered_sum, 862 list); 863 csum_root = btrfs_csum_root(fs_info, 864 sums->bytenr); 865 if (!ret) 866 ret = btrfs_del_csums(trans, csum_root, 867 sums->bytenr, 868 sums->len); 869 if (!ret) 870 ret = btrfs_csum_file_blocks(trans, 871 csum_root, 872 sums); 873 list_del(&sums->list); 874 kfree(sums); 875 } 876 if (ret) 877 goto out; 878 } else { 879 btrfs_release_path(path); 880 } 881 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 882 /* inline extents are easy, we just overwrite them */ 883 ret = overwrite_item(trans, root, path, eb, slot, key); 884 if (ret) 885 goto out; 886 } 887 888 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 889 extent_end - start); 890 if (ret) 891 goto out; 892 893 update_inode: 894 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 895 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 896 out: 897 if (inode) 898 iput(inode); 899 return ret; 900 } 901 902 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 903 struct btrfs_inode *dir, 904 struct btrfs_inode *inode, 905 const char *name, 906 int name_len) 907 { 908 int ret; 909 910 ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); 911 if (ret) 912 return ret; 913 /* 914 * Whenever we need to check if a name exists or not, we check the 915 * fs/subvolume tree. So after an unlink we must run delayed items, so 916 * that future checks for a name during log replay see that the name 917 * does not exists anymore. 918 */ 919 return btrfs_run_delayed_items(trans); 920 } 921 922 /* 923 * when cleaning up conflicts between the directory names in the 924 * subvolume, directory names in the log and directory names in the 925 * inode back references, we may have to unlink inodes from directories. 926 * 927 * This is a helper function to do the unlink of a specific directory 928 * item 929 */ 930 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 931 struct btrfs_path *path, 932 struct btrfs_inode *dir, 933 struct btrfs_dir_item *di) 934 { 935 struct btrfs_root *root = dir->root; 936 struct inode *inode; 937 char *name; 938 int name_len; 939 struct extent_buffer *leaf; 940 struct btrfs_key location; 941 int ret; 942 943 leaf = path->nodes[0]; 944 945 btrfs_dir_item_key_to_cpu(leaf, di, &location); 946 name_len = btrfs_dir_name_len(leaf, di); 947 name = kmalloc(name_len, GFP_NOFS); 948 if (!name) 949 return -ENOMEM; 950 951 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 952 btrfs_release_path(path); 953 954 inode = read_one_inode(root, location.objectid); 955 if (!inode) { 956 ret = -EIO; 957 goto out; 958 } 959 960 ret = link_to_fixup_dir(trans, root, path, location.objectid); 961 if (ret) 962 goto out; 963 964 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, 965 name_len); 966 out: 967 kfree(name); 968 iput(inode); 969 return ret; 970 } 971 972 /* 973 * See if a given name and sequence number found in an inode back reference are 974 * already in a directory and correctly point to this inode. 975 * 976 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 977 * exists. 978 */ 979 static noinline int inode_in_dir(struct btrfs_root *root, 980 struct btrfs_path *path, 981 u64 dirid, u64 objectid, u64 index, 982 const char *name, int name_len) 983 { 984 struct btrfs_dir_item *di; 985 struct btrfs_key location; 986 int ret = 0; 987 988 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 989 index, name, name_len, 0); 990 if (IS_ERR(di)) { 991 ret = PTR_ERR(di); 992 goto out; 993 } else if (di) { 994 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 995 if (location.objectid != objectid) 996 goto out; 997 } else { 998 goto out; 999 } 1000 1001 btrfs_release_path(path); 1002 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 1003 if (IS_ERR(di)) { 1004 ret = PTR_ERR(di); 1005 goto out; 1006 } else if (di) { 1007 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1008 if (location.objectid == objectid) 1009 ret = 1; 1010 } 1011 out: 1012 btrfs_release_path(path); 1013 return ret; 1014 } 1015 1016 /* 1017 * helper function to check a log tree for a named back reference in 1018 * an inode. This is used to decide if a back reference that is 1019 * found in the subvolume conflicts with what we find in the log. 1020 * 1021 * inode backreferences may have multiple refs in a single item, 1022 * during replay we process one reference at a time, and we don't 1023 * want to delete valid links to a file from the subvolume if that 1024 * link is also in the log. 1025 */ 1026 static noinline int backref_in_log(struct btrfs_root *log, 1027 struct btrfs_key *key, 1028 u64 ref_objectid, 1029 const char *name, int namelen) 1030 { 1031 struct btrfs_path *path; 1032 int ret; 1033 1034 path = btrfs_alloc_path(); 1035 if (!path) 1036 return -ENOMEM; 1037 1038 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1039 if (ret < 0) { 1040 goto out; 1041 } else if (ret == 1) { 1042 ret = 0; 1043 goto out; 1044 } 1045 1046 if (key->type == BTRFS_INODE_EXTREF_KEY) 1047 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1048 path->slots[0], 1049 ref_objectid, 1050 name, namelen); 1051 else 1052 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1053 path->slots[0], 1054 name, namelen); 1055 out: 1056 btrfs_free_path(path); 1057 return ret; 1058 } 1059 1060 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1061 struct btrfs_root *root, 1062 struct btrfs_path *path, 1063 struct btrfs_root *log_root, 1064 struct btrfs_inode *dir, 1065 struct btrfs_inode *inode, 1066 u64 inode_objectid, u64 parent_objectid, 1067 u64 ref_index, char *name, int namelen, 1068 int *search_done) 1069 { 1070 int ret; 1071 char *victim_name; 1072 int victim_name_len; 1073 struct extent_buffer *leaf; 1074 struct btrfs_dir_item *di; 1075 struct btrfs_key search_key; 1076 struct btrfs_inode_extref *extref; 1077 1078 again: 1079 /* Search old style refs */ 1080 search_key.objectid = inode_objectid; 1081 search_key.type = BTRFS_INODE_REF_KEY; 1082 search_key.offset = parent_objectid; 1083 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1084 if (ret == 0) { 1085 struct btrfs_inode_ref *victim_ref; 1086 unsigned long ptr; 1087 unsigned long ptr_end; 1088 1089 leaf = path->nodes[0]; 1090 1091 /* are we trying to overwrite a back ref for the root directory 1092 * if so, just jump out, we're done 1093 */ 1094 if (search_key.objectid == search_key.offset) 1095 return 1; 1096 1097 /* check all the names in this back reference to see 1098 * if they are in the log. if so, we allow them to stay 1099 * otherwise they must be unlinked as a conflict 1100 */ 1101 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1102 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1103 while (ptr < ptr_end) { 1104 victim_ref = (struct btrfs_inode_ref *)ptr; 1105 victim_name_len = btrfs_inode_ref_name_len(leaf, 1106 victim_ref); 1107 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1108 if (!victim_name) 1109 return -ENOMEM; 1110 1111 read_extent_buffer(leaf, victim_name, 1112 (unsigned long)(victim_ref + 1), 1113 victim_name_len); 1114 1115 ret = backref_in_log(log_root, &search_key, 1116 parent_objectid, victim_name, 1117 victim_name_len); 1118 if (ret < 0) { 1119 kfree(victim_name); 1120 return ret; 1121 } else if (!ret) { 1122 inc_nlink(&inode->vfs_inode); 1123 btrfs_release_path(path); 1124 1125 ret = unlink_inode_for_log_replay(trans, dir, inode, 1126 victim_name, victim_name_len); 1127 kfree(victim_name); 1128 if (ret) 1129 return ret; 1130 *search_done = 1; 1131 goto again; 1132 } 1133 kfree(victim_name); 1134 1135 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1136 } 1137 1138 /* 1139 * NOTE: we have searched root tree and checked the 1140 * corresponding ref, it does not need to check again. 1141 */ 1142 *search_done = 1; 1143 } 1144 btrfs_release_path(path); 1145 1146 /* Same search but for extended refs */ 1147 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1148 inode_objectid, parent_objectid, 0, 1149 0); 1150 if (!IS_ERR_OR_NULL(extref)) { 1151 u32 item_size; 1152 u32 cur_offset = 0; 1153 unsigned long base; 1154 struct inode *victim_parent; 1155 1156 leaf = path->nodes[0]; 1157 1158 item_size = btrfs_item_size(leaf, path->slots[0]); 1159 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1160 1161 while (cur_offset < item_size) { 1162 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1163 1164 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1165 1166 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1167 goto next; 1168 1169 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1170 if (!victim_name) 1171 return -ENOMEM; 1172 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1173 victim_name_len); 1174 1175 search_key.objectid = inode_objectid; 1176 search_key.type = BTRFS_INODE_EXTREF_KEY; 1177 search_key.offset = btrfs_extref_hash(parent_objectid, 1178 victim_name, 1179 victim_name_len); 1180 ret = backref_in_log(log_root, &search_key, 1181 parent_objectid, victim_name, 1182 victim_name_len); 1183 if (ret < 0) { 1184 kfree(victim_name); 1185 return ret; 1186 } else if (!ret) { 1187 ret = -ENOENT; 1188 victim_parent = read_one_inode(root, 1189 parent_objectid); 1190 if (victim_parent) { 1191 inc_nlink(&inode->vfs_inode); 1192 btrfs_release_path(path); 1193 1194 ret = unlink_inode_for_log_replay(trans, 1195 BTRFS_I(victim_parent), 1196 inode, 1197 victim_name, 1198 victim_name_len); 1199 } 1200 iput(victim_parent); 1201 kfree(victim_name); 1202 if (ret) 1203 return ret; 1204 *search_done = 1; 1205 goto again; 1206 } 1207 kfree(victim_name); 1208 next: 1209 cur_offset += victim_name_len + sizeof(*extref); 1210 } 1211 *search_done = 1; 1212 } 1213 btrfs_release_path(path); 1214 1215 /* look for a conflicting sequence number */ 1216 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1217 ref_index, name, namelen, 0); 1218 if (IS_ERR(di)) { 1219 return PTR_ERR(di); 1220 } else if (di) { 1221 ret = drop_one_dir_item(trans, path, dir, di); 1222 if (ret) 1223 return ret; 1224 } 1225 btrfs_release_path(path); 1226 1227 /* look for a conflicting name */ 1228 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1229 name, namelen, 0); 1230 if (IS_ERR(di)) { 1231 return PTR_ERR(di); 1232 } else if (di) { 1233 ret = drop_one_dir_item(trans, path, dir, di); 1234 if (ret) 1235 return ret; 1236 } 1237 btrfs_release_path(path); 1238 1239 return 0; 1240 } 1241 1242 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1243 u32 *namelen, char **name, u64 *index, 1244 u64 *parent_objectid) 1245 { 1246 struct btrfs_inode_extref *extref; 1247 1248 extref = (struct btrfs_inode_extref *)ref_ptr; 1249 1250 *namelen = btrfs_inode_extref_name_len(eb, extref); 1251 *name = kmalloc(*namelen, GFP_NOFS); 1252 if (*name == NULL) 1253 return -ENOMEM; 1254 1255 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1256 *namelen); 1257 1258 if (index) 1259 *index = btrfs_inode_extref_index(eb, extref); 1260 if (parent_objectid) 1261 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1262 1263 return 0; 1264 } 1265 1266 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1267 u32 *namelen, char **name, u64 *index) 1268 { 1269 struct btrfs_inode_ref *ref; 1270 1271 ref = (struct btrfs_inode_ref *)ref_ptr; 1272 1273 *namelen = btrfs_inode_ref_name_len(eb, ref); 1274 *name = kmalloc(*namelen, GFP_NOFS); 1275 if (*name == NULL) 1276 return -ENOMEM; 1277 1278 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1279 1280 if (index) 1281 *index = btrfs_inode_ref_index(eb, ref); 1282 1283 return 0; 1284 } 1285 1286 /* 1287 * Take an inode reference item from the log tree and iterate all names from the 1288 * inode reference item in the subvolume tree with the same key (if it exists). 1289 * For any name that is not in the inode reference item from the log tree, do a 1290 * proper unlink of that name (that is, remove its entry from the inode 1291 * reference item and both dir index keys). 1292 */ 1293 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1294 struct btrfs_root *root, 1295 struct btrfs_path *path, 1296 struct btrfs_inode *inode, 1297 struct extent_buffer *log_eb, 1298 int log_slot, 1299 struct btrfs_key *key) 1300 { 1301 int ret; 1302 unsigned long ref_ptr; 1303 unsigned long ref_end; 1304 struct extent_buffer *eb; 1305 1306 again: 1307 btrfs_release_path(path); 1308 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1309 if (ret > 0) { 1310 ret = 0; 1311 goto out; 1312 } 1313 if (ret < 0) 1314 goto out; 1315 1316 eb = path->nodes[0]; 1317 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1318 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1319 while (ref_ptr < ref_end) { 1320 char *name = NULL; 1321 int namelen; 1322 u64 parent_id; 1323 1324 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1325 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1326 NULL, &parent_id); 1327 } else { 1328 parent_id = key->offset; 1329 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1330 NULL); 1331 } 1332 if (ret) 1333 goto out; 1334 1335 if (key->type == BTRFS_INODE_EXTREF_KEY) 1336 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1337 parent_id, name, 1338 namelen); 1339 else 1340 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1341 name, namelen); 1342 1343 if (!ret) { 1344 struct inode *dir; 1345 1346 btrfs_release_path(path); 1347 dir = read_one_inode(root, parent_id); 1348 if (!dir) { 1349 ret = -ENOENT; 1350 kfree(name); 1351 goto out; 1352 } 1353 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1354 inode, name, namelen); 1355 kfree(name); 1356 iput(dir); 1357 if (ret) 1358 goto out; 1359 goto again; 1360 } 1361 1362 kfree(name); 1363 ref_ptr += namelen; 1364 if (key->type == BTRFS_INODE_EXTREF_KEY) 1365 ref_ptr += sizeof(struct btrfs_inode_extref); 1366 else 1367 ref_ptr += sizeof(struct btrfs_inode_ref); 1368 } 1369 ret = 0; 1370 out: 1371 btrfs_release_path(path); 1372 return ret; 1373 } 1374 1375 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1376 const u8 ref_type, const char *name, 1377 const int namelen) 1378 { 1379 struct btrfs_key key; 1380 struct btrfs_path *path; 1381 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1382 int ret; 1383 1384 path = btrfs_alloc_path(); 1385 if (!path) 1386 return -ENOMEM; 1387 1388 key.objectid = btrfs_ino(BTRFS_I(inode)); 1389 key.type = ref_type; 1390 if (key.type == BTRFS_INODE_REF_KEY) 1391 key.offset = parent_id; 1392 else 1393 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1394 1395 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1396 if (ret < 0) 1397 goto out; 1398 if (ret > 0) { 1399 ret = 0; 1400 goto out; 1401 } 1402 if (key.type == BTRFS_INODE_EXTREF_KEY) 1403 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1404 path->slots[0], parent_id, name, namelen); 1405 else 1406 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1407 name, namelen); 1408 1409 out: 1410 btrfs_free_path(path); 1411 return ret; 1412 } 1413 1414 static int add_link(struct btrfs_trans_handle *trans, 1415 struct inode *dir, struct inode *inode, const char *name, 1416 int namelen, u64 ref_index) 1417 { 1418 struct btrfs_root *root = BTRFS_I(dir)->root; 1419 struct btrfs_dir_item *dir_item; 1420 struct btrfs_key key; 1421 struct btrfs_path *path; 1422 struct inode *other_inode = NULL; 1423 int ret; 1424 1425 path = btrfs_alloc_path(); 1426 if (!path) 1427 return -ENOMEM; 1428 1429 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1430 btrfs_ino(BTRFS_I(dir)), 1431 name, namelen, 0); 1432 if (!dir_item) { 1433 btrfs_release_path(path); 1434 goto add_link; 1435 } else if (IS_ERR(dir_item)) { 1436 ret = PTR_ERR(dir_item); 1437 goto out; 1438 } 1439 1440 /* 1441 * Our inode's dentry collides with the dentry of another inode which is 1442 * in the log but not yet processed since it has a higher inode number. 1443 * So delete that other dentry. 1444 */ 1445 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1446 btrfs_release_path(path); 1447 other_inode = read_one_inode(root, key.objectid); 1448 if (!other_inode) { 1449 ret = -ENOENT; 1450 goto out; 1451 } 1452 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), 1453 name, namelen); 1454 if (ret) 1455 goto out; 1456 /* 1457 * If we dropped the link count to 0, bump it so that later the iput() 1458 * on the inode will not free it. We will fixup the link count later. 1459 */ 1460 if (other_inode->i_nlink == 0) 1461 inc_nlink(other_inode); 1462 add_link: 1463 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1464 name, namelen, 0, ref_index); 1465 out: 1466 iput(other_inode); 1467 btrfs_free_path(path); 1468 1469 return ret; 1470 } 1471 1472 /* 1473 * replay one inode back reference item found in the log tree. 1474 * eb, slot and key refer to the buffer and key found in the log tree. 1475 * root is the destination we are replaying into, and path is for temp 1476 * use by this function. (it should be released on return). 1477 */ 1478 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1479 struct btrfs_root *root, 1480 struct btrfs_root *log, 1481 struct btrfs_path *path, 1482 struct extent_buffer *eb, int slot, 1483 struct btrfs_key *key) 1484 { 1485 struct inode *dir = NULL; 1486 struct inode *inode = NULL; 1487 unsigned long ref_ptr; 1488 unsigned long ref_end; 1489 char *name = NULL; 1490 int namelen; 1491 int ret; 1492 int search_done = 0; 1493 int log_ref_ver = 0; 1494 u64 parent_objectid; 1495 u64 inode_objectid; 1496 u64 ref_index = 0; 1497 int ref_struct_size; 1498 1499 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1500 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1501 1502 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1503 struct btrfs_inode_extref *r; 1504 1505 ref_struct_size = sizeof(struct btrfs_inode_extref); 1506 log_ref_ver = 1; 1507 r = (struct btrfs_inode_extref *)ref_ptr; 1508 parent_objectid = btrfs_inode_extref_parent(eb, r); 1509 } else { 1510 ref_struct_size = sizeof(struct btrfs_inode_ref); 1511 parent_objectid = key->offset; 1512 } 1513 inode_objectid = key->objectid; 1514 1515 /* 1516 * it is possible that we didn't log all the parent directories 1517 * for a given inode. If we don't find the dir, just don't 1518 * copy the back ref in. The link count fixup code will take 1519 * care of the rest 1520 */ 1521 dir = read_one_inode(root, parent_objectid); 1522 if (!dir) { 1523 ret = -ENOENT; 1524 goto out; 1525 } 1526 1527 inode = read_one_inode(root, inode_objectid); 1528 if (!inode) { 1529 ret = -EIO; 1530 goto out; 1531 } 1532 1533 while (ref_ptr < ref_end) { 1534 if (log_ref_ver) { 1535 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1536 &ref_index, &parent_objectid); 1537 /* 1538 * parent object can change from one array 1539 * item to another. 1540 */ 1541 if (!dir) 1542 dir = read_one_inode(root, parent_objectid); 1543 if (!dir) { 1544 ret = -ENOENT; 1545 goto out; 1546 } 1547 } else { 1548 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1549 &ref_index); 1550 } 1551 if (ret) 1552 goto out; 1553 1554 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1555 btrfs_ino(BTRFS_I(inode)), ref_index, 1556 name, namelen); 1557 if (ret < 0) { 1558 goto out; 1559 } else if (ret == 0) { 1560 /* 1561 * look for a conflicting back reference in the 1562 * metadata. if we find one we have to unlink that name 1563 * of the file before we add our new link. Later on, we 1564 * overwrite any existing back reference, and we don't 1565 * want to create dangling pointers in the directory. 1566 */ 1567 1568 if (!search_done) { 1569 ret = __add_inode_ref(trans, root, path, log, 1570 BTRFS_I(dir), 1571 BTRFS_I(inode), 1572 inode_objectid, 1573 parent_objectid, 1574 ref_index, name, namelen, 1575 &search_done); 1576 if (ret) { 1577 if (ret == 1) 1578 ret = 0; 1579 goto out; 1580 } 1581 } 1582 1583 /* 1584 * If a reference item already exists for this inode 1585 * with the same parent and name, but different index, 1586 * drop it and the corresponding directory index entries 1587 * from the parent before adding the new reference item 1588 * and dir index entries, otherwise we would fail with 1589 * -EEXIST returned from btrfs_add_link() below. 1590 */ 1591 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1592 name, namelen); 1593 if (ret > 0) { 1594 ret = unlink_inode_for_log_replay(trans, 1595 BTRFS_I(dir), 1596 BTRFS_I(inode), 1597 name, namelen); 1598 /* 1599 * If we dropped the link count to 0, bump it so 1600 * that later the iput() on the inode will not 1601 * free it. We will fixup the link count later. 1602 */ 1603 if (!ret && inode->i_nlink == 0) 1604 inc_nlink(inode); 1605 } 1606 if (ret < 0) 1607 goto out; 1608 1609 /* insert our name */ 1610 ret = add_link(trans, dir, inode, name, namelen, 1611 ref_index); 1612 if (ret) 1613 goto out; 1614 1615 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1616 if (ret) 1617 goto out; 1618 } 1619 /* Else, ret == 1, we already have a perfect match, we're done. */ 1620 1621 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1622 kfree(name); 1623 name = NULL; 1624 if (log_ref_ver) { 1625 iput(dir); 1626 dir = NULL; 1627 } 1628 } 1629 1630 /* 1631 * Before we overwrite the inode reference item in the subvolume tree 1632 * with the item from the log tree, we must unlink all names from the 1633 * parent directory that are in the subvolume's tree inode reference 1634 * item, otherwise we end up with an inconsistent subvolume tree where 1635 * dir index entries exist for a name but there is no inode reference 1636 * item with the same name. 1637 */ 1638 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1639 key); 1640 if (ret) 1641 goto out; 1642 1643 /* finally write the back reference in the inode */ 1644 ret = overwrite_item(trans, root, path, eb, slot, key); 1645 out: 1646 btrfs_release_path(path); 1647 kfree(name); 1648 iput(dir); 1649 iput(inode); 1650 return ret; 1651 } 1652 1653 static int count_inode_extrefs(struct btrfs_root *root, 1654 struct btrfs_inode *inode, struct btrfs_path *path) 1655 { 1656 int ret = 0; 1657 int name_len; 1658 unsigned int nlink = 0; 1659 u32 item_size; 1660 u32 cur_offset = 0; 1661 u64 inode_objectid = btrfs_ino(inode); 1662 u64 offset = 0; 1663 unsigned long ptr; 1664 struct btrfs_inode_extref *extref; 1665 struct extent_buffer *leaf; 1666 1667 while (1) { 1668 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1669 &extref, &offset); 1670 if (ret) 1671 break; 1672 1673 leaf = path->nodes[0]; 1674 item_size = btrfs_item_size(leaf, path->slots[0]); 1675 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1676 cur_offset = 0; 1677 1678 while (cur_offset < item_size) { 1679 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1680 name_len = btrfs_inode_extref_name_len(leaf, extref); 1681 1682 nlink++; 1683 1684 cur_offset += name_len + sizeof(*extref); 1685 } 1686 1687 offset++; 1688 btrfs_release_path(path); 1689 } 1690 btrfs_release_path(path); 1691 1692 if (ret < 0 && ret != -ENOENT) 1693 return ret; 1694 return nlink; 1695 } 1696 1697 static int count_inode_refs(struct btrfs_root *root, 1698 struct btrfs_inode *inode, struct btrfs_path *path) 1699 { 1700 int ret; 1701 struct btrfs_key key; 1702 unsigned int nlink = 0; 1703 unsigned long ptr; 1704 unsigned long ptr_end; 1705 int name_len; 1706 u64 ino = btrfs_ino(inode); 1707 1708 key.objectid = ino; 1709 key.type = BTRFS_INODE_REF_KEY; 1710 key.offset = (u64)-1; 1711 1712 while (1) { 1713 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1714 if (ret < 0) 1715 break; 1716 if (ret > 0) { 1717 if (path->slots[0] == 0) 1718 break; 1719 path->slots[0]--; 1720 } 1721 process_slot: 1722 btrfs_item_key_to_cpu(path->nodes[0], &key, 1723 path->slots[0]); 1724 if (key.objectid != ino || 1725 key.type != BTRFS_INODE_REF_KEY) 1726 break; 1727 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1728 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1729 path->slots[0]); 1730 while (ptr < ptr_end) { 1731 struct btrfs_inode_ref *ref; 1732 1733 ref = (struct btrfs_inode_ref *)ptr; 1734 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1735 ref); 1736 ptr = (unsigned long)(ref + 1) + name_len; 1737 nlink++; 1738 } 1739 1740 if (key.offset == 0) 1741 break; 1742 if (path->slots[0] > 0) { 1743 path->slots[0]--; 1744 goto process_slot; 1745 } 1746 key.offset--; 1747 btrfs_release_path(path); 1748 } 1749 btrfs_release_path(path); 1750 1751 return nlink; 1752 } 1753 1754 /* 1755 * There are a few corners where the link count of the file can't 1756 * be properly maintained during replay. So, instead of adding 1757 * lots of complexity to the log code, we just scan the backrefs 1758 * for any file that has been through replay. 1759 * 1760 * The scan will update the link count on the inode to reflect the 1761 * number of back refs found. If it goes down to zero, the iput 1762 * will free the inode. 1763 */ 1764 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1765 struct btrfs_root *root, 1766 struct inode *inode) 1767 { 1768 struct btrfs_path *path; 1769 int ret; 1770 u64 nlink = 0; 1771 u64 ino = btrfs_ino(BTRFS_I(inode)); 1772 1773 path = btrfs_alloc_path(); 1774 if (!path) 1775 return -ENOMEM; 1776 1777 ret = count_inode_refs(root, BTRFS_I(inode), path); 1778 if (ret < 0) 1779 goto out; 1780 1781 nlink = ret; 1782 1783 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1784 if (ret < 0) 1785 goto out; 1786 1787 nlink += ret; 1788 1789 ret = 0; 1790 1791 if (nlink != inode->i_nlink) { 1792 set_nlink(inode, nlink); 1793 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1794 if (ret) 1795 goto out; 1796 } 1797 BTRFS_I(inode)->index_cnt = (u64)-1; 1798 1799 if (inode->i_nlink == 0) { 1800 if (S_ISDIR(inode->i_mode)) { 1801 ret = replay_dir_deletes(trans, root, NULL, path, 1802 ino, 1); 1803 if (ret) 1804 goto out; 1805 } 1806 ret = btrfs_insert_orphan_item(trans, root, ino); 1807 if (ret == -EEXIST) 1808 ret = 0; 1809 } 1810 1811 out: 1812 btrfs_free_path(path); 1813 return ret; 1814 } 1815 1816 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1817 struct btrfs_root *root, 1818 struct btrfs_path *path) 1819 { 1820 int ret; 1821 struct btrfs_key key; 1822 struct inode *inode; 1823 1824 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1825 key.type = BTRFS_ORPHAN_ITEM_KEY; 1826 key.offset = (u64)-1; 1827 while (1) { 1828 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1829 if (ret < 0) 1830 break; 1831 1832 if (ret == 1) { 1833 ret = 0; 1834 if (path->slots[0] == 0) 1835 break; 1836 path->slots[0]--; 1837 } 1838 1839 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1840 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1841 key.type != BTRFS_ORPHAN_ITEM_KEY) 1842 break; 1843 1844 ret = btrfs_del_item(trans, root, path); 1845 if (ret) 1846 break; 1847 1848 btrfs_release_path(path); 1849 inode = read_one_inode(root, key.offset); 1850 if (!inode) { 1851 ret = -EIO; 1852 break; 1853 } 1854 1855 ret = fixup_inode_link_count(trans, root, inode); 1856 iput(inode); 1857 if (ret) 1858 break; 1859 1860 /* 1861 * fixup on a directory may create new entries, 1862 * make sure we always look for the highset possible 1863 * offset 1864 */ 1865 key.offset = (u64)-1; 1866 } 1867 btrfs_release_path(path); 1868 return ret; 1869 } 1870 1871 1872 /* 1873 * record a given inode in the fixup dir so we can check its link 1874 * count when replay is done. The link count is incremented here 1875 * so the inode won't go away until we check it 1876 */ 1877 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1878 struct btrfs_root *root, 1879 struct btrfs_path *path, 1880 u64 objectid) 1881 { 1882 struct btrfs_key key; 1883 int ret = 0; 1884 struct inode *inode; 1885 1886 inode = read_one_inode(root, objectid); 1887 if (!inode) 1888 return -EIO; 1889 1890 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1891 key.type = BTRFS_ORPHAN_ITEM_KEY; 1892 key.offset = objectid; 1893 1894 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1895 1896 btrfs_release_path(path); 1897 if (ret == 0) { 1898 if (!inode->i_nlink) 1899 set_nlink(inode, 1); 1900 else 1901 inc_nlink(inode); 1902 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1903 } else if (ret == -EEXIST) { 1904 ret = 0; 1905 } 1906 iput(inode); 1907 1908 return ret; 1909 } 1910 1911 /* 1912 * when replaying the log for a directory, we only insert names 1913 * for inodes that actually exist. This means an fsync on a directory 1914 * does not implicitly fsync all the new files in it 1915 */ 1916 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1917 struct btrfs_root *root, 1918 u64 dirid, u64 index, 1919 char *name, int name_len, 1920 struct btrfs_key *location) 1921 { 1922 struct inode *inode; 1923 struct inode *dir; 1924 int ret; 1925 1926 inode = read_one_inode(root, location->objectid); 1927 if (!inode) 1928 return -ENOENT; 1929 1930 dir = read_one_inode(root, dirid); 1931 if (!dir) { 1932 iput(inode); 1933 return -EIO; 1934 } 1935 1936 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1937 name_len, 1, index); 1938 1939 /* FIXME, put inode into FIXUP list */ 1940 1941 iput(inode); 1942 iput(dir); 1943 return ret; 1944 } 1945 1946 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1947 struct btrfs_inode *dir, 1948 struct btrfs_path *path, 1949 struct btrfs_dir_item *dst_di, 1950 const struct btrfs_key *log_key, 1951 u8 log_type, 1952 bool exists) 1953 { 1954 struct btrfs_key found_key; 1955 1956 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1957 /* The existing dentry points to the same inode, don't delete it. */ 1958 if (found_key.objectid == log_key->objectid && 1959 found_key.type == log_key->type && 1960 found_key.offset == log_key->offset && 1961 btrfs_dir_type(path->nodes[0], dst_di) == log_type) 1962 return 1; 1963 1964 /* 1965 * Don't drop the conflicting directory entry if the inode for the new 1966 * entry doesn't exist. 1967 */ 1968 if (!exists) 1969 return 0; 1970 1971 return drop_one_dir_item(trans, path, dir, dst_di); 1972 } 1973 1974 /* 1975 * take a single entry in a log directory item and replay it into 1976 * the subvolume. 1977 * 1978 * if a conflicting item exists in the subdirectory already, 1979 * the inode it points to is unlinked and put into the link count 1980 * fix up tree. 1981 * 1982 * If a name from the log points to a file or directory that does 1983 * not exist in the FS, it is skipped. fsyncs on directories 1984 * do not force down inodes inside that directory, just changes to the 1985 * names or unlinks in a directory. 1986 * 1987 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1988 * non-existing inode) and 1 if the name was replayed. 1989 */ 1990 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1991 struct btrfs_root *root, 1992 struct btrfs_path *path, 1993 struct extent_buffer *eb, 1994 struct btrfs_dir_item *di, 1995 struct btrfs_key *key) 1996 { 1997 char *name; 1998 int name_len; 1999 struct btrfs_dir_item *dir_dst_di; 2000 struct btrfs_dir_item *index_dst_di; 2001 bool dir_dst_matches = false; 2002 bool index_dst_matches = false; 2003 struct btrfs_key log_key; 2004 struct btrfs_key search_key; 2005 struct inode *dir; 2006 u8 log_type; 2007 bool exists; 2008 int ret; 2009 bool update_size = true; 2010 bool name_added = false; 2011 2012 dir = read_one_inode(root, key->objectid); 2013 if (!dir) 2014 return -EIO; 2015 2016 name_len = btrfs_dir_name_len(eb, di); 2017 name = kmalloc(name_len, GFP_NOFS); 2018 if (!name) { 2019 ret = -ENOMEM; 2020 goto out; 2021 } 2022 2023 log_type = btrfs_dir_type(eb, di); 2024 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2025 name_len); 2026 2027 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 2028 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 2029 btrfs_release_path(path); 2030 if (ret < 0) 2031 goto out; 2032 exists = (ret == 0); 2033 ret = 0; 2034 2035 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 2036 name, name_len, 1); 2037 if (IS_ERR(dir_dst_di)) { 2038 ret = PTR_ERR(dir_dst_di); 2039 goto out; 2040 } else if (dir_dst_di) { 2041 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2042 dir_dst_di, &log_key, log_type, 2043 exists); 2044 if (ret < 0) 2045 goto out; 2046 dir_dst_matches = (ret == 1); 2047 } 2048 2049 btrfs_release_path(path); 2050 2051 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 2052 key->objectid, key->offset, 2053 name, name_len, 1); 2054 if (IS_ERR(index_dst_di)) { 2055 ret = PTR_ERR(index_dst_di); 2056 goto out; 2057 } else if (index_dst_di) { 2058 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2059 index_dst_di, &log_key, 2060 log_type, exists); 2061 if (ret < 0) 2062 goto out; 2063 index_dst_matches = (ret == 1); 2064 } 2065 2066 btrfs_release_path(path); 2067 2068 if (dir_dst_matches && index_dst_matches) { 2069 ret = 0; 2070 update_size = false; 2071 goto out; 2072 } 2073 2074 /* 2075 * Check if the inode reference exists in the log for the given name, 2076 * inode and parent inode 2077 */ 2078 search_key.objectid = log_key.objectid; 2079 search_key.type = BTRFS_INODE_REF_KEY; 2080 search_key.offset = key->objectid; 2081 ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); 2082 if (ret < 0) { 2083 goto out; 2084 } else if (ret) { 2085 /* The dentry will be added later. */ 2086 ret = 0; 2087 update_size = false; 2088 goto out; 2089 } 2090 2091 search_key.objectid = log_key.objectid; 2092 search_key.type = BTRFS_INODE_EXTREF_KEY; 2093 search_key.offset = key->objectid; 2094 ret = backref_in_log(root->log_root, &search_key, key->objectid, name, 2095 name_len); 2096 if (ret < 0) { 2097 goto out; 2098 } else if (ret) { 2099 /* The dentry will be added later. */ 2100 ret = 0; 2101 update_size = false; 2102 goto out; 2103 } 2104 btrfs_release_path(path); 2105 ret = insert_one_name(trans, root, key->objectid, key->offset, 2106 name, name_len, &log_key); 2107 if (ret && ret != -ENOENT && ret != -EEXIST) 2108 goto out; 2109 if (!ret) 2110 name_added = true; 2111 update_size = false; 2112 ret = 0; 2113 2114 out: 2115 if (!ret && update_size) { 2116 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2117 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 2118 } 2119 kfree(name); 2120 iput(dir); 2121 if (!ret && name_added) 2122 ret = 1; 2123 return ret; 2124 } 2125 2126 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 2127 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2128 struct btrfs_root *root, 2129 struct btrfs_path *path, 2130 struct extent_buffer *eb, int slot, 2131 struct btrfs_key *key) 2132 { 2133 int ret; 2134 struct btrfs_dir_item *di; 2135 2136 /* We only log dir index keys, which only contain a single dir item. */ 2137 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 2138 2139 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2140 ret = replay_one_name(trans, root, path, eb, di, key); 2141 if (ret < 0) 2142 return ret; 2143 2144 /* 2145 * If this entry refers to a non-directory (directories can not have a 2146 * link count > 1) and it was added in the transaction that was not 2147 * committed, make sure we fixup the link count of the inode the entry 2148 * points to. Otherwise something like the following would result in a 2149 * directory pointing to an inode with a wrong link that does not account 2150 * for this dir entry: 2151 * 2152 * mkdir testdir 2153 * touch testdir/foo 2154 * touch testdir/bar 2155 * sync 2156 * 2157 * ln testdir/bar testdir/bar_link 2158 * ln testdir/foo testdir/foo_link 2159 * xfs_io -c "fsync" testdir/bar 2160 * 2161 * <power failure> 2162 * 2163 * mount fs, log replay happens 2164 * 2165 * File foo would remain with a link count of 1 when it has two entries 2166 * pointing to it in the directory testdir. This would make it impossible 2167 * to ever delete the parent directory has it would result in stale 2168 * dentries that can never be deleted. 2169 */ 2170 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2171 struct btrfs_path *fixup_path; 2172 struct btrfs_key di_key; 2173 2174 fixup_path = btrfs_alloc_path(); 2175 if (!fixup_path) 2176 return -ENOMEM; 2177 2178 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2179 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2180 btrfs_free_path(fixup_path); 2181 } 2182 2183 return ret; 2184 } 2185 2186 /* 2187 * directory replay has two parts. There are the standard directory 2188 * items in the log copied from the subvolume, and range items 2189 * created in the log while the subvolume was logged. 2190 * 2191 * The range items tell us which parts of the key space the log 2192 * is authoritative for. During replay, if a key in the subvolume 2193 * directory is in a logged range item, but not actually in the log 2194 * that means it was deleted from the directory before the fsync 2195 * and should be removed. 2196 */ 2197 static noinline int find_dir_range(struct btrfs_root *root, 2198 struct btrfs_path *path, 2199 u64 dirid, 2200 u64 *start_ret, u64 *end_ret) 2201 { 2202 struct btrfs_key key; 2203 u64 found_end; 2204 struct btrfs_dir_log_item *item; 2205 int ret; 2206 int nritems; 2207 2208 if (*start_ret == (u64)-1) 2209 return 1; 2210 2211 key.objectid = dirid; 2212 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2213 key.offset = *start_ret; 2214 2215 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2216 if (ret < 0) 2217 goto out; 2218 if (ret > 0) { 2219 if (path->slots[0] == 0) 2220 goto out; 2221 path->slots[0]--; 2222 } 2223 if (ret != 0) 2224 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2225 2226 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2227 ret = 1; 2228 goto next; 2229 } 2230 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2231 struct btrfs_dir_log_item); 2232 found_end = btrfs_dir_log_end(path->nodes[0], item); 2233 2234 if (*start_ret >= key.offset && *start_ret <= found_end) { 2235 ret = 0; 2236 *start_ret = key.offset; 2237 *end_ret = found_end; 2238 goto out; 2239 } 2240 ret = 1; 2241 next: 2242 /* check the next slot in the tree to see if it is a valid item */ 2243 nritems = btrfs_header_nritems(path->nodes[0]); 2244 path->slots[0]++; 2245 if (path->slots[0] >= nritems) { 2246 ret = btrfs_next_leaf(root, path); 2247 if (ret) 2248 goto out; 2249 } 2250 2251 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2252 2253 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2254 ret = 1; 2255 goto out; 2256 } 2257 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2258 struct btrfs_dir_log_item); 2259 found_end = btrfs_dir_log_end(path->nodes[0], item); 2260 *start_ret = key.offset; 2261 *end_ret = found_end; 2262 ret = 0; 2263 out: 2264 btrfs_release_path(path); 2265 return ret; 2266 } 2267 2268 /* 2269 * this looks for a given directory item in the log. If the directory 2270 * item is not in the log, the item is removed and the inode it points 2271 * to is unlinked 2272 */ 2273 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2274 struct btrfs_root *log, 2275 struct btrfs_path *path, 2276 struct btrfs_path *log_path, 2277 struct inode *dir, 2278 struct btrfs_key *dir_key) 2279 { 2280 struct btrfs_root *root = BTRFS_I(dir)->root; 2281 int ret; 2282 struct extent_buffer *eb; 2283 int slot; 2284 struct btrfs_dir_item *di; 2285 int name_len; 2286 char *name; 2287 struct inode *inode = NULL; 2288 struct btrfs_key location; 2289 2290 /* 2291 * Currenly we only log dir index keys. Even if we replay a log created 2292 * by an older kernel that logged both dir index and dir item keys, all 2293 * we need to do is process the dir index keys, we (and our caller) can 2294 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2295 */ 2296 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2297 2298 eb = path->nodes[0]; 2299 slot = path->slots[0]; 2300 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2301 name_len = btrfs_dir_name_len(eb, di); 2302 name = kmalloc(name_len, GFP_NOFS); 2303 if (!name) { 2304 ret = -ENOMEM; 2305 goto out; 2306 } 2307 2308 read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); 2309 2310 if (log) { 2311 struct btrfs_dir_item *log_di; 2312 2313 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2314 dir_key->objectid, 2315 dir_key->offset, 2316 name, name_len, 0); 2317 if (IS_ERR(log_di)) { 2318 ret = PTR_ERR(log_di); 2319 goto out; 2320 } else if (log_di) { 2321 /* The dentry exists in the log, we have nothing to do. */ 2322 ret = 0; 2323 goto out; 2324 } 2325 } 2326 2327 btrfs_dir_item_key_to_cpu(eb, di, &location); 2328 btrfs_release_path(path); 2329 btrfs_release_path(log_path); 2330 inode = read_one_inode(root, location.objectid); 2331 if (!inode) { 2332 ret = -EIO; 2333 goto out; 2334 } 2335 2336 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2337 if (ret) 2338 goto out; 2339 2340 inc_nlink(inode); 2341 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2342 name, name_len); 2343 /* 2344 * Unlike dir item keys, dir index keys can only have one name (entry) in 2345 * them, as there are no key collisions since each key has a unique offset 2346 * (an index number), so we're done. 2347 */ 2348 out: 2349 btrfs_release_path(path); 2350 btrfs_release_path(log_path); 2351 kfree(name); 2352 iput(inode); 2353 return ret; 2354 } 2355 2356 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2357 struct btrfs_root *root, 2358 struct btrfs_root *log, 2359 struct btrfs_path *path, 2360 const u64 ino) 2361 { 2362 struct btrfs_key search_key; 2363 struct btrfs_path *log_path; 2364 int i; 2365 int nritems; 2366 int ret; 2367 2368 log_path = btrfs_alloc_path(); 2369 if (!log_path) 2370 return -ENOMEM; 2371 2372 search_key.objectid = ino; 2373 search_key.type = BTRFS_XATTR_ITEM_KEY; 2374 search_key.offset = 0; 2375 again: 2376 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2377 if (ret < 0) 2378 goto out; 2379 process_leaf: 2380 nritems = btrfs_header_nritems(path->nodes[0]); 2381 for (i = path->slots[0]; i < nritems; i++) { 2382 struct btrfs_key key; 2383 struct btrfs_dir_item *di; 2384 struct btrfs_dir_item *log_di; 2385 u32 total_size; 2386 u32 cur; 2387 2388 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2389 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2390 ret = 0; 2391 goto out; 2392 } 2393 2394 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2395 total_size = btrfs_item_size(path->nodes[0], i); 2396 cur = 0; 2397 while (cur < total_size) { 2398 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2399 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2400 u32 this_len = sizeof(*di) + name_len + data_len; 2401 char *name; 2402 2403 name = kmalloc(name_len, GFP_NOFS); 2404 if (!name) { 2405 ret = -ENOMEM; 2406 goto out; 2407 } 2408 read_extent_buffer(path->nodes[0], name, 2409 (unsigned long)(di + 1), name_len); 2410 2411 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2412 name, name_len, 0); 2413 btrfs_release_path(log_path); 2414 if (!log_di) { 2415 /* Doesn't exist in log tree, so delete it. */ 2416 btrfs_release_path(path); 2417 di = btrfs_lookup_xattr(trans, root, path, ino, 2418 name, name_len, -1); 2419 kfree(name); 2420 if (IS_ERR(di)) { 2421 ret = PTR_ERR(di); 2422 goto out; 2423 } 2424 ASSERT(di); 2425 ret = btrfs_delete_one_dir_name(trans, root, 2426 path, di); 2427 if (ret) 2428 goto out; 2429 btrfs_release_path(path); 2430 search_key = key; 2431 goto again; 2432 } 2433 kfree(name); 2434 if (IS_ERR(log_di)) { 2435 ret = PTR_ERR(log_di); 2436 goto out; 2437 } 2438 cur += this_len; 2439 di = (struct btrfs_dir_item *)((char *)di + this_len); 2440 } 2441 } 2442 ret = btrfs_next_leaf(root, path); 2443 if (ret > 0) 2444 ret = 0; 2445 else if (ret == 0) 2446 goto process_leaf; 2447 out: 2448 btrfs_free_path(log_path); 2449 btrfs_release_path(path); 2450 return ret; 2451 } 2452 2453 2454 /* 2455 * deletion replay happens before we copy any new directory items 2456 * out of the log or out of backreferences from inodes. It 2457 * scans the log to find ranges of keys that log is authoritative for, 2458 * and then scans the directory to find items in those ranges that are 2459 * not present in the log. 2460 * 2461 * Anything we don't find in the log is unlinked and removed from the 2462 * directory. 2463 */ 2464 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2465 struct btrfs_root *root, 2466 struct btrfs_root *log, 2467 struct btrfs_path *path, 2468 u64 dirid, int del_all) 2469 { 2470 u64 range_start; 2471 u64 range_end; 2472 int ret = 0; 2473 struct btrfs_key dir_key; 2474 struct btrfs_key found_key; 2475 struct btrfs_path *log_path; 2476 struct inode *dir; 2477 2478 dir_key.objectid = dirid; 2479 dir_key.type = BTRFS_DIR_INDEX_KEY; 2480 log_path = btrfs_alloc_path(); 2481 if (!log_path) 2482 return -ENOMEM; 2483 2484 dir = read_one_inode(root, dirid); 2485 /* it isn't an error if the inode isn't there, that can happen 2486 * because we replay the deletes before we copy in the inode item 2487 * from the log 2488 */ 2489 if (!dir) { 2490 btrfs_free_path(log_path); 2491 return 0; 2492 } 2493 2494 range_start = 0; 2495 range_end = 0; 2496 while (1) { 2497 if (del_all) 2498 range_end = (u64)-1; 2499 else { 2500 ret = find_dir_range(log, path, dirid, 2501 &range_start, &range_end); 2502 if (ret < 0) 2503 goto out; 2504 else if (ret > 0) 2505 break; 2506 } 2507 2508 dir_key.offset = range_start; 2509 while (1) { 2510 int nritems; 2511 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2512 0, 0); 2513 if (ret < 0) 2514 goto out; 2515 2516 nritems = btrfs_header_nritems(path->nodes[0]); 2517 if (path->slots[0] >= nritems) { 2518 ret = btrfs_next_leaf(root, path); 2519 if (ret == 1) 2520 break; 2521 else if (ret < 0) 2522 goto out; 2523 } 2524 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2525 path->slots[0]); 2526 if (found_key.objectid != dirid || 2527 found_key.type != dir_key.type) { 2528 ret = 0; 2529 goto out; 2530 } 2531 2532 if (found_key.offset > range_end) 2533 break; 2534 2535 ret = check_item_in_log(trans, log, path, 2536 log_path, dir, 2537 &found_key); 2538 if (ret) 2539 goto out; 2540 if (found_key.offset == (u64)-1) 2541 break; 2542 dir_key.offset = found_key.offset + 1; 2543 } 2544 btrfs_release_path(path); 2545 if (range_end == (u64)-1) 2546 break; 2547 range_start = range_end + 1; 2548 } 2549 ret = 0; 2550 out: 2551 btrfs_release_path(path); 2552 btrfs_free_path(log_path); 2553 iput(dir); 2554 return ret; 2555 } 2556 2557 /* 2558 * the process_func used to replay items from the log tree. This 2559 * gets called in two different stages. The first stage just looks 2560 * for inodes and makes sure they are all copied into the subvolume. 2561 * 2562 * The second stage copies all the other item types from the log into 2563 * the subvolume. The two stage approach is slower, but gets rid of 2564 * lots of complexity around inodes referencing other inodes that exist 2565 * only in the log (references come from either directory items or inode 2566 * back refs). 2567 */ 2568 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2569 struct walk_control *wc, u64 gen, int level) 2570 { 2571 int nritems; 2572 struct btrfs_path *path; 2573 struct btrfs_root *root = wc->replay_dest; 2574 struct btrfs_key key; 2575 int i; 2576 int ret; 2577 2578 ret = btrfs_read_buffer(eb, gen, level, NULL); 2579 if (ret) 2580 return ret; 2581 2582 level = btrfs_header_level(eb); 2583 2584 if (level != 0) 2585 return 0; 2586 2587 path = btrfs_alloc_path(); 2588 if (!path) 2589 return -ENOMEM; 2590 2591 nritems = btrfs_header_nritems(eb); 2592 for (i = 0; i < nritems; i++) { 2593 btrfs_item_key_to_cpu(eb, &key, i); 2594 2595 /* inode keys are done during the first stage */ 2596 if (key.type == BTRFS_INODE_ITEM_KEY && 2597 wc->stage == LOG_WALK_REPLAY_INODES) { 2598 struct btrfs_inode_item *inode_item; 2599 u32 mode; 2600 2601 inode_item = btrfs_item_ptr(eb, i, 2602 struct btrfs_inode_item); 2603 /* 2604 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2605 * and never got linked before the fsync, skip it, as 2606 * replaying it is pointless since it would be deleted 2607 * later. We skip logging tmpfiles, but it's always 2608 * possible we are replaying a log created with a kernel 2609 * that used to log tmpfiles. 2610 */ 2611 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2612 wc->ignore_cur_inode = true; 2613 continue; 2614 } else { 2615 wc->ignore_cur_inode = false; 2616 } 2617 ret = replay_xattr_deletes(wc->trans, root, log, 2618 path, key.objectid); 2619 if (ret) 2620 break; 2621 mode = btrfs_inode_mode(eb, inode_item); 2622 if (S_ISDIR(mode)) { 2623 ret = replay_dir_deletes(wc->trans, 2624 root, log, path, key.objectid, 0); 2625 if (ret) 2626 break; 2627 } 2628 ret = overwrite_item(wc->trans, root, path, 2629 eb, i, &key); 2630 if (ret) 2631 break; 2632 2633 /* 2634 * Before replaying extents, truncate the inode to its 2635 * size. We need to do it now and not after log replay 2636 * because before an fsync we can have prealloc extents 2637 * added beyond the inode's i_size. If we did it after, 2638 * through orphan cleanup for example, we would drop 2639 * those prealloc extents just after replaying them. 2640 */ 2641 if (S_ISREG(mode)) { 2642 struct btrfs_drop_extents_args drop_args = { 0 }; 2643 struct inode *inode; 2644 u64 from; 2645 2646 inode = read_one_inode(root, key.objectid); 2647 if (!inode) { 2648 ret = -EIO; 2649 break; 2650 } 2651 from = ALIGN(i_size_read(inode), 2652 root->fs_info->sectorsize); 2653 drop_args.start = from; 2654 drop_args.end = (u64)-1; 2655 drop_args.drop_cache = true; 2656 ret = btrfs_drop_extents(wc->trans, root, 2657 BTRFS_I(inode), 2658 &drop_args); 2659 if (!ret) { 2660 inode_sub_bytes(inode, 2661 drop_args.bytes_found); 2662 /* Update the inode's nbytes. */ 2663 ret = btrfs_update_inode(wc->trans, 2664 root, BTRFS_I(inode)); 2665 } 2666 iput(inode); 2667 if (ret) 2668 break; 2669 } 2670 2671 ret = link_to_fixup_dir(wc->trans, root, 2672 path, key.objectid); 2673 if (ret) 2674 break; 2675 } 2676 2677 if (wc->ignore_cur_inode) 2678 continue; 2679 2680 if (key.type == BTRFS_DIR_INDEX_KEY && 2681 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2682 ret = replay_one_dir_item(wc->trans, root, path, 2683 eb, i, &key); 2684 if (ret) 2685 break; 2686 } 2687 2688 if (wc->stage < LOG_WALK_REPLAY_ALL) 2689 continue; 2690 2691 /* these keys are simply copied */ 2692 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2693 ret = overwrite_item(wc->trans, root, path, 2694 eb, i, &key); 2695 if (ret) 2696 break; 2697 } else if (key.type == BTRFS_INODE_REF_KEY || 2698 key.type == BTRFS_INODE_EXTREF_KEY) { 2699 ret = add_inode_ref(wc->trans, root, log, path, 2700 eb, i, &key); 2701 if (ret && ret != -ENOENT) 2702 break; 2703 ret = 0; 2704 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2705 ret = replay_one_extent(wc->trans, root, path, 2706 eb, i, &key); 2707 if (ret) 2708 break; 2709 } 2710 /* 2711 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2712 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2713 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2714 * older kernel with such keys, ignore them. 2715 */ 2716 } 2717 btrfs_free_path(path); 2718 return ret; 2719 } 2720 2721 /* 2722 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2723 */ 2724 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2725 { 2726 struct btrfs_block_group *cache; 2727 2728 cache = btrfs_lookup_block_group(fs_info, start); 2729 if (!cache) { 2730 btrfs_err(fs_info, "unable to find block group for %llu", start); 2731 return; 2732 } 2733 2734 spin_lock(&cache->space_info->lock); 2735 spin_lock(&cache->lock); 2736 cache->reserved -= fs_info->nodesize; 2737 cache->space_info->bytes_reserved -= fs_info->nodesize; 2738 spin_unlock(&cache->lock); 2739 spin_unlock(&cache->space_info->lock); 2740 2741 btrfs_put_block_group(cache); 2742 } 2743 2744 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2745 struct btrfs_root *root, 2746 struct btrfs_path *path, int *level, 2747 struct walk_control *wc) 2748 { 2749 struct btrfs_fs_info *fs_info = root->fs_info; 2750 u64 bytenr; 2751 u64 ptr_gen; 2752 struct extent_buffer *next; 2753 struct extent_buffer *cur; 2754 u32 blocksize; 2755 int ret = 0; 2756 2757 while (*level > 0) { 2758 struct btrfs_key first_key; 2759 2760 cur = path->nodes[*level]; 2761 2762 WARN_ON(btrfs_header_level(cur) != *level); 2763 2764 if (path->slots[*level] >= 2765 btrfs_header_nritems(cur)) 2766 break; 2767 2768 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2769 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2770 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2771 blocksize = fs_info->nodesize; 2772 2773 next = btrfs_find_create_tree_block(fs_info, bytenr, 2774 btrfs_header_owner(cur), 2775 *level - 1); 2776 if (IS_ERR(next)) 2777 return PTR_ERR(next); 2778 2779 if (*level == 1) { 2780 ret = wc->process_func(root, next, wc, ptr_gen, 2781 *level - 1); 2782 if (ret) { 2783 free_extent_buffer(next); 2784 return ret; 2785 } 2786 2787 path->slots[*level]++; 2788 if (wc->free) { 2789 ret = btrfs_read_buffer(next, ptr_gen, 2790 *level - 1, &first_key); 2791 if (ret) { 2792 free_extent_buffer(next); 2793 return ret; 2794 } 2795 2796 if (trans) { 2797 btrfs_tree_lock(next); 2798 btrfs_clean_tree_block(next); 2799 btrfs_wait_tree_block_writeback(next); 2800 btrfs_tree_unlock(next); 2801 ret = btrfs_pin_reserved_extent(trans, 2802 bytenr, blocksize); 2803 if (ret) { 2804 free_extent_buffer(next); 2805 return ret; 2806 } 2807 btrfs_redirty_list_add( 2808 trans->transaction, next); 2809 } else { 2810 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2811 clear_extent_buffer_dirty(next); 2812 unaccount_log_buffer(fs_info, bytenr); 2813 } 2814 } 2815 free_extent_buffer(next); 2816 continue; 2817 } 2818 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2819 if (ret) { 2820 free_extent_buffer(next); 2821 return ret; 2822 } 2823 2824 if (path->nodes[*level-1]) 2825 free_extent_buffer(path->nodes[*level-1]); 2826 path->nodes[*level-1] = next; 2827 *level = btrfs_header_level(next); 2828 path->slots[*level] = 0; 2829 cond_resched(); 2830 } 2831 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2832 2833 cond_resched(); 2834 return 0; 2835 } 2836 2837 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2838 struct btrfs_root *root, 2839 struct btrfs_path *path, int *level, 2840 struct walk_control *wc) 2841 { 2842 struct btrfs_fs_info *fs_info = root->fs_info; 2843 int i; 2844 int slot; 2845 int ret; 2846 2847 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2848 slot = path->slots[i]; 2849 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2850 path->slots[i]++; 2851 *level = i; 2852 WARN_ON(*level == 0); 2853 return 0; 2854 } else { 2855 ret = wc->process_func(root, path->nodes[*level], wc, 2856 btrfs_header_generation(path->nodes[*level]), 2857 *level); 2858 if (ret) 2859 return ret; 2860 2861 if (wc->free) { 2862 struct extent_buffer *next; 2863 2864 next = path->nodes[*level]; 2865 2866 if (trans) { 2867 btrfs_tree_lock(next); 2868 btrfs_clean_tree_block(next); 2869 btrfs_wait_tree_block_writeback(next); 2870 btrfs_tree_unlock(next); 2871 ret = btrfs_pin_reserved_extent(trans, 2872 path->nodes[*level]->start, 2873 path->nodes[*level]->len); 2874 if (ret) 2875 return ret; 2876 btrfs_redirty_list_add(trans->transaction, 2877 next); 2878 } else { 2879 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2880 clear_extent_buffer_dirty(next); 2881 2882 unaccount_log_buffer(fs_info, 2883 path->nodes[*level]->start); 2884 } 2885 } 2886 free_extent_buffer(path->nodes[*level]); 2887 path->nodes[*level] = NULL; 2888 *level = i + 1; 2889 } 2890 } 2891 return 1; 2892 } 2893 2894 /* 2895 * drop the reference count on the tree rooted at 'snap'. This traverses 2896 * the tree freeing any blocks that have a ref count of zero after being 2897 * decremented. 2898 */ 2899 static int walk_log_tree(struct btrfs_trans_handle *trans, 2900 struct btrfs_root *log, struct walk_control *wc) 2901 { 2902 struct btrfs_fs_info *fs_info = log->fs_info; 2903 int ret = 0; 2904 int wret; 2905 int level; 2906 struct btrfs_path *path; 2907 int orig_level; 2908 2909 path = btrfs_alloc_path(); 2910 if (!path) 2911 return -ENOMEM; 2912 2913 level = btrfs_header_level(log->node); 2914 orig_level = level; 2915 path->nodes[level] = log->node; 2916 atomic_inc(&log->node->refs); 2917 path->slots[level] = 0; 2918 2919 while (1) { 2920 wret = walk_down_log_tree(trans, log, path, &level, wc); 2921 if (wret > 0) 2922 break; 2923 if (wret < 0) { 2924 ret = wret; 2925 goto out; 2926 } 2927 2928 wret = walk_up_log_tree(trans, log, path, &level, wc); 2929 if (wret > 0) 2930 break; 2931 if (wret < 0) { 2932 ret = wret; 2933 goto out; 2934 } 2935 } 2936 2937 /* was the root node processed? if not, catch it here */ 2938 if (path->nodes[orig_level]) { 2939 ret = wc->process_func(log, path->nodes[orig_level], wc, 2940 btrfs_header_generation(path->nodes[orig_level]), 2941 orig_level); 2942 if (ret) 2943 goto out; 2944 if (wc->free) { 2945 struct extent_buffer *next; 2946 2947 next = path->nodes[orig_level]; 2948 2949 if (trans) { 2950 btrfs_tree_lock(next); 2951 btrfs_clean_tree_block(next); 2952 btrfs_wait_tree_block_writeback(next); 2953 btrfs_tree_unlock(next); 2954 ret = btrfs_pin_reserved_extent(trans, 2955 next->start, next->len); 2956 if (ret) 2957 goto out; 2958 btrfs_redirty_list_add(trans->transaction, next); 2959 } else { 2960 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2961 clear_extent_buffer_dirty(next); 2962 unaccount_log_buffer(fs_info, next->start); 2963 } 2964 } 2965 } 2966 2967 out: 2968 btrfs_free_path(path); 2969 return ret; 2970 } 2971 2972 /* 2973 * helper function to update the item for a given subvolumes log root 2974 * in the tree of log roots 2975 */ 2976 static int update_log_root(struct btrfs_trans_handle *trans, 2977 struct btrfs_root *log, 2978 struct btrfs_root_item *root_item) 2979 { 2980 struct btrfs_fs_info *fs_info = log->fs_info; 2981 int ret; 2982 2983 if (log->log_transid == 1) { 2984 /* insert root item on the first sync */ 2985 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2986 &log->root_key, root_item); 2987 } else { 2988 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2989 &log->root_key, root_item); 2990 } 2991 return ret; 2992 } 2993 2994 static void wait_log_commit(struct btrfs_root *root, int transid) 2995 { 2996 DEFINE_WAIT(wait); 2997 int index = transid % 2; 2998 2999 /* 3000 * we only allow two pending log transactions at a time, 3001 * so we know that if ours is more than 2 older than the 3002 * current transaction, we're done 3003 */ 3004 for (;;) { 3005 prepare_to_wait(&root->log_commit_wait[index], 3006 &wait, TASK_UNINTERRUPTIBLE); 3007 3008 if (!(root->log_transid_committed < transid && 3009 atomic_read(&root->log_commit[index]))) 3010 break; 3011 3012 mutex_unlock(&root->log_mutex); 3013 schedule(); 3014 mutex_lock(&root->log_mutex); 3015 } 3016 finish_wait(&root->log_commit_wait[index], &wait); 3017 } 3018 3019 static void wait_for_writer(struct btrfs_root *root) 3020 { 3021 DEFINE_WAIT(wait); 3022 3023 for (;;) { 3024 prepare_to_wait(&root->log_writer_wait, &wait, 3025 TASK_UNINTERRUPTIBLE); 3026 if (!atomic_read(&root->log_writers)) 3027 break; 3028 3029 mutex_unlock(&root->log_mutex); 3030 schedule(); 3031 mutex_lock(&root->log_mutex); 3032 } 3033 finish_wait(&root->log_writer_wait, &wait); 3034 } 3035 3036 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3037 struct btrfs_log_ctx *ctx) 3038 { 3039 mutex_lock(&root->log_mutex); 3040 list_del_init(&ctx->list); 3041 mutex_unlock(&root->log_mutex); 3042 } 3043 3044 /* 3045 * Invoked in log mutex context, or be sure there is no other task which 3046 * can access the list. 3047 */ 3048 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3049 int index, int error) 3050 { 3051 struct btrfs_log_ctx *ctx; 3052 struct btrfs_log_ctx *safe; 3053 3054 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3055 list_del_init(&ctx->list); 3056 ctx->log_ret = error; 3057 } 3058 } 3059 3060 /* 3061 * btrfs_sync_log does sends a given tree log down to the disk and 3062 * updates the super blocks to record it. When this call is done, 3063 * you know that any inodes previously logged are safely on disk only 3064 * if it returns 0. 3065 * 3066 * Any other return value means you need to call btrfs_commit_transaction. 3067 * Some of the edge cases for fsyncing directories that have had unlinks 3068 * or renames done in the past mean that sometimes the only safe 3069 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3070 * that has happened. 3071 */ 3072 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3073 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3074 { 3075 int index1; 3076 int index2; 3077 int mark; 3078 int ret; 3079 struct btrfs_fs_info *fs_info = root->fs_info; 3080 struct btrfs_root *log = root->log_root; 3081 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3082 struct btrfs_root_item new_root_item; 3083 int log_transid = 0; 3084 struct btrfs_log_ctx root_log_ctx; 3085 struct blk_plug plug; 3086 u64 log_root_start; 3087 u64 log_root_level; 3088 3089 mutex_lock(&root->log_mutex); 3090 log_transid = ctx->log_transid; 3091 if (root->log_transid_committed >= log_transid) { 3092 mutex_unlock(&root->log_mutex); 3093 return ctx->log_ret; 3094 } 3095 3096 index1 = log_transid % 2; 3097 if (atomic_read(&root->log_commit[index1])) { 3098 wait_log_commit(root, log_transid); 3099 mutex_unlock(&root->log_mutex); 3100 return ctx->log_ret; 3101 } 3102 ASSERT(log_transid == root->log_transid); 3103 atomic_set(&root->log_commit[index1], 1); 3104 3105 /* wait for previous tree log sync to complete */ 3106 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3107 wait_log_commit(root, log_transid - 1); 3108 3109 while (1) { 3110 int batch = atomic_read(&root->log_batch); 3111 /* when we're on an ssd, just kick the log commit out */ 3112 if (!btrfs_test_opt(fs_info, SSD) && 3113 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3114 mutex_unlock(&root->log_mutex); 3115 schedule_timeout_uninterruptible(1); 3116 mutex_lock(&root->log_mutex); 3117 } 3118 wait_for_writer(root); 3119 if (batch == atomic_read(&root->log_batch)) 3120 break; 3121 } 3122 3123 /* bail out if we need to do a full commit */ 3124 if (btrfs_need_log_full_commit(trans)) { 3125 ret = -EAGAIN; 3126 mutex_unlock(&root->log_mutex); 3127 goto out; 3128 } 3129 3130 if (log_transid % 2 == 0) 3131 mark = EXTENT_DIRTY; 3132 else 3133 mark = EXTENT_NEW; 3134 3135 /* we start IO on all the marked extents here, but we don't actually 3136 * wait for them until later. 3137 */ 3138 blk_start_plug(&plug); 3139 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3140 /* 3141 * -EAGAIN happens when someone, e.g., a concurrent transaction 3142 * commit, writes a dirty extent in this tree-log commit. This 3143 * concurrent write will create a hole writing out the extents, 3144 * and we cannot proceed on a zoned filesystem, requiring 3145 * sequential writing. While we can bail out to a full commit 3146 * here, but we can continue hoping the concurrent writing fills 3147 * the hole. 3148 */ 3149 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3150 ret = 0; 3151 if (ret) { 3152 blk_finish_plug(&plug); 3153 btrfs_abort_transaction(trans, ret); 3154 btrfs_set_log_full_commit(trans); 3155 mutex_unlock(&root->log_mutex); 3156 goto out; 3157 } 3158 3159 /* 3160 * We _must_ update under the root->log_mutex in order to make sure we 3161 * have a consistent view of the log root we are trying to commit at 3162 * this moment. 3163 * 3164 * We _must_ copy this into a local copy, because we are not holding the 3165 * log_root_tree->log_mutex yet. This is important because when we 3166 * commit the log_root_tree we must have a consistent view of the 3167 * log_root_tree when we update the super block to point at the 3168 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3169 * with the commit and possibly point at the new block which we may not 3170 * have written out. 3171 */ 3172 btrfs_set_root_node(&log->root_item, log->node); 3173 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3174 3175 root->log_transid++; 3176 log->log_transid = root->log_transid; 3177 root->log_start_pid = 0; 3178 /* 3179 * IO has been started, blocks of the log tree have WRITTEN flag set 3180 * in their headers. new modifications of the log will be written to 3181 * new positions. so it's safe to allow log writers to go in. 3182 */ 3183 mutex_unlock(&root->log_mutex); 3184 3185 if (btrfs_is_zoned(fs_info)) { 3186 mutex_lock(&fs_info->tree_root->log_mutex); 3187 if (!log_root_tree->node) { 3188 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3189 if (ret) { 3190 mutex_unlock(&fs_info->tree_root->log_mutex); 3191 goto out; 3192 } 3193 } 3194 mutex_unlock(&fs_info->tree_root->log_mutex); 3195 } 3196 3197 btrfs_init_log_ctx(&root_log_ctx, NULL); 3198 3199 mutex_lock(&log_root_tree->log_mutex); 3200 3201 index2 = log_root_tree->log_transid % 2; 3202 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3203 root_log_ctx.log_transid = log_root_tree->log_transid; 3204 3205 /* 3206 * Now we are safe to update the log_root_tree because we're under the 3207 * log_mutex, and we're a current writer so we're holding the commit 3208 * open until we drop the log_mutex. 3209 */ 3210 ret = update_log_root(trans, log, &new_root_item); 3211 if (ret) { 3212 if (!list_empty(&root_log_ctx.list)) 3213 list_del_init(&root_log_ctx.list); 3214 3215 blk_finish_plug(&plug); 3216 btrfs_set_log_full_commit(trans); 3217 3218 if (ret != -ENOSPC) { 3219 btrfs_abort_transaction(trans, ret); 3220 mutex_unlock(&log_root_tree->log_mutex); 3221 goto out; 3222 } 3223 btrfs_wait_tree_log_extents(log, mark); 3224 mutex_unlock(&log_root_tree->log_mutex); 3225 ret = -EAGAIN; 3226 goto out; 3227 } 3228 3229 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3230 blk_finish_plug(&plug); 3231 list_del_init(&root_log_ctx.list); 3232 mutex_unlock(&log_root_tree->log_mutex); 3233 ret = root_log_ctx.log_ret; 3234 goto out; 3235 } 3236 3237 index2 = root_log_ctx.log_transid % 2; 3238 if (atomic_read(&log_root_tree->log_commit[index2])) { 3239 blk_finish_plug(&plug); 3240 ret = btrfs_wait_tree_log_extents(log, mark); 3241 wait_log_commit(log_root_tree, 3242 root_log_ctx.log_transid); 3243 mutex_unlock(&log_root_tree->log_mutex); 3244 if (!ret) 3245 ret = root_log_ctx.log_ret; 3246 goto out; 3247 } 3248 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3249 atomic_set(&log_root_tree->log_commit[index2], 1); 3250 3251 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3252 wait_log_commit(log_root_tree, 3253 root_log_ctx.log_transid - 1); 3254 } 3255 3256 /* 3257 * now that we've moved on to the tree of log tree roots, 3258 * check the full commit flag again 3259 */ 3260 if (btrfs_need_log_full_commit(trans)) { 3261 blk_finish_plug(&plug); 3262 btrfs_wait_tree_log_extents(log, mark); 3263 mutex_unlock(&log_root_tree->log_mutex); 3264 ret = -EAGAIN; 3265 goto out_wake_log_root; 3266 } 3267 3268 ret = btrfs_write_marked_extents(fs_info, 3269 &log_root_tree->dirty_log_pages, 3270 EXTENT_DIRTY | EXTENT_NEW); 3271 blk_finish_plug(&plug); 3272 /* 3273 * As described above, -EAGAIN indicates a hole in the extents. We 3274 * cannot wait for these write outs since the waiting cause a 3275 * deadlock. Bail out to the full commit instead. 3276 */ 3277 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3278 btrfs_set_log_full_commit(trans); 3279 btrfs_wait_tree_log_extents(log, mark); 3280 mutex_unlock(&log_root_tree->log_mutex); 3281 goto out_wake_log_root; 3282 } else if (ret) { 3283 btrfs_set_log_full_commit(trans); 3284 btrfs_abort_transaction(trans, ret); 3285 mutex_unlock(&log_root_tree->log_mutex); 3286 goto out_wake_log_root; 3287 } 3288 ret = btrfs_wait_tree_log_extents(log, mark); 3289 if (!ret) 3290 ret = btrfs_wait_tree_log_extents(log_root_tree, 3291 EXTENT_NEW | EXTENT_DIRTY); 3292 if (ret) { 3293 btrfs_set_log_full_commit(trans); 3294 mutex_unlock(&log_root_tree->log_mutex); 3295 goto out_wake_log_root; 3296 } 3297 3298 log_root_start = log_root_tree->node->start; 3299 log_root_level = btrfs_header_level(log_root_tree->node); 3300 log_root_tree->log_transid++; 3301 mutex_unlock(&log_root_tree->log_mutex); 3302 3303 /* 3304 * Here we are guaranteed that nobody is going to write the superblock 3305 * for the current transaction before us and that neither we do write 3306 * our superblock before the previous transaction finishes its commit 3307 * and writes its superblock, because: 3308 * 3309 * 1) We are holding a handle on the current transaction, so no body 3310 * can commit it until we release the handle; 3311 * 3312 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3313 * if the previous transaction is still committing, and hasn't yet 3314 * written its superblock, we wait for it to do it, because a 3315 * transaction commit acquires the tree_log_mutex when the commit 3316 * begins and releases it only after writing its superblock. 3317 */ 3318 mutex_lock(&fs_info->tree_log_mutex); 3319 3320 /* 3321 * The previous transaction writeout phase could have failed, and thus 3322 * marked the fs in an error state. We must not commit here, as we 3323 * could have updated our generation in the super_for_commit and 3324 * writing the super here would result in transid mismatches. If there 3325 * is an error here just bail. 3326 */ 3327 if (BTRFS_FS_ERROR(fs_info)) { 3328 ret = -EIO; 3329 btrfs_set_log_full_commit(trans); 3330 btrfs_abort_transaction(trans, ret); 3331 mutex_unlock(&fs_info->tree_log_mutex); 3332 goto out_wake_log_root; 3333 } 3334 3335 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3336 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3337 ret = write_all_supers(fs_info, 1); 3338 mutex_unlock(&fs_info->tree_log_mutex); 3339 if (ret) { 3340 btrfs_set_log_full_commit(trans); 3341 btrfs_abort_transaction(trans, ret); 3342 goto out_wake_log_root; 3343 } 3344 3345 /* 3346 * We know there can only be one task here, since we have not yet set 3347 * root->log_commit[index1] to 0 and any task attempting to sync the 3348 * log must wait for the previous log transaction to commit if it's 3349 * still in progress or wait for the current log transaction commit if 3350 * someone else already started it. We use <= and not < because the 3351 * first log transaction has an ID of 0. 3352 */ 3353 ASSERT(root->last_log_commit <= log_transid); 3354 root->last_log_commit = log_transid; 3355 3356 out_wake_log_root: 3357 mutex_lock(&log_root_tree->log_mutex); 3358 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3359 3360 log_root_tree->log_transid_committed++; 3361 atomic_set(&log_root_tree->log_commit[index2], 0); 3362 mutex_unlock(&log_root_tree->log_mutex); 3363 3364 /* 3365 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3366 * all the updates above are seen by the woken threads. It might not be 3367 * necessary, but proving that seems to be hard. 3368 */ 3369 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3370 out: 3371 mutex_lock(&root->log_mutex); 3372 btrfs_remove_all_log_ctxs(root, index1, ret); 3373 root->log_transid_committed++; 3374 atomic_set(&root->log_commit[index1], 0); 3375 mutex_unlock(&root->log_mutex); 3376 3377 /* 3378 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3379 * all the updates above are seen by the woken threads. It might not be 3380 * necessary, but proving that seems to be hard. 3381 */ 3382 cond_wake_up(&root->log_commit_wait[index1]); 3383 return ret; 3384 } 3385 3386 static void free_log_tree(struct btrfs_trans_handle *trans, 3387 struct btrfs_root *log) 3388 { 3389 int ret; 3390 struct walk_control wc = { 3391 .free = 1, 3392 .process_func = process_one_buffer 3393 }; 3394 3395 if (log->node) { 3396 ret = walk_log_tree(trans, log, &wc); 3397 if (ret) { 3398 /* 3399 * We weren't able to traverse the entire log tree, the 3400 * typical scenario is getting an -EIO when reading an 3401 * extent buffer of the tree, due to a previous writeback 3402 * failure of it. 3403 */ 3404 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3405 &log->fs_info->fs_state); 3406 3407 /* 3408 * Some extent buffers of the log tree may still be dirty 3409 * and not yet written back to storage, because we may 3410 * have updates to a log tree without syncing a log tree, 3411 * such as during rename and link operations. So flush 3412 * them out and wait for their writeback to complete, so 3413 * that we properly cleanup their state and pages. 3414 */ 3415 btrfs_write_marked_extents(log->fs_info, 3416 &log->dirty_log_pages, 3417 EXTENT_DIRTY | EXTENT_NEW); 3418 btrfs_wait_tree_log_extents(log, 3419 EXTENT_DIRTY | EXTENT_NEW); 3420 3421 if (trans) 3422 btrfs_abort_transaction(trans, ret); 3423 else 3424 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3425 } 3426 } 3427 3428 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3429 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3430 extent_io_tree_release(&log->log_csum_range); 3431 3432 btrfs_put_root(log); 3433 } 3434 3435 /* 3436 * free all the extents used by the tree log. This should be called 3437 * at commit time of the full transaction 3438 */ 3439 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3440 { 3441 if (root->log_root) { 3442 free_log_tree(trans, root->log_root); 3443 root->log_root = NULL; 3444 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3445 } 3446 return 0; 3447 } 3448 3449 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3450 struct btrfs_fs_info *fs_info) 3451 { 3452 if (fs_info->log_root_tree) { 3453 free_log_tree(trans, fs_info->log_root_tree); 3454 fs_info->log_root_tree = NULL; 3455 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3456 } 3457 return 0; 3458 } 3459 3460 /* 3461 * Check if an inode was logged in the current transaction. This correctly deals 3462 * with the case where the inode was logged but has a logged_trans of 0, which 3463 * happens if the inode is evicted and loaded again, as logged_trans is an in 3464 * memory only field (not persisted). 3465 * 3466 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3467 * and < 0 on error. 3468 */ 3469 static int inode_logged(struct btrfs_trans_handle *trans, 3470 struct btrfs_inode *inode, 3471 struct btrfs_path *path_in) 3472 { 3473 struct btrfs_path *path = path_in; 3474 struct btrfs_key key; 3475 int ret; 3476 3477 if (inode->logged_trans == trans->transid) 3478 return 1; 3479 3480 /* 3481 * If logged_trans is not 0, then we know the inode logged was not logged 3482 * in this transaction, so we can return false right away. 3483 */ 3484 if (inode->logged_trans > 0) 3485 return 0; 3486 3487 /* 3488 * If no log tree was created for this root in this transaction, then 3489 * the inode can not have been logged in this transaction. In that case 3490 * set logged_trans to anything greater than 0 and less than the current 3491 * transaction's ID, to avoid the search below in a future call in case 3492 * a log tree gets created after this. 3493 */ 3494 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3495 inode->logged_trans = trans->transid - 1; 3496 return 0; 3497 } 3498 3499 /* 3500 * We have a log tree and the inode's logged_trans is 0. We can't tell 3501 * for sure if the inode was logged before in this transaction by looking 3502 * only at logged_trans. We could be pessimistic and assume it was, but 3503 * that can lead to unnecessarily logging an inode during rename and link 3504 * operations, and then further updating the log in followup rename and 3505 * link operations, specially if it's a directory, which adds latency 3506 * visible to applications doing a series of rename or link operations. 3507 * 3508 * A logged_trans of 0 here can mean several things: 3509 * 3510 * 1) The inode was never logged since the filesystem was mounted, and may 3511 * or may have not been evicted and loaded again; 3512 * 3513 * 2) The inode was logged in a previous transaction, then evicted and 3514 * then loaded again; 3515 * 3516 * 3) The inode was logged in the current transaction, then evicted and 3517 * then loaded again. 3518 * 3519 * For cases 1) and 2) we don't want to return true, but we need to detect 3520 * case 3) and return true. So we do a search in the log root for the inode 3521 * item. 3522 */ 3523 key.objectid = btrfs_ino(inode); 3524 key.type = BTRFS_INODE_ITEM_KEY; 3525 key.offset = 0; 3526 3527 if (!path) { 3528 path = btrfs_alloc_path(); 3529 if (!path) 3530 return -ENOMEM; 3531 } 3532 3533 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3534 3535 if (path_in) 3536 btrfs_release_path(path); 3537 else 3538 btrfs_free_path(path); 3539 3540 /* 3541 * Logging an inode always results in logging its inode item. So if we 3542 * did not find the item we know the inode was not logged for sure. 3543 */ 3544 if (ret < 0) { 3545 return ret; 3546 } else if (ret > 0) { 3547 /* 3548 * Set logged_trans to a value greater than 0 and less then the 3549 * current transaction to avoid doing the search in future calls. 3550 */ 3551 inode->logged_trans = trans->transid - 1; 3552 return 0; 3553 } 3554 3555 /* 3556 * The inode was previously logged and then evicted, set logged_trans to 3557 * the current transacion's ID, to avoid future tree searches as long as 3558 * the inode is not evicted again. 3559 */ 3560 inode->logged_trans = trans->transid; 3561 3562 /* 3563 * If it's a directory, then we must set last_dir_index_offset to the 3564 * maximum possible value, so that the next attempt to log the inode does 3565 * not skip checking if dir index keys found in modified subvolume tree 3566 * leaves have been logged before, otherwise it would result in attempts 3567 * to insert duplicate dir index keys in the log tree. This must be done 3568 * because last_dir_index_offset is an in-memory only field, not persisted 3569 * in the inode item or any other on-disk structure, so its value is lost 3570 * once the inode is evicted. 3571 */ 3572 if (S_ISDIR(inode->vfs_inode.i_mode)) 3573 inode->last_dir_index_offset = (u64)-1; 3574 3575 return 1; 3576 } 3577 3578 /* 3579 * Delete a directory entry from the log if it exists. 3580 * 3581 * Returns < 0 on error 3582 * 1 if the entry does not exists 3583 * 0 if the entry existed and was successfully deleted 3584 */ 3585 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3586 struct btrfs_root *log, 3587 struct btrfs_path *path, 3588 u64 dir_ino, 3589 const char *name, int name_len, 3590 u64 index) 3591 { 3592 struct btrfs_dir_item *di; 3593 3594 /* 3595 * We only log dir index items of a directory, so we don't need to look 3596 * for dir item keys. 3597 */ 3598 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3599 index, name, name_len, -1); 3600 if (IS_ERR(di)) 3601 return PTR_ERR(di); 3602 else if (!di) 3603 return 1; 3604 3605 /* 3606 * We do not need to update the size field of the directory's 3607 * inode item because on log replay we update the field to reflect 3608 * all existing entries in the directory (see overwrite_item()). 3609 */ 3610 return btrfs_delete_one_dir_name(trans, log, path, di); 3611 } 3612 3613 /* 3614 * If both a file and directory are logged, and unlinks or renames are 3615 * mixed in, we have a few interesting corners: 3616 * 3617 * create file X in dir Y 3618 * link file X to X.link in dir Y 3619 * fsync file X 3620 * unlink file X but leave X.link 3621 * fsync dir Y 3622 * 3623 * After a crash we would expect only X.link to exist. But file X 3624 * didn't get fsync'd again so the log has back refs for X and X.link. 3625 * 3626 * We solve this by removing directory entries and inode backrefs from the 3627 * log when a file that was logged in the current transaction is 3628 * unlinked. Any later fsync will include the updated log entries, and 3629 * we'll be able to reconstruct the proper directory items from backrefs. 3630 * 3631 * This optimizations allows us to avoid relogging the entire inode 3632 * or the entire directory. 3633 */ 3634 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3635 struct btrfs_root *root, 3636 const char *name, int name_len, 3637 struct btrfs_inode *dir, u64 index) 3638 { 3639 struct btrfs_path *path; 3640 int ret; 3641 3642 ret = inode_logged(trans, dir, NULL); 3643 if (ret == 0) 3644 return; 3645 else if (ret < 0) { 3646 btrfs_set_log_full_commit(trans); 3647 return; 3648 } 3649 3650 ret = join_running_log_trans(root); 3651 if (ret) 3652 return; 3653 3654 mutex_lock(&dir->log_mutex); 3655 3656 path = btrfs_alloc_path(); 3657 if (!path) { 3658 ret = -ENOMEM; 3659 goto out_unlock; 3660 } 3661 3662 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3663 name, name_len, index); 3664 btrfs_free_path(path); 3665 out_unlock: 3666 mutex_unlock(&dir->log_mutex); 3667 if (ret < 0) 3668 btrfs_set_log_full_commit(trans); 3669 btrfs_end_log_trans(root); 3670 } 3671 3672 /* see comments for btrfs_del_dir_entries_in_log */ 3673 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3674 struct btrfs_root *root, 3675 const char *name, int name_len, 3676 struct btrfs_inode *inode, u64 dirid) 3677 { 3678 struct btrfs_root *log; 3679 u64 index; 3680 int ret; 3681 3682 ret = inode_logged(trans, inode, NULL); 3683 if (ret == 0) 3684 return; 3685 else if (ret < 0) { 3686 btrfs_set_log_full_commit(trans); 3687 return; 3688 } 3689 3690 ret = join_running_log_trans(root); 3691 if (ret) 3692 return; 3693 log = root->log_root; 3694 mutex_lock(&inode->log_mutex); 3695 3696 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3697 dirid, &index); 3698 mutex_unlock(&inode->log_mutex); 3699 if (ret < 0 && ret != -ENOENT) 3700 btrfs_set_log_full_commit(trans); 3701 btrfs_end_log_trans(root); 3702 } 3703 3704 /* 3705 * creates a range item in the log for 'dirid'. first_offset and 3706 * last_offset tell us which parts of the key space the log should 3707 * be considered authoritative for. 3708 */ 3709 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3710 struct btrfs_root *log, 3711 struct btrfs_path *path, 3712 u64 dirid, 3713 u64 first_offset, u64 last_offset) 3714 { 3715 int ret; 3716 struct btrfs_key key; 3717 struct btrfs_dir_log_item *item; 3718 3719 key.objectid = dirid; 3720 key.offset = first_offset; 3721 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3722 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3723 if (ret) 3724 return ret; 3725 3726 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3727 struct btrfs_dir_log_item); 3728 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3729 btrfs_mark_buffer_dirty(path->nodes[0]); 3730 btrfs_release_path(path); 3731 return 0; 3732 } 3733 3734 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3735 struct btrfs_root *log, 3736 struct extent_buffer *src, 3737 struct btrfs_path *dst_path, 3738 int start_slot, 3739 int count) 3740 { 3741 char *ins_data = NULL; 3742 struct btrfs_item_batch batch; 3743 struct extent_buffer *dst; 3744 unsigned long src_offset; 3745 unsigned long dst_offset; 3746 struct btrfs_key key; 3747 u32 item_size; 3748 int ret; 3749 int i; 3750 3751 ASSERT(count > 0); 3752 batch.nr = count; 3753 3754 if (count == 1) { 3755 btrfs_item_key_to_cpu(src, &key, start_slot); 3756 item_size = btrfs_item_size(src, start_slot); 3757 batch.keys = &key; 3758 batch.data_sizes = &item_size; 3759 batch.total_data_size = item_size; 3760 } else { 3761 struct btrfs_key *ins_keys; 3762 u32 *ins_sizes; 3763 3764 ins_data = kmalloc(count * sizeof(u32) + 3765 count * sizeof(struct btrfs_key), GFP_NOFS); 3766 if (!ins_data) 3767 return -ENOMEM; 3768 3769 ins_sizes = (u32 *)ins_data; 3770 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3771 batch.keys = ins_keys; 3772 batch.data_sizes = ins_sizes; 3773 batch.total_data_size = 0; 3774 3775 for (i = 0; i < count; i++) { 3776 const int slot = start_slot + i; 3777 3778 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3779 ins_sizes[i] = btrfs_item_size(src, slot); 3780 batch.total_data_size += ins_sizes[i]; 3781 } 3782 } 3783 3784 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3785 if (ret) 3786 goto out; 3787 3788 dst = dst_path->nodes[0]; 3789 /* 3790 * Copy all the items in bulk, in a single copy operation. Item data is 3791 * organized such that it's placed at the end of a leaf and from right 3792 * to left. For example, the data for the second item ends at an offset 3793 * that matches the offset where the data for the first item starts, the 3794 * data for the third item ends at an offset that matches the offset 3795 * where the data of the second items starts, and so on. 3796 * Therefore our source and destination start offsets for copy match the 3797 * offsets of the last items (highest slots). 3798 */ 3799 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3800 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3801 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3802 btrfs_release_path(dst_path); 3803 out: 3804 kfree(ins_data); 3805 3806 return ret; 3807 } 3808 3809 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3810 struct btrfs_inode *inode, 3811 struct btrfs_path *path, 3812 struct btrfs_path *dst_path, 3813 struct btrfs_log_ctx *ctx, 3814 u64 *last_old_dentry_offset) 3815 { 3816 struct btrfs_root *log = inode->root->log_root; 3817 struct extent_buffer *src = path->nodes[0]; 3818 const int nritems = btrfs_header_nritems(src); 3819 const u64 ino = btrfs_ino(inode); 3820 bool last_found = false; 3821 int batch_start = 0; 3822 int batch_size = 0; 3823 int i; 3824 3825 for (i = path->slots[0]; i < nritems; i++) { 3826 struct btrfs_dir_item *di; 3827 struct btrfs_key key; 3828 int ret; 3829 3830 btrfs_item_key_to_cpu(src, &key, i); 3831 3832 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3833 last_found = true; 3834 break; 3835 } 3836 3837 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3838 ctx->last_dir_item_offset = key.offset; 3839 3840 /* 3841 * Skip ranges of items that consist only of dir item keys created 3842 * in past transactions. However if we find a gap, we must log a 3843 * dir index range item for that gap, so that index keys in that 3844 * gap are deleted during log replay. 3845 */ 3846 if (btrfs_dir_transid(src, di) < trans->transid) { 3847 if (key.offset > *last_old_dentry_offset + 1) { 3848 ret = insert_dir_log_key(trans, log, dst_path, 3849 ino, *last_old_dentry_offset + 1, 3850 key.offset - 1); 3851 /* 3852 * -EEXIST should never happen because when we 3853 * log a directory in full mode (LOG_INODE_ALL) 3854 * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from 3855 * the log tree. 3856 */ 3857 ASSERT(ret != -EEXIST); 3858 if (ret < 0) 3859 return ret; 3860 } 3861 3862 *last_old_dentry_offset = key.offset; 3863 continue; 3864 } 3865 /* 3866 * We must make sure that when we log a directory entry, the 3867 * corresponding inode, after log replay, has a matching link 3868 * count. For example: 3869 * 3870 * touch foo 3871 * mkdir mydir 3872 * sync 3873 * ln foo mydir/bar 3874 * xfs_io -c "fsync" mydir 3875 * <crash> 3876 * <mount fs and log replay> 3877 * 3878 * Would result in a fsync log that when replayed, our file inode 3879 * would have a link count of 1, but we get two directory entries 3880 * pointing to the same inode. After removing one of the names, 3881 * it would not be possible to remove the other name, which 3882 * resulted always in stale file handle errors, and would not be 3883 * possible to rmdir the parent directory, since its i_size could 3884 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3885 * resulting in -ENOTEMPTY errors. 3886 */ 3887 if (!ctx->log_new_dentries) { 3888 struct btrfs_key di_key; 3889 3890 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3891 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3892 ctx->log_new_dentries = true; 3893 } 3894 3895 if (!ctx->logged_before) 3896 goto add_to_batch; 3897 3898 /* 3899 * If we were logged before and have logged dir items, we can skip 3900 * checking if any item with a key offset larger than the last one 3901 * we logged is in the log tree, saving time and avoiding adding 3902 * contention on the log tree. We can only rely on the value of 3903 * last_dir_index_offset when we know for sure that the inode was 3904 * previously logged in the current transaction. 3905 */ 3906 if (key.offset > inode->last_dir_index_offset) 3907 goto add_to_batch; 3908 /* 3909 * Check if the key was already logged before. If not we can add 3910 * it to a batch for bulk insertion. 3911 */ 3912 ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); 3913 if (ret < 0) { 3914 return ret; 3915 } else if (ret > 0) { 3916 btrfs_release_path(dst_path); 3917 goto add_to_batch; 3918 } 3919 3920 /* 3921 * Item exists in the log. Overwrite the item in the log if it 3922 * has different content or do nothing if it has exactly the same 3923 * content. And then flush the current batch if any - do it after 3924 * overwriting the current item, or we would deadlock otherwise, 3925 * since we are holding a path for the existing item. 3926 */ 3927 ret = do_overwrite_item(trans, log, dst_path, src, i, &key); 3928 if (ret < 0) 3929 return ret; 3930 3931 if (batch_size > 0) { 3932 ret = flush_dir_items_batch(trans, log, src, dst_path, 3933 batch_start, batch_size); 3934 if (ret < 0) 3935 return ret; 3936 batch_size = 0; 3937 } 3938 continue; 3939 add_to_batch: 3940 if (batch_size == 0) 3941 batch_start = i; 3942 batch_size++; 3943 } 3944 3945 if (batch_size > 0) { 3946 int ret; 3947 3948 ret = flush_dir_items_batch(trans, log, src, dst_path, 3949 batch_start, batch_size); 3950 if (ret < 0) 3951 return ret; 3952 } 3953 3954 return last_found ? 1 : 0; 3955 } 3956 3957 /* 3958 * log all the items included in the current transaction for a given 3959 * directory. This also creates the range items in the log tree required 3960 * to replay anything deleted before the fsync 3961 */ 3962 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3963 struct btrfs_inode *inode, 3964 struct btrfs_path *path, 3965 struct btrfs_path *dst_path, 3966 struct btrfs_log_ctx *ctx, 3967 u64 min_offset, u64 *last_offset_ret) 3968 { 3969 struct btrfs_key min_key; 3970 struct btrfs_root *root = inode->root; 3971 struct btrfs_root *log = root->log_root; 3972 int err = 0; 3973 int ret; 3974 u64 last_old_dentry_offset = min_offset - 1; 3975 u64 last_offset = (u64)-1; 3976 u64 ino = btrfs_ino(inode); 3977 3978 min_key.objectid = ino; 3979 min_key.type = BTRFS_DIR_INDEX_KEY; 3980 min_key.offset = min_offset; 3981 3982 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3983 3984 /* 3985 * we didn't find anything from this transaction, see if there 3986 * is anything at all 3987 */ 3988 if (ret != 0 || min_key.objectid != ino || 3989 min_key.type != BTRFS_DIR_INDEX_KEY) { 3990 min_key.objectid = ino; 3991 min_key.type = BTRFS_DIR_INDEX_KEY; 3992 min_key.offset = (u64)-1; 3993 btrfs_release_path(path); 3994 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3995 if (ret < 0) { 3996 btrfs_release_path(path); 3997 return ret; 3998 } 3999 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4000 4001 /* if ret == 0 there are items for this type, 4002 * create a range to tell us the last key of this type. 4003 * otherwise, there are no items in this directory after 4004 * *min_offset, and we create a range to indicate that. 4005 */ 4006 if (ret == 0) { 4007 struct btrfs_key tmp; 4008 4009 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 4010 path->slots[0]); 4011 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4012 last_old_dentry_offset = tmp.offset; 4013 } 4014 goto done; 4015 } 4016 4017 /* go backward to find any previous key */ 4018 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4019 if (ret == 0) { 4020 struct btrfs_key tmp; 4021 4022 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 4023 /* 4024 * The dir index key before the first one we found that needs to 4025 * be logged might be in a previous leaf, and there might be a 4026 * gap between these keys, meaning that we had deletions that 4027 * happened. So the key range item we log (key type 4028 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 4029 * previous key's offset plus 1, so that those deletes are replayed. 4030 */ 4031 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4032 last_old_dentry_offset = tmp.offset; 4033 } 4034 btrfs_release_path(path); 4035 4036 /* 4037 * Find the first key from this transaction again. See the note for 4038 * log_new_dir_dentries, if we're logging a directory recursively we 4039 * won't be holding its i_mutex, which means we can modify the directory 4040 * while we're logging it. If we remove an entry between our first 4041 * search and this search we'll not find the key again and can just 4042 * bail. 4043 */ 4044 search: 4045 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 4046 if (ret != 0) 4047 goto done; 4048 4049 /* 4050 * we have a block from this transaction, log every item in it 4051 * from our directory 4052 */ 4053 while (1) { 4054 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 4055 &last_old_dentry_offset); 4056 if (ret != 0) { 4057 if (ret < 0) 4058 err = ret; 4059 goto done; 4060 } 4061 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 4062 4063 /* 4064 * look ahead to the next item and see if it is also 4065 * from this directory and from this transaction 4066 */ 4067 ret = btrfs_next_leaf(root, path); 4068 if (ret) { 4069 if (ret == 1) 4070 last_offset = (u64)-1; 4071 else 4072 err = ret; 4073 goto done; 4074 } 4075 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 4076 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 4077 last_offset = (u64)-1; 4078 goto done; 4079 } 4080 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 4081 /* 4082 * The next leaf was not changed in the current transaction 4083 * and has at least one dir index key. 4084 * We check for the next key because there might have been 4085 * one or more deletions between the last key we logged and 4086 * that next key. So the key range item we log (key type 4087 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 4088 * offset minus 1, so that those deletes are replayed. 4089 */ 4090 last_offset = min_key.offset - 1; 4091 goto done; 4092 } 4093 if (need_resched()) { 4094 btrfs_release_path(path); 4095 cond_resched(); 4096 goto search; 4097 } 4098 } 4099 done: 4100 btrfs_release_path(path); 4101 btrfs_release_path(dst_path); 4102 4103 if (err == 0) { 4104 *last_offset_ret = last_offset; 4105 /* 4106 * In case the leaf was changed in the current transaction but 4107 * all its dir items are from a past transaction, the last item 4108 * in the leaf is a dir item and there's no gap between that last 4109 * dir item and the first one on the next leaf (which did not 4110 * change in the current transaction), then we don't need to log 4111 * a range, last_old_dentry_offset is == to last_offset. 4112 */ 4113 ASSERT(last_old_dentry_offset <= last_offset); 4114 if (last_old_dentry_offset < last_offset) { 4115 ret = insert_dir_log_key(trans, log, path, ino, 4116 last_old_dentry_offset + 1, 4117 last_offset); 4118 if (ret) 4119 err = ret; 4120 } 4121 } 4122 return err; 4123 } 4124 4125 /* 4126 * logging directories is very similar to logging inodes, We find all the items 4127 * from the current transaction and write them to the log. 4128 * 4129 * The recovery code scans the directory in the subvolume, and if it finds a 4130 * key in the range logged that is not present in the log tree, then it means 4131 * that dir entry was unlinked during the transaction. 4132 * 4133 * In order for that scan to work, we must include one key smaller than 4134 * the smallest logged by this transaction and one key larger than the largest 4135 * key logged by this transaction. 4136 */ 4137 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4138 struct btrfs_inode *inode, 4139 struct btrfs_path *path, 4140 struct btrfs_path *dst_path, 4141 struct btrfs_log_ctx *ctx) 4142 { 4143 u64 min_key; 4144 u64 max_key; 4145 int ret; 4146 4147 min_key = BTRFS_DIR_START_INDEX; 4148 max_key = 0; 4149 ctx->last_dir_item_offset = inode->last_dir_index_offset; 4150 4151 while (1) { 4152 ret = log_dir_items(trans, inode, path, dst_path, 4153 ctx, min_key, &max_key); 4154 if (ret) 4155 return ret; 4156 if (max_key == (u64)-1) 4157 break; 4158 min_key = max_key + 1; 4159 } 4160 4161 inode->last_dir_index_offset = ctx->last_dir_item_offset; 4162 4163 return 0; 4164 } 4165 4166 /* 4167 * a helper function to drop items from the log before we relog an 4168 * inode. max_key_type indicates the highest item type to remove. 4169 * This cannot be run for file data extents because it does not 4170 * free the extents they point to. 4171 */ 4172 static int drop_inode_items(struct btrfs_trans_handle *trans, 4173 struct btrfs_root *log, 4174 struct btrfs_path *path, 4175 struct btrfs_inode *inode, 4176 int max_key_type) 4177 { 4178 int ret; 4179 struct btrfs_key key; 4180 struct btrfs_key found_key; 4181 int start_slot; 4182 4183 key.objectid = btrfs_ino(inode); 4184 key.type = max_key_type; 4185 key.offset = (u64)-1; 4186 4187 while (1) { 4188 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4189 BUG_ON(ret == 0); /* Logic error */ 4190 if (ret < 0) 4191 break; 4192 4193 if (path->slots[0] == 0) 4194 break; 4195 4196 path->slots[0]--; 4197 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4198 path->slots[0]); 4199 4200 if (found_key.objectid != key.objectid) 4201 break; 4202 4203 found_key.offset = 0; 4204 found_key.type = 0; 4205 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 4206 if (ret < 0) 4207 break; 4208 4209 ret = btrfs_del_items(trans, log, path, start_slot, 4210 path->slots[0] - start_slot + 1); 4211 /* 4212 * If start slot isn't 0 then we don't need to re-search, we've 4213 * found the last guy with the objectid in this tree. 4214 */ 4215 if (ret || start_slot != 0) 4216 break; 4217 btrfs_release_path(path); 4218 } 4219 btrfs_release_path(path); 4220 if (ret > 0) 4221 ret = 0; 4222 return ret; 4223 } 4224 4225 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4226 struct btrfs_root *log_root, 4227 struct btrfs_inode *inode, 4228 u64 new_size, u32 min_type) 4229 { 4230 struct btrfs_truncate_control control = { 4231 .new_size = new_size, 4232 .ino = btrfs_ino(inode), 4233 .min_type = min_type, 4234 .skip_ref_updates = true, 4235 }; 4236 4237 return btrfs_truncate_inode_items(trans, log_root, &control); 4238 } 4239 4240 static void fill_inode_item(struct btrfs_trans_handle *trans, 4241 struct extent_buffer *leaf, 4242 struct btrfs_inode_item *item, 4243 struct inode *inode, int log_inode_only, 4244 u64 logged_isize) 4245 { 4246 struct btrfs_map_token token; 4247 u64 flags; 4248 4249 btrfs_init_map_token(&token, leaf); 4250 4251 if (log_inode_only) { 4252 /* set the generation to zero so the recover code 4253 * can tell the difference between an logging 4254 * just to say 'this inode exists' and a logging 4255 * to say 'update this inode with these values' 4256 */ 4257 btrfs_set_token_inode_generation(&token, item, 0); 4258 btrfs_set_token_inode_size(&token, item, logged_isize); 4259 } else { 4260 btrfs_set_token_inode_generation(&token, item, 4261 BTRFS_I(inode)->generation); 4262 btrfs_set_token_inode_size(&token, item, inode->i_size); 4263 } 4264 4265 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4266 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4267 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4268 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4269 4270 btrfs_set_token_timespec_sec(&token, &item->atime, 4271 inode->i_atime.tv_sec); 4272 btrfs_set_token_timespec_nsec(&token, &item->atime, 4273 inode->i_atime.tv_nsec); 4274 4275 btrfs_set_token_timespec_sec(&token, &item->mtime, 4276 inode->i_mtime.tv_sec); 4277 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4278 inode->i_mtime.tv_nsec); 4279 4280 btrfs_set_token_timespec_sec(&token, &item->ctime, 4281 inode->i_ctime.tv_sec); 4282 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4283 inode->i_ctime.tv_nsec); 4284 4285 /* 4286 * We do not need to set the nbytes field, in fact during a fast fsync 4287 * its value may not even be correct, since a fast fsync does not wait 4288 * for ordered extent completion, which is where we update nbytes, it 4289 * only waits for writeback to complete. During log replay as we find 4290 * file extent items and replay them, we adjust the nbytes field of the 4291 * inode item in subvolume tree as needed (see overwrite_item()). 4292 */ 4293 4294 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4295 btrfs_set_token_inode_transid(&token, item, trans->transid); 4296 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4297 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4298 BTRFS_I(inode)->ro_flags); 4299 btrfs_set_token_inode_flags(&token, item, flags); 4300 btrfs_set_token_inode_block_group(&token, item, 0); 4301 } 4302 4303 static int log_inode_item(struct btrfs_trans_handle *trans, 4304 struct btrfs_root *log, struct btrfs_path *path, 4305 struct btrfs_inode *inode, bool inode_item_dropped) 4306 { 4307 struct btrfs_inode_item *inode_item; 4308 int ret; 4309 4310 /* 4311 * If we are doing a fast fsync and the inode was logged before in the 4312 * current transaction, then we know the inode was previously logged and 4313 * it exists in the log tree. For performance reasons, in this case use 4314 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4315 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4316 * contention in case there are concurrent fsyncs for other inodes of the 4317 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4318 * already exists can also result in unnecessarily splitting a leaf. 4319 */ 4320 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4321 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4322 ASSERT(ret <= 0); 4323 if (ret > 0) 4324 ret = -ENOENT; 4325 } else { 4326 /* 4327 * This means it is the first fsync in the current transaction, 4328 * so the inode item is not in the log and we need to insert it. 4329 * We can never get -EEXIST because we are only called for a fast 4330 * fsync and in case an inode eviction happens after the inode was 4331 * logged before in the current transaction, when we load again 4332 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4333 * flags and set ->logged_trans to 0. 4334 */ 4335 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4336 sizeof(*inode_item)); 4337 ASSERT(ret != -EEXIST); 4338 } 4339 if (ret) 4340 return ret; 4341 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4342 struct btrfs_inode_item); 4343 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4344 0, 0); 4345 btrfs_release_path(path); 4346 return 0; 4347 } 4348 4349 static int log_csums(struct btrfs_trans_handle *trans, 4350 struct btrfs_inode *inode, 4351 struct btrfs_root *log_root, 4352 struct btrfs_ordered_sum *sums) 4353 { 4354 const u64 lock_end = sums->bytenr + sums->len - 1; 4355 struct extent_state *cached_state = NULL; 4356 int ret; 4357 4358 /* 4359 * If this inode was not used for reflink operations in the current 4360 * transaction with new extents, then do the fast path, no need to 4361 * worry about logging checksum items with overlapping ranges. 4362 */ 4363 if (inode->last_reflink_trans < trans->transid) 4364 return btrfs_csum_file_blocks(trans, log_root, sums); 4365 4366 /* 4367 * Serialize logging for checksums. This is to avoid racing with the 4368 * same checksum being logged by another task that is logging another 4369 * file which happens to refer to the same extent as well. Such races 4370 * can leave checksum items in the log with overlapping ranges. 4371 */ 4372 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 4373 lock_end, &cached_state); 4374 if (ret) 4375 return ret; 4376 /* 4377 * Due to extent cloning, we might have logged a csum item that covers a 4378 * subrange of a cloned extent, and later we can end up logging a csum 4379 * item for a larger subrange of the same extent or the entire range. 4380 * This would leave csum items in the log tree that cover the same range 4381 * and break the searches for checksums in the log tree, resulting in 4382 * some checksums missing in the fs/subvolume tree. So just delete (or 4383 * trim and adjust) any existing csum items in the log for this range. 4384 */ 4385 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4386 if (!ret) 4387 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4388 4389 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 4390 &cached_state); 4391 4392 return ret; 4393 } 4394 4395 static noinline int copy_items(struct btrfs_trans_handle *trans, 4396 struct btrfs_inode *inode, 4397 struct btrfs_path *dst_path, 4398 struct btrfs_path *src_path, 4399 int start_slot, int nr, int inode_only, 4400 u64 logged_isize) 4401 { 4402 struct btrfs_root *log = inode->root->log_root; 4403 struct btrfs_file_extent_item *extent; 4404 struct extent_buffer *src = src_path->nodes[0]; 4405 int ret = 0; 4406 struct btrfs_key *ins_keys; 4407 u32 *ins_sizes; 4408 struct btrfs_item_batch batch; 4409 char *ins_data; 4410 int i; 4411 int dst_index; 4412 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4413 const u64 i_size = i_size_read(&inode->vfs_inode); 4414 4415 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4416 nr * sizeof(u32), GFP_NOFS); 4417 if (!ins_data) 4418 return -ENOMEM; 4419 4420 ins_sizes = (u32 *)ins_data; 4421 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4422 batch.keys = ins_keys; 4423 batch.data_sizes = ins_sizes; 4424 batch.total_data_size = 0; 4425 batch.nr = 0; 4426 4427 dst_index = 0; 4428 for (i = 0; i < nr; i++) { 4429 const int src_slot = start_slot + i; 4430 struct btrfs_root *csum_root; 4431 struct btrfs_ordered_sum *sums; 4432 struct btrfs_ordered_sum *sums_next; 4433 LIST_HEAD(ordered_sums); 4434 u64 disk_bytenr; 4435 u64 disk_num_bytes; 4436 u64 extent_offset; 4437 u64 extent_num_bytes; 4438 bool is_old_extent; 4439 4440 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4441 4442 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4443 goto add_to_batch; 4444 4445 extent = btrfs_item_ptr(src, src_slot, 4446 struct btrfs_file_extent_item); 4447 4448 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4449 trans->transid); 4450 4451 /* 4452 * Don't copy extents from past generations. That would make us 4453 * log a lot more metadata for common cases like doing only a 4454 * few random writes into a file and then fsync it for the first 4455 * time or after the full sync flag is set on the inode. We can 4456 * get leaves full of extent items, most of which are from past 4457 * generations, so we can skip them - as long as the inode has 4458 * not been the target of a reflink operation in this transaction, 4459 * as in that case it might have had file extent items with old 4460 * generations copied into it. We also must always log prealloc 4461 * extents that start at or beyond eof, otherwise we would lose 4462 * them on log replay. 4463 */ 4464 if (is_old_extent && 4465 ins_keys[dst_index].offset < i_size && 4466 inode->last_reflink_trans < trans->transid) 4467 continue; 4468 4469 if (skip_csum) 4470 goto add_to_batch; 4471 4472 /* Only regular extents have checksums. */ 4473 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4474 goto add_to_batch; 4475 4476 /* 4477 * If it's an extent created in a past transaction, then its 4478 * checksums are already accessible from the committed csum tree, 4479 * no need to log them. 4480 */ 4481 if (is_old_extent) 4482 goto add_to_batch; 4483 4484 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4485 /* If it's an explicit hole, there are no checksums. */ 4486 if (disk_bytenr == 0) 4487 goto add_to_batch; 4488 4489 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4490 4491 if (btrfs_file_extent_compression(src, extent)) { 4492 extent_offset = 0; 4493 extent_num_bytes = disk_num_bytes; 4494 } else { 4495 extent_offset = btrfs_file_extent_offset(src, extent); 4496 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4497 } 4498 4499 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4500 disk_bytenr += extent_offset; 4501 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, 4502 disk_bytenr + extent_num_bytes - 1, 4503 &ordered_sums, 0); 4504 if (ret) 4505 goto out; 4506 4507 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4508 if (!ret) 4509 ret = log_csums(trans, inode, log, sums); 4510 list_del(&sums->list); 4511 kfree(sums); 4512 } 4513 if (ret) 4514 goto out; 4515 4516 add_to_batch: 4517 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4518 batch.total_data_size += ins_sizes[dst_index]; 4519 batch.nr++; 4520 dst_index++; 4521 } 4522 4523 /* 4524 * We have a leaf full of old extent items that don't need to be logged, 4525 * so we don't need to do anything. 4526 */ 4527 if (batch.nr == 0) 4528 goto out; 4529 4530 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4531 if (ret) 4532 goto out; 4533 4534 dst_index = 0; 4535 for (i = 0; i < nr; i++) { 4536 const int src_slot = start_slot + i; 4537 const int dst_slot = dst_path->slots[0] + dst_index; 4538 struct btrfs_key key; 4539 unsigned long src_offset; 4540 unsigned long dst_offset; 4541 4542 /* 4543 * We're done, all the remaining items in the source leaf 4544 * correspond to old file extent items. 4545 */ 4546 if (dst_index >= batch.nr) 4547 break; 4548 4549 btrfs_item_key_to_cpu(src, &key, src_slot); 4550 4551 if (key.type != BTRFS_EXTENT_DATA_KEY) 4552 goto copy_item; 4553 4554 extent = btrfs_item_ptr(src, src_slot, 4555 struct btrfs_file_extent_item); 4556 4557 /* See the comment in the previous loop, same logic. */ 4558 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4559 key.offset < i_size && 4560 inode->last_reflink_trans < trans->transid) 4561 continue; 4562 4563 copy_item: 4564 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4565 src_offset = btrfs_item_ptr_offset(src, src_slot); 4566 4567 if (key.type == BTRFS_INODE_ITEM_KEY) { 4568 struct btrfs_inode_item *inode_item; 4569 4570 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4571 struct btrfs_inode_item); 4572 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4573 &inode->vfs_inode, 4574 inode_only == LOG_INODE_EXISTS, 4575 logged_isize); 4576 } else { 4577 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4578 src_offset, ins_sizes[dst_index]); 4579 } 4580 4581 dst_index++; 4582 } 4583 4584 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4585 btrfs_release_path(dst_path); 4586 out: 4587 kfree(ins_data); 4588 4589 return ret; 4590 } 4591 4592 static int extent_cmp(void *priv, const struct list_head *a, 4593 const struct list_head *b) 4594 { 4595 const struct extent_map *em1, *em2; 4596 4597 em1 = list_entry(a, struct extent_map, list); 4598 em2 = list_entry(b, struct extent_map, list); 4599 4600 if (em1->start < em2->start) 4601 return -1; 4602 else if (em1->start > em2->start) 4603 return 1; 4604 return 0; 4605 } 4606 4607 static int log_extent_csums(struct btrfs_trans_handle *trans, 4608 struct btrfs_inode *inode, 4609 struct btrfs_root *log_root, 4610 const struct extent_map *em, 4611 struct btrfs_log_ctx *ctx) 4612 { 4613 struct btrfs_ordered_extent *ordered; 4614 struct btrfs_root *csum_root; 4615 u64 csum_offset; 4616 u64 csum_len; 4617 u64 mod_start = em->mod_start; 4618 u64 mod_len = em->mod_len; 4619 LIST_HEAD(ordered_sums); 4620 int ret = 0; 4621 4622 if (inode->flags & BTRFS_INODE_NODATASUM || 4623 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4624 em->block_start == EXTENT_MAP_HOLE) 4625 return 0; 4626 4627 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4628 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4629 const u64 mod_end = mod_start + mod_len; 4630 struct btrfs_ordered_sum *sums; 4631 4632 if (mod_len == 0) 4633 break; 4634 4635 if (ordered_end <= mod_start) 4636 continue; 4637 if (mod_end <= ordered->file_offset) 4638 break; 4639 4640 /* 4641 * We are going to copy all the csums on this ordered extent, so 4642 * go ahead and adjust mod_start and mod_len in case this ordered 4643 * extent has already been logged. 4644 */ 4645 if (ordered->file_offset > mod_start) { 4646 if (ordered_end >= mod_end) 4647 mod_len = ordered->file_offset - mod_start; 4648 /* 4649 * If we have this case 4650 * 4651 * |--------- logged extent ---------| 4652 * |----- ordered extent ----| 4653 * 4654 * Just don't mess with mod_start and mod_len, we'll 4655 * just end up logging more csums than we need and it 4656 * will be ok. 4657 */ 4658 } else { 4659 if (ordered_end < mod_end) { 4660 mod_len = mod_end - ordered_end; 4661 mod_start = ordered_end; 4662 } else { 4663 mod_len = 0; 4664 } 4665 } 4666 4667 /* 4668 * To keep us from looping for the above case of an ordered 4669 * extent that falls inside of the logged extent. 4670 */ 4671 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4672 continue; 4673 4674 list_for_each_entry(sums, &ordered->list, list) { 4675 ret = log_csums(trans, inode, log_root, sums); 4676 if (ret) 4677 return ret; 4678 } 4679 } 4680 4681 /* We're done, found all csums in the ordered extents. */ 4682 if (mod_len == 0) 4683 return 0; 4684 4685 /* If we're compressed we have to save the entire range of csums. */ 4686 if (em->compress_type) { 4687 csum_offset = 0; 4688 csum_len = max(em->block_len, em->orig_block_len); 4689 } else { 4690 csum_offset = mod_start - em->start; 4691 csum_len = mod_len; 4692 } 4693 4694 /* block start is already adjusted for the file extent offset. */ 4695 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4696 ret = btrfs_lookup_csums_range(csum_root, 4697 em->block_start + csum_offset, 4698 em->block_start + csum_offset + 4699 csum_len - 1, &ordered_sums, 0); 4700 if (ret) 4701 return ret; 4702 4703 while (!list_empty(&ordered_sums)) { 4704 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4705 struct btrfs_ordered_sum, 4706 list); 4707 if (!ret) 4708 ret = log_csums(trans, inode, log_root, sums); 4709 list_del(&sums->list); 4710 kfree(sums); 4711 } 4712 4713 return ret; 4714 } 4715 4716 static int log_one_extent(struct btrfs_trans_handle *trans, 4717 struct btrfs_inode *inode, 4718 const struct extent_map *em, 4719 struct btrfs_path *path, 4720 struct btrfs_log_ctx *ctx) 4721 { 4722 struct btrfs_drop_extents_args drop_args = { 0 }; 4723 struct btrfs_root *log = inode->root->log_root; 4724 struct btrfs_file_extent_item fi = { 0 }; 4725 struct extent_buffer *leaf; 4726 struct btrfs_key key; 4727 u64 extent_offset = em->start - em->orig_start; 4728 u64 block_len; 4729 int ret; 4730 4731 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4732 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4733 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4734 else 4735 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4736 4737 block_len = max(em->block_len, em->orig_block_len); 4738 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4739 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4740 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4741 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4742 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4743 extent_offset); 4744 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4745 } 4746 4747 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4748 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4749 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4750 btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4751 4752 ret = log_extent_csums(trans, inode, log, em, ctx); 4753 if (ret) 4754 return ret; 4755 4756 /* 4757 * If this is the first time we are logging the inode in the current 4758 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4759 * because it does a deletion search, which always acquires write locks 4760 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4761 * but also adds significant contention in a log tree, since log trees 4762 * are small, with a root at level 2 or 3 at most, due to their short 4763 * life span. 4764 */ 4765 if (ctx->logged_before) { 4766 drop_args.path = path; 4767 drop_args.start = em->start; 4768 drop_args.end = em->start + em->len; 4769 drop_args.replace_extent = true; 4770 drop_args.extent_item_size = sizeof(fi); 4771 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4772 if (ret) 4773 return ret; 4774 } 4775 4776 if (!drop_args.extent_inserted) { 4777 key.objectid = btrfs_ino(inode); 4778 key.type = BTRFS_EXTENT_DATA_KEY; 4779 key.offset = em->start; 4780 4781 ret = btrfs_insert_empty_item(trans, log, path, &key, 4782 sizeof(fi)); 4783 if (ret) 4784 return ret; 4785 } 4786 leaf = path->nodes[0]; 4787 write_extent_buffer(leaf, &fi, 4788 btrfs_item_ptr_offset(leaf, path->slots[0]), 4789 sizeof(fi)); 4790 btrfs_mark_buffer_dirty(leaf); 4791 4792 btrfs_release_path(path); 4793 4794 return ret; 4795 } 4796 4797 /* 4798 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4799 * lose them after doing a full/fast fsync and replaying the log. We scan the 4800 * subvolume's root instead of iterating the inode's extent map tree because 4801 * otherwise we can log incorrect extent items based on extent map conversion. 4802 * That can happen due to the fact that extent maps are merged when they 4803 * are not in the extent map tree's list of modified extents. 4804 */ 4805 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4806 struct btrfs_inode *inode, 4807 struct btrfs_path *path) 4808 { 4809 struct btrfs_root *root = inode->root; 4810 struct btrfs_key key; 4811 const u64 i_size = i_size_read(&inode->vfs_inode); 4812 const u64 ino = btrfs_ino(inode); 4813 struct btrfs_path *dst_path = NULL; 4814 bool dropped_extents = false; 4815 u64 truncate_offset = i_size; 4816 struct extent_buffer *leaf; 4817 int slot; 4818 int ins_nr = 0; 4819 int start_slot; 4820 int ret; 4821 4822 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4823 return 0; 4824 4825 key.objectid = ino; 4826 key.type = BTRFS_EXTENT_DATA_KEY; 4827 key.offset = i_size; 4828 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4829 if (ret < 0) 4830 goto out; 4831 4832 /* 4833 * We must check if there is a prealloc extent that starts before the 4834 * i_size and crosses the i_size boundary. This is to ensure later we 4835 * truncate down to the end of that extent and not to the i_size, as 4836 * otherwise we end up losing part of the prealloc extent after a log 4837 * replay and with an implicit hole if there is another prealloc extent 4838 * that starts at an offset beyond i_size. 4839 */ 4840 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4841 if (ret < 0) 4842 goto out; 4843 4844 if (ret == 0) { 4845 struct btrfs_file_extent_item *ei; 4846 4847 leaf = path->nodes[0]; 4848 slot = path->slots[0]; 4849 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4850 4851 if (btrfs_file_extent_type(leaf, ei) == 4852 BTRFS_FILE_EXTENT_PREALLOC) { 4853 u64 extent_end; 4854 4855 btrfs_item_key_to_cpu(leaf, &key, slot); 4856 extent_end = key.offset + 4857 btrfs_file_extent_num_bytes(leaf, ei); 4858 4859 if (extent_end > i_size) 4860 truncate_offset = extent_end; 4861 } 4862 } else { 4863 ret = 0; 4864 } 4865 4866 while (true) { 4867 leaf = path->nodes[0]; 4868 slot = path->slots[0]; 4869 4870 if (slot >= btrfs_header_nritems(leaf)) { 4871 if (ins_nr > 0) { 4872 ret = copy_items(trans, inode, dst_path, path, 4873 start_slot, ins_nr, 1, 0); 4874 if (ret < 0) 4875 goto out; 4876 ins_nr = 0; 4877 } 4878 ret = btrfs_next_leaf(root, path); 4879 if (ret < 0) 4880 goto out; 4881 if (ret > 0) { 4882 ret = 0; 4883 break; 4884 } 4885 continue; 4886 } 4887 4888 btrfs_item_key_to_cpu(leaf, &key, slot); 4889 if (key.objectid > ino) 4890 break; 4891 if (WARN_ON_ONCE(key.objectid < ino) || 4892 key.type < BTRFS_EXTENT_DATA_KEY || 4893 key.offset < i_size) { 4894 path->slots[0]++; 4895 continue; 4896 } 4897 if (!dropped_extents) { 4898 /* 4899 * Avoid logging extent items logged in past fsync calls 4900 * and leading to duplicate keys in the log tree. 4901 */ 4902 ret = truncate_inode_items(trans, root->log_root, inode, 4903 truncate_offset, 4904 BTRFS_EXTENT_DATA_KEY); 4905 if (ret) 4906 goto out; 4907 dropped_extents = true; 4908 } 4909 if (ins_nr == 0) 4910 start_slot = slot; 4911 ins_nr++; 4912 path->slots[0]++; 4913 if (!dst_path) { 4914 dst_path = btrfs_alloc_path(); 4915 if (!dst_path) { 4916 ret = -ENOMEM; 4917 goto out; 4918 } 4919 } 4920 } 4921 if (ins_nr > 0) 4922 ret = copy_items(trans, inode, dst_path, path, 4923 start_slot, ins_nr, 1, 0); 4924 out: 4925 btrfs_release_path(path); 4926 btrfs_free_path(dst_path); 4927 return ret; 4928 } 4929 4930 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4931 struct btrfs_inode *inode, 4932 struct btrfs_path *path, 4933 struct btrfs_log_ctx *ctx) 4934 { 4935 struct btrfs_ordered_extent *ordered; 4936 struct btrfs_ordered_extent *tmp; 4937 struct extent_map *em, *n; 4938 struct list_head extents; 4939 struct extent_map_tree *tree = &inode->extent_tree; 4940 int ret = 0; 4941 int num = 0; 4942 4943 INIT_LIST_HEAD(&extents); 4944 4945 write_lock(&tree->lock); 4946 4947 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4948 list_del_init(&em->list); 4949 /* 4950 * Just an arbitrary number, this can be really CPU intensive 4951 * once we start getting a lot of extents, and really once we 4952 * have a bunch of extents we just want to commit since it will 4953 * be faster. 4954 */ 4955 if (++num > 32768) { 4956 list_del_init(&tree->modified_extents); 4957 ret = -EFBIG; 4958 goto process; 4959 } 4960 4961 if (em->generation < trans->transid) 4962 continue; 4963 4964 /* We log prealloc extents beyond eof later. */ 4965 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4966 em->start >= i_size_read(&inode->vfs_inode)) 4967 continue; 4968 4969 /* Need a ref to keep it from getting evicted from cache */ 4970 refcount_inc(&em->refs); 4971 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4972 list_add_tail(&em->list, &extents); 4973 num++; 4974 } 4975 4976 list_sort(NULL, &extents, extent_cmp); 4977 process: 4978 while (!list_empty(&extents)) { 4979 em = list_entry(extents.next, struct extent_map, list); 4980 4981 list_del_init(&em->list); 4982 4983 /* 4984 * If we had an error we just need to delete everybody from our 4985 * private list. 4986 */ 4987 if (ret) { 4988 clear_em_logging(tree, em); 4989 free_extent_map(em); 4990 continue; 4991 } 4992 4993 write_unlock(&tree->lock); 4994 4995 ret = log_one_extent(trans, inode, em, path, ctx); 4996 write_lock(&tree->lock); 4997 clear_em_logging(tree, em); 4998 free_extent_map(em); 4999 } 5000 WARN_ON(!list_empty(&extents)); 5001 write_unlock(&tree->lock); 5002 5003 if (!ret) 5004 ret = btrfs_log_prealloc_extents(trans, inode, path); 5005 if (ret) 5006 return ret; 5007 5008 /* 5009 * We have logged all extents successfully, now make sure the commit of 5010 * the current transaction waits for the ordered extents to complete 5011 * before it commits and wipes out the log trees, otherwise we would 5012 * lose data if an ordered extents completes after the transaction 5013 * commits and a power failure happens after the transaction commit. 5014 */ 5015 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 5016 list_del_init(&ordered->log_list); 5017 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 5018 5019 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5020 spin_lock_irq(&inode->ordered_tree.lock); 5021 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5022 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 5023 atomic_inc(&trans->transaction->pending_ordered); 5024 } 5025 spin_unlock_irq(&inode->ordered_tree.lock); 5026 } 5027 btrfs_put_ordered_extent(ordered); 5028 } 5029 5030 return 0; 5031 } 5032 5033 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 5034 struct btrfs_path *path, u64 *size_ret) 5035 { 5036 struct btrfs_key key; 5037 int ret; 5038 5039 key.objectid = btrfs_ino(inode); 5040 key.type = BTRFS_INODE_ITEM_KEY; 5041 key.offset = 0; 5042 5043 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 5044 if (ret < 0) { 5045 return ret; 5046 } else if (ret > 0) { 5047 *size_ret = 0; 5048 } else { 5049 struct btrfs_inode_item *item; 5050 5051 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5052 struct btrfs_inode_item); 5053 *size_ret = btrfs_inode_size(path->nodes[0], item); 5054 /* 5055 * If the in-memory inode's i_size is smaller then the inode 5056 * size stored in the btree, return the inode's i_size, so 5057 * that we get a correct inode size after replaying the log 5058 * when before a power failure we had a shrinking truncate 5059 * followed by addition of a new name (rename / new hard link). 5060 * Otherwise return the inode size from the btree, to avoid 5061 * data loss when replaying a log due to previously doing a 5062 * write that expands the inode's size and logging a new name 5063 * immediately after. 5064 */ 5065 if (*size_ret > inode->vfs_inode.i_size) 5066 *size_ret = inode->vfs_inode.i_size; 5067 } 5068 5069 btrfs_release_path(path); 5070 return 0; 5071 } 5072 5073 /* 5074 * At the moment we always log all xattrs. This is to figure out at log replay 5075 * time which xattrs must have their deletion replayed. If a xattr is missing 5076 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5077 * because if a xattr is deleted, the inode is fsynced and a power failure 5078 * happens, causing the log to be replayed the next time the fs is mounted, 5079 * we want the xattr to not exist anymore (same behaviour as other filesystems 5080 * with a journal, ext3/4, xfs, f2fs, etc). 5081 */ 5082 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5083 struct btrfs_inode *inode, 5084 struct btrfs_path *path, 5085 struct btrfs_path *dst_path) 5086 { 5087 struct btrfs_root *root = inode->root; 5088 int ret; 5089 struct btrfs_key key; 5090 const u64 ino = btrfs_ino(inode); 5091 int ins_nr = 0; 5092 int start_slot = 0; 5093 bool found_xattrs = false; 5094 5095 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5096 return 0; 5097 5098 key.objectid = ino; 5099 key.type = BTRFS_XATTR_ITEM_KEY; 5100 key.offset = 0; 5101 5102 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5103 if (ret < 0) 5104 return ret; 5105 5106 while (true) { 5107 int slot = path->slots[0]; 5108 struct extent_buffer *leaf = path->nodes[0]; 5109 int nritems = btrfs_header_nritems(leaf); 5110 5111 if (slot >= nritems) { 5112 if (ins_nr > 0) { 5113 ret = copy_items(trans, inode, dst_path, path, 5114 start_slot, ins_nr, 1, 0); 5115 if (ret < 0) 5116 return ret; 5117 ins_nr = 0; 5118 } 5119 ret = btrfs_next_leaf(root, path); 5120 if (ret < 0) 5121 return ret; 5122 else if (ret > 0) 5123 break; 5124 continue; 5125 } 5126 5127 btrfs_item_key_to_cpu(leaf, &key, slot); 5128 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5129 break; 5130 5131 if (ins_nr == 0) 5132 start_slot = slot; 5133 ins_nr++; 5134 path->slots[0]++; 5135 found_xattrs = true; 5136 cond_resched(); 5137 } 5138 if (ins_nr > 0) { 5139 ret = copy_items(trans, inode, dst_path, path, 5140 start_slot, ins_nr, 1, 0); 5141 if (ret < 0) 5142 return ret; 5143 } 5144 5145 if (!found_xattrs) 5146 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5147 5148 return 0; 5149 } 5150 5151 /* 5152 * When using the NO_HOLES feature if we punched a hole that causes the 5153 * deletion of entire leafs or all the extent items of the first leaf (the one 5154 * that contains the inode item and references) we may end up not processing 5155 * any extents, because there are no leafs with a generation matching the 5156 * current transaction that have extent items for our inode. So we need to find 5157 * if any holes exist and then log them. We also need to log holes after any 5158 * truncate operation that changes the inode's size. 5159 */ 5160 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5161 struct btrfs_inode *inode, 5162 struct btrfs_path *path) 5163 { 5164 struct btrfs_root *root = inode->root; 5165 struct btrfs_fs_info *fs_info = root->fs_info; 5166 struct btrfs_key key; 5167 const u64 ino = btrfs_ino(inode); 5168 const u64 i_size = i_size_read(&inode->vfs_inode); 5169 u64 prev_extent_end = 0; 5170 int ret; 5171 5172 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5173 return 0; 5174 5175 key.objectid = ino; 5176 key.type = BTRFS_EXTENT_DATA_KEY; 5177 key.offset = 0; 5178 5179 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5180 if (ret < 0) 5181 return ret; 5182 5183 while (true) { 5184 struct extent_buffer *leaf = path->nodes[0]; 5185 5186 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5187 ret = btrfs_next_leaf(root, path); 5188 if (ret < 0) 5189 return ret; 5190 if (ret > 0) { 5191 ret = 0; 5192 break; 5193 } 5194 leaf = path->nodes[0]; 5195 } 5196 5197 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5198 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5199 break; 5200 5201 /* We have a hole, log it. */ 5202 if (prev_extent_end < key.offset) { 5203 const u64 hole_len = key.offset - prev_extent_end; 5204 5205 /* 5206 * Release the path to avoid deadlocks with other code 5207 * paths that search the root while holding locks on 5208 * leafs from the log root. 5209 */ 5210 btrfs_release_path(path); 5211 ret = btrfs_insert_file_extent(trans, root->log_root, 5212 ino, prev_extent_end, 0, 5213 0, hole_len, 0, hole_len, 5214 0, 0, 0); 5215 if (ret < 0) 5216 return ret; 5217 5218 /* 5219 * Search for the same key again in the root. Since it's 5220 * an extent item and we are holding the inode lock, the 5221 * key must still exist. If it doesn't just emit warning 5222 * and return an error to fall back to a transaction 5223 * commit. 5224 */ 5225 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5226 if (ret < 0) 5227 return ret; 5228 if (WARN_ON(ret > 0)) 5229 return -ENOENT; 5230 leaf = path->nodes[0]; 5231 } 5232 5233 prev_extent_end = btrfs_file_extent_end(path); 5234 path->slots[0]++; 5235 cond_resched(); 5236 } 5237 5238 if (prev_extent_end < i_size) { 5239 u64 hole_len; 5240 5241 btrfs_release_path(path); 5242 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5243 ret = btrfs_insert_file_extent(trans, root->log_root, 5244 ino, prev_extent_end, 0, 0, 5245 hole_len, 0, hole_len, 5246 0, 0, 0); 5247 if (ret < 0) 5248 return ret; 5249 } 5250 5251 return 0; 5252 } 5253 5254 /* 5255 * When we are logging a new inode X, check if it doesn't have a reference that 5256 * matches the reference from some other inode Y created in a past transaction 5257 * and that was renamed in the current transaction. If we don't do this, then at 5258 * log replay time we can lose inode Y (and all its files if it's a directory): 5259 * 5260 * mkdir /mnt/x 5261 * echo "hello world" > /mnt/x/foobar 5262 * sync 5263 * mv /mnt/x /mnt/y 5264 * mkdir /mnt/x # or touch /mnt/x 5265 * xfs_io -c fsync /mnt/x 5266 * <power fail> 5267 * mount fs, trigger log replay 5268 * 5269 * After the log replay procedure, we would lose the first directory and all its 5270 * files (file foobar). 5271 * For the case where inode Y is not a directory we simply end up losing it: 5272 * 5273 * echo "123" > /mnt/foo 5274 * sync 5275 * mv /mnt/foo /mnt/bar 5276 * echo "abc" > /mnt/foo 5277 * xfs_io -c fsync /mnt/foo 5278 * <power fail> 5279 * 5280 * We also need this for cases where a snapshot entry is replaced by some other 5281 * entry (file or directory) otherwise we end up with an unreplayable log due to 5282 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5283 * if it were a regular entry: 5284 * 5285 * mkdir /mnt/x 5286 * btrfs subvolume snapshot /mnt /mnt/x/snap 5287 * btrfs subvolume delete /mnt/x/snap 5288 * rmdir /mnt/x 5289 * mkdir /mnt/x 5290 * fsync /mnt/x or fsync some new file inside it 5291 * <power fail> 5292 * 5293 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5294 * the same transaction. 5295 */ 5296 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5297 const int slot, 5298 const struct btrfs_key *key, 5299 struct btrfs_inode *inode, 5300 u64 *other_ino, u64 *other_parent) 5301 { 5302 int ret; 5303 struct btrfs_path *search_path; 5304 char *name = NULL; 5305 u32 name_len = 0; 5306 u32 item_size = btrfs_item_size(eb, slot); 5307 u32 cur_offset = 0; 5308 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5309 5310 search_path = btrfs_alloc_path(); 5311 if (!search_path) 5312 return -ENOMEM; 5313 search_path->search_commit_root = 1; 5314 search_path->skip_locking = 1; 5315 5316 while (cur_offset < item_size) { 5317 u64 parent; 5318 u32 this_name_len; 5319 u32 this_len; 5320 unsigned long name_ptr; 5321 struct btrfs_dir_item *di; 5322 5323 if (key->type == BTRFS_INODE_REF_KEY) { 5324 struct btrfs_inode_ref *iref; 5325 5326 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5327 parent = key->offset; 5328 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5329 name_ptr = (unsigned long)(iref + 1); 5330 this_len = sizeof(*iref) + this_name_len; 5331 } else { 5332 struct btrfs_inode_extref *extref; 5333 5334 extref = (struct btrfs_inode_extref *)(ptr + 5335 cur_offset); 5336 parent = btrfs_inode_extref_parent(eb, extref); 5337 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5338 name_ptr = (unsigned long)&extref->name; 5339 this_len = sizeof(*extref) + this_name_len; 5340 } 5341 5342 if (this_name_len > name_len) { 5343 char *new_name; 5344 5345 new_name = krealloc(name, this_name_len, GFP_NOFS); 5346 if (!new_name) { 5347 ret = -ENOMEM; 5348 goto out; 5349 } 5350 name_len = this_name_len; 5351 name = new_name; 5352 } 5353 5354 read_extent_buffer(eb, name, name_ptr, this_name_len); 5355 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5356 parent, name, this_name_len, 0); 5357 if (di && !IS_ERR(di)) { 5358 struct btrfs_key di_key; 5359 5360 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5361 di, &di_key); 5362 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5363 if (di_key.objectid != key->objectid) { 5364 ret = 1; 5365 *other_ino = di_key.objectid; 5366 *other_parent = parent; 5367 } else { 5368 ret = 0; 5369 } 5370 } else { 5371 ret = -EAGAIN; 5372 } 5373 goto out; 5374 } else if (IS_ERR(di)) { 5375 ret = PTR_ERR(di); 5376 goto out; 5377 } 5378 btrfs_release_path(search_path); 5379 5380 cur_offset += this_len; 5381 } 5382 ret = 0; 5383 out: 5384 btrfs_free_path(search_path); 5385 kfree(name); 5386 return ret; 5387 } 5388 5389 struct btrfs_ino_list { 5390 u64 ino; 5391 u64 parent; 5392 struct list_head list; 5393 }; 5394 5395 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5396 struct btrfs_root *root, 5397 struct btrfs_path *path, 5398 struct btrfs_log_ctx *ctx, 5399 u64 ino, u64 parent) 5400 { 5401 struct btrfs_ino_list *ino_elem; 5402 LIST_HEAD(inode_list); 5403 int ret = 0; 5404 5405 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5406 if (!ino_elem) 5407 return -ENOMEM; 5408 ino_elem->ino = ino; 5409 ino_elem->parent = parent; 5410 list_add_tail(&ino_elem->list, &inode_list); 5411 5412 while (!list_empty(&inode_list)) { 5413 struct btrfs_fs_info *fs_info = root->fs_info; 5414 struct btrfs_key key; 5415 struct inode *inode; 5416 5417 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 5418 list); 5419 ino = ino_elem->ino; 5420 parent = ino_elem->parent; 5421 list_del(&ino_elem->list); 5422 kfree(ino_elem); 5423 if (ret) 5424 continue; 5425 5426 btrfs_release_path(path); 5427 5428 inode = btrfs_iget(fs_info->sb, ino, root); 5429 /* 5430 * If the other inode that had a conflicting dir entry was 5431 * deleted in the current transaction, we need to log its parent 5432 * directory. 5433 */ 5434 if (IS_ERR(inode)) { 5435 ret = PTR_ERR(inode); 5436 if (ret == -ENOENT) { 5437 inode = btrfs_iget(fs_info->sb, parent, root); 5438 if (IS_ERR(inode)) { 5439 ret = PTR_ERR(inode); 5440 } else { 5441 ret = btrfs_log_inode(trans, 5442 BTRFS_I(inode), 5443 LOG_OTHER_INODE_ALL, 5444 ctx); 5445 btrfs_add_delayed_iput(inode); 5446 } 5447 } 5448 continue; 5449 } 5450 /* 5451 * If the inode was already logged skip it - otherwise we can 5452 * hit an infinite loop. Example: 5453 * 5454 * From the commit root (previous transaction) we have the 5455 * following inodes: 5456 * 5457 * inode 257 a directory 5458 * inode 258 with references "zz" and "zz_link" on inode 257 5459 * inode 259 with reference "a" on inode 257 5460 * 5461 * And in the current (uncommitted) transaction we have: 5462 * 5463 * inode 257 a directory, unchanged 5464 * inode 258 with references "a" and "a2" on inode 257 5465 * inode 259 with reference "zz_link" on inode 257 5466 * inode 261 with reference "zz" on inode 257 5467 * 5468 * When logging inode 261 the following infinite loop could 5469 * happen if we don't skip already logged inodes: 5470 * 5471 * - we detect inode 258 as a conflicting inode, with inode 261 5472 * on reference "zz", and log it; 5473 * 5474 * - we detect inode 259 as a conflicting inode, with inode 258 5475 * on reference "a", and log it; 5476 * 5477 * - we detect inode 258 as a conflicting inode, with inode 259 5478 * on reference "zz_link", and log it - again! After this we 5479 * repeat the above steps forever. 5480 */ 5481 spin_lock(&BTRFS_I(inode)->lock); 5482 /* 5483 * Check the inode's logged_trans only instead of 5484 * btrfs_inode_in_log(). This is because the last_log_commit of 5485 * the inode is not updated when we only log that it exists (see 5486 * btrfs_log_inode()). 5487 */ 5488 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5489 spin_unlock(&BTRFS_I(inode)->lock); 5490 btrfs_add_delayed_iput(inode); 5491 continue; 5492 } 5493 spin_unlock(&BTRFS_I(inode)->lock); 5494 /* 5495 * We are safe logging the other inode without acquiring its 5496 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5497 * are safe against concurrent renames of the other inode as 5498 * well because during a rename we pin the log and update the 5499 * log with the new name before we unpin it. 5500 */ 5501 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); 5502 if (ret) { 5503 btrfs_add_delayed_iput(inode); 5504 continue; 5505 } 5506 5507 key.objectid = ino; 5508 key.type = BTRFS_INODE_REF_KEY; 5509 key.offset = 0; 5510 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5511 if (ret < 0) { 5512 btrfs_add_delayed_iput(inode); 5513 continue; 5514 } 5515 5516 while (true) { 5517 struct extent_buffer *leaf = path->nodes[0]; 5518 int slot = path->slots[0]; 5519 u64 other_ino = 0; 5520 u64 other_parent = 0; 5521 5522 if (slot >= btrfs_header_nritems(leaf)) { 5523 ret = btrfs_next_leaf(root, path); 5524 if (ret < 0) { 5525 break; 5526 } else if (ret > 0) { 5527 ret = 0; 5528 break; 5529 } 5530 continue; 5531 } 5532 5533 btrfs_item_key_to_cpu(leaf, &key, slot); 5534 if (key.objectid != ino || 5535 (key.type != BTRFS_INODE_REF_KEY && 5536 key.type != BTRFS_INODE_EXTREF_KEY)) { 5537 ret = 0; 5538 break; 5539 } 5540 5541 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5542 BTRFS_I(inode), &other_ino, 5543 &other_parent); 5544 if (ret < 0) 5545 break; 5546 if (ret > 0) { 5547 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5548 if (!ino_elem) { 5549 ret = -ENOMEM; 5550 break; 5551 } 5552 ino_elem->ino = other_ino; 5553 ino_elem->parent = other_parent; 5554 list_add_tail(&ino_elem->list, &inode_list); 5555 ret = 0; 5556 } 5557 path->slots[0]++; 5558 } 5559 btrfs_add_delayed_iput(inode); 5560 } 5561 5562 return ret; 5563 } 5564 5565 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5566 struct btrfs_inode *inode, 5567 struct btrfs_key *min_key, 5568 const struct btrfs_key *max_key, 5569 struct btrfs_path *path, 5570 struct btrfs_path *dst_path, 5571 const u64 logged_isize, 5572 const bool recursive_logging, 5573 const int inode_only, 5574 struct btrfs_log_ctx *ctx, 5575 bool *need_log_inode_item) 5576 { 5577 const u64 i_size = i_size_read(&inode->vfs_inode); 5578 struct btrfs_root *root = inode->root; 5579 int ins_start_slot = 0; 5580 int ins_nr = 0; 5581 int ret; 5582 5583 while (1) { 5584 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5585 if (ret < 0) 5586 return ret; 5587 if (ret > 0) { 5588 ret = 0; 5589 break; 5590 } 5591 again: 5592 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5593 if (min_key->objectid != max_key->objectid) 5594 break; 5595 if (min_key->type > max_key->type) 5596 break; 5597 5598 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5599 *need_log_inode_item = false; 5600 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5601 min_key->offset >= i_size) { 5602 /* 5603 * Extents at and beyond eof are logged with 5604 * btrfs_log_prealloc_extents(). 5605 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5606 * and no keys greater than that, so bail out. 5607 */ 5608 break; 5609 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5610 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5611 inode->generation == trans->transid && 5612 !recursive_logging) { 5613 u64 other_ino = 0; 5614 u64 other_parent = 0; 5615 5616 ret = btrfs_check_ref_name_override(path->nodes[0], 5617 path->slots[0], min_key, inode, 5618 &other_ino, &other_parent); 5619 if (ret < 0) { 5620 return ret; 5621 } else if (ret > 0 && 5622 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5623 if (ins_nr > 0) { 5624 ins_nr++; 5625 } else { 5626 ins_nr = 1; 5627 ins_start_slot = path->slots[0]; 5628 } 5629 ret = copy_items(trans, inode, dst_path, path, 5630 ins_start_slot, ins_nr, 5631 inode_only, logged_isize); 5632 if (ret < 0) 5633 return ret; 5634 ins_nr = 0; 5635 5636 ret = log_conflicting_inodes(trans, root, path, 5637 ctx, other_ino, other_parent); 5638 if (ret) 5639 return ret; 5640 btrfs_release_path(path); 5641 goto next_key; 5642 } 5643 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5644 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5645 if (ins_nr == 0) 5646 goto next_slot; 5647 ret = copy_items(trans, inode, dst_path, path, 5648 ins_start_slot, 5649 ins_nr, inode_only, logged_isize); 5650 if (ret < 0) 5651 return ret; 5652 ins_nr = 0; 5653 goto next_slot; 5654 } 5655 5656 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5657 ins_nr++; 5658 goto next_slot; 5659 } else if (!ins_nr) { 5660 ins_start_slot = path->slots[0]; 5661 ins_nr = 1; 5662 goto next_slot; 5663 } 5664 5665 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5666 ins_nr, inode_only, logged_isize); 5667 if (ret < 0) 5668 return ret; 5669 ins_nr = 1; 5670 ins_start_slot = path->slots[0]; 5671 next_slot: 5672 path->slots[0]++; 5673 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5674 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5675 path->slots[0]); 5676 goto again; 5677 } 5678 if (ins_nr) { 5679 ret = copy_items(trans, inode, dst_path, path, 5680 ins_start_slot, ins_nr, inode_only, 5681 logged_isize); 5682 if (ret < 0) 5683 return ret; 5684 ins_nr = 0; 5685 } 5686 btrfs_release_path(path); 5687 next_key: 5688 if (min_key->offset < (u64)-1) { 5689 min_key->offset++; 5690 } else if (min_key->type < max_key->type) { 5691 min_key->type++; 5692 min_key->offset = 0; 5693 } else { 5694 break; 5695 } 5696 5697 /* 5698 * We may process many leaves full of items for our inode, so 5699 * avoid monopolizing a cpu for too long by rescheduling while 5700 * not holding locks on any tree. 5701 */ 5702 cond_resched(); 5703 } 5704 if (ins_nr) { 5705 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5706 ins_nr, inode_only, logged_isize); 5707 if (ret) 5708 return ret; 5709 } 5710 5711 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5712 /* 5713 * Release the path because otherwise we might attempt to double 5714 * lock the same leaf with btrfs_log_prealloc_extents() below. 5715 */ 5716 btrfs_release_path(path); 5717 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5718 } 5719 5720 return ret; 5721 } 5722 5723 /* log a single inode in the tree log. 5724 * At least one parent directory for this inode must exist in the tree 5725 * or be logged already. 5726 * 5727 * Any items from this inode changed by the current transaction are copied 5728 * to the log tree. An extra reference is taken on any extents in this 5729 * file, allowing us to avoid a whole pile of corner cases around logging 5730 * blocks that have been removed from the tree. 5731 * 5732 * See LOG_INODE_ALL and related defines for a description of what inode_only 5733 * does. 5734 * 5735 * This handles both files and directories. 5736 */ 5737 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5738 struct btrfs_inode *inode, 5739 int inode_only, 5740 struct btrfs_log_ctx *ctx) 5741 { 5742 struct btrfs_path *path; 5743 struct btrfs_path *dst_path; 5744 struct btrfs_key min_key; 5745 struct btrfs_key max_key; 5746 struct btrfs_root *log = inode->root->log_root; 5747 int ret; 5748 bool fast_search = false; 5749 u64 ino = btrfs_ino(inode); 5750 struct extent_map_tree *em_tree = &inode->extent_tree; 5751 u64 logged_isize = 0; 5752 bool need_log_inode_item = true; 5753 bool xattrs_logged = false; 5754 bool recursive_logging = false; 5755 bool inode_item_dropped = true; 5756 const bool orig_logged_before = ctx->logged_before; 5757 5758 path = btrfs_alloc_path(); 5759 if (!path) 5760 return -ENOMEM; 5761 dst_path = btrfs_alloc_path(); 5762 if (!dst_path) { 5763 btrfs_free_path(path); 5764 return -ENOMEM; 5765 } 5766 5767 min_key.objectid = ino; 5768 min_key.type = BTRFS_INODE_ITEM_KEY; 5769 min_key.offset = 0; 5770 5771 max_key.objectid = ino; 5772 5773 5774 /* today the code can only do partial logging of directories */ 5775 if (S_ISDIR(inode->vfs_inode.i_mode) || 5776 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5777 &inode->runtime_flags) && 5778 inode_only >= LOG_INODE_EXISTS)) 5779 max_key.type = BTRFS_XATTR_ITEM_KEY; 5780 else 5781 max_key.type = (u8)-1; 5782 max_key.offset = (u64)-1; 5783 5784 /* 5785 * Only run delayed items if we are a directory. We want to make sure 5786 * all directory indexes hit the fs/subvolume tree so we can find them 5787 * and figure out which index ranges have to be logged. 5788 */ 5789 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5790 ret = btrfs_commit_inode_delayed_items(trans, inode); 5791 if (ret) 5792 goto out; 5793 } 5794 5795 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5796 recursive_logging = true; 5797 if (inode_only == LOG_OTHER_INODE) 5798 inode_only = LOG_INODE_EXISTS; 5799 else 5800 inode_only = LOG_INODE_ALL; 5801 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5802 } else { 5803 mutex_lock(&inode->log_mutex); 5804 } 5805 5806 /* 5807 * Before logging the inode item, cache the value returned by 5808 * inode_logged(), because after that we have the need to figure out if 5809 * the inode was previously logged in this transaction. 5810 */ 5811 ret = inode_logged(trans, inode, path); 5812 if (ret < 0) 5813 goto out_unlock; 5814 ctx->logged_before = (ret == 1); 5815 ret = 0; 5816 5817 /* 5818 * This is for cases where logging a directory could result in losing a 5819 * a file after replaying the log. For example, if we move a file from a 5820 * directory A to a directory B, then fsync directory A, we have no way 5821 * to known the file was moved from A to B, so logging just A would 5822 * result in losing the file after a log replay. 5823 */ 5824 if (S_ISDIR(inode->vfs_inode.i_mode) && 5825 inode_only == LOG_INODE_ALL && 5826 inode->last_unlink_trans >= trans->transid) { 5827 btrfs_set_log_full_commit(trans); 5828 ret = 1; 5829 goto out_unlock; 5830 } 5831 5832 /* 5833 * a brute force approach to making sure we get the most uptodate 5834 * copies of everything. 5835 */ 5836 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5837 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5838 5839 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5840 if (inode_only == LOG_INODE_EXISTS) 5841 max_key_type = BTRFS_XATTR_ITEM_KEY; 5842 if (ctx->logged_before) 5843 ret = drop_inode_items(trans, log, path, inode, 5844 max_key_type); 5845 } else { 5846 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 5847 /* 5848 * Make sure the new inode item we write to the log has 5849 * the same isize as the current one (if it exists). 5850 * This is necessary to prevent data loss after log 5851 * replay, and also to prevent doing a wrong expanding 5852 * truncate - for e.g. create file, write 4K into offset 5853 * 0, fsync, write 4K into offset 4096, add hard link, 5854 * fsync some other file (to sync log), power fail - if 5855 * we use the inode's current i_size, after log replay 5856 * we get a 8Kb file, with the last 4Kb extent as a hole 5857 * (zeroes), as if an expanding truncate happened, 5858 * instead of getting a file of 4Kb only. 5859 */ 5860 ret = logged_inode_size(log, inode, path, &logged_isize); 5861 if (ret) 5862 goto out_unlock; 5863 } 5864 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5865 &inode->runtime_flags)) { 5866 if (inode_only == LOG_INODE_EXISTS) { 5867 max_key.type = BTRFS_XATTR_ITEM_KEY; 5868 if (ctx->logged_before) 5869 ret = drop_inode_items(trans, log, path, 5870 inode, max_key.type); 5871 } else { 5872 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5873 &inode->runtime_flags); 5874 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5875 &inode->runtime_flags); 5876 if (ctx->logged_before) 5877 ret = truncate_inode_items(trans, log, 5878 inode, 0, 0); 5879 } 5880 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5881 &inode->runtime_flags) || 5882 inode_only == LOG_INODE_EXISTS) { 5883 if (inode_only == LOG_INODE_ALL) 5884 fast_search = true; 5885 max_key.type = BTRFS_XATTR_ITEM_KEY; 5886 if (ctx->logged_before) 5887 ret = drop_inode_items(trans, log, path, inode, 5888 max_key.type); 5889 } else { 5890 if (inode_only == LOG_INODE_ALL) 5891 fast_search = true; 5892 inode_item_dropped = false; 5893 goto log_extents; 5894 } 5895 5896 } 5897 if (ret) 5898 goto out_unlock; 5899 5900 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5901 path, dst_path, logged_isize, 5902 recursive_logging, inode_only, ctx, 5903 &need_log_inode_item); 5904 if (ret) 5905 goto out_unlock; 5906 5907 btrfs_release_path(path); 5908 btrfs_release_path(dst_path); 5909 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5910 if (ret) 5911 goto out_unlock; 5912 xattrs_logged = true; 5913 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5914 btrfs_release_path(path); 5915 btrfs_release_path(dst_path); 5916 ret = btrfs_log_holes(trans, inode, path); 5917 if (ret) 5918 goto out_unlock; 5919 } 5920 log_extents: 5921 btrfs_release_path(path); 5922 btrfs_release_path(dst_path); 5923 if (need_log_inode_item) { 5924 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5925 if (ret) 5926 goto out_unlock; 5927 /* 5928 * If we are doing a fast fsync and the inode was logged before 5929 * in this transaction, we don't need to log the xattrs because 5930 * they were logged before. If xattrs were added, changed or 5931 * deleted since the last time we logged the inode, then we have 5932 * already logged them because the inode had the runtime flag 5933 * BTRFS_INODE_COPY_EVERYTHING set. 5934 */ 5935 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5936 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5937 if (ret) 5938 goto out_unlock; 5939 btrfs_release_path(path); 5940 } 5941 } 5942 if (fast_search) { 5943 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 5944 if (ret) 5945 goto out_unlock; 5946 } else if (inode_only == LOG_INODE_ALL) { 5947 struct extent_map *em, *n; 5948 5949 write_lock(&em_tree->lock); 5950 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5951 list_del_init(&em->list); 5952 write_unlock(&em_tree->lock); 5953 } 5954 5955 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5956 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 5957 if (ret) 5958 goto out_unlock; 5959 } 5960 5961 spin_lock(&inode->lock); 5962 inode->logged_trans = trans->transid; 5963 /* 5964 * Don't update last_log_commit if we logged that an inode exists. 5965 * We do this for three reasons: 5966 * 5967 * 1) We might have had buffered writes to this inode that were 5968 * flushed and had their ordered extents completed in this 5969 * transaction, but we did not previously log the inode with 5970 * LOG_INODE_ALL. Later the inode was evicted and after that 5971 * it was loaded again and this LOG_INODE_EXISTS log operation 5972 * happened. We must make sure that if an explicit fsync against 5973 * the inode is performed later, it logs the new extents, an 5974 * updated inode item, etc, and syncs the log. The same logic 5975 * applies to direct IO writes instead of buffered writes. 5976 * 5977 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 5978 * is logged with an i_size of 0 or whatever value was logged 5979 * before. If later the i_size of the inode is increased by a 5980 * truncate operation, the log is synced through an fsync of 5981 * some other inode and then finally an explicit fsync against 5982 * this inode is made, we must make sure this fsync logs the 5983 * inode with the new i_size, the hole between old i_size and 5984 * the new i_size, and syncs the log. 5985 * 5986 * 3) If we are logging that an ancestor inode exists as part of 5987 * logging a new name from a link or rename operation, don't update 5988 * its last_log_commit - otherwise if an explicit fsync is made 5989 * against an ancestor, the fsync considers the inode in the log 5990 * and doesn't sync the log, resulting in the ancestor missing after 5991 * a power failure unless the log was synced as part of an fsync 5992 * against any other unrelated inode. 5993 */ 5994 if (inode_only != LOG_INODE_EXISTS) 5995 inode->last_log_commit = inode->last_sub_trans; 5996 spin_unlock(&inode->lock); 5997 5998 /* 5999 * Reset the last_reflink_trans so that the next fsync does not need to 6000 * go through the slower path when logging extents and their checksums. 6001 */ 6002 if (inode_only == LOG_INODE_ALL) 6003 inode->last_reflink_trans = 0; 6004 6005 out_unlock: 6006 mutex_unlock(&inode->log_mutex); 6007 out: 6008 btrfs_free_path(path); 6009 btrfs_free_path(dst_path); 6010 6011 if (recursive_logging) 6012 ctx->logged_before = orig_logged_before; 6013 6014 return ret; 6015 } 6016 6017 /* 6018 * Check if we need to log an inode. This is used in contexts where while 6019 * logging an inode we need to log another inode (either that it exists or in 6020 * full mode). This is used instead of btrfs_inode_in_log() because the later 6021 * requires the inode to be in the log and have the log transaction committed, 6022 * while here we do not care if the log transaction was already committed - our 6023 * caller will commit the log later - and we want to avoid logging an inode 6024 * multiple times when multiple tasks have joined the same log transaction. 6025 */ 6026 static bool need_log_inode(struct btrfs_trans_handle *trans, 6027 struct btrfs_inode *inode) 6028 { 6029 /* 6030 * If a directory was not modified, no dentries added or removed, we can 6031 * and should avoid logging it. 6032 */ 6033 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 6034 return false; 6035 6036 /* 6037 * If this inode does not have new/updated/deleted xattrs since the last 6038 * time it was logged and is flagged as logged in the current transaction, 6039 * we can skip logging it. As for new/deleted names, those are updated in 6040 * the log by link/unlink/rename operations. 6041 * In case the inode was logged and then evicted and reloaded, its 6042 * logged_trans will be 0, in which case we have to fully log it since 6043 * logged_trans is a transient field, not persisted. 6044 */ 6045 if (inode->logged_trans == trans->transid && 6046 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 6047 return false; 6048 6049 return true; 6050 } 6051 6052 struct btrfs_dir_list { 6053 u64 ino; 6054 struct list_head list; 6055 }; 6056 6057 /* 6058 * Log the inodes of the new dentries of a directory. See log_dir_items() for 6059 * details about the why it is needed. 6060 * This is a recursive operation - if an existing dentry corresponds to a 6061 * directory, that directory's new entries are logged too (same behaviour as 6062 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 6063 * the dentries point to we do not lock their i_mutex, otherwise lockdep 6064 * complains about the following circular lock dependency / possible deadlock: 6065 * 6066 * CPU0 CPU1 6067 * ---- ---- 6068 * lock(&type->i_mutex_dir_key#3/2); 6069 * lock(sb_internal#2); 6070 * lock(&type->i_mutex_dir_key#3/2); 6071 * lock(&sb->s_type->i_mutex_key#14); 6072 * 6073 * Where sb_internal is the lock (a counter that works as a lock) acquired by 6074 * sb_start_intwrite() in btrfs_start_transaction(). 6075 * Not locking i_mutex of the inodes is still safe because: 6076 * 6077 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 6078 * that while logging the inode new references (names) are added or removed 6079 * from the inode, leaving the logged inode item with a link count that does 6080 * not match the number of logged inode reference items. This is fine because 6081 * at log replay time we compute the real number of links and correct the 6082 * link count in the inode item (see replay_one_buffer() and 6083 * link_to_fixup_dir()); 6084 * 6085 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 6086 * while logging the inode's items new index items (key type 6087 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 6088 * has a size that doesn't match the sum of the lengths of all the logged 6089 * names - this is ok, not a problem, because at log replay time we set the 6090 * directory's i_size to the correct value (see replay_one_name() and 6091 * do_overwrite_item()). 6092 */ 6093 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 6094 struct btrfs_root *root, 6095 struct btrfs_inode *start_inode, 6096 struct btrfs_log_ctx *ctx) 6097 { 6098 struct btrfs_fs_info *fs_info = root->fs_info; 6099 struct btrfs_path *path; 6100 LIST_HEAD(dir_list); 6101 struct btrfs_dir_list *dir_elem; 6102 int ret = 0; 6103 6104 /* 6105 * If we are logging a new name, as part of a link or rename operation, 6106 * don't bother logging new dentries, as we just want to log the names 6107 * of an inode and that any new parents exist. 6108 */ 6109 if (ctx->logging_new_name) 6110 return 0; 6111 6112 path = btrfs_alloc_path(); 6113 if (!path) 6114 return -ENOMEM; 6115 6116 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 6117 if (!dir_elem) { 6118 btrfs_free_path(path); 6119 return -ENOMEM; 6120 } 6121 dir_elem->ino = btrfs_ino(start_inode); 6122 list_add_tail(&dir_elem->list, &dir_list); 6123 6124 while (!list_empty(&dir_list)) { 6125 struct extent_buffer *leaf; 6126 struct btrfs_key min_key; 6127 int nritems; 6128 int i; 6129 6130 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 6131 list); 6132 if (ret) 6133 goto next_dir_inode; 6134 6135 min_key.objectid = dir_elem->ino; 6136 min_key.type = BTRFS_DIR_INDEX_KEY; 6137 min_key.offset = 0; 6138 again: 6139 btrfs_release_path(path); 6140 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 6141 if (ret < 0) { 6142 goto next_dir_inode; 6143 } else if (ret > 0) { 6144 ret = 0; 6145 goto next_dir_inode; 6146 } 6147 6148 leaf = path->nodes[0]; 6149 nritems = btrfs_header_nritems(leaf); 6150 for (i = path->slots[0]; i < nritems; i++) { 6151 struct btrfs_dir_item *di; 6152 struct btrfs_key di_key; 6153 struct inode *di_inode; 6154 struct btrfs_dir_list *new_dir_elem; 6155 int log_mode = LOG_INODE_EXISTS; 6156 int type; 6157 6158 btrfs_item_key_to_cpu(leaf, &min_key, i); 6159 if (min_key.objectid != dir_elem->ino || 6160 min_key.type != BTRFS_DIR_INDEX_KEY) 6161 goto next_dir_inode; 6162 6163 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 6164 type = btrfs_dir_type(leaf, di); 6165 if (btrfs_dir_transid(leaf, di) < trans->transid) 6166 continue; 6167 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 6168 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 6169 continue; 6170 6171 btrfs_release_path(path); 6172 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 6173 if (IS_ERR(di_inode)) { 6174 ret = PTR_ERR(di_inode); 6175 goto next_dir_inode; 6176 } 6177 6178 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6179 btrfs_add_delayed_iput(di_inode); 6180 break; 6181 } 6182 6183 ctx->log_new_dentries = false; 6184 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 6185 log_mode = LOG_INODE_ALL; 6186 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 6187 log_mode, ctx); 6188 btrfs_add_delayed_iput(di_inode); 6189 if (ret) 6190 goto next_dir_inode; 6191 if (ctx->log_new_dentries) { 6192 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 6193 GFP_NOFS); 6194 if (!new_dir_elem) { 6195 ret = -ENOMEM; 6196 goto next_dir_inode; 6197 } 6198 new_dir_elem->ino = di_key.objectid; 6199 list_add_tail(&new_dir_elem->list, &dir_list); 6200 } 6201 break; 6202 } 6203 if (min_key.offset < (u64)-1) { 6204 min_key.offset++; 6205 goto again; 6206 } 6207 next_dir_inode: 6208 list_del(&dir_elem->list); 6209 kfree(dir_elem); 6210 } 6211 6212 btrfs_free_path(path); 6213 return ret; 6214 } 6215 6216 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6217 struct btrfs_inode *inode, 6218 struct btrfs_log_ctx *ctx) 6219 { 6220 struct btrfs_fs_info *fs_info = trans->fs_info; 6221 int ret; 6222 struct btrfs_path *path; 6223 struct btrfs_key key; 6224 struct btrfs_root *root = inode->root; 6225 const u64 ino = btrfs_ino(inode); 6226 6227 path = btrfs_alloc_path(); 6228 if (!path) 6229 return -ENOMEM; 6230 path->skip_locking = 1; 6231 path->search_commit_root = 1; 6232 6233 key.objectid = ino; 6234 key.type = BTRFS_INODE_REF_KEY; 6235 key.offset = 0; 6236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6237 if (ret < 0) 6238 goto out; 6239 6240 while (true) { 6241 struct extent_buffer *leaf = path->nodes[0]; 6242 int slot = path->slots[0]; 6243 u32 cur_offset = 0; 6244 u32 item_size; 6245 unsigned long ptr; 6246 6247 if (slot >= btrfs_header_nritems(leaf)) { 6248 ret = btrfs_next_leaf(root, path); 6249 if (ret < 0) 6250 goto out; 6251 else if (ret > 0) 6252 break; 6253 continue; 6254 } 6255 6256 btrfs_item_key_to_cpu(leaf, &key, slot); 6257 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6258 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6259 break; 6260 6261 item_size = btrfs_item_size(leaf, slot); 6262 ptr = btrfs_item_ptr_offset(leaf, slot); 6263 while (cur_offset < item_size) { 6264 struct btrfs_key inode_key; 6265 struct inode *dir_inode; 6266 6267 inode_key.type = BTRFS_INODE_ITEM_KEY; 6268 inode_key.offset = 0; 6269 6270 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6271 struct btrfs_inode_extref *extref; 6272 6273 extref = (struct btrfs_inode_extref *) 6274 (ptr + cur_offset); 6275 inode_key.objectid = btrfs_inode_extref_parent( 6276 leaf, extref); 6277 cur_offset += sizeof(*extref); 6278 cur_offset += btrfs_inode_extref_name_len(leaf, 6279 extref); 6280 } else { 6281 inode_key.objectid = key.offset; 6282 cur_offset = item_size; 6283 } 6284 6285 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6286 root); 6287 /* 6288 * If the parent inode was deleted, return an error to 6289 * fallback to a transaction commit. This is to prevent 6290 * getting an inode that was moved from one parent A to 6291 * a parent B, got its former parent A deleted and then 6292 * it got fsync'ed, from existing at both parents after 6293 * a log replay (and the old parent still existing). 6294 * Example: 6295 * 6296 * mkdir /mnt/A 6297 * mkdir /mnt/B 6298 * touch /mnt/B/bar 6299 * sync 6300 * mv /mnt/B/bar /mnt/A/bar 6301 * mv -T /mnt/A /mnt/B 6302 * fsync /mnt/B/bar 6303 * <power fail> 6304 * 6305 * If we ignore the old parent B which got deleted, 6306 * after a log replay we would have file bar linked 6307 * at both parents and the old parent B would still 6308 * exist. 6309 */ 6310 if (IS_ERR(dir_inode)) { 6311 ret = PTR_ERR(dir_inode); 6312 goto out; 6313 } 6314 6315 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6316 btrfs_add_delayed_iput(dir_inode); 6317 continue; 6318 } 6319 6320 ctx->log_new_dentries = false; 6321 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6322 LOG_INODE_ALL, ctx); 6323 if (!ret && ctx->log_new_dentries) 6324 ret = log_new_dir_dentries(trans, root, 6325 BTRFS_I(dir_inode), ctx); 6326 btrfs_add_delayed_iput(dir_inode); 6327 if (ret) 6328 goto out; 6329 } 6330 path->slots[0]++; 6331 } 6332 ret = 0; 6333 out: 6334 btrfs_free_path(path); 6335 return ret; 6336 } 6337 6338 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6339 struct btrfs_root *root, 6340 struct btrfs_path *path, 6341 struct btrfs_log_ctx *ctx) 6342 { 6343 struct btrfs_key found_key; 6344 6345 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6346 6347 while (true) { 6348 struct btrfs_fs_info *fs_info = root->fs_info; 6349 struct extent_buffer *leaf = path->nodes[0]; 6350 int slot = path->slots[0]; 6351 struct btrfs_key search_key; 6352 struct inode *inode; 6353 u64 ino; 6354 int ret = 0; 6355 6356 btrfs_release_path(path); 6357 6358 ino = found_key.offset; 6359 6360 search_key.objectid = found_key.offset; 6361 search_key.type = BTRFS_INODE_ITEM_KEY; 6362 search_key.offset = 0; 6363 inode = btrfs_iget(fs_info->sb, ino, root); 6364 if (IS_ERR(inode)) 6365 return PTR_ERR(inode); 6366 6367 if (BTRFS_I(inode)->generation >= trans->transid && 6368 need_log_inode(trans, BTRFS_I(inode))) 6369 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6370 LOG_INODE_EXISTS, ctx); 6371 btrfs_add_delayed_iput(inode); 6372 if (ret) 6373 return ret; 6374 6375 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6376 break; 6377 6378 search_key.type = BTRFS_INODE_REF_KEY; 6379 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6380 if (ret < 0) 6381 return ret; 6382 6383 leaf = path->nodes[0]; 6384 slot = path->slots[0]; 6385 if (slot >= btrfs_header_nritems(leaf)) { 6386 ret = btrfs_next_leaf(root, path); 6387 if (ret < 0) 6388 return ret; 6389 else if (ret > 0) 6390 return -ENOENT; 6391 leaf = path->nodes[0]; 6392 slot = path->slots[0]; 6393 } 6394 6395 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6396 if (found_key.objectid != search_key.objectid || 6397 found_key.type != BTRFS_INODE_REF_KEY) 6398 return -ENOENT; 6399 } 6400 return 0; 6401 } 6402 6403 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6404 struct btrfs_inode *inode, 6405 struct dentry *parent, 6406 struct btrfs_log_ctx *ctx) 6407 { 6408 struct btrfs_root *root = inode->root; 6409 struct dentry *old_parent = NULL; 6410 struct super_block *sb = inode->vfs_inode.i_sb; 6411 int ret = 0; 6412 6413 while (true) { 6414 if (!parent || d_really_is_negative(parent) || 6415 sb != parent->d_sb) 6416 break; 6417 6418 inode = BTRFS_I(d_inode(parent)); 6419 if (root != inode->root) 6420 break; 6421 6422 if (inode->generation >= trans->transid && 6423 need_log_inode(trans, inode)) { 6424 ret = btrfs_log_inode(trans, inode, 6425 LOG_INODE_EXISTS, ctx); 6426 if (ret) 6427 break; 6428 } 6429 if (IS_ROOT(parent)) 6430 break; 6431 6432 parent = dget_parent(parent); 6433 dput(old_parent); 6434 old_parent = parent; 6435 } 6436 dput(old_parent); 6437 6438 return ret; 6439 } 6440 6441 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6442 struct btrfs_inode *inode, 6443 struct dentry *parent, 6444 struct btrfs_log_ctx *ctx) 6445 { 6446 struct btrfs_root *root = inode->root; 6447 const u64 ino = btrfs_ino(inode); 6448 struct btrfs_path *path; 6449 struct btrfs_key search_key; 6450 int ret; 6451 6452 /* 6453 * For a single hard link case, go through a fast path that does not 6454 * need to iterate the fs/subvolume tree. 6455 */ 6456 if (inode->vfs_inode.i_nlink < 2) 6457 return log_new_ancestors_fast(trans, inode, parent, ctx); 6458 6459 path = btrfs_alloc_path(); 6460 if (!path) 6461 return -ENOMEM; 6462 6463 search_key.objectid = ino; 6464 search_key.type = BTRFS_INODE_REF_KEY; 6465 search_key.offset = 0; 6466 again: 6467 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6468 if (ret < 0) 6469 goto out; 6470 if (ret == 0) 6471 path->slots[0]++; 6472 6473 while (true) { 6474 struct extent_buffer *leaf = path->nodes[0]; 6475 int slot = path->slots[0]; 6476 struct btrfs_key found_key; 6477 6478 if (slot >= btrfs_header_nritems(leaf)) { 6479 ret = btrfs_next_leaf(root, path); 6480 if (ret < 0) 6481 goto out; 6482 else if (ret > 0) 6483 break; 6484 continue; 6485 } 6486 6487 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6488 if (found_key.objectid != ino || 6489 found_key.type > BTRFS_INODE_EXTREF_KEY) 6490 break; 6491 6492 /* 6493 * Don't deal with extended references because they are rare 6494 * cases and too complex to deal with (we would need to keep 6495 * track of which subitem we are processing for each item in 6496 * this loop, etc). So just return some error to fallback to 6497 * a transaction commit. 6498 */ 6499 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6500 ret = -EMLINK; 6501 goto out; 6502 } 6503 6504 /* 6505 * Logging ancestors needs to do more searches on the fs/subvol 6506 * tree, so it releases the path as needed to avoid deadlocks. 6507 * Keep track of the last inode ref key and resume from that key 6508 * after logging all new ancestors for the current hard link. 6509 */ 6510 memcpy(&search_key, &found_key, sizeof(search_key)); 6511 6512 ret = log_new_ancestors(trans, root, path, ctx); 6513 if (ret) 6514 goto out; 6515 btrfs_release_path(path); 6516 goto again; 6517 } 6518 ret = 0; 6519 out: 6520 btrfs_free_path(path); 6521 return ret; 6522 } 6523 6524 /* 6525 * helper function around btrfs_log_inode to make sure newly created 6526 * parent directories also end up in the log. A minimal inode and backref 6527 * only logging is done of any parent directories that are older than 6528 * the last committed transaction 6529 */ 6530 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6531 struct btrfs_inode *inode, 6532 struct dentry *parent, 6533 int inode_only, 6534 struct btrfs_log_ctx *ctx) 6535 { 6536 struct btrfs_root *root = inode->root; 6537 struct btrfs_fs_info *fs_info = root->fs_info; 6538 int ret = 0; 6539 bool log_dentries = false; 6540 6541 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6542 ret = 1; 6543 goto end_no_trans; 6544 } 6545 6546 if (btrfs_root_refs(&root->root_item) == 0) { 6547 ret = 1; 6548 goto end_no_trans; 6549 } 6550 6551 /* 6552 * Skip already logged inodes or inodes corresponding to tmpfiles 6553 * (since logging them is pointless, a link count of 0 means they 6554 * will never be accessible). 6555 */ 6556 if ((btrfs_inode_in_log(inode, trans->transid) && 6557 list_empty(&ctx->ordered_extents)) || 6558 inode->vfs_inode.i_nlink == 0) { 6559 ret = BTRFS_NO_LOG_SYNC; 6560 goto end_no_trans; 6561 } 6562 6563 ret = start_log_trans(trans, root, ctx); 6564 if (ret) 6565 goto end_no_trans; 6566 6567 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 6568 if (ret) 6569 goto end_trans; 6570 6571 /* 6572 * for regular files, if its inode is already on disk, we don't 6573 * have to worry about the parents at all. This is because 6574 * we can use the last_unlink_trans field to record renames 6575 * and other fun in this file. 6576 */ 6577 if (S_ISREG(inode->vfs_inode.i_mode) && 6578 inode->generation < trans->transid && 6579 inode->last_unlink_trans < trans->transid) { 6580 ret = 0; 6581 goto end_trans; 6582 } 6583 6584 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 6585 log_dentries = true; 6586 6587 /* 6588 * On unlink we must make sure all our current and old parent directory 6589 * inodes are fully logged. This is to prevent leaving dangling 6590 * directory index entries in directories that were our parents but are 6591 * not anymore. Not doing this results in old parent directory being 6592 * impossible to delete after log replay (rmdir will always fail with 6593 * error -ENOTEMPTY). 6594 * 6595 * Example 1: 6596 * 6597 * mkdir testdir 6598 * touch testdir/foo 6599 * ln testdir/foo testdir/bar 6600 * sync 6601 * unlink testdir/bar 6602 * xfs_io -c fsync testdir/foo 6603 * <power failure> 6604 * mount fs, triggers log replay 6605 * 6606 * If we don't log the parent directory (testdir), after log replay the 6607 * directory still has an entry pointing to the file inode using the bar 6608 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6609 * the file inode has a link count of 1. 6610 * 6611 * Example 2: 6612 * 6613 * mkdir testdir 6614 * touch foo 6615 * ln foo testdir/foo2 6616 * ln foo testdir/foo3 6617 * sync 6618 * unlink testdir/foo3 6619 * xfs_io -c fsync foo 6620 * <power failure> 6621 * mount fs, triggers log replay 6622 * 6623 * Similar as the first example, after log replay the parent directory 6624 * testdir still has an entry pointing to the inode file with name foo3 6625 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6626 * and has a link count of 2. 6627 */ 6628 if (inode->last_unlink_trans >= trans->transid) { 6629 ret = btrfs_log_all_parents(trans, inode, ctx); 6630 if (ret) 6631 goto end_trans; 6632 } 6633 6634 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6635 if (ret) 6636 goto end_trans; 6637 6638 if (log_dentries) 6639 ret = log_new_dir_dentries(trans, root, inode, ctx); 6640 else 6641 ret = 0; 6642 end_trans: 6643 if (ret < 0) { 6644 btrfs_set_log_full_commit(trans); 6645 ret = 1; 6646 } 6647 6648 if (ret) 6649 btrfs_remove_log_ctx(root, ctx); 6650 btrfs_end_log_trans(root); 6651 end_no_trans: 6652 return ret; 6653 } 6654 6655 /* 6656 * it is not safe to log dentry if the chunk root has added new 6657 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6658 * If this returns 1, you must commit the transaction to safely get your 6659 * data on disk. 6660 */ 6661 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6662 struct dentry *dentry, 6663 struct btrfs_log_ctx *ctx) 6664 { 6665 struct dentry *parent = dget_parent(dentry); 6666 int ret; 6667 6668 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6669 LOG_INODE_ALL, ctx); 6670 dput(parent); 6671 6672 return ret; 6673 } 6674 6675 /* 6676 * should be called during mount to recover any replay any log trees 6677 * from the FS 6678 */ 6679 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6680 { 6681 int ret; 6682 struct btrfs_path *path; 6683 struct btrfs_trans_handle *trans; 6684 struct btrfs_key key; 6685 struct btrfs_key found_key; 6686 struct btrfs_root *log; 6687 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6688 struct walk_control wc = { 6689 .process_func = process_one_buffer, 6690 .stage = LOG_WALK_PIN_ONLY, 6691 }; 6692 6693 path = btrfs_alloc_path(); 6694 if (!path) 6695 return -ENOMEM; 6696 6697 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6698 6699 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6700 if (IS_ERR(trans)) { 6701 ret = PTR_ERR(trans); 6702 goto error; 6703 } 6704 6705 wc.trans = trans; 6706 wc.pin = 1; 6707 6708 ret = walk_log_tree(trans, log_root_tree, &wc); 6709 if (ret) { 6710 btrfs_abort_transaction(trans, ret); 6711 goto error; 6712 } 6713 6714 again: 6715 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6716 key.offset = (u64)-1; 6717 key.type = BTRFS_ROOT_ITEM_KEY; 6718 6719 while (1) { 6720 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6721 6722 if (ret < 0) { 6723 btrfs_abort_transaction(trans, ret); 6724 goto error; 6725 } 6726 if (ret > 0) { 6727 if (path->slots[0] == 0) 6728 break; 6729 path->slots[0]--; 6730 } 6731 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6732 path->slots[0]); 6733 btrfs_release_path(path); 6734 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6735 break; 6736 6737 log = btrfs_read_tree_root(log_root_tree, &found_key); 6738 if (IS_ERR(log)) { 6739 ret = PTR_ERR(log); 6740 btrfs_abort_transaction(trans, ret); 6741 goto error; 6742 } 6743 6744 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6745 true); 6746 if (IS_ERR(wc.replay_dest)) { 6747 ret = PTR_ERR(wc.replay_dest); 6748 6749 /* 6750 * We didn't find the subvol, likely because it was 6751 * deleted. This is ok, simply skip this log and go to 6752 * the next one. 6753 * 6754 * We need to exclude the root because we can't have 6755 * other log replays overwriting this log as we'll read 6756 * it back in a few more times. This will keep our 6757 * block from being modified, and we'll just bail for 6758 * each subsequent pass. 6759 */ 6760 if (ret == -ENOENT) 6761 ret = btrfs_pin_extent_for_log_replay(trans, 6762 log->node->start, 6763 log->node->len); 6764 btrfs_put_root(log); 6765 6766 if (!ret) 6767 goto next; 6768 btrfs_abort_transaction(trans, ret); 6769 goto error; 6770 } 6771 6772 wc.replay_dest->log_root = log; 6773 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 6774 if (ret) 6775 /* The loop needs to continue due to the root refs */ 6776 btrfs_abort_transaction(trans, ret); 6777 else 6778 ret = walk_log_tree(trans, log, &wc); 6779 6780 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6781 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6782 path); 6783 if (ret) 6784 btrfs_abort_transaction(trans, ret); 6785 } 6786 6787 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6788 struct btrfs_root *root = wc.replay_dest; 6789 6790 btrfs_release_path(path); 6791 6792 /* 6793 * We have just replayed everything, and the highest 6794 * objectid of fs roots probably has changed in case 6795 * some inode_item's got replayed. 6796 * 6797 * root->objectid_mutex is not acquired as log replay 6798 * could only happen during mount. 6799 */ 6800 ret = btrfs_init_root_free_objectid(root); 6801 if (ret) 6802 btrfs_abort_transaction(trans, ret); 6803 } 6804 6805 wc.replay_dest->log_root = NULL; 6806 btrfs_put_root(wc.replay_dest); 6807 btrfs_put_root(log); 6808 6809 if (ret) 6810 goto error; 6811 next: 6812 if (found_key.offset == 0) 6813 break; 6814 key.offset = found_key.offset - 1; 6815 } 6816 btrfs_release_path(path); 6817 6818 /* step one is to pin it all, step two is to replay just inodes */ 6819 if (wc.pin) { 6820 wc.pin = 0; 6821 wc.process_func = replay_one_buffer; 6822 wc.stage = LOG_WALK_REPLAY_INODES; 6823 goto again; 6824 } 6825 /* step three is to replay everything */ 6826 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6827 wc.stage++; 6828 goto again; 6829 } 6830 6831 btrfs_free_path(path); 6832 6833 /* step 4: commit the transaction, which also unpins the blocks */ 6834 ret = btrfs_commit_transaction(trans); 6835 if (ret) 6836 return ret; 6837 6838 log_root_tree->log_root = NULL; 6839 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6840 btrfs_put_root(log_root_tree); 6841 6842 return 0; 6843 error: 6844 if (wc.trans) 6845 btrfs_end_transaction(wc.trans); 6846 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6847 btrfs_free_path(path); 6848 return ret; 6849 } 6850 6851 /* 6852 * there are some corner cases where we want to force a full 6853 * commit instead of allowing a directory to be logged. 6854 * 6855 * They revolve around files there were unlinked from the directory, and 6856 * this function updates the parent directory so that a full commit is 6857 * properly done if it is fsync'd later after the unlinks are done. 6858 * 6859 * Must be called before the unlink operations (updates to the subvolume tree, 6860 * inodes, etc) are done. 6861 */ 6862 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6863 struct btrfs_inode *dir, struct btrfs_inode *inode, 6864 int for_rename) 6865 { 6866 /* 6867 * when we're logging a file, if it hasn't been renamed 6868 * or unlinked, and its inode is fully committed on disk, 6869 * we don't have to worry about walking up the directory chain 6870 * to log its parents. 6871 * 6872 * So, we use the last_unlink_trans field to put this transid 6873 * into the file. When the file is logged we check it and 6874 * don't log the parents if the file is fully on disk. 6875 */ 6876 mutex_lock(&inode->log_mutex); 6877 inode->last_unlink_trans = trans->transid; 6878 mutex_unlock(&inode->log_mutex); 6879 6880 /* 6881 * if this directory was already logged any new 6882 * names for this file/dir will get recorded 6883 */ 6884 if (dir->logged_trans == trans->transid) 6885 return; 6886 6887 /* 6888 * if the inode we're about to unlink was logged, 6889 * the log will be properly updated for any new names 6890 */ 6891 if (inode->logged_trans == trans->transid) 6892 return; 6893 6894 /* 6895 * when renaming files across directories, if the directory 6896 * there we're unlinking from gets fsync'd later on, there's 6897 * no way to find the destination directory later and fsync it 6898 * properly. So, we have to be conservative and force commits 6899 * so the new name gets discovered. 6900 */ 6901 if (for_rename) 6902 goto record; 6903 6904 /* we can safely do the unlink without any special recording */ 6905 return; 6906 6907 record: 6908 mutex_lock(&dir->log_mutex); 6909 dir->last_unlink_trans = trans->transid; 6910 mutex_unlock(&dir->log_mutex); 6911 } 6912 6913 /* 6914 * Make sure that if someone attempts to fsync the parent directory of a deleted 6915 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6916 * that after replaying the log tree of the parent directory's root we will not 6917 * see the snapshot anymore and at log replay time we will not see any log tree 6918 * corresponding to the deleted snapshot's root, which could lead to replaying 6919 * it after replaying the log tree of the parent directory (which would replay 6920 * the snapshot delete operation). 6921 * 6922 * Must be called before the actual snapshot destroy operation (updates to the 6923 * parent root and tree of tree roots trees, etc) are done. 6924 */ 6925 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6926 struct btrfs_inode *dir) 6927 { 6928 mutex_lock(&dir->log_mutex); 6929 dir->last_unlink_trans = trans->transid; 6930 mutex_unlock(&dir->log_mutex); 6931 } 6932 6933 /** 6934 * Update the log after adding a new name for an inode. 6935 * 6936 * @trans: Transaction handle. 6937 * @old_dentry: The dentry associated with the old name and the old 6938 * parent directory. 6939 * @old_dir: The inode of the previous parent directory for the case 6940 * of a rename. For a link operation, it must be NULL. 6941 * @old_dir_index: The index number associated with the old name, meaningful 6942 * only for rename operations (when @old_dir is not NULL). 6943 * Ignored for link operations. 6944 * @parent: The dentry associated with the directory under which the 6945 * new name is located. 6946 * 6947 * Call this after adding a new name for an inode, as a result of a link or 6948 * rename operation, and it will properly update the log to reflect the new name. 6949 */ 6950 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6951 struct dentry *old_dentry, struct btrfs_inode *old_dir, 6952 u64 old_dir_index, struct dentry *parent) 6953 { 6954 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 6955 struct btrfs_root *root = inode->root; 6956 struct btrfs_log_ctx ctx; 6957 bool log_pinned = false; 6958 int ret; 6959 6960 /* 6961 * this will force the logging code to walk the dentry chain 6962 * up for the file 6963 */ 6964 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6965 inode->last_unlink_trans = trans->transid; 6966 6967 /* 6968 * if this inode hasn't been logged and directory we're renaming it 6969 * from hasn't been logged, we don't need to log it 6970 */ 6971 ret = inode_logged(trans, inode, NULL); 6972 if (ret < 0) { 6973 goto out; 6974 } else if (ret == 0) { 6975 if (!old_dir) 6976 return; 6977 /* 6978 * If the inode was not logged and we are doing a rename (old_dir is not 6979 * NULL), check if old_dir was logged - if it was not we can return and 6980 * do nothing. 6981 */ 6982 ret = inode_logged(trans, old_dir, NULL); 6983 if (ret < 0) 6984 goto out; 6985 else if (ret == 0) 6986 return; 6987 } 6988 ret = 0; 6989 6990 /* 6991 * If we are doing a rename (old_dir is not NULL) from a directory that 6992 * was previously logged, make sure that on log replay we get the old 6993 * dir entry deleted. This is needed because we will also log the new 6994 * name of the renamed inode, so we need to make sure that after log 6995 * replay we don't end up with both the new and old dir entries existing. 6996 */ 6997 if (old_dir && old_dir->logged_trans == trans->transid) { 6998 struct btrfs_root *log = old_dir->root->log_root; 6999 struct btrfs_path *path; 7000 7001 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7002 7003 /* 7004 * We have two inodes to update in the log, the old directory and 7005 * the inode that got renamed, so we must pin the log to prevent 7006 * anyone from syncing the log until we have updated both inodes 7007 * in the log. 7008 */ 7009 log_pinned = true; 7010 btrfs_pin_log_trans(root); 7011 7012 path = btrfs_alloc_path(); 7013 if (!path) { 7014 ret = -ENOMEM; 7015 goto out; 7016 } 7017 7018 /* 7019 * Other concurrent task might be logging the old directory, 7020 * as it can be triggered when logging other inode that had or 7021 * still has a dentry in the old directory. So take the old 7022 * directory's log_mutex to prevent getting an -EEXIST when 7023 * logging a key to record the deletion, or having that other 7024 * task logging the old directory get an -EEXIST if it attempts 7025 * to log the same key after we just did it. In both cases that 7026 * would result in falling back to a transaction commit. 7027 */ 7028 mutex_lock(&old_dir->log_mutex); 7029 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7030 old_dentry->d_name.name, 7031 old_dentry->d_name.len, old_dir_index); 7032 if (ret > 0) { 7033 /* 7034 * The dentry does not exist in the log, so record its 7035 * deletion. 7036 */ 7037 btrfs_release_path(path); 7038 ret = insert_dir_log_key(trans, log, path, 7039 btrfs_ino(old_dir), 7040 old_dir_index, old_dir_index); 7041 } 7042 mutex_unlock(&old_dir->log_mutex); 7043 7044 btrfs_free_path(path); 7045 if (ret < 0) 7046 goto out; 7047 } 7048 7049 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7050 ctx.logging_new_name = true; 7051 /* 7052 * We don't care about the return value. If we fail to log the new name 7053 * then we know the next attempt to sync the log will fallback to a full 7054 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7055 * we don't need to worry about getting a log committed that has an 7056 * inconsistent state after a rename operation. 7057 */ 7058 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7059 out: 7060 /* 7061 * If an error happened mark the log for a full commit because it's not 7062 * consistent and up to date or we couldn't find out if one of the 7063 * inodes was logged before in this transaction. Do it before unpinning 7064 * the log, to avoid any races with someone else trying to commit it. 7065 */ 7066 if (ret < 0) 7067 btrfs_set_log_full_commit(trans); 7068 if (log_pinned) 7069 btrfs_end_log_trans(root); 7070 } 7071 7072