1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 25 /* magic values for the inode_only field in btrfs_log_inode: 26 * 27 * LOG_INODE_ALL means to log everything 28 * LOG_INODE_EXISTS means to log just enough to recreate the inode 29 * during log replay 30 */ 31 enum { 32 LOG_INODE_ALL, 33 LOG_INODE_EXISTS, 34 LOG_OTHER_INODE, 35 LOG_OTHER_INODE_ALL, 36 }; 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 enum { 91 LOG_WALK_PIN_ONLY, 92 LOG_WALK_REPLAY_INODES, 93 LOG_WALK_REPLAY_DIR_INDEX, 94 LOG_WALK_REPLAY_ALL, 95 }; 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_inode *inode, 99 int inode_only, 100 struct btrfs_log_ctx *ctx); 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, 103 struct btrfs_path *path, u64 objectid); 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 105 struct btrfs_root *root, 106 struct btrfs_root *log, 107 struct btrfs_path *path, 108 u64 dirid, int del_all); 109 static void wait_log_commit(struct btrfs_root *root, int transid); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 struct btrfs_root *tree_root = fs_info->tree_root; 145 const bool zoned = btrfs_is_zoned(fs_info); 146 int ret = 0; 147 bool created = false; 148 149 /* 150 * First check if the log root tree was already created. If not, create 151 * it before locking the root's log_mutex, just to keep lockdep happy. 152 */ 153 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 154 mutex_lock(&tree_root->log_mutex); 155 if (!fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, fs_info); 157 if (!ret) { 158 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 159 created = true; 160 } 161 } 162 mutex_unlock(&tree_root->log_mutex); 163 if (ret) 164 return ret; 165 } 166 167 mutex_lock(&root->log_mutex); 168 169 again: 170 if (root->log_root) { 171 int index = (root->log_transid + 1) % 2; 172 173 if (btrfs_need_log_full_commit(trans)) { 174 ret = -EAGAIN; 175 goto out; 176 } 177 178 if (zoned && atomic_read(&root->log_commit[index])) { 179 wait_log_commit(root, root->log_transid - 1); 180 goto again; 181 } 182 183 if (!root->log_start_pid) { 184 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 185 root->log_start_pid = current->pid; 186 } else if (root->log_start_pid != current->pid) { 187 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 188 } 189 } else { 190 /* 191 * This means fs_info->log_root_tree was already created 192 * for some other FS trees. Do the full commit not to mix 193 * nodes from multiple log transactions to do sequential 194 * writing. 195 */ 196 if (zoned && !created) { 197 ret = -EAGAIN; 198 goto out; 199 } 200 201 ret = btrfs_add_log_tree(trans, root); 202 if (ret) 203 goto out; 204 205 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 206 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 207 root->log_start_pid = current->pid; 208 } 209 210 atomic_inc(&root->log_writers); 211 if (!ctx->logging_new_name) { 212 int index = root->log_transid % 2; 213 list_add_tail(&ctx->list, &root->log_ctxs[index]); 214 ctx->log_transid = root->log_transid; 215 } 216 217 out: 218 mutex_unlock(&root->log_mutex); 219 return ret; 220 } 221 222 /* 223 * returns 0 if there was a log transaction running and we were able 224 * to join, or returns -ENOENT if there were not transactions 225 * in progress 226 */ 227 static int join_running_log_trans(struct btrfs_root *root) 228 { 229 const bool zoned = btrfs_is_zoned(root->fs_info); 230 int ret = -ENOENT; 231 232 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 233 return ret; 234 235 mutex_lock(&root->log_mutex); 236 again: 237 if (root->log_root) { 238 int index = (root->log_transid + 1) % 2; 239 240 ret = 0; 241 if (zoned && atomic_read(&root->log_commit[index])) { 242 wait_log_commit(root, root->log_transid - 1); 243 goto again; 244 } 245 atomic_inc(&root->log_writers); 246 } 247 mutex_unlock(&root->log_mutex); 248 return ret; 249 } 250 251 /* 252 * This either makes the current running log transaction wait 253 * until you call btrfs_end_log_trans() or it makes any future 254 * log transactions wait until you call btrfs_end_log_trans() 255 */ 256 void btrfs_pin_log_trans(struct btrfs_root *root) 257 { 258 atomic_inc(&root->log_writers); 259 } 260 261 /* 262 * indicate we're done making changes to the log tree 263 * and wake up anyone waiting to do a sync 264 */ 265 void btrfs_end_log_trans(struct btrfs_root *root) 266 { 267 if (atomic_dec_and_test(&root->log_writers)) { 268 /* atomic_dec_and_test implies a barrier */ 269 cond_wake_up_nomb(&root->log_writer_wait); 270 } 271 } 272 273 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 274 { 275 filemap_fdatawait_range(buf->pages[0]->mapping, 276 buf->start, buf->start + buf->len - 1); 277 } 278 279 /* 280 * the walk control struct is used to pass state down the chain when 281 * processing the log tree. The stage field tells us which part 282 * of the log tree processing we are currently doing. The others 283 * are state fields used for that specific part 284 */ 285 struct walk_control { 286 /* should we free the extent on disk when done? This is used 287 * at transaction commit time while freeing a log tree 288 */ 289 int free; 290 291 /* pin only walk, we record which extents on disk belong to the 292 * log trees 293 */ 294 int pin; 295 296 /* what stage of the replay code we're currently in */ 297 int stage; 298 299 /* 300 * Ignore any items from the inode currently being processed. Needs 301 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 302 * the LOG_WALK_REPLAY_INODES stage. 303 */ 304 bool ignore_cur_inode; 305 306 /* the root we are currently replaying */ 307 struct btrfs_root *replay_dest; 308 309 /* the trans handle for the current replay */ 310 struct btrfs_trans_handle *trans; 311 312 /* the function that gets used to process blocks we find in the 313 * tree. Note the extent_buffer might not be up to date when it is 314 * passed in, and it must be checked or read if you need the data 315 * inside it 316 */ 317 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 318 struct walk_control *wc, u64 gen, int level); 319 }; 320 321 /* 322 * process_func used to pin down extents, write them or wait on them 323 */ 324 static int process_one_buffer(struct btrfs_root *log, 325 struct extent_buffer *eb, 326 struct walk_control *wc, u64 gen, int level) 327 { 328 struct btrfs_fs_info *fs_info = log->fs_info; 329 int ret = 0; 330 331 /* 332 * If this fs is mixed then we need to be able to process the leaves to 333 * pin down any logged extents, so we have to read the block. 334 */ 335 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 336 ret = btrfs_read_buffer(eb, gen, level, NULL); 337 if (ret) 338 return ret; 339 } 340 341 if (wc->pin) { 342 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 343 eb->len); 344 if (ret) 345 return ret; 346 347 if (btrfs_buffer_uptodate(eb, gen, 0) && 348 btrfs_header_level(eb) == 0) 349 ret = btrfs_exclude_logged_extents(eb); 350 } 351 return ret; 352 } 353 354 static int do_overwrite_item(struct btrfs_trans_handle *trans, 355 struct btrfs_root *root, 356 struct btrfs_path *path, 357 struct extent_buffer *eb, int slot, 358 struct btrfs_key *key) 359 { 360 int ret; 361 u32 item_size; 362 u64 saved_i_size = 0; 363 int save_old_i_size = 0; 364 unsigned long src_ptr; 365 unsigned long dst_ptr; 366 int overwrite_root = 0; 367 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 368 369 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 370 overwrite_root = 1; 371 372 item_size = btrfs_item_size(eb, slot); 373 src_ptr = btrfs_item_ptr_offset(eb, slot); 374 375 /* Our caller must have done a search for the key for us. */ 376 ASSERT(path->nodes[0] != NULL); 377 378 /* 379 * And the slot must point to the exact key or the slot where the key 380 * should be at (the first item with a key greater than 'key') 381 */ 382 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 383 struct btrfs_key found_key; 384 385 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 386 ret = btrfs_comp_cpu_keys(&found_key, key); 387 ASSERT(ret >= 0); 388 } else { 389 ret = 1; 390 } 391 392 if (ret == 0) { 393 char *src_copy; 394 char *dst_copy; 395 u32 dst_size = btrfs_item_size(path->nodes[0], 396 path->slots[0]); 397 if (dst_size != item_size) 398 goto insert; 399 400 if (item_size == 0) { 401 btrfs_release_path(path); 402 return 0; 403 } 404 dst_copy = kmalloc(item_size, GFP_NOFS); 405 src_copy = kmalloc(item_size, GFP_NOFS); 406 if (!dst_copy || !src_copy) { 407 btrfs_release_path(path); 408 kfree(dst_copy); 409 kfree(src_copy); 410 return -ENOMEM; 411 } 412 413 read_extent_buffer(eb, src_copy, src_ptr, item_size); 414 415 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 416 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 417 item_size); 418 ret = memcmp(dst_copy, src_copy, item_size); 419 420 kfree(dst_copy); 421 kfree(src_copy); 422 /* 423 * they have the same contents, just return, this saves 424 * us from cowing blocks in the destination tree and doing 425 * extra writes that may not have been done by a previous 426 * sync 427 */ 428 if (ret == 0) { 429 btrfs_release_path(path); 430 return 0; 431 } 432 433 /* 434 * We need to load the old nbytes into the inode so when we 435 * replay the extents we've logged we get the right nbytes. 436 */ 437 if (inode_item) { 438 struct btrfs_inode_item *item; 439 u64 nbytes; 440 u32 mode; 441 442 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 443 struct btrfs_inode_item); 444 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 445 item = btrfs_item_ptr(eb, slot, 446 struct btrfs_inode_item); 447 btrfs_set_inode_nbytes(eb, item, nbytes); 448 449 /* 450 * If this is a directory we need to reset the i_size to 451 * 0 so that we can set it up properly when replaying 452 * the rest of the items in this log. 453 */ 454 mode = btrfs_inode_mode(eb, item); 455 if (S_ISDIR(mode)) 456 btrfs_set_inode_size(eb, item, 0); 457 } 458 } else if (inode_item) { 459 struct btrfs_inode_item *item; 460 u32 mode; 461 462 /* 463 * New inode, set nbytes to 0 so that the nbytes comes out 464 * properly when we replay the extents. 465 */ 466 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 467 btrfs_set_inode_nbytes(eb, item, 0); 468 469 /* 470 * If this is a directory we need to reset the i_size to 0 so 471 * that we can set it up properly when replaying the rest of 472 * the items in this log. 473 */ 474 mode = btrfs_inode_mode(eb, item); 475 if (S_ISDIR(mode)) 476 btrfs_set_inode_size(eb, item, 0); 477 } 478 insert: 479 btrfs_release_path(path); 480 /* try to insert the key into the destination tree */ 481 path->skip_release_on_error = 1; 482 ret = btrfs_insert_empty_item(trans, root, path, 483 key, item_size); 484 path->skip_release_on_error = 0; 485 486 /* make sure any existing item is the correct size */ 487 if (ret == -EEXIST || ret == -EOVERFLOW) { 488 u32 found_size; 489 found_size = btrfs_item_size(path->nodes[0], 490 path->slots[0]); 491 if (found_size > item_size) 492 btrfs_truncate_item(path, item_size, 1); 493 else if (found_size < item_size) 494 btrfs_extend_item(path, item_size - found_size); 495 } else if (ret) { 496 return ret; 497 } 498 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 499 path->slots[0]); 500 501 /* don't overwrite an existing inode if the generation number 502 * was logged as zero. This is done when the tree logging code 503 * is just logging an inode to make sure it exists after recovery. 504 * 505 * Also, don't overwrite i_size on directories during replay. 506 * log replay inserts and removes directory items based on the 507 * state of the tree found in the subvolume, and i_size is modified 508 * as it goes 509 */ 510 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 511 struct btrfs_inode_item *src_item; 512 struct btrfs_inode_item *dst_item; 513 514 src_item = (struct btrfs_inode_item *)src_ptr; 515 dst_item = (struct btrfs_inode_item *)dst_ptr; 516 517 if (btrfs_inode_generation(eb, src_item) == 0) { 518 struct extent_buffer *dst_eb = path->nodes[0]; 519 const u64 ino_size = btrfs_inode_size(eb, src_item); 520 521 /* 522 * For regular files an ino_size == 0 is used only when 523 * logging that an inode exists, as part of a directory 524 * fsync, and the inode wasn't fsynced before. In this 525 * case don't set the size of the inode in the fs/subvol 526 * tree, otherwise we would be throwing valid data away. 527 */ 528 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 529 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 530 ino_size != 0) 531 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 532 goto no_copy; 533 } 534 535 if (overwrite_root && 536 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 537 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 538 save_old_i_size = 1; 539 saved_i_size = btrfs_inode_size(path->nodes[0], 540 dst_item); 541 } 542 } 543 544 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 545 src_ptr, item_size); 546 547 if (save_old_i_size) { 548 struct btrfs_inode_item *dst_item; 549 dst_item = (struct btrfs_inode_item *)dst_ptr; 550 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 551 } 552 553 /* make sure the generation is filled in */ 554 if (key->type == BTRFS_INODE_ITEM_KEY) { 555 struct btrfs_inode_item *dst_item; 556 dst_item = (struct btrfs_inode_item *)dst_ptr; 557 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 558 btrfs_set_inode_generation(path->nodes[0], dst_item, 559 trans->transid); 560 } 561 } 562 no_copy: 563 btrfs_mark_buffer_dirty(path->nodes[0]); 564 btrfs_release_path(path); 565 return 0; 566 } 567 568 /* 569 * Item overwrite used by replay and tree logging. eb, slot and key all refer 570 * to the src data we are copying out. 571 * 572 * root is the tree we are copying into, and path is a scratch 573 * path for use in this function (it should be released on entry and 574 * will be released on exit). 575 * 576 * If the key is already in the destination tree the existing item is 577 * overwritten. If the existing item isn't big enough, it is extended. 578 * If it is too large, it is truncated. 579 * 580 * If the key isn't in the destination yet, a new item is inserted. 581 */ 582 static int overwrite_item(struct btrfs_trans_handle *trans, 583 struct btrfs_root *root, 584 struct btrfs_path *path, 585 struct extent_buffer *eb, int slot, 586 struct btrfs_key *key) 587 { 588 int ret; 589 590 /* Look for the key in the destination tree. */ 591 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 592 if (ret < 0) 593 return ret; 594 595 return do_overwrite_item(trans, root, path, eb, slot, key); 596 } 597 598 /* 599 * simple helper to read an inode off the disk from a given root 600 * This can only be called for subvolume roots and not for the log 601 */ 602 static noinline struct inode *read_one_inode(struct btrfs_root *root, 603 u64 objectid) 604 { 605 struct inode *inode; 606 607 inode = btrfs_iget(root->fs_info->sb, objectid, root); 608 if (IS_ERR(inode)) 609 inode = NULL; 610 return inode; 611 } 612 613 /* replays a single extent in 'eb' at 'slot' with 'key' into the 614 * subvolume 'root'. path is released on entry and should be released 615 * on exit. 616 * 617 * extents in the log tree have not been allocated out of the extent 618 * tree yet. So, this completes the allocation, taking a reference 619 * as required if the extent already exists or creating a new extent 620 * if it isn't in the extent allocation tree yet. 621 * 622 * The extent is inserted into the file, dropping any existing extents 623 * from the file that overlap the new one. 624 */ 625 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 626 struct btrfs_root *root, 627 struct btrfs_path *path, 628 struct extent_buffer *eb, int slot, 629 struct btrfs_key *key) 630 { 631 struct btrfs_drop_extents_args drop_args = { 0 }; 632 struct btrfs_fs_info *fs_info = root->fs_info; 633 int found_type; 634 u64 extent_end; 635 u64 start = key->offset; 636 u64 nbytes = 0; 637 struct btrfs_file_extent_item *item; 638 struct inode *inode = NULL; 639 unsigned long size; 640 int ret = 0; 641 642 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 643 found_type = btrfs_file_extent_type(eb, item); 644 645 if (found_type == BTRFS_FILE_EXTENT_REG || 646 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 647 nbytes = btrfs_file_extent_num_bytes(eb, item); 648 extent_end = start + nbytes; 649 650 /* 651 * We don't add to the inodes nbytes if we are prealloc or a 652 * hole. 653 */ 654 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 655 nbytes = 0; 656 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 657 size = btrfs_file_extent_ram_bytes(eb, item); 658 nbytes = btrfs_file_extent_ram_bytes(eb, item); 659 extent_end = ALIGN(start + size, 660 fs_info->sectorsize); 661 } else { 662 ret = 0; 663 goto out; 664 } 665 666 inode = read_one_inode(root, key->objectid); 667 if (!inode) { 668 ret = -EIO; 669 goto out; 670 } 671 672 /* 673 * first check to see if we already have this extent in the 674 * file. This must be done before the btrfs_drop_extents run 675 * so we don't try to drop this extent. 676 */ 677 ret = btrfs_lookup_file_extent(trans, root, path, 678 btrfs_ino(BTRFS_I(inode)), start, 0); 679 680 if (ret == 0 && 681 (found_type == BTRFS_FILE_EXTENT_REG || 682 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 683 struct btrfs_file_extent_item cmp1; 684 struct btrfs_file_extent_item cmp2; 685 struct btrfs_file_extent_item *existing; 686 struct extent_buffer *leaf; 687 688 leaf = path->nodes[0]; 689 existing = btrfs_item_ptr(leaf, path->slots[0], 690 struct btrfs_file_extent_item); 691 692 read_extent_buffer(eb, &cmp1, (unsigned long)item, 693 sizeof(cmp1)); 694 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 695 sizeof(cmp2)); 696 697 /* 698 * we already have a pointer to this exact extent, 699 * we don't have to do anything 700 */ 701 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 702 btrfs_release_path(path); 703 goto out; 704 } 705 } 706 btrfs_release_path(path); 707 708 /* drop any overlapping extents */ 709 drop_args.start = start; 710 drop_args.end = extent_end; 711 drop_args.drop_cache = true; 712 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 713 if (ret) 714 goto out; 715 716 if (found_type == BTRFS_FILE_EXTENT_REG || 717 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 718 u64 offset; 719 unsigned long dest_offset; 720 struct btrfs_key ins; 721 722 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 723 btrfs_fs_incompat(fs_info, NO_HOLES)) 724 goto update_inode; 725 726 ret = btrfs_insert_empty_item(trans, root, path, key, 727 sizeof(*item)); 728 if (ret) 729 goto out; 730 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 731 path->slots[0]); 732 copy_extent_buffer(path->nodes[0], eb, dest_offset, 733 (unsigned long)item, sizeof(*item)); 734 735 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 736 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 737 ins.type = BTRFS_EXTENT_ITEM_KEY; 738 offset = key->offset - btrfs_file_extent_offset(eb, item); 739 740 /* 741 * Manually record dirty extent, as here we did a shallow 742 * file extent item copy and skip normal backref update, 743 * but modifying extent tree all by ourselves. 744 * So need to manually record dirty extent for qgroup, 745 * as the owner of the file extent changed from log tree 746 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 747 */ 748 ret = btrfs_qgroup_trace_extent(trans, 749 btrfs_file_extent_disk_bytenr(eb, item), 750 btrfs_file_extent_disk_num_bytes(eb, item), 751 GFP_NOFS); 752 if (ret < 0) 753 goto out; 754 755 if (ins.objectid > 0) { 756 struct btrfs_ref ref = { 0 }; 757 u64 csum_start; 758 u64 csum_end; 759 LIST_HEAD(ordered_sums); 760 761 /* 762 * is this extent already allocated in the extent 763 * allocation tree? If so, just add a reference 764 */ 765 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 766 ins.offset); 767 if (ret < 0) { 768 goto out; 769 } else if (ret == 0) { 770 btrfs_init_generic_ref(&ref, 771 BTRFS_ADD_DELAYED_REF, 772 ins.objectid, ins.offset, 0); 773 btrfs_init_data_ref(&ref, 774 root->root_key.objectid, 775 key->objectid, offset, 0, false); 776 ret = btrfs_inc_extent_ref(trans, &ref); 777 if (ret) 778 goto out; 779 } else { 780 /* 781 * insert the extent pointer in the extent 782 * allocation tree 783 */ 784 ret = btrfs_alloc_logged_file_extent(trans, 785 root->root_key.objectid, 786 key->objectid, offset, &ins); 787 if (ret) 788 goto out; 789 } 790 btrfs_release_path(path); 791 792 if (btrfs_file_extent_compression(eb, item)) { 793 csum_start = ins.objectid; 794 csum_end = csum_start + ins.offset; 795 } else { 796 csum_start = ins.objectid + 797 btrfs_file_extent_offset(eb, item); 798 csum_end = csum_start + 799 btrfs_file_extent_num_bytes(eb, item); 800 } 801 802 ret = btrfs_lookup_csums_range(root->log_root, 803 csum_start, csum_end - 1, 804 &ordered_sums, 0); 805 if (ret) 806 goto out; 807 /* 808 * Now delete all existing cums in the csum root that 809 * cover our range. We do this because we can have an 810 * extent that is completely referenced by one file 811 * extent item and partially referenced by another 812 * file extent item (like after using the clone or 813 * extent_same ioctls). In this case if we end up doing 814 * the replay of the one that partially references the 815 * extent first, and we do not do the csum deletion 816 * below, we can get 2 csum items in the csum tree that 817 * overlap each other. For example, imagine our log has 818 * the two following file extent items: 819 * 820 * key (257 EXTENT_DATA 409600) 821 * extent data disk byte 12845056 nr 102400 822 * extent data offset 20480 nr 20480 ram 102400 823 * 824 * key (257 EXTENT_DATA 819200) 825 * extent data disk byte 12845056 nr 102400 826 * extent data offset 0 nr 102400 ram 102400 827 * 828 * Where the second one fully references the 100K extent 829 * that starts at disk byte 12845056, and the log tree 830 * has a single csum item that covers the entire range 831 * of the extent: 832 * 833 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 834 * 835 * After the first file extent item is replayed, the 836 * csum tree gets the following csum item: 837 * 838 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 839 * 840 * Which covers the 20K sub-range starting at offset 20K 841 * of our extent. Now when we replay the second file 842 * extent item, if we do not delete existing csum items 843 * that cover any of its blocks, we end up getting two 844 * csum items in our csum tree that overlap each other: 845 * 846 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 847 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 848 * 849 * Which is a problem, because after this anyone trying 850 * to lookup up for the checksum of any block of our 851 * extent starting at an offset of 40K or higher, will 852 * end up looking at the second csum item only, which 853 * does not contain the checksum for any block starting 854 * at offset 40K or higher of our extent. 855 */ 856 while (!list_empty(&ordered_sums)) { 857 struct btrfs_ordered_sum *sums; 858 struct btrfs_root *csum_root; 859 860 sums = list_entry(ordered_sums.next, 861 struct btrfs_ordered_sum, 862 list); 863 csum_root = btrfs_csum_root(fs_info, 864 sums->bytenr); 865 if (!ret) 866 ret = btrfs_del_csums(trans, csum_root, 867 sums->bytenr, 868 sums->len); 869 if (!ret) 870 ret = btrfs_csum_file_blocks(trans, 871 csum_root, 872 sums); 873 list_del(&sums->list); 874 kfree(sums); 875 } 876 if (ret) 877 goto out; 878 } else { 879 btrfs_release_path(path); 880 } 881 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 882 /* inline extents are easy, we just overwrite them */ 883 ret = overwrite_item(trans, root, path, eb, slot, key); 884 if (ret) 885 goto out; 886 } 887 888 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 889 extent_end - start); 890 if (ret) 891 goto out; 892 893 update_inode: 894 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 895 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 896 out: 897 if (inode) 898 iput(inode); 899 return ret; 900 } 901 902 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 903 struct btrfs_inode *dir, 904 struct btrfs_inode *inode, 905 const char *name, 906 int name_len) 907 { 908 int ret; 909 910 ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); 911 if (ret) 912 return ret; 913 /* 914 * Whenever we need to check if a name exists or not, we check the 915 * fs/subvolume tree. So after an unlink we must run delayed items, so 916 * that future checks for a name during log replay see that the name 917 * does not exists anymore. 918 */ 919 return btrfs_run_delayed_items(trans); 920 } 921 922 /* 923 * when cleaning up conflicts between the directory names in the 924 * subvolume, directory names in the log and directory names in the 925 * inode back references, we may have to unlink inodes from directories. 926 * 927 * This is a helper function to do the unlink of a specific directory 928 * item 929 */ 930 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 931 struct btrfs_path *path, 932 struct btrfs_inode *dir, 933 struct btrfs_dir_item *di) 934 { 935 struct btrfs_root *root = dir->root; 936 struct inode *inode; 937 char *name; 938 int name_len; 939 struct extent_buffer *leaf; 940 struct btrfs_key location; 941 int ret; 942 943 leaf = path->nodes[0]; 944 945 btrfs_dir_item_key_to_cpu(leaf, di, &location); 946 name_len = btrfs_dir_name_len(leaf, di); 947 name = kmalloc(name_len, GFP_NOFS); 948 if (!name) 949 return -ENOMEM; 950 951 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 952 btrfs_release_path(path); 953 954 inode = read_one_inode(root, location.objectid); 955 if (!inode) { 956 ret = -EIO; 957 goto out; 958 } 959 960 ret = link_to_fixup_dir(trans, root, path, location.objectid); 961 if (ret) 962 goto out; 963 964 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, 965 name_len); 966 out: 967 kfree(name); 968 iput(inode); 969 return ret; 970 } 971 972 /* 973 * See if a given name and sequence number found in an inode back reference are 974 * already in a directory and correctly point to this inode. 975 * 976 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 977 * exists. 978 */ 979 static noinline int inode_in_dir(struct btrfs_root *root, 980 struct btrfs_path *path, 981 u64 dirid, u64 objectid, u64 index, 982 const char *name, int name_len) 983 { 984 struct btrfs_dir_item *di; 985 struct btrfs_key location; 986 int ret = 0; 987 988 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 989 index, name, name_len, 0); 990 if (IS_ERR(di)) { 991 ret = PTR_ERR(di); 992 goto out; 993 } else if (di) { 994 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 995 if (location.objectid != objectid) 996 goto out; 997 } else { 998 goto out; 999 } 1000 1001 btrfs_release_path(path); 1002 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 1003 if (IS_ERR(di)) { 1004 ret = PTR_ERR(di); 1005 goto out; 1006 } else if (di) { 1007 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1008 if (location.objectid == objectid) 1009 ret = 1; 1010 } 1011 out: 1012 btrfs_release_path(path); 1013 return ret; 1014 } 1015 1016 /* 1017 * helper function to check a log tree for a named back reference in 1018 * an inode. This is used to decide if a back reference that is 1019 * found in the subvolume conflicts with what we find in the log. 1020 * 1021 * inode backreferences may have multiple refs in a single item, 1022 * during replay we process one reference at a time, and we don't 1023 * want to delete valid links to a file from the subvolume if that 1024 * link is also in the log. 1025 */ 1026 static noinline int backref_in_log(struct btrfs_root *log, 1027 struct btrfs_key *key, 1028 u64 ref_objectid, 1029 const char *name, int namelen) 1030 { 1031 struct btrfs_path *path; 1032 int ret; 1033 1034 path = btrfs_alloc_path(); 1035 if (!path) 1036 return -ENOMEM; 1037 1038 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1039 if (ret < 0) { 1040 goto out; 1041 } else if (ret == 1) { 1042 ret = 0; 1043 goto out; 1044 } 1045 1046 if (key->type == BTRFS_INODE_EXTREF_KEY) 1047 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1048 path->slots[0], 1049 ref_objectid, 1050 name, namelen); 1051 else 1052 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1053 path->slots[0], 1054 name, namelen); 1055 out: 1056 btrfs_free_path(path); 1057 return ret; 1058 } 1059 1060 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1061 struct btrfs_root *root, 1062 struct btrfs_path *path, 1063 struct btrfs_root *log_root, 1064 struct btrfs_inode *dir, 1065 struct btrfs_inode *inode, 1066 u64 inode_objectid, u64 parent_objectid, 1067 u64 ref_index, char *name, int namelen, 1068 int *search_done) 1069 { 1070 int ret; 1071 char *victim_name; 1072 int victim_name_len; 1073 struct extent_buffer *leaf; 1074 struct btrfs_dir_item *di; 1075 struct btrfs_key search_key; 1076 struct btrfs_inode_extref *extref; 1077 1078 again: 1079 /* Search old style refs */ 1080 search_key.objectid = inode_objectid; 1081 search_key.type = BTRFS_INODE_REF_KEY; 1082 search_key.offset = parent_objectid; 1083 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1084 if (ret == 0) { 1085 struct btrfs_inode_ref *victim_ref; 1086 unsigned long ptr; 1087 unsigned long ptr_end; 1088 1089 leaf = path->nodes[0]; 1090 1091 /* are we trying to overwrite a back ref for the root directory 1092 * if so, just jump out, we're done 1093 */ 1094 if (search_key.objectid == search_key.offset) 1095 return 1; 1096 1097 /* check all the names in this back reference to see 1098 * if they are in the log. if so, we allow them to stay 1099 * otherwise they must be unlinked as a conflict 1100 */ 1101 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1102 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1103 while (ptr < ptr_end) { 1104 victim_ref = (struct btrfs_inode_ref *)ptr; 1105 victim_name_len = btrfs_inode_ref_name_len(leaf, 1106 victim_ref); 1107 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1108 if (!victim_name) 1109 return -ENOMEM; 1110 1111 read_extent_buffer(leaf, victim_name, 1112 (unsigned long)(victim_ref + 1), 1113 victim_name_len); 1114 1115 ret = backref_in_log(log_root, &search_key, 1116 parent_objectid, victim_name, 1117 victim_name_len); 1118 if (ret < 0) { 1119 kfree(victim_name); 1120 return ret; 1121 } else if (!ret) { 1122 inc_nlink(&inode->vfs_inode); 1123 btrfs_release_path(path); 1124 1125 ret = unlink_inode_for_log_replay(trans, dir, inode, 1126 victim_name, victim_name_len); 1127 kfree(victim_name); 1128 if (ret) 1129 return ret; 1130 *search_done = 1; 1131 goto again; 1132 } 1133 kfree(victim_name); 1134 1135 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1136 } 1137 1138 /* 1139 * NOTE: we have searched root tree and checked the 1140 * corresponding ref, it does not need to check again. 1141 */ 1142 *search_done = 1; 1143 } 1144 btrfs_release_path(path); 1145 1146 /* Same search but for extended refs */ 1147 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1148 inode_objectid, parent_objectid, 0, 1149 0); 1150 if (!IS_ERR_OR_NULL(extref)) { 1151 u32 item_size; 1152 u32 cur_offset = 0; 1153 unsigned long base; 1154 struct inode *victim_parent; 1155 1156 leaf = path->nodes[0]; 1157 1158 item_size = btrfs_item_size(leaf, path->slots[0]); 1159 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1160 1161 while (cur_offset < item_size) { 1162 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1163 1164 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1165 1166 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1167 goto next; 1168 1169 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1170 if (!victim_name) 1171 return -ENOMEM; 1172 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1173 victim_name_len); 1174 1175 search_key.objectid = inode_objectid; 1176 search_key.type = BTRFS_INODE_EXTREF_KEY; 1177 search_key.offset = btrfs_extref_hash(parent_objectid, 1178 victim_name, 1179 victim_name_len); 1180 ret = backref_in_log(log_root, &search_key, 1181 parent_objectid, victim_name, 1182 victim_name_len); 1183 if (ret < 0) { 1184 kfree(victim_name); 1185 return ret; 1186 } else if (!ret) { 1187 ret = -ENOENT; 1188 victim_parent = read_one_inode(root, 1189 parent_objectid); 1190 if (victim_parent) { 1191 inc_nlink(&inode->vfs_inode); 1192 btrfs_release_path(path); 1193 1194 ret = unlink_inode_for_log_replay(trans, 1195 BTRFS_I(victim_parent), 1196 inode, 1197 victim_name, 1198 victim_name_len); 1199 } 1200 iput(victim_parent); 1201 kfree(victim_name); 1202 if (ret) 1203 return ret; 1204 *search_done = 1; 1205 goto again; 1206 } 1207 kfree(victim_name); 1208 next: 1209 cur_offset += victim_name_len + sizeof(*extref); 1210 } 1211 *search_done = 1; 1212 } 1213 btrfs_release_path(path); 1214 1215 /* look for a conflicting sequence number */ 1216 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1217 ref_index, name, namelen, 0); 1218 if (IS_ERR(di)) { 1219 return PTR_ERR(di); 1220 } else if (di) { 1221 ret = drop_one_dir_item(trans, path, dir, di); 1222 if (ret) 1223 return ret; 1224 } 1225 btrfs_release_path(path); 1226 1227 /* look for a conflicting name */ 1228 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1229 name, namelen, 0); 1230 if (IS_ERR(di)) { 1231 return PTR_ERR(di); 1232 } else if (di) { 1233 ret = drop_one_dir_item(trans, path, dir, di); 1234 if (ret) 1235 return ret; 1236 } 1237 btrfs_release_path(path); 1238 1239 return 0; 1240 } 1241 1242 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1243 u32 *namelen, char **name, u64 *index, 1244 u64 *parent_objectid) 1245 { 1246 struct btrfs_inode_extref *extref; 1247 1248 extref = (struct btrfs_inode_extref *)ref_ptr; 1249 1250 *namelen = btrfs_inode_extref_name_len(eb, extref); 1251 *name = kmalloc(*namelen, GFP_NOFS); 1252 if (*name == NULL) 1253 return -ENOMEM; 1254 1255 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1256 *namelen); 1257 1258 if (index) 1259 *index = btrfs_inode_extref_index(eb, extref); 1260 if (parent_objectid) 1261 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1262 1263 return 0; 1264 } 1265 1266 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1267 u32 *namelen, char **name, u64 *index) 1268 { 1269 struct btrfs_inode_ref *ref; 1270 1271 ref = (struct btrfs_inode_ref *)ref_ptr; 1272 1273 *namelen = btrfs_inode_ref_name_len(eb, ref); 1274 *name = kmalloc(*namelen, GFP_NOFS); 1275 if (*name == NULL) 1276 return -ENOMEM; 1277 1278 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1279 1280 if (index) 1281 *index = btrfs_inode_ref_index(eb, ref); 1282 1283 return 0; 1284 } 1285 1286 /* 1287 * Take an inode reference item from the log tree and iterate all names from the 1288 * inode reference item in the subvolume tree with the same key (if it exists). 1289 * For any name that is not in the inode reference item from the log tree, do a 1290 * proper unlink of that name (that is, remove its entry from the inode 1291 * reference item and both dir index keys). 1292 */ 1293 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1294 struct btrfs_root *root, 1295 struct btrfs_path *path, 1296 struct btrfs_inode *inode, 1297 struct extent_buffer *log_eb, 1298 int log_slot, 1299 struct btrfs_key *key) 1300 { 1301 int ret; 1302 unsigned long ref_ptr; 1303 unsigned long ref_end; 1304 struct extent_buffer *eb; 1305 1306 again: 1307 btrfs_release_path(path); 1308 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1309 if (ret > 0) { 1310 ret = 0; 1311 goto out; 1312 } 1313 if (ret < 0) 1314 goto out; 1315 1316 eb = path->nodes[0]; 1317 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1318 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1319 while (ref_ptr < ref_end) { 1320 char *name = NULL; 1321 int namelen; 1322 u64 parent_id; 1323 1324 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1325 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1326 NULL, &parent_id); 1327 } else { 1328 parent_id = key->offset; 1329 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1330 NULL); 1331 } 1332 if (ret) 1333 goto out; 1334 1335 if (key->type == BTRFS_INODE_EXTREF_KEY) 1336 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1337 parent_id, name, 1338 namelen); 1339 else 1340 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1341 name, namelen); 1342 1343 if (!ret) { 1344 struct inode *dir; 1345 1346 btrfs_release_path(path); 1347 dir = read_one_inode(root, parent_id); 1348 if (!dir) { 1349 ret = -ENOENT; 1350 kfree(name); 1351 goto out; 1352 } 1353 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1354 inode, name, namelen); 1355 kfree(name); 1356 iput(dir); 1357 if (ret) 1358 goto out; 1359 goto again; 1360 } 1361 1362 kfree(name); 1363 ref_ptr += namelen; 1364 if (key->type == BTRFS_INODE_EXTREF_KEY) 1365 ref_ptr += sizeof(struct btrfs_inode_extref); 1366 else 1367 ref_ptr += sizeof(struct btrfs_inode_ref); 1368 } 1369 ret = 0; 1370 out: 1371 btrfs_release_path(path); 1372 return ret; 1373 } 1374 1375 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1376 const u8 ref_type, const char *name, 1377 const int namelen) 1378 { 1379 struct btrfs_key key; 1380 struct btrfs_path *path; 1381 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1382 int ret; 1383 1384 path = btrfs_alloc_path(); 1385 if (!path) 1386 return -ENOMEM; 1387 1388 key.objectid = btrfs_ino(BTRFS_I(inode)); 1389 key.type = ref_type; 1390 if (key.type == BTRFS_INODE_REF_KEY) 1391 key.offset = parent_id; 1392 else 1393 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1394 1395 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1396 if (ret < 0) 1397 goto out; 1398 if (ret > 0) { 1399 ret = 0; 1400 goto out; 1401 } 1402 if (key.type == BTRFS_INODE_EXTREF_KEY) 1403 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1404 path->slots[0], parent_id, name, namelen); 1405 else 1406 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1407 name, namelen); 1408 1409 out: 1410 btrfs_free_path(path); 1411 return ret; 1412 } 1413 1414 static int add_link(struct btrfs_trans_handle *trans, 1415 struct inode *dir, struct inode *inode, const char *name, 1416 int namelen, u64 ref_index) 1417 { 1418 struct btrfs_root *root = BTRFS_I(dir)->root; 1419 struct btrfs_dir_item *dir_item; 1420 struct btrfs_key key; 1421 struct btrfs_path *path; 1422 struct inode *other_inode = NULL; 1423 int ret; 1424 1425 path = btrfs_alloc_path(); 1426 if (!path) 1427 return -ENOMEM; 1428 1429 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1430 btrfs_ino(BTRFS_I(dir)), 1431 name, namelen, 0); 1432 if (!dir_item) { 1433 btrfs_release_path(path); 1434 goto add_link; 1435 } else if (IS_ERR(dir_item)) { 1436 ret = PTR_ERR(dir_item); 1437 goto out; 1438 } 1439 1440 /* 1441 * Our inode's dentry collides with the dentry of another inode which is 1442 * in the log but not yet processed since it has a higher inode number. 1443 * So delete that other dentry. 1444 */ 1445 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1446 btrfs_release_path(path); 1447 other_inode = read_one_inode(root, key.objectid); 1448 if (!other_inode) { 1449 ret = -ENOENT; 1450 goto out; 1451 } 1452 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), 1453 name, namelen); 1454 if (ret) 1455 goto out; 1456 /* 1457 * If we dropped the link count to 0, bump it so that later the iput() 1458 * on the inode will not free it. We will fixup the link count later. 1459 */ 1460 if (other_inode->i_nlink == 0) 1461 inc_nlink(other_inode); 1462 add_link: 1463 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1464 name, namelen, 0, ref_index); 1465 out: 1466 iput(other_inode); 1467 btrfs_free_path(path); 1468 1469 return ret; 1470 } 1471 1472 /* 1473 * replay one inode back reference item found in the log tree. 1474 * eb, slot and key refer to the buffer and key found in the log tree. 1475 * root is the destination we are replaying into, and path is for temp 1476 * use by this function. (it should be released on return). 1477 */ 1478 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1479 struct btrfs_root *root, 1480 struct btrfs_root *log, 1481 struct btrfs_path *path, 1482 struct extent_buffer *eb, int slot, 1483 struct btrfs_key *key) 1484 { 1485 struct inode *dir = NULL; 1486 struct inode *inode = NULL; 1487 unsigned long ref_ptr; 1488 unsigned long ref_end; 1489 char *name = NULL; 1490 int namelen; 1491 int ret; 1492 int search_done = 0; 1493 int log_ref_ver = 0; 1494 u64 parent_objectid; 1495 u64 inode_objectid; 1496 u64 ref_index = 0; 1497 int ref_struct_size; 1498 1499 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1500 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1501 1502 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1503 struct btrfs_inode_extref *r; 1504 1505 ref_struct_size = sizeof(struct btrfs_inode_extref); 1506 log_ref_ver = 1; 1507 r = (struct btrfs_inode_extref *)ref_ptr; 1508 parent_objectid = btrfs_inode_extref_parent(eb, r); 1509 } else { 1510 ref_struct_size = sizeof(struct btrfs_inode_ref); 1511 parent_objectid = key->offset; 1512 } 1513 inode_objectid = key->objectid; 1514 1515 /* 1516 * it is possible that we didn't log all the parent directories 1517 * for a given inode. If we don't find the dir, just don't 1518 * copy the back ref in. The link count fixup code will take 1519 * care of the rest 1520 */ 1521 dir = read_one_inode(root, parent_objectid); 1522 if (!dir) { 1523 ret = -ENOENT; 1524 goto out; 1525 } 1526 1527 inode = read_one_inode(root, inode_objectid); 1528 if (!inode) { 1529 ret = -EIO; 1530 goto out; 1531 } 1532 1533 while (ref_ptr < ref_end) { 1534 if (log_ref_ver) { 1535 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1536 &ref_index, &parent_objectid); 1537 /* 1538 * parent object can change from one array 1539 * item to another. 1540 */ 1541 if (!dir) 1542 dir = read_one_inode(root, parent_objectid); 1543 if (!dir) { 1544 ret = -ENOENT; 1545 goto out; 1546 } 1547 } else { 1548 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1549 &ref_index); 1550 } 1551 if (ret) 1552 goto out; 1553 1554 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1555 btrfs_ino(BTRFS_I(inode)), ref_index, 1556 name, namelen); 1557 if (ret < 0) { 1558 goto out; 1559 } else if (ret == 0) { 1560 /* 1561 * look for a conflicting back reference in the 1562 * metadata. if we find one we have to unlink that name 1563 * of the file before we add our new link. Later on, we 1564 * overwrite any existing back reference, and we don't 1565 * want to create dangling pointers in the directory. 1566 */ 1567 1568 if (!search_done) { 1569 ret = __add_inode_ref(trans, root, path, log, 1570 BTRFS_I(dir), 1571 BTRFS_I(inode), 1572 inode_objectid, 1573 parent_objectid, 1574 ref_index, name, namelen, 1575 &search_done); 1576 if (ret) { 1577 if (ret == 1) 1578 ret = 0; 1579 goto out; 1580 } 1581 } 1582 1583 /* 1584 * If a reference item already exists for this inode 1585 * with the same parent and name, but different index, 1586 * drop it and the corresponding directory index entries 1587 * from the parent before adding the new reference item 1588 * and dir index entries, otherwise we would fail with 1589 * -EEXIST returned from btrfs_add_link() below. 1590 */ 1591 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1592 name, namelen); 1593 if (ret > 0) { 1594 ret = unlink_inode_for_log_replay(trans, 1595 BTRFS_I(dir), 1596 BTRFS_I(inode), 1597 name, namelen); 1598 /* 1599 * If we dropped the link count to 0, bump it so 1600 * that later the iput() on the inode will not 1601 * free it. We will fixup the link count later. 1602 */ 1603 if (!ret && inode->i_nlink == 0) 1604 inc_nlink(inode); 1605 } 1606 if (ret < 0) 1607 goto out; 1608 1609 /* insert our name */ 1610 ret = add_link(trans, dir, inode, name, namelen, 1611 ref_index); 1612 if (ret) 1613 goto out; 1614 1615 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1616 if (ret) 1617 goto out; 1618 } 1619 /* Else, ret == 1, we already have a perfect match, we're done. */ 1620 1621 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1622 kfree(name); 1623 name = NULL; 1624 if (log_ref_ver) { 1625 iput(dir); 1626 dir = NULL; 1627 } 1628 } 1629 1630 /* 1631 * Before we overwrite the inode reference item in the subvolume tree 1632 * with the item from the log tree, we must unlink all names from the 1633 * parent directory that are in the subvolume's tree inode reference 1634 * item, otherwise we end up with an inconsistent subvolume tree where 1635 * dir index entries exist for a name but there is no inode reference 1636 * item with the same name. 1637 */ 1638 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1639 key); 1640 if (ret) 1641 goto out; 1642 1643 /* finally write the back reference in the inode */ 1644 ret = overwrite_item(trans, root, path, eb, slot, key); 1645 out: 1646 btrfs_release_path(path); 1647 kfree(name); 1648 iput(dir); 1649 iput(inode); 1650 return ret; 1651 } 1652 1653 static int count_inode_extrefs(struct btrfs_root *root, 1654 struct btrfs_inode *inode, struct btrfs_path *path) 1655 { 1656 int ret = 0; 1657 int name_len; 1658 unsigned int nlink = 0; 1659 u32 item_size; 1660 u32 cur_offset = 0; 1661 u64 inode_objectid = btrfs_ino(inode); 1662 u64 offset = 0; 1663 unsigned long ptr; 1664 struct btrfs_inode_extref *extref; 1665 struct extent_buffer *leaf; 1666 1667 while (1) { 1668 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1669 &extref, &offset); 1670 if (ret) 1671 break; 1672 1673 leaf = path->nodes[0]; 1674 item_size = btrfs_item_size(leaf, path->slots[0]); 1675 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1676 cur_offset = 0; 1677 1678 while (cur_offset < item_size) { 1679 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1680 name_len = btrfs_inode_extref_name_len(leaf, extref); 1681 1682 nlink++; 1683 1684 cur_offset += name_len + sizeof(*extref); 1685 } 1686 1687 offset++; 1688 btrfs_release_path(path); 1689 } 1690 btrfs_release_path(path); 1691 1692 if (ret < 0 && ret != -ENOENT) 1693 return ret; 1694 return nlink; 1695 } 1696 1697 static int count_inode_refs(struct btrfs_root *root, 1698 struct btrfs_inode *inode, struct btrfs_path *path) 1699 { 1700 int ret; 1701 struct btrfs_key key; 1702 unsigned int nlink = 0; 1703 unsigned long ptr; 1704 unsigned long ptr_end; 1705 int name_len; 1706 u64 ino = btrfs_ino(inode); 1707 1708 key.objectid = ino; 1709 key.type = BTRFS_INODE_REF_KEY; 1710 key.offset = (u64)-1; 1711 1712 while (1) { 1713 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1714 if (ret < 0) 1715 break; 1716 if (ret > 0) { 1717 if (path->slots[0] == 0) 1718 break; 1719 path->slots[0]--; 1720 } 1721 process_slot: 1722 btrfs_item_key_to_cpu(path->nodes[0], &key, 1723 path->slots[0]); 1724 if (key.objectid != ino || 1725 key.type != BTRFS_INODE_REF_KEY) 1726 break; 1727 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1728 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1729 path->slots[0]); 1730 while (ptr < ptr_end) { 1731 struct btrfs_inode_ref *ref; 1732 1733 ref = (struct btrfs_inode_ref *)ptr; 1734 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1735 ref); 1736 ptr = (unsigned long)(ref + 1) + name_len; 1737 nlink++; 1738 } 1739 1740 if (key.offset == 0) 1741 break; 1742 if (path->slots[0] > 0) { 1743 path->slots[0]--; 1744 goto process_slot; 1745 } 1746 key.offset--; 1747 btrfs_release_path(path); 1748 } 1749 btrfs_release_path(path); 1750 1751 return nlink; 1752 } 1753 1754 /* 1755 * There are a few corners where the link count of the file can't 1756 * be properly maintained during replay. So, instead of adding 1757 * lots of complexity to the log code, we just scan the backrefs 1758 * for any file that has been through replay. 1759 * 1760 * The scan will update the link count on the inode to reflect the 1761 * number of back refs found. If it goes down to zero, the iput 1762 * will free the inode. 1763 */ 1764 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1765 struct btrfs_root *root, 1766 struct inode *inode) 1767 { 1768 struct btrfs_path *path; 1769 int ret; 1770 u64 nlink = 0; 1771 u64 ino = btrfs_ino(BTRFS_I(inode)); 1772 1773 path = btrfs_alloc_path(); 1774 if (!path) 1775 return -ENOMEM; 1776 1777 ret = count_inode_refs(root, BTRFS_I(inode), path); 1778 if (ret < 0) 1779 goto out; 1780 1781 nlink = ret; 1782 1783 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1784 if (ret < 0) 1785 goto out; 1786 1787 nlink += ret; 1788 1789 ret = 0; 1790 1791 if (nlink != inode->i_nlink) { 1792 set_nlink(inode, nlink); 1793 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1794 if (ret) 1795 goto out; 1796 } 1797 BTRFS_I(inode)->index_cnt = (u64)-1; 1798 1799 if (inode->i_nlink == 0) { 1800 if (S_ISDIR(inode->i_mode)) { 1801 ret = replay_dir_deletes(trans, root, NULL, path, 1802 ino, 1); 1803 if (ret) 1804 goto out; 1805 } 1806 ret = btrfs_insert_orphan_item(trans, root, ino); 1807 if (ret == -EEXIST) 1808 ret = 0; 1809 } 1810 1811 out: 1812 btrfs_free_path(path); 1813 return ret; 1814 } 1815 1816 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1817 struct btrfs_root *root, 1818 struct btrfs_path *path) 1819 { 1820 int ret; 1821 struct btrfs_key key; 1822 struct inode *inode; 1823 1824 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1825 key.type = BTRFS_ORPHAN_ITEM_KEY; 1826 key.offset = (u64)-1; 1827 while (1) { 1828 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1829 if (ret < 0) 1830 break; 1831 1832 if (ret == 1) { 1833 ret = 0; 1834 if (path->slots[0] == 0) 1835 break; 1836 path->slots[0]--; 1837 } 1838 1839 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1840 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1841 key.type != BTRFS_ORPHAN_ITEM_KEY) 1842 break; 1843 1844 ret = btrfs_del_item(trans, root, path); 1845 if (ret) 1846 break; 1847 1848 btrfs_release_path(path); 1849 inode = read_one_inode(root, key.offset); 1850 if (!inode) { 1851 ret = -EIO; 1852 break; 1853 } 1854 1855 ret = fixup_inode_link_count(trans, root, inode); 1856 iput(inode); 1857 if (ret) 1858 break; 1859 1860 /* 1861 * fixup on a directory may create new entries, 1862 * make sure we always look for the highset possible 1863 * offset 1864 */ 1865 key.offset = (u64)-1; 1866 } 1867 btrfs_release_path(path); 1868 return ret; 1869 } 1870 1871 1872 /* 1873 * record a given inode in the fixup dir so we can check its link 1874 * count when replay is done. The link count is incremented here 1875 * so the inode won't go away until we check it 1876 */ 1877 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1878 struct btrfs_root *root, 1879 struct btrfs_path *path, 1880 u64 objectid) 1881 { 1882 struct btrfs_key key; 1883 int ret = 0; 1884 struct inode *inode; 1885 1886 inode = read_one_inode(root, objectid); 1887 if (!inode) 1888 return -EIO; 1889 1890 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1891 key.type = BTRFS_ORPHAN_ITEM_KEY; 1892 key.offset = objectid; 1893 1894 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1895 1896 btrfs_release_path(path); 1897 if (ret == 0) { 1898 if (!inode->i_nlink) 1899 set_nlink(inode, 1); 1900 else 1901 inc_nlink(inode); 1902 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1903 } else if (ret == -EEXIST) { 1904 ret = 0; 1905 } 1906 iput(inode); 1907 1908 return ret; 1909 } 1910 1911 /* 1912 * when replaying the log for a directory, we only insert names 1913 * for inodes that actually exist. This means an fsync on a directory 1914 * does not implicitly fsync all the new files in it 1915 */ 1916 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1917 struct btrfs_root *root, 1918 u64 dirid, u64 index, 1919 char *name, int name_len, 1920 struct btrfs_key *location) 1921 { 1922 struct inode *inode; 1923 struct inode *dir; 1924 int ret; 1925 1926 inode = read_one_inode(root, location->objectid); 1927 if (!inode) 1928 return -ENOENT; 1929 1930 dir = read_one_inode(root, dirid); 1931 if (!dir) { 1932 iput(inode); 1933 return -EIO; 1934 } 1935 1936 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1937 name_len, 1, index); 1938 1939 /* FIXME, put inode into FIXUP list */ 1940 1941 iput(inode); 1942 iput(dir); 1943 return ret; 1944 } 1945 1946 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1947 struct btrfs_inode *dir, 1948 struct btrfs_path *path, 1949 struct btrfs_dir_item *dst_di, 1950 const struct btrfs_key *log_key, 1951 u8 log_type, 1952 bool exists) 1953 { 1954 struct btrfs_key found_key; 1955 1956 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1957 /* The existing dentry points to the same inode, don't delete it. */ 1958 if (found_key.objectid == log_key->objectid && 1959 found_key.type == log_key->type && 1960 found_key.offset == log_key->offset && 1961 btrfs_dir_type(path->nodes[0], dst_di) == log_type) 1962 return 1; 1963 1964 /* 1965 * Don't drop the conflicting directory entry if the inode for the new 1966 * entry doesn't exist. 1967 */ 1968 if (!exists) 1969 return 0; 1970 1971 return drop_one_dir_item(trans, path, dir, dst_di); 1972 } 1973 1974 /* 1975 * take a single entry in a log directory item and replay it into 1976 * the subvolume. 1977 * 1978 * if a conflicting item exists in the subdirectory already, 1979 * the inode it points to is unlinked and put into the link count 1980 * fix up tree. 1981 * 1982 * If a name from the log points to a file or directory that does 1983 * not exist in the FS, it is skipped. fsyncs on directories 1984 * do not force down inodes inside that directory, just changes to the 1985 * names or unlinks in a directory. 1986 * 1987 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1988 * non-existing inode) and 1 if the name was replayed. 1989 */ 1990 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1991 struct btrfs_root *root, 1992 struct btrfs_path *path, 1993 struct extent_buffer *eb, 1994 struct btrfs_dir_item *di, 1995 struct btrfs_key *key) 1996 { 1997 char *name; 1998 int name_len; 1999 struct btrfs_dir_item *dir_dst_di; 2000 struct btrfs_dir_item *index_dst_di; 2001 bool dir_dst_matches = false; 2002 bool index_dst_matches = false; 2003 struct btrfs_key log_key; 2004 struct btrfs_key search_key; 2005 struct inode *dir; 2006 u8 log_type; 2007 bool exists; 2008 int ret; 2009 bool update_size = true; 2010 bool name_added = false; 2011 2012 dir = read_one_inode(root, key->objectid); 2013 if (!dir) 2014 return -EIO; 2015 2016 name_len = btrfs_dir_name_len(eb, di); 2017 name = kmalloc(name_len, GFP_NOFS); 2018 if (!name) { 2019 ret = -ENOMEM; 2020 goto out; 2021 } 2022 2023 log_type = btrfs_dir_type(eb, di); 2024 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2025 name_len); 2026 2027 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 2028 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 2029 btrfs_release_path(path); 2030 if (ret < 0) 2031 goto out; 2032 exists = (ret == 0); 2033 ret = 0; 2034 2035 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 2036 name, name_len, 1); 2037 if (IS_ERR(dir_dst_di)) { 2038 ret = PTR_ERR(dir_dst_di); 2039 goto out; 2040 } else if (dir_dst_di) { 2041 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2042 dir_dst_di, &log_key, log_type, 2043 exists); 2044 if (ret < 0) 2045 goto out; 2046 dir_dst_matches = (ret == 1); 2047 } 2048 2049 btrfs_release_path(path); 2050 2051 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 2052 key->objectid, key->offset, 2053 name, name_len, 1); 2054 if (IS_ERR(index_dst_di)) { 2055 ret = PTR_ERR(index_dst_di); 2056 goto out; 2057 } else if (index_dst_di) { 2058 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2059 index_dst_di, &log_key, 2060 log_type, exists); 2061 if (ret < 0) 2062 goto out; 2063 index_dst_matches = (ret == 1); 2064 } 2065 2066 btrfs_release_path(path); 2067 2068 if (dir_dst_matches && index_dst_matches) { 2069 ret = 0; 2070 update_size = false; 2071 goto out; 2072 } 2073 2074 /* 2075 * Check if the inode reference exists in the log for the given name, 2076 * inode and parent inode 2077 */ 2078 search_key.objectid = log_key.objectid; 2079 search_key.type = BTRFS_INODE_REF_KEY; 2080 search_key.offset = key->objectid; 2081 ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); 2082 if (ret < 0) { 2083 goto out; 2084 } else if (ret) { 2085 /* The dentry will be added later. */ 2086 ret = 0; 2087 update_size = false; 2088 goto out; 2089 } 2090 2091 search_key.objectid = log_key.objectid; 2092 search_key.type = BTRFS_INODE_EXTREF_KEY; 2093 search_key.offset = key->objectid; 2094 ret = backref_in_log(root->log_root, &search_key, key->objectid, name, 2095 name_len); 2096 if (ret < 0) { 2097 goto out; 2098 } else if (ret) { 2099 /* The dentry will be added later. */ 2100 ret = 0; 2101 update_size = false; 2102 goto out; 2103 } 2104 btrfs_release_path(path); 2105 ret = insert_one_name(trans, root, key->objectid, key->offset, 2106 name, name_len, &log_key); 2107 if (ret && ret != -ENOENT && ret != -EEXIST) 2108 goto out; 2109 if (!ret) 2110 name_added = true; 2111 update_size = false; 2112 ret = 0; 2113 2114 out: 2115 if (!ret && update_size) { 2116 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2117 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 2118 } 2119 kfree(name); 2120 iput(dir); 2121 if (!ret && name_added) 2122 ret = 1; 2123 return ret; 2124 } 2125 2126 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 2127 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2128 struct btrfs_root *root, 2129 struct btrfs_path *path, 2130 struct extent_buffer *eb, int slot, 2131 struct btrfs_key *key) 2132 { 2133 int ret; 2134 struct btrfs_dir_item *di; 2135 2136 /* We only log dir index keys, which only contain a single dir item. */ 2137 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 2138 2139 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2140 ret = replay_one_name(trans, root, path, eb, di, key); 2141 if (ret < 0) 2142 return ret; 2143 2144 /* 2145 * If this entry refers to a non-directory (directories can not have a 2146 * link count > 1) and it was added in the transaction that was not 2147 * committed, make sure we fixup the link count of the inode the entry 2148 * points to. Otherwise something like the following would result in a 2149 * directory pointing to an inode with a wrong link that does not account 2150 * for this dir entry: 2151 * 2152 * mkdir testdir 2153 * touch testdir/foo 2154 * touch testdir/bar 2155 * sync 2156 * 2157 * ln testdir/bar testdir/bar_link 2158 * ln testdir/foo testdir/foo_link 2159 * xfs_io -c "fsync" testdir/bar 2160 * 2161 * <power failure> 2162 * 2163 * mount fs, log replay happens 2164 * 2165 * File foo would remain with a link count of 1 when it has two entries 2166 * pointing to it in the directory testdir. This would make it impossible 2167 * to ever delete the parent directory has it would result in stale 2168 * dentries that can never be deleted. 2169 */ 2170 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2171 struct btrfs_path *fixup_path; 2172 struct btrfs_key di_key; 2173 2174 fixup_path = btrfs_alloc_path(); 2175 if (!fixup_path) 2176 return -ENOMEM; 2177 2178 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2179 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2180 btrfs_free_path(fixup_path); 2181 } 2182 2183 return ret; 2184 } 2185 2186 /* 2187 * directory replay has two parts. There are the standard directory 2188 * items in the log copied from the subvolume, and range items 2189 * created in the log while the subvolume was logged. 2190 * 2191 * The range items tell us which parts of the key space the log 2192 * is authoritative for. During replay, if a key in the subvolume 2193 * directory is in a logged range item, but not actually in the log 2194 * that means it was deleted from the directory before the fsync 2195 * and should be removed. 2196 */ 2197 static noinline int find_dir_range(struct btrfs_root *root, 2198 struct btrfs_path *path, 2199 u64 dirid, 2200 u64 *start_ret, u64 *end_ret) 2201 { 2202 struct btrfs_key key; 2203 u64 found_end; 2204 struct btrfs_dir_log_item *item; 2205 int ret; 2206 int nritems; 2207 2208 if (*start_ret == (u64)-1) 2209 return 1; 2210 2211 key.objectid = dirid; 2212 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2213 key.offset = *start_ret; 2214 2215 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2216 if (ret < 0) 2217 goto out; 2218 if (ret > 0) { 2219 if (path->slots[0] == 0) 2220 goto out; 2221 path->slots[0]--; 2222 } 2223 if (ret != 0) 2224 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2225 2226 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2227 ret = 1; 2228 goto next; 2229 } 2230 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2231 struct btrfs_dir_log_item); 2232 found_end = btrfs_dir_log_end(path->nodes[0], item); 2233 2234 if (*start_ret >= key.offset && *start_ret <= found_end) { 2235 ret = 0; 2236 *start_ret = key.offset; 2237 *end_ret = found_end; 2238 goto out; 2239 } 2240 ret = 1; 2241 next: 2242 /* check the next slot in the tree to see if it is a valid item */ 2243 nritems = btrfs_header_nritems(path->nodes[0]); 2244 path->slots[0]++; 2245 if (path->slots[0] >= nritems) { 2246 ret = btrfs_next_leaf(root, path); 2247 if (ret) 2248 goto out; 2249 } 2250 2251 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2252 2253 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2254 ret = 1; 2255 goto out; 2256 } 2257 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2258 struct btrfs_dir_log_item); 2259 found_end = btrfs_dir_log_end(path->nodes[0], item); 2260 *start_ret = key.offset; 2261 *end_ret = found_end; 2262 ret = 0; 2263 out: 2264 btrfs_release_path(path); 2265 return ret; 2266 } 2267 2268 /* 2269 * this looks for a given directory item in the log. If the directory 2270 * item is not in the log, the item is removed and the inode it points 2271 * to is unlinked 2272 */ 2273 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2274 struct btrfs_root *log, 2275 struct btrfs_path *path, 2276 struct btrfs_path *log_path, 2277 struct inode *dir, 2278 struct btrfs_key *dir_key) 2279 { 2280 struct btrfs_root *root = BTRFS_I(dir)->root; 2281 int ret; 2282 struct extent_buffer *eb; 2283 int slot; 2284 struct btrfs_dir_item *di; 2285 int name_len; 2286 char *name; 2287 struct inode *inode = NULL; 2288 struct btrfs_key location; 2289 2290 /* 2291 * Currenly we only log dir index keys. Even if we replay a log created 2292 * by an older kernel that logged both dir index and dir item keys, all 2293 * we need to do is process the dir index keys, we (and our caller) can 2294 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2295 */ 2296 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2297 2298 eb = path->nodes[0]; 2299 slot = path->slots[0]; 2300 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2301 name_len = btrfs_dir_name_len(eb, di); 2302 name = kmalloc(name_len, GFP_NOFS); 2303 if (!name) { 2304 ret = -ENOMEM; 2305 goto out; 2306 } 2307 2308 read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); 2309 2310 if (log) { 2311 struct btrfs_dir_item *log_di; 2312 2313 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2314 dir_key->objectid, 2315 dir_key->offset, 2316 name, name_len, 0); 2317 if (IS_ERR(log_di)) { 2318 ret = PTR_ERR(log_di); 2319 goto out; 2320 } else if (log_di) { 2321 /* The dentry exists in the log, we have nothing to do. */ 2322 ret = 0; 2323 goto out; 2324 } 2325 } 2326 2327 btrfs_dir_item_key_to_cpu(eb, di, &location); 2328 btrfs_release_path(path); 2329 btrfs_release_path(log_path); 2330 inode = read_one_inode(root, location.objectid); 2331 if (!inode) { 2332 ret = -EIO; 2333 goto out; 2334 } 2335 2336 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2337 if (ret) 2338 goto out; 2339 2340 inc_nlink(inode); 2341 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2342 name, name_len); 2343 /* 2344 * Unlike dir item keys, dir index keys can only have one name (entry) in 2345 * them, as there are no key collisions since each key has a unique offset 2346 * (an index number), so we're done. 2347 */ 2348 out: 2349 btrfs_release_path(path); 2350 btrfs_release_path(log_path); 2351 kfree(name); 2352 iput(inode); 2353 return ret; 2354 } 2355 2356 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2357 struct btrfs_root *root, 2358 struct btrfs_root *log, 2359 struct btrfs_path *path, 2360 const u64 ino) 2361 { 2362 struct btrfs_key search_key; 2363 struct btrfs_path *log_path; 2364 int i; 2365 int nritems; 2366 int ret; 2367 2368 log_path = btrfs_alloc_path(); 2369 if (!log_path) 2370 return -ENOMEM; 2371 2372 search_key.objectid = ino; 2373 search_key.type = BTRFS_XATTR_ITEM_KEY; 2374 search_key.offset = 0; 2375 again: 2376 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2377 if (ret < 0) 2378 goto out; 2379 process_leaf: 2380 nritems = btrfs_header_nritems(path->nodes[0]); 2381 for (i = path->slots[0]; i < nritems; i++) { 2382 struct btrfs_key key; 2383 struct btrfs_dir_item *di; 2384 struct btrfs_dir_item *log_di; 2385 u32 total_size; 2386 u32 cur; 2387 2388 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2389 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2390 ret = 0; 2391 goto out; 2392 } 2393 2394 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2395 total_size = btrfs_item_size(path->nodes[0], i); 2396 cur = 0; 2397 while (cur < total_size) { 2398 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2399 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2400 u32 this_len = sizeof(*di) + name_len + data_len; 2401 char *name; 2402 2403 name = kmalloc(name_len, GFP_NOFS); 2404 if (!name) { 2405 ret = -ENOMEM; 2406 goto out; 2407 } 2408 read_extent_buffer(path->nodes[0], name, 2409 (unsigned long)(di + 1), name_len); 2410 2411 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2412 name, name_len, 0); 2413 btrfs_release_path(log_path); 2414 if (!log_di) { 2415 /* Doesn't exist in log tree, so delete it. */ 2416 btrfs_release_path(path); 2417 di = btrfs_lookup_xattr(trans, root, path, ino, 2418 name, name_len, -1); 2419 kfree(name); 2420 if (IS_ERR(di)) { 2421 ret = PTR_ERR(di); 2422 goto out; 2423 } 2424 ASSERT(di); 2425 ret = btrfs_delete_one_dir_name(trans, root, 2426 path, di); 2427 if (ret) 2428 goto out; 2429 btrfs_release_path(path); 2430 search_key = key; 2431 goto again; 2432 } 2433 kfree(name); 2434 if (IS_ERR(log_di)) { 2435 ret = PTR_ERR(log_di); 2436 goto out; 2437 } 2438 cur += this_len; 2439 di = (struct btrfs_dir_item *)((char *)di + this_len); 2440 } 2441 } 2442 ret = btrfs_next_leaf(root, path); 2443 if (ret > 0) 2444 ret = 0; 2445 else if (ret == 0) 2446 goto process_leaf; 2447 out: 2448 btrfs_free_path(log_path); 2449 btrfs_release_path(path); 2450 return ret; 2451 } 2452 2453 2454 /* 2455 * deletion replay happens before we copy any new directory items 2456 * out of the log or out of backreferences from inodes. It 2457 * scans the log to find ranges of keys that log is authoritative for, 2458 * and then scans the directory to find items in those ranges that are 2459 * not present in the log. 2460 * 2461 * Anything we don't find in the log is unlinked and removed from the 2462 * directory. 2463 */ 2464 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2465 struct btrfs_root *root, 2466 struct btrfs_root *log, 2467 struct btrfs_path *path, 2468 u64 dirid, int del_all) 2469 { 2470 u64 range_start; 2471 u64 range_end; 2472 int ret = 0; 2473 struct btrfs_key dir_key; 2474 struct btrfs_key found_key; 2475 struct btrfs_path *log_path; 2476 struct inode *dir; 2477 2478 dir_key.objectid = dirid; 2479 dir_key.type = BTRFS_DIR_INDEX_KEY; 2480 log_path = btrfs_alloc_path(); 2481 if (!log_path) 2482 return -ENOMEM; 2483 2484 dir = read_one_inode(root, dirid); 2485 /* it isn't an error if the inode isn't there, that can happen 2486 * because we replay the deletes before we copy in the inode item 2487 * from the log 2488 */ 2489 if (!dir) { 2490 btrfs_free_path(log_path); 2491 return 0; 2492 } 2493 2494 range_start = 0; 2495 range_end = 0; 2496 while (1) { 2497 if (del_all) 2498 range_end = (u64)-1; 2499 else { 2500 ret = find_dir_range(log, path, dirid, 2501 &range_start, &range_end); 2502 if (ret < 0) 2503 goto out; 2504 else if (ret > 0) 2505 break; 2506 } 2507 2508 dir_key.offset = range_start; 2509 while (1) { 2510 int nritems; 2511 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2512 0, 0); 2513 if (ret < 0) 2514 goto out; 2515 2516 nritems = btrfs_header_nritems(path->nodes[0]); 2517 if (path->slots[0] >= nritems) { 2518 ret = btrfs_next_leaf(root, path); 2519 if (ret == 1) 2520 break; 2521 else if (ret < 0) 2522 goto out; 2523 } 2524 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2525 path->slots[0]); 2526 if (found_key.objectid != dirid || 2527 found_key.type != dir_key.type) { 2528 ret = 0; 2529 goto out; 2530 } 2531 2532 if (found_key.offset > range_end) 2533 break; 2534 2535 ret = check_item_in_log(trans, log, path, 2536 log_path, dir, 2537 &found_key); 2538 if (ret) 2539 goto out; 2540 if (found_key.offset == (u64)-1) 2541 break; 2542 dir_key.offset = found_key.offset + 1; 2543 } 2544 btrfs_release_path(path); 2545 if (range_end == (u64)-1) 2546 break; 2547 range_start = range_end + 1; 2548 } 2549 ret = 0; 2550 out: 2551 btrfs_release_path(path); 2552 btrfs_free_path(log_path); 2553 iput(dir); 2554 return ret; 2555 } 2556 2557 /* 2558 * the process_func used to replay items from the log tree. This 2559 * gets called in two different stages. The first stage just looks 2560 * for inodes and makes sure they are all copied into the subvolume. 2561 * 2562 * The second stage copies all the other item types from the log into 2563 * the subvolume. The two stage approach is slower, but gets rid of 2564 * lots of complexity around inodes referencing other inodes that exist 2565 * only in the log (references come from either directory items or inode 2566 * back refs). 2567 */ 2568 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2569 struct walk_control *wc, u64 gen, int level) 2570 { 2571 int nritems; 2572 struct btrfs_path *path; 2573 struct btrfs_root *root = wc->replay_dest; 2574 struct btrfs_key key; 2575 int i; 2576 int ret; 2577 2578 ret = btrfs_read_buffer(eb, gen, level, NULL); 2579 if (ret) 2580 return ret; 2581 2582 level = btrfs_header_level(eb); 2583 2584 if (level != 0) 2585 return 0; 2586 2587 path = btrfs_alloc_path(); 2588 if (!path) 2589 return -ENOMEM; 2590 2591 nritems = btrfs_header_nritems(eb); 2592 for (i = 0; i < nritems; i++) { 2593 btrfs_item_key_to_cpu(eb, &key, i); 2594 2595 /* inode keys are done during the first stage */ 2596 if (key.type == BTRFS_INODE_ITEM_KEY && 2597 wc->stage == LOG_WALK_REPLAY_INODES) { 2598 struct btrfs_inode_item *inode_item; 2599 u32 mode; 2600 2601 inode_item = btrfs_item_ptr(eb, i, 2602 struct btrfs_inode_item); 2603 /* 2604 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2605 * and never got linked before the fsync, skip it, as 2606 * replaying it is pointless since it would be deleted 2607 * later. We skip logging tmpfiles, but it's always 2608 * possible we are replaying a log created with a kernel 2609 * that used to log tmpfiles. 2610 */ 2611 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2612 wc->ignore_cur_inode = true; 2613 continue; 2614 } else { 2615 wc->ignore_cur_inode = false; 2616 } 2617 ret = replay_xattr_deletes(wc->trans, root, log, 2618 path, key.objectid); 2619 if (ret) 2620 break; 2621 mode = btrfs_inode_mode(eb, inode_item); 2622 if (S_ISDIR(mode)) { 2623 ret = replay_dir_deletes(wc->trans, 2624 root, log, path, key.objectid, 0); 2625 if (ret) 2626 break; 2627 } 2628 ret = overwrite_item(wc->trans, root, path, 2629 eb, i, &key); 2630 if (ret) 2631 break; 2632 2633 /* 2634 * Before replaying extents, truncate the inode to its 2635 * size. We need to do it now and not after log replay 2636 * because before an fsync we can have prealloc extents 2637 * added beyond the inode's i_size. If we did it after, 2638 * through orphan cleanup for example, we would drop 2639 * those prealloc extents just after replaying them. 2640 */ 2641 if (S_ISREG(mode)) { 2642 struct btrfs_drop_extents_args drop_args = { 0 }; 2643 struct inode *inode; 2644 u64 from; 2645 2646 inode = read_one_inode(root, key.objectid); 2647 if (!inode) { 2648 ret = -EIO; 2649 break; 2650 } 2651 from = ALIGN(i_size_read(inode), 2652 root->fs_info->sectorsize); 2653 drop_args.start = from; 2654 drop_args.end = (u64)-1; 2655 drop_args.drop_cache = true; 2656 ret = btrfs_drop_extents(wc->trans, root, 2657 BTRFS_I(inode), 2658 &drop_args); 2659 if (!ret) { 2660 inode_sub_bytes(inode, 2661 drop_args.bytes_found); 2662 /* Update the inode's nbytes. */ 2663 ret = btrfs_update_inode(wc->trans, 2664 root, BTRFS_I(inode)); 2665 } 2666 iput(inode); 2667 if (ret) 2668 break; 2669 } 2670 2671 ret = link_to_fixup_dir(wc->trans, root, 2672 path, key.objectid); 2673 if (ret) 2674 break; 2675 } 2676 2677 if (wc->ignore_cur_inode) 2678 continue; 2679 2680 if (key.type == BTRFS_DIR_INDEX_KEY && 2681 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2682 ret = replay_one_dir_item(wc->trans, root, path, 2683 eb, i, &key); 2684 if (ret) 2685 break; 2686 } 2687 2688 if (wc->stage < LOG_WALK_REPLAY_ALL) 2689 continue; 2690 2691 /* these keys are simply copied */ 2692 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2693 ret = overwrite_item(wc->trans, root, path, 2694 eb, i, &key); 2695 if (ret) 2696 break; 2697 } else if (key.type == BTRFS_INODE_REF_KEY || 2698 key.type == BTRFS_INODE_EXTREF_KEY) { 2699 ret = add_inode_ref(wc->trans, root, log, path, 2700 eb, i, &key); 2701 if (ret && ret != -ENOENT) 2702 break; 2703 ret = 0; 2704 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2705 ret = replay_one_extent(wc->trans, root, path, 2706 eb, i, &key); 2707 if (ret) 2708 break; 2709 } 2710 /* 2711 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2712 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2713 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2714 * older kernel with such keys, ignore them. 2715 */ 2716 } 2717 btrfs_free_path(path); 2718 return ret; 2719 } 2720 2721 /* 2722 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2723 */ 2724 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2725 { 2726 struct btrfs_block_group *cache; 2727 2728 cache = btrfs_lookup_block_group(fs_info, start); 2729 if (!cache) { 2730 btrfs_err(fs_info, "unable to find block group for %llu", start); 2731 return; 2732 } 2733 2734 spin_lock(&cache->space_info->lock); 2735 spin_lock(&cache->lock); 2736 cache->reserved -= fs_info->nodesize; 2737 cache->space_info->bytes_reserved -= fs_info->nodesize; 2738 spin_unlock(&cache->lock); 2739 spin_unlock(&cache->space_info->lock); 2740 2741 btrfs_put_block_group(cache); 2742 } 2743 2744 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2745 struct btrfs_root *root, 2746 struct btrfs_path *path, int *level, 2747 struct walk_control *wc) 2748 { 2749 struct btrfs_fs_info *fs_info = root->fs_info; 2750 u64 bytenr; 2751 u64 ptr_gen; 2752 struct extent_buffer *next; 2753 struct extent_buffer *cur; 2754 u32 blocksize; 2755 int ret = 0; 2756 2757 while (*level > 0) { 2758 struct btrfs_key first_key; 2759 2760 cur = path->nodes[*level]; 2761 2762 WARN_ON(btrfs_header_level(cur) != *level); 2763 2764 if (path->slots[*level] >= 2765 btrfs_header_nritems(cur)) 2766 break; 2767 2768 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2769 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2770 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2771 blocksize = fs_info->nodesize; 2772 2773 next = btrfs_find_create_tree_block(fs_info, bytenr, 2774 btrfs_header_owner(cur), 2775 *level - 1); 2776 if (IS_ERR(next)) 2777 return PTR_ERR(next); 2778 2779 if (*level == 1) { 2780 ret = wc->process_func(root, next, wc, ptr_gen, 2781 *level - 1); 2782 if (ret) { 2783 free_extent_buffer(next); 2784 return ret; 2785 } 2786 2787 path->slots[*level]++; 2788 if (wc->free) { 2789 ret = btrfs_read_buffer(next, ptr_gen, 2790 *level - 1, &first_key); 2791 if (ret) { 2792 free_extent_buffer(next); 2793 return ret; 2794 } 2795 2796 if (trans) { 2797 btrfs_tree_lock(next); 2798 btrfs_clean_tree_block(next); 2799 btrfs_wait_tree_block_writeback(next); 2800 btrfs_tree_unlock(next); 2801 ret = btrfs_pin_reserved_extent(trans, 2802 bytenr, blocksize); 2803 if (ret) { 2804 free_extent_buffer(next); 2805 return ret; 2806 } 2807 btrfs_redirty_list_add( 2808 trans->transaction, next); 2809 } else { 2810 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2811 clear_extent_buffer_dirty(next); 2812 unaccount_log_buffer(fs_info, bytenr); 2813 } 2814 } 2815 free_extent_buffer(next); 2816 continue; 2817 } 2818 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); 2819 if (ret) { 2820 free_extent_buffer(next); 2821 return ret; 2822 } 2823 2824 if (path->nodes[*level-1]) 2825 free_extent_buffer(path->nodes[*level-1]); 2826 path->nodes[*level-1] = next; 2827 *level = btrfs_header_level(next); 2828 path->slots[*level] = 0; 2829 cond_resched(); 2830 } 2831 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2832 2833 cond_resched(); 2834 return 0; 2835 } 2836 2837 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2838 struct btrfs_root *root, 2839 struct btrfs_path *path, int *level, 2840 struct walk_control *wc) 2841 { 2842 struct btrfs_fs_info *fs_info = root->fs_info; 2843 int i; 2844 int slot; 2845 int ret; 2846 2847 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2848 slot = path->slots[i]; 2849 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2850 path->slots[i]++; 2851 *level = i; 2852 WARN_ON(*level == 0); 2853 return 0; 2854 } else { 2855 ret = wc->process_func(root, path->nodes[*level], wc, 2856 btrfs_header_generation(path->nodes[*level]), 2857 *level); 2858 if (ret) 2859 return ret; 2860 2861 if (wc->free) { 2862 struct extent_buffer *next; 2863 2864 next = path->nodes[*level]; 2865 2866 if (trans) { 2867 btrfs_tree_lock(next); 2868 btrfs_clean_tree_block(next); 2869 btrfs_wait_tree_block_writeback(next); 2870 btrfs_tree_unlock(next); 2871 ret = btrfs_pin_reserved_extent(trans, 2872 path->nodes[*level]->start, 2873 path->nodes[*level]->len); 2874 if (ret) 2875 return ret; 2876 btrfs_redirty_list_add(trans->transaction, 2877 next); 2878 } else { 2879 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2880 clear_extent_buffer_dirty(next); 2881 2882 unaccount_log_buffer(fs_info, 2883 path->nodes[*level]->start); 2884 } 2885 } 2886 free_extent_buffer(path->nodes[*level]); 2887 path->nodes[*level] = NULL; 2888 *level = i + 1; 2889 } 2890 } 2891 return 1; 2892 } 2893 2894 /* 2895 * drop the reference count on the tree rooted at 'snap'. This traverses 2896 * the tree freeing any blocks that have a ref count of zero after being 2897 * decremented. 2898 */ 2899 static int walk_log_tree(struct btrfs_trans_handle *trans, 2900 struct btrfs_root *log, struct walk_control *wc) 2901 { 2902 struct btrfs_fs_info *fs_info = log->fs_info; 2903 int ret = 0; 2904 int wret; 2905 int level; 2906 struct btrfs_path *path; 2907 int orig_level; 2908 2909 path = btrfs_alloc_path(); 2910 if (!path) 2911 return -ENOMEM; 2912 2913 level = btrfs_header_level(log->node); 2914 orig_level = level; 2915 path->nodes[level] = log->node; 2916 atomic_inc(&log->node->refs); 2917 path->slots[level] = 0; 2918 2919 while (1) { 2920 wret = walk_down_log_tree(trans, log, path, &level, wc); 2921 if (wret > 0) 2922 break; 2923 if (wret < 0) { 2924 ret = wret; 2925 goto out; 2926 } 2927 2928 wret = walk_up_log_tree(trans, log, path, &level, wc); 2929 if (wret > 0) 2930 break; 2931 if (wret < 0) { 2932 ret = wret; 2933 goto out; 2934 } 2935 } 2936 2937 /* was the root node processed? if not, catch it here */ 2938 if (path->nodes[orig_level]) { 2939 ret = wc->process_func(log, path->nodes[orig_level], wc, 2940 btrfs_header_generation(path->nodes[orig_level]), 2941 orig_level); 2942 if (ret) 2943 goto out; 2944 if (wc->free) { 2945 struct extent_buffer *next; 2946 2947 next = path->nodes[orig_level]; 2948 2949 if (trans) { 2950 btrfs_tree_lock(next); 2951 btrfs_clean_tree_block(next); 2952 btrfs_wait_tree_block_writeback(next); 2953 btrfs_tree_unlock(next); 2954 ret = btrfs_pin_reserved_extent(trans, 2955 next->start, next->len); 2956 if (ret) 2957 goto out; 2958 btrfs_redirty_list_add(trans->transaction, next); 2959 } else { 2960 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2961 clear_extent_buffer_dirty(next); 2962 unaccount_log_buffer(fs_info, next->start); 2963 } 2964 } 2965 } 2966 2967 out: 2968 btrfs_free_path(path); 2969 return ret; 2970 } 2971 2972 /* 2973 * helper function to update the item for a given subvolumes log root 2974 * in the tree of log roots 2975 */ 2976 static int update_log_root(struct btrfs_trans_handle *trans, 2977 struct btrfs_root *log, 2978 struct btrfs_root_item *root_item) 2979 { 2980 struct btrfs_fs_info *fs_info = log->fs_info; 2981 int ret; 2982 2983 if (log->log_transid == 1) { 2984 /* insert root item on the first sync */ 2985 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2986 &log->root_key, root_item); 2987 } else { 2988 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2989 &log->root_key, root_item); 2990 } 2991 return ret; 2992 } 2993 2994 static void wait_log_commit(struct btrfs_root *root, int transid) 2995 { 2996 DEFINE_WAIT(wait); 2997 int index = transid % 2; 2998 2999 /* 3000 * we only allow two pending log transactions at a time, 3001 * so we know that if ours is more than 2 older than the 3002 * current transaction, we're done 3003 */ 3004 for (;;) { 3005 prepare_to_wait(&root->log_commit_wait[index], 3006 &wait, TASK_UNINTERRUPTIBLE); 3007 3008 if (!(root->log_transid_committed < transid && 3009 atomic_read(&root->log_commit[index]))) 3010 break; 3011 3012 mutex_unlock(&root->log_mutex); 3013 schedule(); 3014 mutex_lock(&root->log_mutex); 3015 } 3016 finish_wait(&root->log_commit_wait[index], &wait); 3017 } 3018 3019 static void wait_for_writer(struct btrfs_root *root) 3020 { 3021 DEFINE_WAIT(wait); 3022 3023 for (;;) { 3024 prepare_to_wait(&root->log_writer_wait, &wait, 3025 TASK_UNINTERRUPTIBLE); 3026 if (!atomic_read(&root->log_writers)) 3027 break; 3028 3029 mutex_unlock(&root->log_mutex); 3030 schedule(); 3031 mutex_lock(&root->log_mutex); 3032 } 3033 finish_wait(&root->log_writer_wait, &wait); 3034 } 3035 3036 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3037 struct btrfs_log_ctx *ctx) 3038 { 3039 mutex_lock(&root->log_mutex); 3040 list_del_init(&ctx->list); 3041 mutex_unlock(&root->log_mutex); 3042 } 3043 3044 /* 3045 * Invoked in log mutex context, or be sure there is no other task which 3046 * can access the list. 3047 */ 3048 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3049 int index, int error) 3050 { 3051 struct btrfs_log_ctx *ctx; 3052 struct btrfs_log_ctx *safe; 3053 3054 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3055 list_del_init(&ctx->list); 3056 ctx->log_ret = error; 3057 } 3058 } 3059 3060 /* 3061 * btrfs_sync_log does sends a given tree log down to the disk and 3062 * updates the super blocks to record it. When this call is done, 3063 * you know that any inodes previously logged are safely on disk only 3064 * if it returns 0. 3065 * 3066 * Any other return value means you need to call btrfs_commit_transaction. 3067 * Some of the edge cases for fsyncing directories that have had unlinks 3068 * or renames done in the past mean that sometimes the only safe 3069 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3070 * that has happened. 3071 */ 3072 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3073 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3074 { 3075 int index1; 3076 int index2; 3077 int mark; 3078 int ret; 3079 struct btrfs_fs_info *fs_info = root->fs_info; 3080 struct btrfs_root *log = root->log_root; 3081 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3082 struct btrfs_root_item new_root_item; 3083 int log_transid = 0; 3084 struct btrfs_log_ctx root_log_ctx; 3085 struct blk_plug plug; 3086 u64 log_root_start; 3087 u64 log_root_level; 3088 3089 mutex_lock(&root->log_mutex); 3090 log_transid = ctx->log_transid; 3091 if (root->log_transid_committed >= log_transid) { 3092 mutex_unlock(&root->log_mutex); 3093 return ctx->log_ret; 3094 } 3095 3096 index1 = log_transid % 2; 3097 if (atomic_read(&root->log_commit[index1])) { 3098 wait_log_commit(root, log_transid); 3099 mutex_unlock(&root->log_mutex); 3100 return ctx->log_ret; 3101 } 3102 ASSERT(log_transid == root->log_transid); 3103 atomic_set(&root->log_commit[index1], 1); 3104 3105 /* wait for previous tree log sync to complete */ 3106 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3107 wait_log_commit(root, log_transid - 1); 3108 3109 while (1) { 3110 int batch = atomic_read(&root->log_batch); 3111 /* when we're on an ssd, just kick the log commit out */ 3112 if (!btrfs_test_opt(fs_info, SSD) && 3113 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3114 mutex_unlock(&root->log_mutex); 3115 schedule_timeout_uninterruptible(1); 3116 mutex_lock(&root->log_mutex); 3117 } 3118 wait_for_writer(root); 3119 if (batch == atomic_read(&root->log_batch)) 3120 break; 3121 } 3122 3123 /* bail out if we need to do a full commit */ 3124 if (btrfs_need_log_full_commit(trans)) { 3125 ret = -EAGAIN; 3126 mutex_unlock(&root->log_mutex); 3127 goto out; 3128 } 3129 3130 if (log_transid % 2 == 0) 3131 mark = EXTENT_DIRTY; 3132 else 3133 mark = EXTENT_NEW; 3134 3135 /* we start IO on all the marked extents here, but we don't actually 3136 * wait for them until later. 3137 */ 3138 blk_start_plug(&plug); 3139 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3140 /* 3141 * -EAGAIN happens when someone, e.g., a concurrent transaction 3142 * commit, writes a dirty extent in this tree-log commit. This 3143 * concurrent write will create a hole writing out the extents, 3144 * and we cannot proceed on a zoned filesystem, requiring 3145 * sequential writing. While we can bail out to a full commit 3146 * here, but we can continue hoping the concurrent writing fills 3147 * the hole. 3148 */ 3149 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3150 ret = 0; 3151 if (ret) { 3152 blk_finish_plug(&plug); 3153 btrfs_abort_transaction(trans, ret); 3154 btrfs_set_log_full_commit(trans); 3155 mutex_unlock(&root->log_mutex); 3156 goto out; 3157 } 3158 3159 /* 3160 * We _must_ update under the root->log_mutex in order to make sure we 3161 * have a consistent view of the log root we are trying to commit at 3162 * this moment. 3163 * 3164 * We _must_ copy this into a local copy, because we are not holding the 3165 * log_root_tree->log_mutex yet. This is important because when we 3166 * commit the log_root_tree we must have a consistent view of the 3167 * log_root_tree when we update the super block to point at the 3168 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3169 * with the commit and possibly point at the new block which we may not 3170 * have written out. 3171 */ 3172 btrfs_set_root_node(&log->root_item, log->node); 3173 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3174 3175 root->log_transid++; 3176 log->log_transid = root->log_transid; 3177 root->log_start_pid = 0; 3178 /* 3179 * IO has been started, blocks of the log tree have WRITTEN flag set 3180 * in their headers. new modifications of the log will be written to 3181 * new positions. so it's safe to allow log writers to go in. 3182 */ 3183 mutex_unlock(&root->log_mutex); 3184 3185 if (btrfs_is_zoned(fs_info)) { 3186 mutex_lock(&fs_info->tree_root->log_mutex); 3187 if (!log_root_tree->node) { 3188 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3189 if (ret) { 3190 mutex_unlock(&fs_info->tree_root->log_mutex); 3191 blk_finish_plug(&plug); 3192 goto out; 3193 } 3194 } 3195 mutex_unlock(&fs_info->tree_root->log_mutex); 3196 } 3197 3198 btrfs_init_log_ctx(&root_log_ctx, NULL); 3199 3200 mutex_lock(&log_root_tree->log_mutex); 3201 3202 index2 = log_root_tree->log_transid % 2; 3203 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3204 root_log_ctx.log_transid = log_root_tree->log_transid; 3205 3206 /* 3207 * Now we are safe to update the log_root_tree because we're under the 3208 * log_mutex, and we're a current writer so we're holding the commit 3209 * open until we drop the log_mutex. 3210 */ 3211 ret = update_log_root(trans, log, &new_root_item); 3212 if (ret) { 3213 if (!list_empty(&root_log_ctx.list)) 3214 list_del_init(&root_log_ctx.list); 3215 3216 blk_finish_plug(&plug); 3217 btrfs_set_log_full_commit(trans); 3218 3219 if (ret != -ENOSPC) { 3220 btrfs_abort_transaction(trans, ret); 3221 mutex_unlock(&log_root_tree->log_mutex); 3222 goto out; 3223 } 3224 btrfs_wait_tree_log_extents(log, mark); 3225 mutex_unlock(&log_root_tree->log_mutex); 3226 ret = -EAGAIN; 3227 goto out; 3228 } 3229 3230 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3231 blk_finish_plug(&plug); 3232 list_del_init(&root_log_ctx.list); 3233 mutex_unlock(&log_root_tree->log_mutex); 3234 ret = root_log_ctx.log_ret; 3235 goto out; 3236 } 3237 3238 index2 = root_log_ctx.log_transid % 2; 3239 if (atomic_read(&log_root_tree->log_commit[index2])) { 3240 blk_finish_plug(&plug); 3241 ret = btrfs_wait_tree_log_extents(log, mark); 3242 wait_log_commit(log_root_tree, 3243 root_log_ctx.log_transid); 3244 mutex_unlock(&log_root_tree->log_mutex); 3245 if (!ret) 3246 ret = root_log_ctx.log_ret; 3247 goto out; 3248 } 3249 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3250 atomic_set(&log_root_tree->log_commit[index2], 1); 3251 3252 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3253 wait_log_commit(log_root_tree, 3254 root_log_ctx.log_transid - 1); 3255 } 3256 3257 /* 3258 * now that we've moved on to the tree of log tree roots, 3259 * check the full commit flag again 3260 */ 3261 if (btrfs_need_log_full_commit(trans)) { 3262 blk_finish_plug(&plug); 3263 btrfs_wait_tree_log_extents(log, mark); 3264 mutex_unlock(&log_root_tree->log_mutex); 3265 ret = -EAGAIN; 3266 goto out_wake_log_root; 3267 } 3268 3269 ret = btrfs_write_marked_extents(fs_info, 3270 &log_root_tree->dirty_log_pages, 3271 EXTENT_DIRTY | EXTENT_NEW); 3272 blk_finish_plug(&plug); 3273 /* 3274 * As described above, -EAGAIN indicates a hole in the extents. We 3275 * cannot wait for these write outs since the waiting cause a 3276 * deadlock. Bail out to the full commit instead. 3277 */ 3278 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3279 btrfs_set_log_full_commit(trans); 3280 btrfs_wait_tree_log_extents(log, mark); 3281 mutex_unlock(&log_root_tree->log_mutex); 3282 goto out_wake_log_root; 3283 } else if (ret) { 3284 btrfs_set_log_full_commit(trans); 3285 btrfs_abort_transaction(trans, ret); 3286 mutex_unlock(&log_root_tree->log_mutex); 3287 goto out_wake_log_root; 3288 } 3289 ret = btrfs_wait_tree_log_extents(log, mark); 3290 if (!ret) 3291 ret = btrfs_wait_tree_log_extents(log_root_tree, 3292 EXTENT_NEW | EXTENT_DIRTY); 3293 if (ret) { 3294 btrfs_set_log_full_commit(trans); 3295 mutex_unlock(&log_root_tree->log_mutex); 3296 goto out_wake_log_root; 3297 } 3298 3299 log_root_start = log_root_tree->node->start; 3300 log_root_level = btrfs_header_level(log_root_tree->node); 3301 log_root_tree->log_transid++; 3302 mutex_unlock(&log_root_tree->log_mutex); 3303 3304 /* 3305 * Here we are guaranteed that nobody is going to write the superblock 3306 * for the current transaction before us and that neither we do write 3307 * our superblock before the previous transaction finishes its commit 3308 * and writes its superblock, because: 3309 * 3310 * 1) We are holding a handle on the current transaction, so no body 3311 * can commit it until we release the handle; 3312 * 3313 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3314 * if the previous transaction is still committing, and hasn't yet 3315 * written its superblock, we wait for it to do it, because a 3316 * transaction commit acquires the tree_log_mutex when the commit 3317 * begins and releases it only after writing its superblock. 3318 */ 3319 mutex_lock(&fs_info->tree_log_mutex); 3320 3321 /* 3322 * The previous transaction writeout phase could have failed, and thus 3323 * marked the fs in an error state. We must not commit here, as we 3324 * could have updated our generation in the super_for_commit and 3325 * writing the super here would result in transid mismatches. If there 3326 * is an error here just bail. 3327 */ 3328 if (BTRFS_FS_ERROR(fs_info)) { 3329 ret = -EIO; 3330 btrfs_set_log_full_commit(trans); 3331 btrfs_abort_transaction(trans, ret); 3332 mutex_unlock(&fs_info->tree_log_mutex); 3333 goto out_wake_log_root; 3334 } 3335 3336 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3337 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3338 ret = write_all_supers(fs_info, 1); 3339 mutex_unlock(&fs_info->tree_log_mutex); 3340 if (ret) { 3341 btrfs_set_log_full_commit(trans); 3342 btrfs_abort_transaction(trans, ret); 3343 goto out_wake_log_root; 3344 } 3345 3346 /* 3347 * We know there can only be one task here, since we have not yet set 3348 * root->log_commit[index1] to 0 and any task attempting to sync the 3349 * log must wait for the previous log transaction to commit if it's 3350 * still in progress or wait for the current log transaction commit if 3351 * someone else already started it. We use <= and not < because the 3352 * first log transaction has an ID of 0. 3353 */ 3354 ASSERT(root->last_log_commit <= log_transid); 3355 root->last_log_commit = log_transid; 3356 3357 out_wake_log_root: 3358 mutex_lock(&log_root_tree->log_mutex); 3359 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3360 3361 log_root_tree->log_transid_committed++; 3362 atomic_set(&log_root_tree->log_commit[index2], 0); 3363 mutex_unlock(&log_root_tree->log_mutex); 3364 3365 /* 3366 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3367 * all the updates above are seen by the woken threads. It might not be 3368 * necessary, but proving that seems to be hard. 3369 */ 3370 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3371 out: 3372 mutex_lock(&root->log_mutex); 3373 btrfs_remove_all_log_ctxs(root, index1, ret); 3374 root->log_transid_committed++; 3375 atomic_set(&root->log_commit[index1], 0); 3376 mutex_unlock(&root->log_mutex); 3377 3378 /* 3379 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3380 * all the updates above are seen by the woken threads. It might not be 3381 * necessary, but proving that seems to be hard. 3382 */ 3383 cond_wake_up(&root->log_commit_wait[index1]); 3384 return ret; 3385 } 3386 3387 static void free_log_tree(struct btrfs_trans_handle *trans, 3388 struct btrfs_root *log) 3389 { 3390 int ret; 3391 struct walk_control wc = { 3392 .free = 1, 3393 .process_func = process_one_buffer 3394 }; 3395 3396 if (log->node) { 3397 ret = walk_log_tree(trans, log, &wc); 3398 if (ret) { 3399 /* 3400 * We weren't able to traverse the entire log tree, the 3401 * typical scenario is getting an -EIO when reading an 3402 * extent buffer of the tree, due to a previous writeback 3403 * failure of it. 3404 */ 3405 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3406 &log->fs_info->fs_state); 3407 3408 /* 3409 * Some extent buffers of the log tree may still be dirty 3410 * and not yet written back to storage, because we may 3411 * have updates to a log tree without syncing a log tree, 3412 * such as during rename and link operations. So flush 3413 * them out and wait for their writeback to complete, so 3414 * that we properly cleanup their state and pages. 3415 */ 3416 btrfs_write_marked_extents(log->fs_info, 3417 &log->dirty_log_pages, 3418 EXTENT_DIRTY | EXTENT_NEW); 3419 btrfs_wait_tree_log_extents(log, 3420 EXTENT_DIRTY | EXTENT_NEW); 3421 3422 if (trans) 3423 btrfs_abort_transaction(trans, ret); 3424 else 3425 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3426 } 3427 } 3428 3429 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3430 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3431 extent_io_tree_release(&log->log_csum_range); 3432 3433 btrfs_put_root(log); 3434 } 3435 3436 /* 3437 * free all the extents used by the tree log. This should be called 3438 * at commit time of the full transaction 3439 */ 3440 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3441 { 3442 if (root->log_root) { 3443 free_log_tree(trans, root->log_root); 3444 root->log_root = NULL; 3445 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3446 } 3447 return 0; 3448 } 3449 3450 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3451 struct btrfs_fs_info *fs_info) 3452 { 3453 if (fs_info->log_root_tree) { 3454 free_log_tree(trans, fs_info->log_root_tree); 3455 fs_info->log_root_tree = NULL; 3456 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3457 } 3458 return 0; 3459 } 3460 3461 /* 3462 * Check if an inode was logged in the current transaction. This correctly deals 3463 * with the case where the inode was logged but has a logged_trans of 0, which 3464 * happens if the inode is evicted and loaded again, as logged_trans is an in 3465 * memory only field (not persisted). 3466 * 3467 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3468 * and < 0 on error. 3469 */ 3470 static int inode_logged(struct btrfs_trans_handle *trans, 3471 struct btrfs_inode *inode, 3472 struct btrfs_path *path_in) 3473 { 3474 struct btrfs_path *path = path_in; 3475 struct btrfs_key key; 3476 int ret; 3477 3478 if (inode->logged_trans == trans->transid) 3479 return 1; 3480 3481 /* 3482 * If logged_trans is not 0, then we know the inode logged was not logged 3483 * in this transaction, so we can return false right away. 3484 */ 3485 if (inode->logged_trans > 0) 3486 return 0; 3487 3488 /* 3489 * If no log tree was created for this root in this transaction, then 3490 * the inode can not have been logged in this transaction. In that case 3491 * set logged_trans to anything greater than 0 and less than the current 3492 * transaction's ID, to avoid the search below in a future call in case 3493 * a log tree gets created after this. 3494 */ 3495 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3496 inode->logged_trans = trans->transid - 1; 3497 return 0; 3498 } 3499 3500 /* 3501 * We have a log tree and the inode's logged_trans is 0. We can't tell 3502 * for sure if the inode was logged before in this transaction by looking 3503 * only at logged_trans. We could be pessimistic and assume it was, but 3504 * that can lead to unnecessarily logging an inode during rename and link 3505 * operations, and then further updating the log in followup rename and 3506 * link operations, specially if it's a directory, which adds latency 3507 * visible to applications doing a series of rename or link operations. 3508 * 3509 * A logged_trans of 0 here can mean several things: 3510 * 3511 * 1) The inode was never logged since the filesystem was mounted, and may 3512 * or may have not been evicted and loaded again; 3513 * 3514 * 2) The inode was logged in a previous transaction, then evicted and 3515 * then loaded again; 3516 * 3517 * 3) The inode was logged in the current transaction, then evicted and 3518 * then loaded again. 3519 * 3520 * For cases 1) and 2) we don't want to return true, but we need to detect 3521 * case 3) and return true. So we do a search in the log root for the inode 3522 * item. 3523 */ 3524 key.objectid = btrfs_ino(inode); 3525 key.type = BTRFS_INODE_ITEM_KEY; 3526 key.offset = 0; 3527 3528 if (!path) { 3529 path = btrfs_alloc_path(); 3530 if (!path) 3531 return -ENOMEM; 3532 } 3533 3534 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3535 3536 if (path_in) 3537 btrfs_release_path(path); 3538 else 3539 btrfs_free_path(path); 3540 3541 /* 3542 * Logging an inode always results in logging its inode item. So if we 3543 * did not find the item we know the inode was not logged for sure. 3544 */ 3545 if (ret < 0) { 3546 return ret; 3547 } else if (ret > 0) { 3548 /* 3549 * Set logged_trans to a value greater than 0 and less then the 3550 * current transaction to avoid doing the search in future calls. 3551 */ 3552 inode->logged_trans = trans->transid - 1; 3553 return 0; 3554 } 3555 3556 /* 3557 * The inode was previously logged and then evicted, set logged_trans to 3558 * the current transacion's ID, to avoid future tree searches as long as 3559 * the inode is not evicted again. 3560 */ 3561 inode->logged_trans = trans->transid; 3562 3563 /* 3564 * If it's a directory, then we must set last_dir_index_offset to the 3565 * maximum possible value, so that the next attempt to log the inode does 3566 * not skip checking if dir index keys found in modified subvolume tree 3567 * leaves have been logged before, otherwise it would result in attempts 3568 * to insert duplicate dir index keys in the log tree. This must be done 3569 * because last_dir_index_offset is an in-memory only field, not persisted 3570 * in the inode item or any other on-disk structure, so its value is lost 3571 * once the inode is evicted. 3572 */ 3573 if (S_ISDIR(inode->vfs_inode.i_mode)) 3574 inode->last_dir_index_offset = (u64)-1; 3575 3576 return 1; 3577 } 3578 3579 /* 3580 * Delete a directory entry from the log if it exists. 3581 * 3582 * Returns < 0 on error 3583 * 1 if the entry does not exists 3584 * 0 if the entry existed and was successfully deleted 3585 */ 3586 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3587 struct btrfs_root *log, 3588 struct btrfs_path *path, 3589 u64 dir_ino, 3590 const char *name, int name_len, 3591 u64 index) 3592 { 3593 struct btrfs_dir_item *di; 3594 3595 /* 3596 * We only log dir index items of a directory, so we don't need to look 3597 * for dir item keys. 3598 */ 3599 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3600 index, name, name_len, -1); 3601 if (IS_ERR(di)) 3602 return PTR_ERR(di); 3603 else if (!di) 3604 return 1; 3605 3606 /* 3607 * We do not need to update the size field of the directory's 3608 * inode item because on log replay we update the field to reflect 3609 * all existing entries in the directory (see overwrite_item()). 3610 */ 3611 return btrfs_delete_one_dir_name(trans, log, path, di); 3612 } 3613 3614 /* 3615 * If both a file and directory are logged, and unlinks or renames are 3616 * mixed in, we have a few interesting corners: 3617 * 3618 * create file X in dir Y 3619 * link file X to X.link in dir Y 3620 * fsync file X 3621 * unlink file X but leave X.link 3622 * fsync dir Y 3623 * 3624 * After a crash we would expect only X.link to exist. But file X 3625 * didn't get fsync'd again so the log has back refs for X and X.link. 3626 * 3627 * We solve this by removing directory entries and inode backrefs from the 3628 * log when a file that was logged in the current transaction is 3629 * unlinked. Any later fsync will include the updated log entries, and 3630 * we'll be able to reconstruct the proper directory items from backrefs. 3631 * 3632 * This optimizations allows us to avoid relogging the entire inode 3633 * or the entire directory. 3634 */ 3635 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3636 struct btrfs_root *root, 3637 const char *name, int name_len, 3638 struct btrfs_inode *dir, u64 index) 3639 { 3640 struct btrfs_path *path; 3641 int ret; 3642 3643 ret = inode_logged(trans, dir, NULL); 3644 if (ret == 0) 3645 return; 3646 else if (ret < 0) { 3647 btrfs_set_log_full_commit(trans); 3648 return; 3649 } 3650 3651 ret = join_running_log_trans(root); 3652 if (ret) 3653 return; 3654 3655 mutex_lock(&dir->log_mutex); 3656 3657 path = btrfs_alloc_path(); 3658 if (!path) { 3659 ret = -ENOMEM; 3660 goto out_unlock; 3661 } 3662 3663 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3664 name, name_len, index); 3665 btrfs_free_path(path); 3666 out_unlock: 3667 mutex_unlock(&dir->log_mutex); 3668 if (ret < 0) 3669 btrfs_set_log_full_commit(trans); 3670 btrfs_end_log_trans(root); 3671 } 3672 3673 /* see comments for btrfs_del_dir_entries_in_log */ 3674 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3675 struct btrfs_root *root, 3676 const char *name, int name_len, 3677 struct btrfs_inode *inode, u64 dirid) 3678 { 3679 struct btrfs_root *log; 3680 u64 index; 3681 int ret; 3682 3683 ret = inode_logged(trans, inode, NULL); 3684 if (ret == 0) 3685 return; 3686 else if (ret < 0) { 3687 btrfs_set_log_full_commit(trans); 3688 return; 3689 } 3690 3691 ret = join_running_log_trans(root); 3692 if (ret) 3693 return; 3694 log = root->log_root; 3695 mutex_lock(&inode->log_mutex); 3696 3697 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3698 dirid, &index); 3699 mutex_unlock(&inode->log_mutex); 3700 if (ret < 0 && ret != -ENOENT) 3701 btrfs_set_log_full_commit(trans); 3702 btrfs_end_log_trans(root); 3703 } 3704 3705 /* 3706 * creates a range item in the log for 'dirid'. first_offset and 3707 * last_offset tell us which parts of the key space the log should 3708 * be considered authoritative for. 3709 */ 3710 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3711 struct btrfs_root *log, 3712 struct btrfs_path *path, 3713 u64 dirid, 3714 u64 first_offset, u64 last_offset) 3715 { 3716 int ret; 3717 struct btrfs_key key; 3718 struct btrfs_dir_log_item *item; 3719 3720 key.objectid = dirid; 3721 key.offset = first_offset; 3722 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3723 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3724 if (ret) 3725 return ret; 3726 3727 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3728 struct btrfs_dir_log_item); 3729 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3730 btrfs_mark_buffer_dirty(path->nodes[0]); 3731 btrfs_release_path(path); 3732 return 0; 3733 } 3734 3735 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3736 struct btrfs_root *log, 3737 struct extent_buffer *src, 3738 struct btrfs_path *dst_path, 3739 int start_slot, 3740 int count) 3741 { 3742 char *ins_data = NULL; 3743 struct btrfs_item_batch batch; 3744 struct extent_buffer *dst; 3745 unsigned long src_offset; 3746 unsigned long dst_offset; 3747 struct btrfs_key key; 3748 u32 item_size; 3749 int ret; 3750 int i; 3751 3752 ASSERT(count > 0); 3753 batch.nr = count; 3754 3755 if (count == 1) { 3756 btrfs_item_key_to_cpu(src, &key, start_slot); 3757 item_size = btrfs_item_size(src, start_slot); 3758 batch.keys = &key; 3759 batch.data_sizes = &item_size; 3760 batch.total_data_size = item_size; 3761 } else { 3762 struct btrfs_key *ins_keys; 3763 u32 *ins_sizes; 3764 3765 ins_data = kmalloc(count * sizeof(u32) + 3766 count * sizeof(struct btrfs_key), GFP_NOFS); 3767 if (!ins_data) 3768 return -ENOMEM; 3769 3770 ins_sizes = (u32 *)ins_data; 3771 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3772 batch.keys = ins_keys; 3773 batch.data_sizes = ins_sizes; 3774 batch.total_data_size = 0; 3775 3776 for (i = 0; i < count; i++) { 3777 const int slot = start_slot + i; 3778 3779 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3780 ins_sizes[i] = btrfs_item_size(src, slot); 3781 batch.total_data_size += ins_sizes[i]; 3782 } 3783 } 3784 3785 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3786 if (ret) 3787 goto out; 3788 3789 dst = dst_path->nodes[0]; 3790 /* 3791 * Copy all the items in bulk, in a single copy operation. Item data is 3792 * organized such that it's placed at the end of a leaf and from right 3793 * to left. For example, the data for the second item ends at an offset 3794 * that matches the offset where the data for the first item starts, the 3795 * data for the third item ends at an offset that matches the offset 3796 * where the data of the second items starts, and so on. 3797 * Therefore our source and destination start offsets for copy match the 3798 * offsets of the last items (highest slots). 3799 */ 3800 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3801 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3802 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3803 btrfs_release_path(dst_path); 3804 out: 3805 kfree(ins_data); 3806 3807 return ret; 3808 } 3809 3810 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3811 struct btrfs_inode *inode, 3812 struct btrfs_path *path, 3813 struct btrfs_path *dst_path, 3814 struct btrfs_log_ctx *ctx, 3815 u64 *last_old_dentry_offset) 3816 { 3817 struct btrfs_root *log = inode->root->log_root; 3818 struct extent_buffer *src = path->nodes[0]; 3819 const int nritems = btrfs_header_nritems(src); 3820 const u64 ino = btrfs_ino(inode); 3821 bool last_found = false; 3822 int batch_start = 0; 3823 int batch_size = 0; 3824 int i; 3825 3826 for (i = path->slots[0]; i < nritems; i++) { 3827 struct btrfs_dir_item *di; 3828 struct btrfs_key key; 3829 int ret; 3830 3831 btrfs_item_key_to_cpu(src, &key, i); 3832 3833 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3834 last_found = true; 3835 break; 3836 } 3837 3838 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3839 ctx->last_dir_item_offset = key.offset; 3840 3841 /* 3842 * Skip ranges of items that consist only of dir item keys created 3843 * in past transactions. However if we find a gap, we must log a 3844 * dir index range item for that gap, so that index keys in that 3845 * gap are deleted during log replay. 3846 */ 3847 if (btrfs_dir_transid(src, di) < trans->transid) { 3848 if (key.offset > *last_old_dentry_offset + 1) { 3849 ret = insert_dir_log_key(trans, log, dst_path, 3850 ino, *last_old_dentry_offset + 1, 3851 key.offset - 1); 3852 /* 3853 * -EEXIST should never happen because when we 3854 * log a directory in full mode (LOG_INODE_ALL) 3855 * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from 3856 * the log tree. 3857 */ 3858 ASSERT(ret != -EEXIST); 3859 if (ret < 0) 3860 return ret; 3861 } 3862 3863 *last_old_dentry_offset = key.offset; 3864 continue; 3865 } 3866 /* 3867 * We must make sure that when we log a directory entry, the 3868 * corresponding inode, after log replay, has a matching link 3869 * count. For example: 3870 * 3871 * touch foo 3872 * mkdir mydir 3873 * sync 3874 * ln foo mydir/bar 3875 * xfs_io -c "fsync" mydir 3876 * <crash> 3877 * <mount fs and log replay> 3878 * 3879 * Would result in a fsync log that when replayed, our file inode 3880 * would have a link count of 1, but we get two directory entries 3881 * pointing to the same inode. After removing one of the names, 3882 * it would not be possible to remove the other name, which 3883 * resulted always in stale file handle errors, and would not be 3884 * possible to rmdir the parent directory, since its i_size could 3885 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3886 * resulting in -ENOTEMPTY errors. 3887 */ 3888 if (!ctx->log_new_dentries) { 3889 struct btrfs_key di_key; 3890 3891 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3892 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3893 ctx->log_new_dentries = true; 3894 } 3895 3896 if (!ctx->logged_before) 3897 goto add_to_batch; 3898 3899 /* 3900 * If we were logged before and have logged dir items, we can skip 3901 * checking if any item with a key offset larger than the last one 3902 * we logged is in the log tree, saving time and avoiding adding 3903 * contention on the log tree. We can only rely on the value of 3904 * last_dir_index_offset when we know for sure that the inode was 3905 * previously logged in the current transaction. 3906 */ 3907 if (key.offset > inode->last_dir_index_offset) 3908 goto add_to_batch; 3909 /* 3910 * Check if the key was already logged before. If not we can add 3911 * it to a batch for bulk insertion. 3912 */ 3913 ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); 3914 if (ret < 0) { 3915 return ret; 3916 } else if (ret > 0) { 3917 btrfs_release_path(dst_path); 3918 goto add_to_batch; 3919 } 3920 3921 /* 3922 * Item exists in the log. Overwrite the item in the log if it 3923 * has different content or do nothing if it has exactly the same 3924 * content. And then flush the current batch if any - do it after 3925 * overwriting the current item, or we would deadlock otherwise, 3926 * since we are holding a path for the existing item. 3927 */ 3928 ret = do_overwrite_item(trans, log, dst_path, src, i, &key); 3929 if (ret < 0) 3930 return ret; 3931 3932 if (batch_size > 0) { 3933 ret = flush_dir_items_batch(trans, log, src, dst_path, 3934 batch_start, batch_size); 3935 if (ret < 0) 3936 return ret; 3937 batch_size = 0; 3938 } 3939 continue; 3940 add_to_batch: 3941 if (batch_size == 0) 3942 batch_start = i; 3943 batch_size++; 3944 } 3945 3946 if (batch_size > 0) { 3947 int ret; 3948 3949 ret = flush_dir_items_batch(trans, log, src, dst_path, 3950 batch_start, batch_size); 3951 if (ret < 0) 3952 return ret; 3953 } 3954 3955 return last_found ? 1 : 0; 3956 } 3957 3958 /* 3959 * log all the items included in the current transaction for a given 3960 * directory. This also creates the range items in the log tree required 3961 * to replay anything deleted before the fsync 3962 */ 3963 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3964 struct btrfs_inode *inode, 3965 struct btrfs_path *path, 3966 struct btrfs_path *dst_path, 3967 struct btrfs_log_ctx *ctx, 3968 u64 min_offset, u64 *last_offset_ret) 3969 { 3970 struct btrfs_key min_key; 3971 struct btrfs_root *root = inode->root; 3972 struct btrfs_root *log = root->log_root; 3973 int err = 0; 3974 int ret; 3975 u64 last_old_dentry_offset = min_offset - 1; 3976 u64 last_offset = (u64)-1; 3977 u64 ino = btrfs_ino(inode); 3978 3979 min_key.objectid = ino; 3980 min_key.type = BTRFS_DIR_INDEX_KEY; 3981 min_key.offset = min_offset; 3982 3983 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3984 3985 /* 3986 * we didn't find anything from this transaction, see if there 3987 * is anything at all 3988 */ 3989 if (ret != 0 || min_key.objectid != ino || 3990 min_key.type != BTRFS_DIR_INDEX_KEY) { 3991 min_key.objectid = ino; 3992 min_key.type = BTRFS_DIR_INDEX_KEY; 3993 min_key.offset = (u64)-1; 3994 btrfs_release_path(path); 3995 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 3996 if (ret < 0) { 3997 btrfs_release_path(path); 3998 return ret; 3999 } 4000 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4001 4002 /* if ret == 0 there are items for this type, 4003 * create a range to tell us the last key of this type. 4004 * otherwise, there are no items in this directory after 4005 * *min_offset, and we create a range to indicate that. 4006 */ 4007 if (ret == 0) { 4008 struct btrfs_key tmp; 4009 4010 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 4011 path->slots[0]); 4012 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4013 last_old_dentry_offset = tmp.offset; 4014 } 4015 goto done; 4016 } 4017 4018 /* go backward to find any previous key */ 4019 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4020 if (ret == 0) { 4021 struct btrfs_key tmp; 4022 4023 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 4024 /* 4025 * The dir index key before the first one we found that needs to 4026 * be logged might be in a previous leaf, and there might be a 4027 * gap between these keys, meaning that we had deletions that 4028 * happened. So the key range item we log (key type 4029 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 4030 * previous key's offset plus 1, so that those deletes are replayed. 4031 */ 4032 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4033 last_old_dentry_offset = tmp.offset; 4034 } 4035 btrfs_release_path(path); 4036 4037 /* 4038 * Find the first key from this transaction again. See the note for 4039 * log_new_dir_dentries, if we're logging a directory recursively we 4040 * won't be holding its i_mutex, which means we can modify the directory 4041 * while we're logging it. If we remove an entry between our first 4042 * search and this search we'll not find the key again and can just 4043 * bail. 4044 */ 4045 search: 4046 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 4047 if (ret != 0) 4048 goto done; 4049 4050 /* 4051 * we have a block from this transaction, log every item in it 4052 * from our directory 4053 */ 4054 while (1) { 4055 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 4056 &last_old_dentry_offset); 4057 if (ret != 0) { 4058 if (ret < 0) 4059 err = ret; 4060 goto done; 4061 } 4062 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 4063 4064 /* 4065 * look ahead to the next item and see if it is also 4066 * from this directory and from this transaction 4067 */ 4068 ret = btrfs_next_leaf(root, path); 4069 if (ret) { 4070 if (ret == 1) 4071 last_offset = (u64)-1; 4072 else 4073 err = ret; 4074 goto done; 4075 } 4076 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 4077 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 4078 last_offset = (u64)-1; 4079 goto done; 4080 } 4081 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 4082 /* 4083 * The next leaf was not changed in the current transaction 4084 * and has at least one dir index key. 4085 * We check for the next key because there might have been 4086 * one or more deletions between the last key we logged and 4087 * that next key. So the key range item we log (key type 4088 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 4089 * offset minus 1, so that those deletes are replayed. 4090 */ 4091 last_offset = min_key.offset - 1; 4092 goto done; 4093 } 4094 if (need_resched()) { 4095 btrfs_release_path(path); 4096 cond_resched(); 4097 goto search; 4098 } 4099 } 4100 done: 4101 btrfs_release_path(path); 4102 btrfs_release_path(dst_path); 4103 4104 if (err == 0) { 4105 *last_offset_ret = last_offset; 4106 /* 4107 * In case the leaf was changed in the current transaction but 4108 * all its dir items are from a past transaction, the last item 4109 * in the leaf is a dir item and there's no gap between that last 4110 * dir item and the first one on the next leaf (which did not 4111 * change in the current transaction), then we don't need to log 4112 * a range, last_old_dentry_offset is == to last_offset. 4113 */ 4114 ASSERT(last_old_dentry_offset <= last_offset); 4115 if (last_old_dentry_offset < last_offset) { 4116 ret = insert_dir_log_key(trans, log, path, ino, 4117 last_old_dentry_offset + 1, 4118 last_offset); 4119 if (ret) 4120 err = ret; 4121 } 4122 } 4123 return err; 4124 } 4125 4126 /* 4127 * logging directories is very similar to logging inodes, We find all the items 4128 * from the current transaction and write them to the log. 4129 * 4130 * The recovery code scans the directory in the subvolume, and if it finds a 4131 * key in the range logged that is not present in the log tree, then it means 4132 * that dir entry was unlinked during the transaction. 4133 * 4134 * In order for that scan to work, we must include one key smaller than 4135 * the smallest logged by this transaction and one key larger than the largest 4136 * key logged by this transaction. 4137 */ 4138 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4139 struct btrfs_inode *inode, 4140 struct btrfs_path *path, 4141 struct btrfs_path *dst_path, 4142 struct btrfs_log_ctx *ctx) 4143 { 4144 u64 min_key; 4145 u64 max_key; 4146 int ret; 4147 4148 min_key = BTRFS_DIR_START_INDEX; 4149 max_key = 0; 4150 ctx->last_dir_item_offset = inode->last_dir_index_offset; 4151 4152 while (1) { 4153 ret = log_dir_items(trans, inode, path, dst_path, 4154 ctx, min_key, &max_key); 4155 if (ret) 4156 return ret; 4157 if (max_key == (u64)-1) 4158 break; 4159 min_key = max_key + 1; 4160 } 4161 4162 inode->last_dir_index_offset = ctx->last_dir_item_offset; 4163 4164 return 0; 4165 } 4166 4167 /* 4168 * a helper function to drop items from the log before we relog an 4169 * inode. max_key_type indicates the highest item type to remove. 4170 * This cannot be run for file data extents because it does not 4171 * free the extents they point to. 4172 */ 4173 static int drop_inode_items(struct btrfs_trans_handle *trans, 4174 struct btrfs_root *log, 4175 struct btrfs_path *path, 4176 struct btrfs_inode *inode, 4177 int max_key_type) 4178 { 4179 int ret; 4180 struct btrfs_key key; 4181 struct btrfs_key found_key; 4182 int start_slot; 4183 4184 key.objectid = btrfs_ino(inode); 4185 key.type = max_key_type; 4186 key.offset = (u64)-1; 4187 4188 while (1) { 4189 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4190 BUG_ON(ret == 0); /* Logic error */ 4191 if (ret < 0) 4192 break; 4193 4194 if (path->slots[0] == 0) 4195 break; 4196 4197 path->slots[0]--; 4198 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4199 path->slots[0]); 4200 4201 if (found_key.objectid != key.objectid) 4202 break; 4203 4204 found_key.offset = 0; 4205 found_key.type = 0; 4206 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 4207 if (ret < 0) 4208 break; 4209 4210 ret = btrfs_del_items(trans, log, path, start_slot, 4211 path->slots[0] - start_slot + 1); 4212 /* 4213 * If start slot isn't 0 then we don't need to re-search, we've 4214 * found the last guy with the objectid in this tree. 4215 */ 4216 if (ret || start_slot != 0) 4217 break; 4218 btrfs_release_path(path); 4219 } 4220 btrfs_release_path(path); 4221 if (ret > 0) 4222 ret = 0; 4223 return ret; 4224 } 4225 4226 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4227 struct btrfs_root *log_root, 4228 struct btrfs_inode *inode, 4229 u64 new_size, u32 min_type) 4230 { 4231 struct btrfs_truncate_control control = { 4232 .new_size = new_size, 4233 .ino = btrfs_ino(inode), 4234 .min_type = min_type, 4235 .skip_ref_updates = true, 4236 }; 4237 4238 return btrfs_truncate_inode_items(trans, log_root, &control); 4239 } 4240 4241 static void fill_inode_item(struct btrfs_trans_handle *trans, 4242 struct extent_buffer *leaf, 4243 struct btrfs_inode_item *item, 4244 struct inode *inode, int log_inode_only, 4245 u64 logged_isize) 4246 { 4247 struct btrfs_map_token token; 4248 u64 flags; 4249 4250 btrfs_init_map_token(&token, leaf); 4251 4252 if (log_inode_only) { 4253 /* set the generation to zero so the recover code 4254 * can tell the difference between an logging 4255 * just to say 'this inode exists' and a logging 4256 * to say 'update this inode with these values' 4257 */ 4258 btrfs_set_token_inode_generation(&token, item, 0); 4259 btrfs_set_token_inode_size(&token, item, logged_isize); 4260 } else { 4261 btrfs_set_token_inode_generation(&token, item, 4262 BTRFS_I(inode)->generation); 4263 btrfs_set_token_inode_size(&token, item, inode->i_size); 4264 } 4265 4266 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4267 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4268 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4269 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4270 4271 btrfs_set_token_timespec_sec(&token, &item->atime, 4272 inode->i_atime.tv_sec); 4273 btrfs_set_token_timespec_nsec(&token, &item->atime, 4274 inode->i_atime.tv_nsec); 4275 4276 btrfs_set_token_timespec_sec(&token, &item->mtime, 4277 inode->i_mtime.tv_sec); 4278 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4279 inode->i_mtime.tv_nsec); 4280 4281 btrfs_set_token_timespec_sec(&token, &item->ctime, 4282 inode->i_ctime.tv_sec); 4283 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4284 inode->i_ctime.tv_nsec); 4285 4286 /* 4287 * We do not need to set the nbytes field, in fact during a fast fsync 4288 * its value may not even be correct, since a fast fsync does not wait 4289 * for ordered extent completion, which is where we update nbytes, it 4290 * only waits for writeback to complete. During log replay as we find 4291 * file extent items and replay them, we adjust the nbytes field of the 4292 * inode item in subvolume tree as needed (see overwrite_item()). 4293 */ 4294 4295 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4296 btrfs_set_token_inode_transid(&token, item, trans->transid); 4297 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4298 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4299 BTRFS_I(inode)->ro_flags); 4300 btrfs_set_token_inode_flags(&token, item, flags); 4301 btrfs_set_token_inode_block_group(&token, item, 0); 4302 } 4303 4304 static int log_inode_item(struct btrfs_trans_handle *trans, 4305 struct btrfs_root *log, struct btrfs_path *path, 4306 struct btrfs_inode *inode, bool inode_item_dropped) 4307 { 4308 struct btrfs_inode_item *inode_item; 4309 int ret; 4310 4311 /* 4312 * If we are doing a fast fsync and the inode was logged before in the 4313 * current transaction, then we know the inode was previously logged and 4314 * it exists in the log tree. For performance reasons, in this case use 4315 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4316 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4317 * contention in case there are concurrent fsyncs for other inodes of the 4318 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4319 * already exists can also result in unnecessarily splitting a leaf. 4320 */ 4321 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4322 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4323 ASSERT(ret <= 0); 4324 if (ret > 0) 4325 ret = -ENOENT; 4326 } else { 4327 /* 4328 * This means it is the first fsync in the current transaction, 4329 * so the inode item is not in the log and we need to insert it. 4330 * We can never get -EEXIST because we are only called for a fast 4331 * fsync and in case an inode eviction happens after the inode was 4332 * logged before in the current transaction, when we load again 4333 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4334 * flags and set ->logged_trans to 0. 4335 */ 4336 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4337 sizeof(*inode_item)); 4338 ASSERT(ret != -EEXIST); 4339 } 4340 if (ret) 4341 return ret; 4342 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4343 struct btrfs_inode_item); 4344 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4345 0, 0); 4346 btrfs_release_path(path); 4347 return 0; 4348 } 4349 4350 static int log_csums(struct btrfs_trans_handle *trans, 4351 struct btrfs_inode *inode, 4352 struct btrfs_root *log_root, 4353 struct btrfs_ordered_sum *sums) 4354 { 4355 const u64 lock_end = sums->bytenr + sums->len - 1; 4356 struct extent_state *cached_state = NULL; 4357 int ret; 4358 4359 /* 4360 * If this inode was not used for reflink operations in the current 4361 * transaction with new extents, then do the fast path, no need to 4362 * worry about logging checksum items with overlapping ranges. 4363 */ 4364 if (inode->last_reflink_trans < trans->transid) 4365 return btrfs_csum_file_blocks(trans, log_root, sums); 4366 4367 /* 4368 * Serialize logging for checksums. This is to avoid racing with the 4369 * same checksum being logged by another task that is logging another 4370 * file which happens to refer to the same extent as well. Such races 4371 * can leave checksum items in the log with overlapping ranges. 4372 */ 4373 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 4374 lock_end, &cached_state); 4375 if (ret) 4376 return ret; 4377 /* 4378 * Due to extent cloning, we might have logged a csum item that covers a 4379 * subrange of a cloned extent, and later we can end up logging a csum 4380 * item for a larger subrange of the same extent or the entire range. 4381 * This would leave csum items in the log tree that cover the same range 4382 * and break the searches for checksums in the log tree, resulting in 4383 * some checksums missing in the fs/subvolume tree. So just delete (or 4384 * trim and adjust) any existing csum items in the log for this range. 4385 */ 4386 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4387 if (!ret) 4388 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4389 4390 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 4391 &cached_state); 4392 4393 return ret; 4394 } 4395 4396 static noinline int copy_items(struct btrfs_trans_handle *trans, 4397 struct btrfs_inode *inode, 4398 struct btrfs_path *dst_path, 4399 struct btrfs_path *src_path, 4400 int start_slot, int nr, int inode_only, 4401 u64 logged_isize) 4402 { 4403 struct btrfs_root *log = inode->root->log_root; 4404 struct btrfs_file_extent_item *extent; 4405 struct extent_buffer *src = src_path->nodes[0]; 4406 int ret = 0; 4407 struct btrfs_key *ins_keys; 4408 u32 *ins_sizes; 4409 struct btrfs_item_batch batch; 4410 char *ins_data; 4411 int i; 4412 int dst_index; 4413 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4414 const u64 i_size = i_size_read(&inode->vfs_inode); 4415 4416 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4417 nr * sizeof(u32), GFP_NOFS); 4418 if (!ins_data) 4419 return -ENOMEM; 4420 4421 ins_sizes = (u32 *)ins_data; 4422 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4423 batch.keys = ins_keys; 4424 batch.data_sizes = ins_sizes; 4425 batch.total_data_size = 0; 4426 batch.nr = 0; 4427 4428 dst_index = 0; 4429 for (i = 0; i < nr; i++) { 4430 const int src_slot = start_slot + i; 4431 struct btrfs_root *csum_root; 4432 struct btrfs_ordered_sum *sums; 4433 struct btrfs_ordered_sum *sums_next; 4434 LIST_HEAD(ordered_sums); 4435 u64 disk_bytenr; 4436 u64 disk_num_bytes; 4437 u64 extent_offset; 4438 u64 extent_num_bytes; 4439 bool is_old_extent; 4440 4441 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4442 4443 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4444 goto add_to_batch; 4445 4446 extent = btrfs_item_ptr(src, src_slot, 4447 struct btrfs_file_extent_item); 4448 4449 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4450 trans->transid); 4451 4452 /* 4453 * Don't copy extents from past generations. That would make us 4454 * log a lot more metadata for common cases like doing only a 4455 * few random writes into a file and then fsync it for the first 4456 * time or after the full sync flag is set on the inode. We can 4457 * get leaves full of extent items, most of which are from past 4458 * generations, so we can skip them - as long as the inode has 4459 * not been the target of a reflink operation in this transaction, 4460 * as in that case it might have had file extent items with old 4461 * generations copied into it. We also must always log prealloc 4462 * extents that start at or beyond eof, otherwise we would lose 4463 * them on log replay. 4464 */ 4465 if (is_old_extent && 4466 ins_keys[dst_index].offset < i_size && 4467 inode->last_reflink_trans < trans->transid) 4468 continue; 4469 4470 if (skip_csum) 4471 goto add_to_batch; 4472 4473 /* Only regular extents have checksums. */ 4474 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4475 goto add_to_batch; 4476 4477 /* 4478 * If it's an extent created in a past transaction, then its 4479 * checksums are already accessible from the committed csum tree, 4480 * no need to log them. 4481 */ 4482 if (is_old_extent) 4483 goto add_to_batch; 4484 4485 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4486 /* If it's an explicit hole, there are no checksums. */ 4487 if (disk_bytenr == 0) 4488 goto add_to_batch; 4489 4490 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4491 4492 if (btrfs_file_extent_compression(src, extent)) { 4493 extent_offset = 0; 4494 extent_num_bytes = disk_num_bytes; 4495 } else { 4496 extent_offset = btrfs_file_extent_offset(src, extent); 4497 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4498 } 4499 4500 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4501 disk_bytenr += extent_offset; 4502 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, 4503 disk_bytenr + extent_num_bytes - 1, 4504 &ordered_sums, 0); 4505 if (ret) 4506 goto out; 4507 4508 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4509 if (!ret) 4510 ret = log_csums(trans, inode, log, sums); 4511 list_del(&sums->list); 4512 kfree(sums); 4513 } 4514 if (ret) 4515 goto out; 4516 4517 add_to_batch: 4518 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4519 batch.total_data_size += ins_sizes[dst_index]; 4520 batch.nr++; 4521 dst_index++; 4522 } 4523 4524 /* 4525 * We have a leaf full of old extent items that don't need to be logged, 4526 * so we don't need to do anything. 4527 */ 4528 if (batch.nr == 0) 4529 goto out; 4530 4531 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4532 if (ret) 4533 goto out; 4534 4535 dst_index = 0; 4536 for (i = 0; i < nr; i++) { 4537 const int src_slot = start_slot + i; 4538 const int dst_slot = dst_path->slots[0] + dst_index; 4539 struct btrfs_key key; 4540 unsigned long src_offset; 4541 unsigned long dst_offset; 4542 4543 /* 4544 * We're done, all the remaining items in the source leaf 4545 * correspond to old file extent items. 4546 */ 4547 if (dst_index >= batch.nr) 4548 break; 4549 4550 btrfs_item_key_to_cpu(src, &key, src_slot); 4551 4552 if (key.type != BTRFS_EXTENT_DATA_KEY) 4553 goto copy_item; 4554 4555 extent = btrfs_item_ptr(src, src_slot, 4556 struct btrfs_file_extent_item); 4557 4558 /* See the comment in the previous loop, same logic. */ 4559 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4560 key.offset < i_size && 4561 inode->last_reflink_trans < trans->transid) 4562 continue; 4563 4564 copy_item: 4565 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4566 src_offset = btrfs_item_ptr_offset(src, src_slot); 4567 4568 if (key.type == BTRFS_INODE_ITEM_KEY) { 4569 struct btrfs_inode_item *inode_item; 4570 4571 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4572 struct btrfs_inode_item); 4573 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4574 &inode->vfs_inode, 4575 inode_only == LOG_INODE_EXISTS, 4576 logged_isize); 4577 } else { 4578 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4579 src_offset, ins_sizes[dst_index]); 4580 } 4581 4582 dst_index++; 4583 } 4584 4585 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4586 btrfs_release_path(dst_path); 4587 out: 4588 kfree(ins_data); 4589 4590 return ret; 4591 } 4592 4593 static int extent_cmp(void *priv, const struct list_head *a, 4594 const struct list_head *b) 4595 { 4596 const struct extent_map *em1, *em2; 4597 4598 em1 = list_entry(a, struct extent_map, list); 4599 em2 = list_entry(b, struct extent_map, list); 4600 4601 if (em1->start < em2->start) 4602 return -1; 4603 else if (em1->start > em2->start) 4604 return 1; 4605 return 0; 4606 } 4607 4608 static int log_extent_csums(struct btrfs_trans_handle *trans, 4609 struct btrfs_inode *inode, 4610 struct btrfs_root *log_root, 4611 const struct extent_map *em, 4612 struct btrfs_log_ctx *ctx) 4613 { 4614 struct btrfs_ordered_extent *ordered; 4615 struct btrfs_root *csum_root; 4616 u64 csum_offset; 4617 u64 csum_len; 4618 u64 mod_start = em->mod_start; 4619 u64 mod_len = em->mod_len; 4620 LIST_HEAD(ordered_sums); 4621 int ret = 0; 4622 4623 if (inode->flags & BTRFS_INODE_NODATASUM || 4624 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4625 em->block_start == EXTENT_MAP_HOLE) 4626 return 0; 4627 4628 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4629 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4630 const u64 mod_end = mod_start + mod_len; 4631 struct btrfs_ordered_sum *sums; 4632 4633 if (mod_len == 0) 4634 break; 4635 4636 if (ordered_end <= mod_start) 4637 continue; 4638 if (mod_end <= ordered->file_offset) 4639 break; 4640 4641 /* 4642 * We are going to copy all the csums on this ordered extent, so 4643 * go ahead and adjust mod_start and mod_len in case this ordered 4644 * extent has already been logged. 4645 */ 4646 if (ordered->file_offset > mod_start) { 4647 if (ordered_end >= mod_end) 4648 mod_len = ordered->file_offset - mod_start; 4649 /* 4650 * If we have this case 4651 * 4652 * |--------- logged extent ---------| 4653 * |----- ordered extent ----| 4654 * 4655 * Just don't mess with mod_start and mod_len, we'll 4656 * just end up logging more csums than we need and it 4657 * will be ok. 4658 */ 4659 } else { 4660 if (ordered_end < mod_end) { 4661 mod_len = mod_end - ordered_end; 4662 mod_start = ordered_end; 4663 } else { 4664 mod_len = 0; 4665 } 4666 } 4667 4668 /* 4669 * To keep us from looping for the above case of an ordered 4670 * extent that falls inside of the logged extent. 4671 */ 4672 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4673 continue; 4674 4675 list_for_each_entry(sums, &ordered->list, list) { 4676 ret = log_csums(trans, inode, log_root, sums); 4677 if (ret) 4678 return ret; 4679 } 4680 } 4681 4682 /* We're done, found all csums in the ordered extents. */ 4683 if (mod_len == 0) 4684 return 0; 4685 4686 /* If we're compressed we have to save the entire range of csums. */ 4687 if (em->compress_type) { 4688 csum_offset = 0; 4689 csum_len = max(em->block_len, em->orig_block_len); 4690 } else { 4691 csum_offset = mod_start - em->start; 4692 csum_len = mod_len; 4693 } 4694 4695 /* block start is already adjusted for the file extent offset. */ 4696 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4697 ret = btrfs_lookup_csums_range(csum_root, 4698 em->block_start + csum_offset, 4699 em->block_start + csum_offset + 4700 csum_len - 1, &ordered_sums, 0); 4701 if (ret) 4702 return ret; 4703 4704 while (!list_empty(&ordered_sums)) { 4705 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4706 struct btrfs_ordered_sum, 4707 list); 4708 if (!ret) 4709 ret = log_csums(trans, inode, log_root, sums); 4710 list_del(&sums->list); 4711 kfree(sums); 4712 } 4713 4714 return ret; 4715 } 4716 4717 static int log_one_extent(struct btrfs_trans_handle *trans, 4718 struct btrfs_inode *inode, 4719 const struct extent_map *em, 4720 struct btrfs_path *path, 4721 struct btrfs_log_ctx *ctx) 4722 { 4723 struct btrfs_drop_extents_args drop_args = { 0 }; 4724 struct btrfs_root *log = inode->root->log_root; 4725 struct btrfs_file_extent_item fi = { 0 }; 4726 struct extent_buffer *leaf; 4727 struct btrfs_key key; 4728 u64 extent_offset = em->start - em->orig_start; 4729 u64 block_len; 4730 int ret; 4731 4732 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4733 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4734 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4735 else 4736 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4737 4738 block_len = max(em->block_len, em->orig_block_len); 4739 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4740 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4741 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4742 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4743 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4744 extent_offset); 4745 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4746 } 4747 4748 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4749 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4750 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4751 btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4752 4753 ret = log_extent_csums(trans, inode, log, em, ctx); 4754 if (ret) 4755 return ret; 4756 4757 /* 4758 * If this is the first time we are logging the inode in the current 4759 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4760 * because it does a deletion search, which always acquires write locks 4761 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4762 * but also adds significant contention in a log tree, since log trees 4763 * are small, with a root at level 2 or 3 at most, due to their short 4764 * life span. 4765 */ 4766 if (ctx->logged_before) { 4767 drop_args.path = path; 4768 drop_args.start = em->start; 4769 drop_args.end = em->start + em->len; 4770 drop_args.replace_extent = true; 4771 drop_args.extent_item_size = sizeof(fi); 4772 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4773 if (ret) 4774 return ret; 4775 } 4776 4777 if (!drop_args.extent_inserted) { 4778 key.objectid = btrfs_ino(inode); 4779 key.type = BTRFS_EXTENT_DATA_KEY; 4780 key.offset = em->start; 4781 4782 ret = btrfs_insert_empty_item(trans, log, path, &key, 4783 sizeof(fi)); 4784 if (ret) 4785 return ret; 4786 } 4787 leaf = path->nodes[0]; 4788 write_extent_buffer(leaf, &fi, 4789 btrfs_item_ptr_offset(leaf, path->slots[0]), 4790 sizeof(fi)); 4791 btrfs_mark_buffer_dirty(leaf); 4792 4793 btrfs_release_path(path); 4794 4795 return ret; 4796 } 4797 4798 /* 4799 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4800 * lose them after doing a full/fast fsync and replaying the log. We scan the 4801 * subvolume's root instead of iterating the inode's extent map tree because 4802 * otherwise we can log incorrect extent items based on extent map conversion. 4803 * That can happen due to the fact that extent maps are merged when they 4804 * are not in the extent map tree's list of modified extents. 4805 */ 4806 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4807 struct btrfs_inode *inode, 4808 struct btrfs_path *path) 4809 { 4810 struct btrfs_root *root = inode->root; 4811 struct btrfs_key key; 4812 const u64 i_size = i_size_read(&inode->vfs_inode); 4813 const u64 ino = btrfs_ino(inode); 4814 struct btrfs_path *dst_path = NULL; 4815 bool dropped_extents = false; 4816 u64 truncate_offset = i_size; 4817 struct extent_buffer *leaf; 4818 int slot; 4819 int ins_nr = 0; 4820 int start_slot; 4821 int ret; 4822 4823 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4824 return 0; 4825 4826 key.objectid = ino; 4827 key.type = BTRFS_EXTENT_DATA_KEY; 4828 key.offset = i_size; 4829 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4830 if (ret < 0) 4831 goto out; 4832 4833 /* 4834 * We must check if there is a prealloc extent that starts before the 4835 * i_size and crosses the i_size boundary. This is to ensure later we 4836 * truncate down to the end of that extent and not to the i_size, as 4837 * otherwise we end up losing part of the prealloc extent after a log 4838 * replay and with an implicit hole if there is another prealloc extent 4839 * that starts at an offset beyond i_size. 4840 */ 4841 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4842 if (ret < 0) 4843 goto out; 4844 4845 if (ret == 0) { 4846 struct btrfs_file_extent_item *ei; 4847 4848 leaf = path->nodes[0]; 4849 slot = path->slots[0]; 4850 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4851 4852 if (btrfs_file_extent_type(leaf, ei) == 4853 BTRFS_FILE_EXTENT_PREALLOC) { 4854 u64 extent_end; 4855 4856 btrfs_item_key_to_cpu(leaf, &key, slot); 4857 extent_end = key.offset + 4858 btrfs_file_extent_num_bytes(leaf, ei); 4859 4860 if (extent_end > i_size) 4861 truncate_offset = extent_end; 4862 } 4863 } else { 4864 ret = 0; 4865 } 4866 4867 while (true) { 4868 leaf = path->nodes[0]; 4869 slot = path->slots[0]; 4870 4871 if (slot >= btrfs_header_nritems(leaf)) { 4872 if (ins_nr > 0) { 4873 ret = copy_items(trans, inode, dst_path, path, 4874 start_slot, ins_nr, 1, 0); 4875 if (ret < 0) 4876 goto out; 4877 ins_nr = 0; 4878 } 4879 ret = btrfs_next_leaf(root, path); 4880 if (ret < 0) 4881 goto out; 4882 if (ret > 0) { 4883 ret = 0; 4884 break; 4885 } 4886 continue; 4887 } 4888 4889 btrfs_item_key_to_cpu(leaf, &key, slot); 4890 if (key.objectid > ino) 4891 break; 4892 if (WARN_ON_ONCE(key.objectid < ino) || 4893 key.type < BTRFS_EXTENT_DATA_KEY || 4894 key.offset < i_size) { 4895 path->slots[0]++; 4896 continue; 4897 } 4898 if (!dropped_extents) { 4899 /* 4900 * Avoid logging extent items logged in past fsync calls 4901 * and leading to duplicate keys in the log tree. 4902 */ 4903 ret = truncate_inode_items(trans, root->log_root, inode, 4904 truncate_offset, 4905 BTRFS_EXTENT_DATA_KEY); 4906 if (ret) 4907 goto out; 4908 dropped_extents = true; 4909 } 4910 if (ins_nr == 0) 4911 start_slot = slot; 4912 ins_nr++; 4913 path->slots[0]++; 4914 if (!dst_path) { 4915 dst_path = btrfs_alloc_path(); 4916 if (!dst_path) { 4917 ret = -ENOMEM; 4918 goto out; 4919 } 4920 } 4921 } 4922 if (ins_nr > 0) 4923 ret = copy_items(trans, inode, dst_path, path, 4924 start_slot, ins_nr, 1, 0); 4925 out: 4926 btrfs_release_path(path); 4927 btrfs_free_path(dst_path); 4928 return ret; 4929 } 4930 4931 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4932 struct btrfs_inode *inode, 4933 struct btrfs_path *path, 4934 struct btrfs_log_ctx *ctx) 4935 { 4936 struct btrfs_ordered_extent *ordered; 4937 struct btrfs_ordered_extent *tmp; 4938 struct extent_map *em, *n; 4939 struct list_head extents; 4940 struct extent_map_tree *tree = &inode->extent_tree; 4941 int ret = 0; 4942 int num = 0; 4943 4944 INIT_LIST_HEAD(&extents); 4945 4946 write_lock(&tree->lock); 4947 4948 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4949 list_del_init(&em->list); 4950 /* 4951 * Just an arbitrary number, this can be really CPU intensive 4952 * once we start getting a lot of extents, and really once we 4953 * have a bunch of extents we just want to commit since it will 4954 * be faster. 4955 */ 4956 if (++num > 32768) { 4957 list_del_init(&tree->modified_extents); 4958 ret = -EFBIG; 4959 goto process; 4960 } 4961 4962 if (em->generation < trans->transid) 4963 continue; 4964 4965 /* We log prealloc extents beyond eof later. */ 4966 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4967 em->start >= i_size_read(&inode->vfs_inode)) 4968 continue; 4969 4970 /* Need a ref to keep it from getting evicted from cache */ 4971 refcount_inc(&em->refs); 4972 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4973 list_add_tail(&em->list, &extents); 4974 num++; 4975 } 4976 4977 list_sort(NULL, &extents, extent_cmp); 4978 process: 4979 while (!list_empty(&extents)) { 4980 em = list_entry(extents.next, struct extent_map, list); 4981 4982 list_del_init(&em->list); 4983 4984 /* 4985 * If we had an error we just need to delete everybody from our 4986 * private list. 4987 */ 4988 if (ret) { 4989 clear_em_logging(tree, em); 4990 free_extent_map(em); 4991 continue; 4992 } 4993 4994 write_unlock(&tree->lock); 4995 4996 ret = log_one_extent(trans, inode, em, path, ctx); 4997 write_lock(&tree->lock); 4998 clear_em_logging(tree, em); 4999 free_extent_map(em); 5000 } 5001 WARN_ON(!list_empty(&extents)); 5002 write_unlock(&tree->lock); 5003 5004 if (!ret) 5005 ret = btrfs_log_prealloc_extents(trans, inode, path); 5006 if (ret) 5007 return ret; 5008 5009 /* 5010 * We have logged all extents successfully, now make sure the commit of 5011 * the current transaction waits for the ordered extents to complete 5012 * before it commits and wipes out the log trees, otherwise we would 5013 * lose data if an ordered extents completes after the transaction 5014 * commits and a power failure happens after the transaction commit. 5015 */ 5016 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 5017 list_del_init(&ordered->log_list); 5018 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 5019 5020 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5021 spin_lock_irq(&inode->ordered_tree.lock); 5022 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5023 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 5024 atomic_inc(&trans->transaction->pending_ordered); 5025 } 5026 spin_unlock_irq(&inode->ordered_tree.lock); 5027 } 5028 btrfs_put_ordered_extent(ordered); 5029 } 5030 5031 return 0; 5032 } 5033 5034 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 5035 struct btrfs_path *path, u64 *size_ret) 5036 { 5037 struct btrfs_key key; 5038 int ret; 5039 5040 key.objectid = btrfs_ino(inode); 5041 key.type = BTRFS_INODE_ITEM_KEY; 5042 key.offset = 0; 5043 5044 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 5045 if (ret < 0) { 5046 return ret; 5047 } else if (ret > 0) { 5048 *size_ret = 0; 5049 } else { 5050 struct btrfs_inode_item *item; 5051 5052 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5053 struct btrfs_inode_item); 5054 *size_ret = btrfs_inode_size(path->nodes[0], item); 5055 /* 5056 * If the in-memory inode's i_size is smaller then the inode 5057 * size stored in the btree, return the inode's i_size, so 5058 * that we get a correct inode size after replaying the log 5059 * when before a power failure we had a shrinking truncate 5060 * followed by addition of a new name (rename / new hard link). 5061 * Otherwise return the inode size from the btree, to avoid 5062 * data loss when replaying a log due to previously doing a 5063 * write that expands the inode's size and logging a new name 5064 * immediately after. 5065 */ 5066 if (*size_ret > inode->vfs_inode.i_size) 5067 *size_ret = inode->vfs_inode.i_size; 5068 } 5069 5070 btrfs_release_path(path); 5071 return 0; 5072 } 5073 5074 /* 5075 * At the moment we always log all xattrs. This is to figure out at log replay 5076 * time which xattrs must have their deletion replayed. If a xattr is missing 5077 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5078 * because if a xattr is deleted, the inode is fsynced and a power failure 5079 * happens, causing the log to be replayed the next time the fs is mounted, 5080 * we want the xattr to not exist anymore (same behaviour as other filesystems 5081 * with a journal, ext3/4, xfs, f2fs, etc). 5082 */ 5083 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5084 struct btrfs_inode *inode, 5085 struct btrfs_path *path, 5086 struct btrfs_path *dst_path) 5087 { 5088 struct btrfs_root *root = inode->root; 5089 int ret; 5090 struct btrfs_key key; 5091 const u64 ino = btrfs_ino(inode); 5092 int ins_nr = 0; 5093 int start_slot = 0; 5094 bool found_xattrs = false; 5095 5096 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5097 return 0; 5098 5099 key.objectid = ino; 5100 key.type = BTRFS_XATTR_ITEM_KEY; 5101 key.offset = 0; 5102 5103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5104 if (ret < 0) 5105 return ret; 5106 5107 while (true) { 5108 int slot = path->slots[0]; 5109 struct extent_buffer *leaf = path->nodes[0]; 5110 int nritems = btrfs_header_nritems(leaf); 5111 5112 if (slot >= nritems) { 5113 if (ins_nr > 0) { 5114 ret = copy_items(trans, inode, dst_path, path, 5115 start_slot, ins_nr, 1, 0); 5116 if (ret < 0) 5117 return ret; 5118 ins_nr = 0; 5119 } 5120 ret = btrfs_next_leaf(root, path); 5121 if (ret < 0) 5122 return ret; 5123 else if (ret > 0) 5124 break; 5125 continue; 5126 } 5127 5128 btrfs_item_key_to_cpu(leaf, &key, slot); 5129 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5130 break; 5131 5132 if (ins_nr == 0) 5133 start_slot = slot; 5134 ins_nr++; 5135 path->slots[0]++; 5136 found_xattrs = true; 5137 cond_resched(); 5138 } 5139 if (ins_nr > 0) { 5140 ret = copy_items(trans, inode, dst_path, path, 5141 start_slot, ins_nr, 1, 0); 5142 if (ret < 0) 5143 return ret; 5144 } 5145 5146 if (!found_xattrs) 5147 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5148 5149 return 0; 5150 } 5151 5152 /* 5153 * When using the NO_HOLES feature if we punched a hole that causes the 5154 * deletion of entire leafs or all the extent items of the first leaf (the one 5155 * that contains the inode item and references) we may end up not processing 5156 * any extents, because there are no leafs with a generation matching the 5157 * current transaction that have extent items for our inode. So we need to find 5158 * if any holes exist and then log them. We also need to log holes after any 5159 * truncate operation that changes the inode's size. 5160 */ 5161 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5162 struct btrfs_inode *inode, 5163 struct btrfs_path *path) 5164 { 5165 struct btrfs_root *root = inode->root; 5166 struct btrfs_fs_info *fs_info = root->fs_info; 5167 struct btrfs_key key; 5168 const u64 ino = btrfs_ino(inode); 5169 const u64 i_size = i_size_read(&inode->vfs_inode); 5170 u64 prev_extent_end = 0; 5171 int ret; 5172 5173 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5174 return 0; 5175 5176 key.objectid = ino; 5177 key.type = BTRFS_EXTENT_DATA_KEY; 5178 key.offset = 0; 5179 5180 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5181 if (ret < 0) 5182 return ret; 5183 5184 while (true) { 5185 struct extent_buffer *leaf = path->nodes[0]; 5186 5187 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5188 ret = btrfs_next_leaf(root, path); 5189 if (ret < 0) 5190 return ret; 5191 if (ret > 0) { 5192 ret = 0; 5193 break; 5194 } 5195 leaf = path->nodes[0]; 5196 } 5197 5198 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5199 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5200 break; 5201 5202 /* We have a hole, log it. */ 5203 if (prev_extent_end < key.offset) { 5204 const u64 hole_len = key.offset - prev_extent_end; 5205 5206 /* 5207 * Release the path to avoid deadlocks with other code 5208 * paths that search the root while holding locks on 5209 * leafs from the log root. 5210 */ 5211 btrfs_release_path(path); 5212 ret = btrfs_insert_file_extent(trans, root->log_root, 5213 ino, prev_extent_end, 0, 5214 0, hole_len, 0, hole_len, 5215 0, 0, 0); 5216 if (ret < 0) 5217 return ret; 5218 5219 /* 5220 * Search for the same key again in the root. Since it's 5221 * an extent item and we are holding the inode lock, the 5222 * key must still exist. If it doesn't just emit warning 5223 * and return an error to fall back to a transaction 5224 * commit. 5225 */ 5226 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5227 if (ret < 0) 5228 return ret; 5229 if (WARN_ON(ret > 0)) 5230 return -ENOENT; 5231 leaf = path->nodes[0]; 5232 } 5233 5234 prev_extent_end = btrfs_file_extent_end(path); 5235 path->slots[0]++; 5236 cond_resched(); 5237 } 5238 5239 if (prev_extent_end < i_size) { 5240 u64 hole_len; 5241 5242 btrfs_release_path(path); 5243 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5244 ret = btrfs_insert_file_extent(trans, root->log_root, 5245 ino, prev_extent_end, 0, 0, 5246 hole_len, 0, hole_len, 5247 0, 0, 0); 5248 if (ret < 0) 5249 return ret; 5250 } 5251 5252 return 0; 5253 } 5254 5255 /* 5256 * When we are logging a new inode X, check if it doesn't have a reference that 5257 * matches the reference from some other inode Y created in a past transaction 5258 * and that was renamed in the current transaction. If we don't do this, then at 5259 * log replay time we can lose inode Y (and all its files if it's a directory): 5260 * 5261 * mkdir /mnt/x 5262 * echo "hello world" > /mnt/x/foobar 5263 * sync 5264 * mv /mnt/x /mnt/y 5265 * mkdir /mnt/x # or touch /mnt/x 5266 * xfs_io -c fsync /mnt/x 5267 * <power fail> 5268 * mount fs, trigger log replay 5269 * 5270 * After the log replay procedure, we would lose the first directory and all its 5271 * files (file foobar). 5272 * For the case where inode Y is not a directory we simply end up losing it: 5273 * 5274 * echo "123" > /mnt/foo 5275 * sync 5276 * mv /mnt/foo /mnt/bar 5277 * echo "abc" > /mnt/foo 5278 * xfs_io -c fsync /mnt/foo 5279 * <power fail> 5280 * 5281 * We also need this for cases where a snapshot entry is replaced by some other 5282 * entry (file or directory) otherwise we end up with an unreplayable log due to 5283 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5284 * if it were a regular entry: 5285 * 5286 * mkdir /mnt/x 5287 * btrfs subvolume snapshot /mnt /mnt/x/snap 5288 * btrfs subvolume delete /mnt/x/snap 5289 * rmdir /mnt/x 5290 * mkdir /mnt/x 5291 * fsync /mnt/x or fsync some new file inside it 5292 * <power fail> 5293 * 5294 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5295 * the same transaction. 5296 */ 5297 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5298 const int slot, 5299 const struct btrfs_key *key, 5300 struct btrfs_inode *inode, 5301 u64 *other_ino, u64 *other_parent) 5302 { 5303 int ret; 5304 struct btrfs_path *search_path; 5305 char *name = NULL; 5306 u32 name_len = 0; 5307 u32 item_size = btrfs_item_size(eb, slot); 5308 u32 cur_offset = 0; 5309 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5310 5311 search_path = btrfs_alloc_path(); 5312 if (!search_path) 5313 return -ENOMEM; 5314 search_path->search_commit_root = 1; 5315 search_path->skip_locking = 1; 5316 5317 while (cur_offset < item_size) { 5318 u64 parent; 5319 u32 this_name_len; 5320 u32 this_len; 5321 unsigned long name_ptr; 5322 struct btrfs_dir_item *di; 5323 5324 if (key->type == BTRFS_INODE_REF_KEY) { 5325 struct btrfs_inode_ref *iref; 5326 5327 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5328 parent = key->offset; 5329 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5330 name_ptr = (unsigned long)(iref + 1); 5331 this_len = sizeof(*iref) + this_name_len; 5332 } else { 5333 struct btrfs_inode_extref *extref; 5334 5335 extref = (struct btrfs_inode_extref *)(ptr + 5336 cur_offset); 5337 parent = btrfs_inode_extref_parent(eb, extref); 5338 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5339 name_ptr = (unsigned long)&extref->name; 5340 this_len = sizeof(*extref) + this_name_len; 5341 } 5342 5343 if (this_name_len > name_len) { 5344 char *new_name; 5345 5346 new_name = krealloc(name, this_name_len, GFP_NOFS); 5347 if (!new_name) { 5348 ret = -ENOMEM; 5349 goto out; 5350 } 5351 name_len = this_name_len; 5352 name = new_name; 5353 } 5354 5355 read_extent_buffer(eb, name, name_ptr, this_name_len); 5356 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5357 parent, name, this_name_len, 0); 5358 if (di && !IS_ERR(di)) { 5359 struct btrfs_key di_key; 5360 5361 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5362 di, &di_key); 5363 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5364 if (di_key.objectid != key->objectid) { 5365 ret = 1; 5366 *other_ino = di_key.objectid; 5367 *other_parent = parent; 5368 } else { 5369 ret = 0; 5370 } 5371 } else { 5372 ret = -EAGAIN; 5373 } 5374 goto out; 5375 } else if (IS_ERR(di)) { 5376 ret = PTR_ERR(di); 5377 goto out; 5378 } 5379 btrfs_release_path(search_path); 5380 5381 cur_offset += this_len; 5382 } 5383 ret = 0; 5384 out: 5385 btrfs_free_path(search_path); 5386 kfree(name); 5387 return ret; 5388 } 5389 5390 struct btrfs_ino_list { 5391 u64 ino; 5392 u64 parent; 5393 struct list_head list; 5394 }; 5395 5396 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5397 struct btrfs_root *root, 5398 struct btrfs_path *path, 5399 struct btrfs_log_ctx *ctx, 5400 u64 ino, u64 parent) 5401 { 5402 struct btrfs_ino_list *ino_elem; 5403 LIST_HEAD(inode_list); 5404 int ret = 0; 5405 5406 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5407 if (!ino_elem) 5408 return -ENOMEM; 5409 ino_elem->ino = ino; 5410 ino_elem->parent = parent; 5411 list_add_tail(&ino_elem->list, &inode_list); 5412 5413 while (!list_empty(&inode_list)) { 5414 struct btrfs_fs_info *fs_info = root->fs_info; 5415 struct btrfs_key key; 5416 struct inode *inode; 5417 5418 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 5419 list); 5420 ino = ino_elem->ino; 5421 parent = ino_elem->parent; 5422 list_del(&ino_elem->list); 5423 kfree(ino_elem); 5424 if (ret) 5425 continue; 5426 5427 btrfs_release_path(path); 5428 5429 inode = btrfs_iget(fs_info->sb, ino, root); 5430 /* 5431 * If the other inode that had a conflicting dir entry was 5432 * deleted in the current transaction, we need to log its parent 5433 * directory. 5434 */ 5435 if (IS_ERR(inode)) { 5436 ret = PTR_ERR(inode); 5437 if (ret == -ENOENT) { 5438 inode = btrfs_iget(fs_info->sb, parent, root); 5439 if (IS_ERR(inode)) { 5440 ret = PTR_ERR(inode); 5441 } else { 5442 ret = btrfs_log_inode(trans, 5443 BTRFS_I(inode), 5444 LOG_OTHER_INODE_ALL, 5445 ctx); 5446 btrfs_add_delayed_iput(inode); 5447 } 5448 } 5449 continue; 5450 } 5451 /* 5452 * If the inode was already logged skip it - otherwise we can 5453 * hit an infinite loop. Example: 5454 * 5455 * From the commit root (previous transaction) we have the 5456 * following inodes: 5457 * 5458 * inode 257 a directory 5459 * inode 258 with references "zz" and "zz_link" on inode 257 5460 * inode 259 with reference "a" on inode 257 5461 * 5462 * And in the current (uncommitted) transaction we have: 5463 * 5464 * inode 257 a directory, unchanged 5465 * inode 258 with references "a" and "a2" on inode 257 5466 * inode 259 with reference "zz_link" on inode 257 5467 * inode 261 with reference "zz" on inode 257 5468 * 5469 * When logging inode 261 the following infinite loop could 5470 * happen if we don't skip already logged inodes: 5471 * 5472 * - we detect inode 258 as a conflicting inode, with inode 261 5473 * on reference "zz", and log it; 5474 * 5475 * - we detect inode 259 as a conflicting inode, with inode 258 5476 * on reference "a", and log it; 5477 * 5478 * - we detect inode 258 as a conflicting inode, with inode 259 5479 * on reference "zz_link", and log it - again! After this we 5480 * repeat the above steps forever. 5481 */ 5482 spin_lock(&BTRFS_I(inode)->lock); 5483 /* 5484 * Check the inode's logged_trans only instead of 5485 * btrfs_inode_in_log(). This is because the last_log_commit of 5486 * the inode is not updated when we only log that it exists (see 5487 * btrfs_log_inode()). 5488 */ 5489 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5490 spin_unlock(&BTRFS_I(inode)->lock); 5491 btrfs_add_delayed_iput(inode); 5492 continue; 5493 } 5494 spin_unlock(&BTRFS_I(inode)->lock); 5495 /* 5496 * We are safe logging the other inode without acquiring its 5497 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5498 * are safe against concurrent renames of the other inode as 5499 * well because during a rename we pin the log and update the 5500 * log with the new name before we unpin it. 5501 */ 5502 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); 5503 if (ret) { 5504 btrfs_add_delayed_iput(inode); 5505 continue; 5506 } 5507 5508 key.objectid = ino; 5509 key.type = BTRFS_INODE_REF_KEY; 5510 key.offset = 0; 5511 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5512 if (ret < 0) { 5513 btrfs_add_delayed_iput(inode); 5514 continue; 5515 } 5516 5517 while (true) { 5518 struct extent_buffer *leaf = path->nodes[0]; 5519 int slot = path->slots[0]; 5520 u64 other_ino = 0; 5521 u64 other_parent = 0; 5522 5523 if (slot >= btrfs_header_nritems(leaf)) { 5524 ret = btrfs_next_leaf(root, path); 5525 if (ret < 0) { 5526 break; 5527 } else if (ret > 0) { 5528 ret = 0; 5529 break; 5530 } 5531 continue; 5532 } 5533 5534 btrfs_item_key_to_cpu(leaf, &key, slot); 5535 if (key.objectid != ino || 5536 (key.type != BTRFS_INODE_REF_KEY && 5537 key.type != BTRFS_INODE_EXTREF_KEY)) { 5538 ret = 0; 5539 break; 5540 } 5541 5542 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5543 BTRFS_I(inode), &other_ino, 5544 &other_parent); 5545 if (ret < 0) 5546 break; 5547 if (ret > 0) { 5548 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5549 if (!ino_elem) { 5550 ret = -ENOMEM; 5551 break; 5552 } 5553 ino_elem->ino = other_ino; 5554 ino_elem->parent = other_parent; 5555 list_add_tail(&ino_elem->list, &inode_list); 5556 ret = 0; 5557 } 5558 path->slots[0]++; 5559 } 5560 btrfs_add_delayed_iput(inode); 5561 } 5562 5563 return ret; 5564 } 5565 5566 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5567 struct btrfs_inode *inode, 5568 struct btrfs_key *min_key, 5569 const struct btrfs_key *max_key, 5570 struct btrfs_path *path, 5571 struct btrfs_path *dst_path, 5572 const u64 logged_isize, 5573 const bool recursive_logging, 5574 const int inode_only, 5575 struct btrfs_log_ctx *ctx, 5576 bool *need_log_inode_item) 5577 { 5578 const u64 i_size = i_size_read(&inode->vfs_inode); 5579 struct btrfs_root *root = inode->root; 5580 int ins_start_slot = 0; 5581 int ins_nr = 0; 5582 int ret; 5583 5584 while (1) { 5585 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5586 if (ret < 0) 5587 return ret; 5588 if (ret > 0) { 5589 ret = 0; 5590 break; 5591 } 5592 again: 5593 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5594 if (min_key->objectid != max_key->objectid) 5595 break; 5596 if (min_key->type > max_key->type) 5597 break; 5598 5599 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5600 *need_log_inode_item = false; 5601 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5602 min_key->offset >= i_size) { 5603 /* 5604 * Extents at and beyond eof are logged with 5605 * btrfs_log_prealloc_extents(). 5606 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5607 * and no keys greater than that, so bail out. 5608 */ 5609 break; 5610 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5611 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5612 inode->generation == trans->transid && 5613 !recursive_logging) { 5614 u64 other_ino = 0; 5615 u64 other_parent = 0; 5616 5617 ret = btrfs_check_ref_name_override(path->nodes[0], 5618 path->slots[0], min_key, inode, 5619 &other_ino, &other_parent); 5620 if (ret < 0) { 5621 return ret; 5622 } else if (ret > 0 && 5623 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5624 if (ins_nr > 0) { 5625 ins_nr++; 5626 } else { 5627 ins_nr = 1; 5628 ins_start_slot = path->slots[0]; 5629 } 5630 ret = copy_items(trans, inode, dst_path, path, 5631 ins_start_slot, ins_nr, 5632 inode_only, logged_isize); 5633 if (ret < 0) 5634 return ret; 5635 ins_nr = 0; 5636 5637 ret = log_conflicting_inodes(trans, root, path, 5638 ctx, other_ino, other_parent); 5639 if (ret) 5640 return ret; 5641 btrfs_release_path(path); 5642 goto next_key; 5643 } 5644 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5645 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5646 if (ins_nr == 0) 5647 goto next_slot; 5648 ret = copy_items(trans, inode, dst_path, path, 5649 ins_start_slot, 5650 ins_nr, inode_only, logged_isize); 5651 if (ret < 0) 5652 return ret; 5653 ins_nr = 0; 5654 goto next_slot; 5655 } 5656 5657 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5658 ins_nr++; 5659 goto next_slot; 5660 } else if (!ins_nr) { 5661 ins_start_slot = path->slots[0]; 5662 ins_nr = 1; 5663 goto next_slot; 5664 } 5665 5666 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5667 ins_nr, inode_only, logged_isize); 5668 if (ret < 0) 5669 return ret; 5670 ins_nr = 1; 5671 ins_start_slot = path->slots[0]; 5672 next_slot: 5673 path->slots[0]++; 5674 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5675 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5676 path->slots[0]); 5677 goto again; 5678 } 5679 if (ins_nr) { 5680 ret = copy_items(trans, inode, dst_path, path, 5681 ins_start_slot, ins_nr, inode_only, 5682 logged_isize); 5683 if (ret < 0) 5684 return ret; 5685 ins_nr = 0; 5686 } 5687 btrfs_release_path(path); 5688 next_key: 5689 if (min_key->offset < (u64)-1) { 5690 min_key->offset++; 5691 } else if (min_key->type < max_key->type) { 5692 min_key->type++; 5693 min_key->offset = 0; 5694 } else { 5695 break; 5696 } 5697 5698 /* 5699 * We may process many leaves full of items for our inode, so 5700 * avoid monopolizing a cpu for too long by rescheduling while 5701 * not holding locks on any tree. 5702 */ 5703 cond_resched(); 5704 } 5705 if (ins_nr) { 5706 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5707 ins_nr, inode_only, logged_isize); 5708 if (ret) 5709 return ret; 5710 } 5711 5712 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5713 /* 5714 * Release the path because otherwise we might attempt to double 5715 * lock the same leaf with btrfs_log_prealloc_extents() below. 5716 */ 5717 btrfs_release_path(path); 5718 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5719 } 5720 5721 return ret; 5722 } 5723 5724 /* log a single inode in the tree log. 5725 * At least one parent directory for this inode must exist in the tree 5726 * or be logged already. 5727 * 5728 * Any items from this inode changed by the current transaction are copied 5729 * to the log tree. An extra reference is taken on any extents in this 5730 * file, allowing us to avoid a whole pile of corner cases around logging 5731 * blocks that have been removed from the tree. 5732 * 5733 * See LOG_INODE_ALL and related defines for a description of what inode_only 5734 * does. 5735 * 5736 * This handles both files and directories. 5737 */ 5738 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5739 struct btrfs_inode *inode, 5740 int inode_only, 5741 struct btrfs_log_ctx *ctx) 5742 { 5743 struct btrfs_path *path; 5744 struct btrfs_path *dst_path; 5745 struct btrfs_key min_key; 5746 struct btrfs_key max_key; 5747 struct btrfs_root *log = inode->root->log_root; 5748 int ret; 5749 bool fast_search = false; 5750 u64 ino = btrfs_ino(inode); 5751 struct extent_map_tree *em_tree = &inode->extent_tree; 5752 u64 logged_isize = 0; 5753 bool need_log_inode_item = true; 5754 bool xattrs_logged = false; 5755 bool recursive_logging = false; 5756 bool inode_item_dropped = true; 5757 const bool orig_logged_before = ctx->logged_before; 5758 5759 path = btrfs_alloc_path(); 5760 if (!path) 5761 return -ENOMEM; 5762 dst_path = btrfs_alloc_path(); 5763 if (!dst_path) { 5764 btrfs_free_path(path); 5765 return -ENOMEM; 5766 } 5767 5768 min_key.objectid = ino; 5769 min_key.type = BTRFS_INODE_ITEM_KEY; 5770 min_key.offset = 0; 5771 5772 max_key.objectid = ino; 5773 5774 5775 /* today the code can only do partial logging of directories */ 5776 if (S_ISDIR(inode->vfs_inode.i_mode) || 5777 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5778 &inode->runtime_flags) && 5779 inode_only >= LOG_INODE_EXISTS)) 5780 max_key.type = BTRFS_XATTR_ITEM_KEY; 5781 else 5782 max_key.type = (u8)-1; 5783 max_key.offset = (u64)-1; 5784 5785 /* 5786 * Only run delayed items if we are a directory. We want to make sure 5787 * all directory indexes hit the fs/subvolume tree so we can find them 5788 * and figure out which index ranges have to be logged. 5789 */ 5790 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5791 ret = btrfs_commit_inode_delayed_items(trans, inode); 5792 if (ret) 5793 goto out; 5794 } 5795 5796 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5797 recursive_logging = true; 5798 if (inode_only == LOG_OTHER_INODE) 5799 inode_only = LOG_INODE_EXISTS; 5800 else 5801 inode_only = LOG_INODE_ALL; 5802 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5803 } else { 5804 mutex_lock(&inode->log_mutex); 5805 } 5806 5807 /* 5808 * Before logging the inode item, cache the value returned by 5809 * inode_logged(), because after that we have the need to figure out if 5810 * the inode was previously logged in this transaction. 5811 */ 5812 ret = inode_logged(trans, inode, path); 5813 if (ret < 0) 5814 goto out_unlock; 5815 ctx->logged_before = (ret == 1); 5816 ret = 0; 5817 5818 /* 5819 * This is for cases where logging a directory could result in losing a 5820 * a file after replaying the log. For example, if we move a file from a 5821 * directory A to a directory B, then fsync directory A, we have no way 5822 * to known the file was moved from A to B, so logging just A would 5823 * result in losing the file after a log replay. 5824 */ 5825 if (S_ISDIR(inode->vfs_inode.i_mode) && 5826 inode_only == LOG_INODE_ALL && 5827 inode->last_unlink_trans >= trans->transid) { 5828 btrfs_set_log_full_commit(trans); 5829 ret = 1; 5830 goto out_unlock; 5831 } 5832 5833 /* 5834 * a brute force approach to making sure we get the most uptodate 5835 * copies of everything. 5836 */ 5837 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5838 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5839 5840 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5841 if (inode_only == LOG_INODE_EXISTS) 5842 max_key_type = BTRFS_XATTR_ITEM_KEY; 5843 if (ctx->logged_before) 5844 ret = drop_inode_items(trans, log, path, inode, 5845 max_key_type); 5846 } else { 5847 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 5848 /* 5849 * Make sure the new inode item we write to the log has 5850 * the same isize as the current one (if it exists). 5851 * This is necessary to prevent data loss after log 5852 * replay, and also to prevent doing a wrong expanding 5853 * truncate - for e.g. create file, write 4K into offset 5854 * 0, fsync, write 4K into offset 4096, add hard link, 5855 * fsync some other file (to sync log), power fail - if 5856 * we use the inode's current i_size, after log replay 5857 * we get a 8Kb file, with the last 4Kb extent as a hole 5858 * (zeroes), as if an expanding truncate happened, 5859 * instead of getting a file of 4Kb only. 5860 */ 5861 ret = logged_inode_size(log, inode, path, &logged_isize); 5862 if (ret) 5863 goto out_unlock; 5864 } 5865 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5866 &inode->runtime_flags)) { 5867 if (inode_only == LOG_INODE_EXISTS) { 5868 max_key.type = BTRFS_XATTR_ITEM_KEY; 5869 if (ctx->logged_before) 5870 ret = drop_inode_items(trans, log, path, 5871 inode, max_key.type); 5872 } else { 5873 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5874 &inode->runtime_flags); 5875 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5876 &inode->runtime_flags); 5877 if (ctx->logged_before) 5878 ret = truncate_inode_items(trans, log, 5879 inode, 0, 0); 5880 } 5881 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5882 &inode->runtime_flags) || 5883 inode_only == LOG_INODE_EXISTS) { 5884 if (inode_only == LOG_INODE_ALL) 5885 fast_search = true; 5886 max_key.type = BTRFS_XATTR_ITEM_KEY; 5887 if (ctx->logged_before) 5888 ret = drop_inode_items(trans, log, path, inode, 5889 max_key.type); 5890 } else { 5891 if (inode_only == LOG_INODE_ALL) 5892 fast_search = true; 5893 inode_item_dropped = false; 5894 goto log_extents; 5895 } 5896 5897 } 5898 if (ret) 5899 goto out_unlock; 5900 5901 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5902 path, dst_path, logged_isize, 5903 recursive_logging, inode_only, ctx, 5904 &need_log_inode_item); 5905 if (ret) 5906 goto out_unlock; 5907 5908 btrfs_release_path(path); 5909 btrfs_release_path(dst_path); 5910 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5911 if (ret) 5912 goto out_unlock; 5913 xattrs_logged = true; 5914 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5915 btrfs_release_path(path); 5916 btrfs_release_path(dst_path); 5917 ret = btrfs_log_holes(trans, inode, path); 5918 if (ret) 5919 goto out_unlock; 5920 } 5921 log_extents: 5922 btrfs_release_path(path); 5923 btrfs_release_path(dst_path); 5924 if (need_log_inode_item) { 5925 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5926 if (ret) 5927 goto out_unlock; 5928 /* 5929 * If we are doing a fast fsync and the inode was logged before 5930 * in this transaction, we don't need to log the xattrs because 5931 * they were logged before. If xattrs were added, changed or 5932 * deleted since the last time we logged the inode, then we have 5933 * already logged them because the inode had the runtime flag 5934 * BTRFS_INODE_COPY_EVERYTHING set. 5935 */ 5936 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5937 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5938 if (ret) 5939 goto out_unlock; 5940 btrfs_release_path(path); 5941 } 5942 } 5943 if (fast_search) { 5944 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 5945 if (ret) 5946 goto out_unlock; 5947 } else if (inode_only == LOG_INODE_ALL) { 5948 struct extent_map *em, *n; 5949 5950 write_lock(&em_tree->lock); 5951 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5952 list_del_init(&em->list); 5953 write_unlock(&em_tree->lock); 5954 } 5955 5956 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5957 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 5958 if (ret) 5959 goto out_unlock; 5960 } 5961 5962 spin_lock(&inode->lock); 5963 inode->logged_trans = trans->transid; 5964 /* 5965 * Don't update last_log_commit if we logged that an inode exists. 5966 * We do this for three reasons: 5967 * 5968 * 1) We might have had buffered writes to this inode that were 5969 * flushed and had their ordered extents completed in this 5970 * transaction, but we did not previously log the inode with 5971 * LOG_INODE_ALL. Later the inode was evicted and after that 5972 * it was loaded again and this LOG_INODE_EXISTS log operation 5973 * happened. We must make sure that if an explicit fsync against 5974 * the inode is performed later, it logs the new extents, an 5975 * updated inode item, etc, and syncs the log. The same logic 5976 * applies to direct IO writes instead of buffered writes. 5977 * 5978 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 5979 * is logged with an i_size of 0 or whatever value was logged 5980 * before. If later the i_size of the inode is increased by a 5981 * truncate operation, the log is synced through an fsync of 5982 * some other inode and then finally an explicit fsync against 5983 * this inode is made, we must make sure this fsync logs the 5984 * inode with the new i_size, the hole between old i_size and 5985 * the new i_size, and syncs the log. 5986 * 5987 * 3) If we are logging that an ancestor inode exists as part of 5988 * logging a new name from a link or rename operation, don't update 5989 * its last_log_commit - otherwise if an explicit fsync is made 5990 * against an ancestor, the fsync considers the inode in the log 5991 * and doesn't sync the log, resulting in the ancestor missing after 5992 * a power failure unless the log was synced as part of an fsync 5993 * against any other unrelated inode. 5994 */ 5995 if (inode_only != LOG_INODE_EXISTS) 5996 inode->last_log_commit = inode->last_sub_trans; 5997 spin_unlock(&inode->lock); 5998 5999 /* 6000 * Reset the last_reflink_trans so that the next fsync does not need to 6001 * go through the slower path when logging extents and their checksums. 6002 */ 6003 if (inode_only == LOG_INODE_ALL) 6004 inode->last_reflink_trans = 0; 6005 6006 out_unlock: 6007 mutex_unlock(&inode->log_mutex); 6008 out: 6009 btrfs_free_path(path); 6010 btrfs_free_path(dst_path); 6011 6012 if (recursive_logging) 6013 ctx->logged_before = orig_logged_before; 6014 6015 return ret; 6016 } 6017 6018 /* 6019 * Check if we need to log an inode. This is used in contexts where while 6020 * logging an inode we need to log another inode (either that it exists or in 6021 * full mode). This is used instead of btrfs_inode_in_log() because the later 6022 * requires the inode to be in the log and have the log transaction committed, 6023 * while here we do not care if the log transaction was already committed - our 6024 * caller will commit the log later - and we want to avoid logging an inode 6025 * multiple times when multiple tasks have joined the same log transaction. 6026 */ 6027 static bool need_log_inode(struct btrfs_trans_handle *trans, 6028 struct btrfs_inode *inode) 6029 { 6030 /* 6031 * If a directory was not modified, no dentries added or removed, we can 6032 * and should avoid logging it. 6033 */ 6034 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 6035 return false; 6036 6037 /* 6038 * If this inode does not have new/updated/deleted xattrs since the last 6039 * time it was logged and is flagged as logged in the current transaction, 6040 * we can skip logging it. As for new/deleted names, those are updated in 6041 * the log by link/unlink/rename operations. 6042 * In case the inode was logged and then evicted and reloaded, its 6043 * logged_trans will be 0, in which case we have to fully log it since 6044 * logged_trans is a transient field, not persisted. 6045 */ 6046 if (inode->logged_trans == trans->transid && 6047 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 6048 return false; 6049 6050 return true; 6051 } 6052 6053 struct btrfs_dir_list { 6054 u64 ino; 6055 struct list_head list; 6056 }; 6057 6058 /* 6059 * Log the inodes of the new dentries of a directory. See log_dir_items() for 6060 * details about the why it is needed. 6061 * This is a recursive operation - if an existing dentry corresponds to a 6062 * directory, that directory's new entries are logged too (same behaviour as 6063 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 6064 * the dentries point to we do not lock their i_mutex, otherwise lockdep 6065 * complains about the following circular lock dependency / possible deadlock: 6066 * 6067 * CPU0 CPU1 6068 * ---- ---- 6069 * lock(&type->i_mutex_dir_key#3/2); 6070 * lock(sb_internal#2); 6071 * lock(&type->i_mutex_dir_key#3/2); 6072 * lock(&sb->s_type->i_mutex_key#14); 6073 * 6074 * Where sb_internal is the lock (a counter that works as a lock) acquired by 6075 * sb_start_intwrite() in btrfs_start_transaction(). 6076 * Not locking i_mutex of the inodes is still safe because: 6077 * 6078 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 6079 * that while logging the inode new references (names) are added or removed 6080 * from the inode, leaving the logged inode item with a link count that does 6081 * not match the number of logged inode reference items. This is fine because 6082 * at log replay time we compute the real number of links and correct the 6083 * link count in the inode item (see replay_one_buffer() and 6084 * link_to_fixup_dir()); 6085 * 6086 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 6087 * while logging the inode's items new index items (key type 6088 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 6089 * has a size that doesn't match the sum of the lengths of all the logged 6090 * names - this is ok, not a problem, because at log replay time we set the 6091 * directory's i_size to the correct value (see replay_one_name() and 6092 * do_overwrite_item()). 6093 */ 6094 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 6095 struct btrfs_root *root, 6096 struct btrfs_inode *start_inode, 6097 struct btrfs_log_ctx *ctx) 6098 { 6099 struct btrfs_fs_info *fs_info = root->fs_info; 6100 struct btrfs_path *path; 6101 LIST_HEAD(dir_list); 6102 struct btrfs_dir_list *dir_elem; 6103 int ret = 0; 6104 6105 /* 6106 * If we are logging a new name, as part of a link or rename operation, 6107 * don't bother logging new dentries, as we just want to log the names 6108 * of an inode and that any new parents exist. 6109 */ 6110 if (ctx->logging_new_name) 6111 return 0; 6112 6113 path = btrfs_alloc_path(); 6114 if (!path) 6115 return -ENOMEM; 6116 6117 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 6118 if (!dir_elem) { 6119 btrfs_free_path(path); 6120 return -ENOMEM; 6121 } 6122 dir_elem->ino = btrfs_ino(start_inode); 6123 list_add_tail(&dir_elem->list, &dir_list); 6124 6125 while (!list_empty(&dir_list)) { 6126 struct extent_buffer *leaf; 6127 struct btrfs_key min_key; 6128 int nritems; 6129 int i; 6130 6131 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 6132 list); 6133 if (ret) 6134 goto next_dir_inode; 6135 6136 min_key.objectid = dir_elem->ino; 6137 min_key.type = BTRFS_DIR_INDEX_KEY; 6138 min_key.offset = 0; 6139 again: 6140 btrfs_release_path(path); 6141 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 6142 if (ret < 0) { 6143 goto next_dir_inode; 6144 } else if (ret > 0) { 6145 ret = 0; 6146 goto next_dir_inode; 6147 } 6148 6149 leaf = path->nodes[0]; 6150 nritems = btrfs_header_nritems(leaf); 6151 for (i = path->slots[0]; i < nritems; i++) { 6152 struct btrfs_dir_item *di; 6153 struct btrfs_key di_key; 6154 struct inode *di_inode; 6155 struct btrfs_dir_list *new_dir_elem; 6156 int log_mode = LOG_INODE_EXISTS; 6157 int type; 6158 6159 btrfs_item_key_to_cpu(leaf, &min_key, i); 6160 if (min_key.objectid != dir_elem->ino || 6161 min_key.type != BTRFS_DIR_INDEX_KEY) 6162 goto next_dir_inode; 6163 6164 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 6165 type = btrfs_dir_type(leaf, di); 6166 if (btrfs_dir_transid(leaf, di) < trans->transid) 6167 continue; 6168 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 6169 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 6170 continue; 6171 6172 btrfs_release_path(path); 6173 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 6174 if (IS_ERR(di_inode)) { 6175 ret = PTR_ERR(di_inode); 6176 goto next_dir_inode; 6177 } 6178 6179 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6180 btrfs_add_delayed_iput(di_inode); 6181 break; 6182 } 6183 6184 ctx->log_new_dentries = false; 6185 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) 6186 log_mode = LOG_INODE_ALL; 6187 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 6188 log_mode, ctx); 6189 btrfs_add_delayed_iput(di_inode); 6190 if (ret) 6191 goto next_dir_inode; 6192 if (ctx->log_new_dentries) { 6193 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 6194 GFP_NOFS); 6195 if (!new_dir_elem) { 6196 ret = -ENOMEM; 6197 goto next_dir_inode; 6198 } 6199 new_dir_elem->ino = di_key.objectid; 6200 list_add_tail(&new_dir_elem->list, &dir_list); 6201 } 6202 break; 6203 } 6204 if (min_key.offset < (u64)-1) { 6205 min_key.offset++; 6206 goto again; 6207 } 6208 next_dir_inode: 6209 list_del(&dir_elem->list); 6210 kfree(dir_elem); 6211 } 6212 6213 btrfs_free_path(path); 6214 return ret; 6215 } 6216 6217 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6218 struct btrfs_inode *inode, 6219 struct btrfs_log_ctx *ctx) 6220 { 6221 struct btrfs_fs_info *fs_info = trans->fs_info; 6222 int ret; 6223 struct btrfs_path *path; 6224 struct btrfs_key key; 6225 struct btrfs_root *root = inode->root; 6226 const u64 ino = btrfs_ino(inode); 6227 6228 path = btrfs_alloc_path(); 6229 if (!path) 6230 return -ENOMEM; 6231 path->skip_locking = 1; 6232 path->search_commit_root = 1; 6233 6234 key.objectid = ino; 6235 key.type = BTRFS_INODE_REF_KEY; 6236 key.offset = 0; 6237 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6238 if (ret < 0) 6239 goto out; 6240 6241 while (true) { 6242 struct extent_buffer *leaf = path->nodes[0]; 6243 int slot = path->slots[0]; 6244 u32 cur_offset = 0; 6245 u32 item_size; 6246 unsigned long ptr; 6247 6248 if (slot >= btrfs_header_nritems(leaf)) { 6249 ret = btrfs_next_leaf(root, path); 6250 if (ret < 0) 6251 goto out; 6252 else if (ret > 0) 6253 break; 6254 continue; 6255 } 6256 6257 btrfs_item_key_to_cpu(leaf, &key, slot); 6258 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6259 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6260 break; 6261 6262 item_size = btrfs_item_size(leaf, slot); 6263 ptr = btrfs_item_ptr_offset(leaf, slot); 6264 while (cur_offset < item_size) { 6265 struct btrfs_key inode_key; 6266 struct inode *dir_inode; 6267 6268 inode_key.type = BTRFS_INODE_ITEM_KEY; 6269 inode_key.offset = 0; 6270 6271 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6272 struct btrfs_inode_extref *extref; 6273 6274 extref = (struct btrfs_inode_extref *) 6275 (ptr + cur_offset); 6276 inode_key.objectid = btrfs_inode_extref_parent( 6277 leaf, extref); 6278 cur_offset += sizeof(*extref); 6279 cur_offset += btrfs_inode_extref_name_len(leaf, 6280 extref); 6281 } else { 6282 inode_key.objectid = key.offset; 6283 cur_offset = item_size; 6284 } 6285 6286 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6287 root); 6288 /* 6289 * If the parent inode was deleted, return an error to 6290 * fallback to a transaction commit. This is to prevent 6291 * getting an inode that was moved from one parent A to 6292 * a parent B, got its former parent A deleted and then 6293 * it got fsync'ed, from existing at both parents after 6294 * a log replay (and the old parent still existing). 6295 * Example: 6296 * 6297 * mkdir /mnt/A 6298 * mkdir /mnt/B 6299 * touch /mnt/B/bar 6300 * sync 6301 * mv /mnt/B/bar /mnt/A/bar 6302 * mv -T /mnt/A /mnt/B 6303 * fsync /mnt/B/bar 6304 * <power fail> 6305 * 6306 * If we ignore the old parent B which got deleted, 6307 * after a log replay we would have file bar linked 6308 * at both parents and the old parent B would still 6309 * exist. 6310 */ 6311 if (IS_ERR(dir_inode)) { 6312 ret = PTR_ERR(dir_inode); 6313 goto out; 6314 } 6315 6316 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6317 btrfs_add_delayed_iput(dir_inode); 6318 continue; 6319 } 6320 6321 ctx->log_new_dentries = false; 6322 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6323 LOG_INODE_ALL, ctx); 6324 if (!ret && ctx->log_new_dentries) 6325 ret = log_new_dir_dentries(trans, root, 6326 BTRFS_I(dir_inode), ctx); 6327 btrfs_add_delayed_iput(dir_inode); 6328 if (ret) 6329 goto out; 6330 } 6331 path->slots[0]++; 6332 } 6333 ret = 0; 6334 out: 6335 btrfs_free_path(path); 6336 return ret; 6337 } 6338 6339 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6340 struct btrfs_root *root, 6341 struct btrfs_path *path, 6342 struct btrfs_log_ctx *ctx) 6343 { 6344 struct btrfs_key found_key; 6345 6346 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6347 6348 while (true) { 6349 struct btrfs_fs_info *fs_info = root->fs_info; 6350 struct extent_buffer *leaf = path->nodes[0]; 6351 int slot = path->slots[0]; 6352 struct btrfs_key search_key; 6353 struct inode *inode; 6354 u64 ino; 6355 int ret = 0; 6356 6357 btrfs_release_path(path); 6358 6359 ino = found_key.offset; 6360 6361 search_key.objectid = found_key.offset; 6362 search_key.type = BTRFS_INODE_ITEM_KEY; 6363 search_key.offset = 0; 6364 inode = btrfs_iget(fs_info->sb, ino, root); 6365 if (IS_ERR(inode)) 6366 return PTR_ERR(inode); 6367 6368 if (BTRFS_I(inode)->generation >= trans->transid && 6369 need_log_inode(trans, BTRFS_I(inode))) 6370 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6371 LOG_INODE_EXISTS, ctx); 6372 btrfs_add_delayed_iput(inode); 6373 if (ret) 6374 return ret; 6375 6376 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6377 break; 6378 6379 search_key.type = BTRFS_INODE_REF_KEY; 6380 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6381 if (ret < 0) 6382 return ret; 6383 6384 leaf = path->nodes[0]; 6385 slot = path->slots[0]; 6386 if (slot >= btrfs_header_nritems(leaf)) { 6387 ret = btrfs_next_leaf(root, path); 6388 if (ret < 0) 6389 return ret; 6390 else if (ret > 0) 6391 return -ENOENT; 6392 leaf = path->nodes[0]; 6393 slot = path->slots[0]; 6394 } 6395 6396 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6397 if (found_key.objectid != search_key.objectid || 6398 found_key.type != BTRFS_INODE_REF_KEY) 6399 return -ENOENT; 6400 } 6401 return 0; 6402 } 6403 6404 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6405 struct btrfs_inode *inode, 6406 struct dentry *parent, 6407 struct btrfs_log_ctx *ctx) 6408 { 6409 struct btrfs_root *root = inode->root; 6410 struct dentry *old_parent = NULL; 6411 struct super_block *sb = inode->vfs_inode.i_sb; 6412 int ret = 0; 6413 6414 while (true) { 6415 if (!parent || d_really_is_negative(parent) || 6416 sb != parent->d_sb) 6417 break; 6418 6419 inode = BTRFS_I(d_inode(parent)); 6420 if (root != inode->root) 6421 break; 6422 6423 if (inode->generation >= trans->transid && 6424 need_log_inode(trans, inode)) { 6425 ret = btrfs_log_inode(trans, inode, 6426 LOG_INODE_EXISTS, ctx); 6427 if (ret) 6428 break; 6429 } 6430 if (IS_ROOT(parent)) 6431 break; 6432 6433 parent = dget_parent(parent); 6434 dput(old_parent); 6435 old_parent = parent; 6436 } 6437 dput(old_parent); 6438 6439 return ret; 6440 } 6441 6442 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6443 struct btrfs_inode *inode, 6444 struct dentry *parent, 6445 struct btrfs_log_ctx *ctx) 6446 { 6447 struct btrfs_root *root = inode->root; 6448 const u64 ino = btrfs_ino(inode); 6449 struct btrfs_path *path; 6450 struct btrfs_key search_key; 6451 int ret; 6452 6453 /* 6454 * For a single hard link case, go through a fast path that does not 6455 * need to iterate the fs/subvolume tree. 6456 */ 6457 if (inode->vfs_inode.i_nlink < 2) 6458 return log_new_ancestors_fast(trans, inode, parent, ctx); 6459 6460 path = btrfs_alloc_path(); 6461 if (!path) 6462 return -ENOMEM; 6463 6464 search_key.objectid = ino; 6465 search_key.type = BTRFS_INODE_REF_KEY; 6466 search_key.offset = 0; 6467 again: 6468 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6469 if (ret < 0) 6470 goto out; 6471 if (ret == 0) 6472 path->slots[0]++; 6473 6474 while (true) { 6475 struct extent_buffer *leaf = path->nodes[0]; 6476 int slot = path->slots[0]; 6477 struct btrfs_key found_key; 6478 6479 if (slot >= btrfs_header_nritems(leaf)) { 6480 ret = btrfs_next_leaf(root, path); 6481 if (ret < 0) 6482 goto out; 6483 else if (ret > 0) 6484 break; 6485 continue; 6486 } 6487 6488 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6489 if (found_key.objectid != ino || 6490 found_key.type > BTRFS_INODE_EXTREF_KEY) 6491 break; 6492 6493 /* 6494 * Don't deal with extended references because they are rare 6495 * cases and too complex to deal with (we would need to keep 6496 * track of which subitem we are processing for each item in 6497 * this loop, etc). So just return some error to fallback to 6498 * a transaction commit. 6499 */ 6500 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6501 ret = -EMLINK; 6502 goto out; 6503 } 6504 6505 /* 6506 * Logging ancestors needs to do more searches on the fs/subvol 6507 * tree, so it releases the path as needed to avoid deadlocks. 6508 * Keep track of the last inode ref key and resume from that key 6509 * after logging all new ancestors for the current hard link. 6510 */ 6511 memcpy(&search_key, &found_key, sizeof(search_key)); 6512 6513 ret = log_new_ancestors(trans, root, path, ctx); 6514 if (ret) 6515 goto out; 6516 btrfs_release_path(path); 6517 goto again; 6518 } 6519 ret = 0; 6520 out: 6521 btrfs_free_path(path); 6522 return ret; 6523 } 6524 6525 /* 6526 * helper function around btrfs_log_inode to make sure newly created 6527 * parent directories also end up in the log. A minimal inode and backref 6528 * only logging is done of any parent directories that are older than 6529 * the last committed transaction 6530 */ 6531 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6532 struct btrfs_inode *inode, 6533 struct dentry *parent, 6534 int inode_only, 6535 struct btrfs_log_ctx *ctx) 6536 { 6537 struct btrfs_root *root = inode->root; 6538 struct btrfs_fs_info *fs_info = root->fs_info; 6539 int ret = 0; 6540 bool log_dentries = false; 6541 6542 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6543 ret = 1; 6544 goto end_no_trans; 6545 } 6546 6547 if (btrfs_root_refs(&root->root_item) == 0) { 6548 ret = 1; 6549 goto end_no_trans; 6550 } 6551 6552 /* 6553 * Skip already logged inodes or inodes corresponding to tmpfiles 6554 * (since logging them is pointless, a link count of 0 means they 6555 * will never be accessible). 6556 */ 6557 if ((btrfs_inode_in_log(inode, trans->transid) && 6558 list_empty(&ctx->ordered_extents)) || 6559 inode->vfs_inode.i_nlink == 0) { 6560 ret = BTRFS_NO_LOG_SYNC; 6561 goto end_no_trans; 6562 } 6563 6564 ret = start_log_trans(trans, root, ctx); 6565 if (ret) 6566 goto end_no_trans; 6567 6568 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 6569 if (ret) 6570 goto end_trans; 6571 6572 /* 6573 * for regular files, if its inode is already on disk, we don't 6574 * have to worry about the parents at all. This is because 6575 * we can use the last_unlink_trans field to record renames 6576 * and other fun in this file. 6577 */ 6578 if (S_ISREG(inode->vfs_inode.i_mode) && 6579 inode->generation < trans->transid && 6580 inode->last_unlink_trans < trans->transid) { 6581 ret = 0; 6582 goto end_trans; 6583 } 6584 6585 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 6586 log_dentries = true; 6587 6588 /* 6589 * On unlink we must make sure all our current and old parent directory 6590 * inodes are fully logged. This is to prevent leaving dangling 6591 * directory index entries in directories that were our parents but are 6592 * not anymore. Not doing this results in old parent directory being 6593 * impossible to delete after log replay (rmdir will always fail with 6594 * error -ENOTEMPTY). 6595 * 6596 * Example 1: 6597 * 6598 * mkdir testdir 6599 * touch testdir/foo 6600 * ln testdir/foo testdir/bar 6601 * sync 6602 * unlink testdir/bar 6603 * xfs_io -c fsync testdir/foo 6604 * <power failure> 6605 * mount fs, triggers log replay 6606 * 6607 * If we don't log the parent directory (testdir), after log replay the 6608 * directory still has an entry pointing to the file inode using the bar 6609 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6610 * the file inode has a link count of 1. 6611 * 6612 * Example 2: 6613 * 6614 * mkdir testdir 6615 * touch foo 6616 * ln foo testdir/foo2 6617 * ln foo testdir/foo3 6618 * sync 6619 * unlink testdir/foo3 6620 * xfs_io -c fsync foo 6621 * <power failure> 6622 * mount fs, triggers log replay 6623 * 6624 * Similar as the first example, after log replay the parent directory 6625 * testdir still has an entry pointing to the inode file with name foo3 6626 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6627 * and has a link count of 2. 6628 */ 6629 if (inode->last_unlink_trans >= trans->transid) { 6630 ret = btrfs_log_all_parents(trans, inode, ctx); 6631 if (ret) 6632 goto end_trans; 6633 } 6634 6635 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6636 if (ret) 6637 goto end_trans; 6638 6639 if (log_dentries) 6640 ret = log_new_dir_dentries(trans, root, inode, ctx); 6641 else 6642 ret = 0; 6643 end_trans: 6644 if (ret < 0) { 6645 btrfs_set_log_full_commit(trans); 6646 ret = 1; 6647 } 6648 6649 if (ret) 6650 btrfs_remove_log_ctx(root, ctx); 6651 btrfs_end_log_trans(root); 6652 end_no_trans: 6653 return ret; 6654 } 6655 6656 /* 6657 * it is not safe to log dentry if the chunk root has added new 6658 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6659 * If this returns 1, you must commit the transaction to safely get your 6660 * data on disk. 6661 */ 6662 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6663 struct dentry *dentry, 6664 struct btrfs_log_ctx *ctx) 6665 { 6666 struct dentry *parent = dget_parent(dentry); 6667 int ret; 6668 6669 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6670 LOG_INODE_ALL, ctx); 6671 dput(parent); 6672 6673 return ret; 6674 } 6675 6676 /* 6677 * should be called during mount to recover any replay any log trees 6678 * from the FS 6679 */ 6680 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6681 { 6682 int ret; 6683 struct btrfs_path *path; 6684 struct btrfs_trans_handle *trans; 6685 struct btrfs_key key; 6686 struct btrfs_key found_key; 6687 struct btrfs_root *log; 6688 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6689 struct walk_control wc = { 6690 .process_func = process_one_buffer, 6691 .stage = LOG_WALK_PIN_ONLY, 6692 }; 6693 6694 path = btrfs_alloc_path(); 6695 if (!path) 6696 return -ENOMEM; 6697 6698 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6699 6700 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6701 if (IS_ERR(trans)) { 6702 ret = PTR_ERR(trans); 6703 goto error; 6704 } 6705 6706 wc.trans = trans; 6707 wc.pin = 1; 6708 6709 ret = walk_log_tree(trans, log_root_tree, &wc); 6710 if (ret) { 6711 btrfs_abort_transaction(trans, ret); 6712 goto error; 6713 } 6714 6715 again: 6716 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6717 key.offset = (u64)-1; 6718 key.type = BTRFS_ROOT_ITEM_KEY; 6719 6720 while (1) { 6721 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6722 6723 if (ret < 0) { 6724 btrfs_abort_transaction(trans, ret); 6725 goto error; 6726 } 6727 if (ret > 0) { 6728 if (path->slots[0] == 0) 6729 break; 6730 path->slots[0]--; 6731 } 6732 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6733 path->slots[0]); 6734 btrfs_release_path(path); 6735 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6736 break; 6737 6738 log = btrfs_read_tree_root(log_root_tree, &found_key); 6739 if (IS_ERR(log)) { 6740 ret = PTR_ERR(log); 6741 btrfs_abort_transaction(trans, ret); 6742 goto error; 6743 } 6744 6745 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6746 true); 6747 if (IS_ERR(wc.replay_dest)) { 6748 ret = PTR_ERR(wc.replay_dest); 6749 6750 /* 6751 * We didn't find the subvol, likely because it was 6752 * deleted. This is ok, simply skip this log and go to 6753 * the next one. 6754 * 6755 * We need to exclude the root because we can't have 6756 * other log replays overwriting this log as we'll read 6757 * it back in a few more times. This will keep our 6758 * block from being modified, and we'll just bail for 6759 * each subsequent pass. 6760 */ 6761 if (ret == -ENOENT) 6762 ret = btrfs_pin_extent_for_log_replay(trans, 6763 log->node->start, 6764 log->node->len); 6765 btrfs_put_root(log); 6766 6767 if (!ret) 6768 goto next; 6769 btrfs_abort_transaction(trans, ret); 6770 goto error; 6771 } 6772 6773 wc.replay_dest->log_root = log; 6774 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 6775 if (ret) 6776 /* The loop needs to continue due to the root refs */ 6777 btrfs_abort_transaction(trans, ret); 6778 else 6779 ret = walk_log_tree(trans, log, &wc); 6780 6781 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6782 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6783 path); 6784 if (ret) 6785 btrfs_abort_transaction(trans, ret); 6786 } 6787 6788 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6789 struct btrfs_root *root = wc.replay_dest; 6790 6791 btrfs_release_path(path); 6792 6793 /* 6794 * We have just replayed everything, and the highest 6795 * objectid of fs roots probably has changed in case 6796 * some inode_item's got replayed. 6797 * 6798 * root->objectid_mutex is not acquired as log replay 6799 * could only happen during mount. 6800 */ 6801 ret = btrfs_init_root_free_objectid(root); 6802 if (ret) 6803 btrfs_abort_transaction(trans, ret); 6804 } 6805 6806 wc.replay_dest->log_root = NULL; 6807 btrfs_put_root(wc.replay_dest); 6808 btrfs_put_root(log); 6809 6810 if (ret) 6811 goto error; 6812 next: 6813 if (found_key.offset == 0) 6814 break; 6815 key.offset = found_key.offset - 1; 6816 } 6817 btrfs_release_path(path); 6818 6819 /* step one is to pin it all, step two is to replay just inodes */ 6820 if (wc.pin) { 6821 wc.pin = 0; 6822 wc.process_func = replay_one_buffer; 6823 wc.stage = LOG_WALK_REPLAY_INODES; 6824 goto again; 6825 } 6826 /* step three is to replay everything */ 6827 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6828 wc.stage++; 6829 goto again; 6830 } 6831 6832 btrfs_free_path(path); 6833 6834 /* step 4: commit the transaction, which also unpins the blocks */ 6835 ret = btrfs_commit_transaction(trans); 6836 if (ret) 6837 return ret; 6838 6839 log_root_tree->log_root = NULL; 6840 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6841 btrfs_put_root(log_root_tree); 6842 6843 return 0; 6844 error: 6845 if (wc.trans) 6846 btrfs_end_transaction(wc.trans); 6847 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6848 btrfs_free_path(path); 6849 return ret; 6850 } 6851 6852 /* 6853 * there are some corner cases where we want to force a full 6854 * commit instead of allowing a directory to be logged. 6855 * 6856 * They revolve around files there were unlinked from the directory, and 6857 * this function updates the parent directory so that a full commit is 6858 * properly done if it is fsync'd later after the unlinks are done. 6859 * 6860 * Must be called before the unlink operations (updates to the subvolume tree, 6861 * inodes, etc) are done. 6862 */ 6863 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6864 struct btrfs_inode *dir, struct btrfs_inode *inode, 6865 int for_rename) 6866 { 6867 /* 6868 * when we're logging a file, if it hasn't been renamed 6869 * or unlinked, and its inode is fully committed on disk, 6870 * we don't have to worry about walking up the directory chain 6871 * to log its parents. 6872 * 6873 * So, we use the last_unlink_trans field to put this transid 6874 * into the file. When the file is logged we check it and 6875 * don't log the parents if the file is fully on disk. 6876 */ 6877 mutex_lock(&inode->log_mutex); 6878 inode->last_unlink_trans = trans->transid; 6879 mutex_unlock(&inode->log_mutex); 6880 6881 /* 6882 * if this directory was already logged any new 6883 * names for this file/dir will get recorded 6884 */ 6885 if (dir->logged_trans == trans->transid) 6886 return; 6887 6888 /* 6889 * if the inode we're about to unlink was logged, 6890 * the log will be properly updated for any new names 6891 */ 6892 if (inode->logged_trans == trans->transid) 6893 return; 6894 6895 /* 6896 * when renaming files across directories, if the directory 6897 * there we're unlinking from gets fsync'd later on, there's 6898 * no way to find the destination directory later and fsync it 6899 * properly. So, we have to be conservative and force commits 6900 * so the new name gets discovered. 6901 */ 6902 if (for_rename) 6903 goto record; 6904 6905 /* we can safely do the unlink without any special recording */ 6906 return; 6907 6908 record: 6909 mutex_lock(&dir->log_mutex); 6910 dir->last_unlink_trans = trans->transid; 6911 mutex_unlock(&dir->log_mutex); 6912 } 6913 6914 /* 6915 * Make sure that if someone attempts to fsync the parent directory of a deleted 6916 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6917 * that after replaying the log tree of the parent directory's root we will not 6918 * see the snapshot anymore and at log replay time we will not see any log tree 6919 * corresponding to the deleted snapshot's root, which could lead to replaying 6920 * it after replaying the log tree of the parent directory (which would replay 6921 * the snapshot delete operation). 6922 * 6923 * Must be called before the actual snapshot destroy operation (updates to the 6924 * parent root and tree of tree roots trees, etc) are done. 6925 */ 6926 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6927 struct btrfs_inode *dir) 6928 { 6929 mutex_lock(&dir->log_mutex); 6930 dir->last_unlink_trans = trans->transid; 6931 mutex_unlock(&dir->log_mutex); 6932 } 6933 6934 /** 6935 * Update the log after adding a new name for an inode. 6936 * 6937 * @trans: Transaction handle. 6938 * @old_dentry: The dentry associated with the old name and the old 6939 * parent directory. 6940 * @old_dir: The inode of the previous parent directory for the case 6941 * of a rename. For a link operation, it must be NULL. 6942 * @old_dir_index: The index number associated with the old name, meaningful 6943 * only for rename operations (when @old_dir is not NULL). 6944 * Ignored for link operations. 6945 * @parent: The dentry associated with the directory under which the 6946 * new name is located. 6947 * 6948 * Call this after adding a new name for an inode, as a result of a link or 6949 * rename operation, and it will properly update the log to reflect the new name. 6950 */ 6951 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6952 struct dentry *old_dentry, struct btrfs_inode *old_dir, 6953 u64 old_dir_index, struct dentry *parent) 6954 { 6955 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 6956 struct btrfs_root *root = inode->root; 6957 struct btrfs_log_ctx ctx; 6958 bool log_pinned = false; 6959 int ret; 6960 6961 /* 6962 * this will force the logging code to walk the dentry chain 6963 * up for the file 6964 */ 6965 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6966 inode->last_unlink_trans = trans->transid; 6967 6968 /* 6969 * if this inode hasn't been logged and directory we're renaming it 6970 * from hasn't been logged, we don't need to log it 6971 */ 6972 ret = inode_logged(trans, inode, NULL); 6973 if (ret < 0) { 6974 goto out; 6975 } else if (ret == 0) { 6976 if (!old_dir) 6977 return; 6978 /* 6979 * If the inode was not logged and we are doing a rename (old_dir is not 6980 * NULL), check if old_dir was logged - if it was not we can return and 6981 * do nothing. 6982 */ 6983 ret = inode_logged(trans, old_dir, NULL); 6984 if (ret < 0) 6985 goto out; 6986 else if (ret == 0) 6987 return; 6988 } 6989 ret = 0; 6990 6991 /* 6992 * If we are doing a rename (old_dir is not NULL) from a directory that 6993 * was previously logged, make sure that on log replay we get the old 6994 * dir entry deleted. This is needed because we will also log the new 6995 * name of the renamed inode, so we need to make sure that after log 6996 * replay we don't end up with both the new and old dir entries existing. 6997 */ 6998 if (old_dir && old_dir->logged_trans == trans->transid) { 6999 struct btrfs_root *log = old_dir->root->log_root; 7000 struct btrfs_path *path; 7001 7002 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7003 7004 /* 7005 * We have two inodes to update in the log, the old directory and 7006 * the inode that got renamed, so we must pin the log to prevent 7007 * anyone from syncing the log until we have updated both inodes 7008 * in the log. 7009 */ 7010 log_pinned = true; 7011 btrfs_pin_log_trans(root); 7012 7013 path = btrfs_alloc_path(); 7014 if (!path) { 7015 ret = -ENOMEM; 7016 goto out; 7017 } 7018 7019 /* 7020 * Other concurrent task might be logging the old directory, 7021 * as it can be triggered when logging other inode that had or 7022 * still has a dentry in the old directory. So take the old 7023 * directory's log_mutex to prevent getting an -EEXIST when 7024 * logging a key to record the deletion, or having that other 7025 * task logging the old directory get an -EEXIST if it attempts 7026 * to log the same key after we just did it. In both cases that 7027 * would result in falling back to a transaction commit. 7028 */ 7029 mutex_lock(&old_dir->log_mutex); 7030 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7031 old_dentry->d_name.name, 7032 old_dentry->d_name.len, old_dir_index); 7033 if (ret > 0) { 7034 /* 7035 * The dentry does not exist in the log, so record its 7036 * deletion. 7037 */ 7038 btrfs_release_path(path); 7039 ret = insert_dir_log_key(trans, log, path, 7040 btrfs_ino(old_dir), 7041 old_dir_index, old_dir_index); 7042 } 7043 mutex_unlock(&old_dir->log_mutex); 7044 7045 btrfs_free_path(path); 7046 if (ret < 0) 7047 goto out; 7048 } 7049 7050 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7051 ctx.logging_new_name = true; 7052 /* 7053 * We don't care about the return value. If we fail to log the new name 7054 * then we know the next attempt to sync the log will fallback to a full 7055 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7056 * we don't need to worry about getting a log committed that has an 7057 * inconsistent state after a rename operation. 7058 */ 7059 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7060 out: 7061 /* 7062 * If an error happened mark the log for a full commit because it's not 7063 * consistent and up to date or we couldn't find out if one of the 7064 * inodes was logged before in this transaction. Do it before unpinning 7065 * the log, to avoid any races with someone else trying to commit it. 7066 */ 7067 if (ret < 0) 7068 btrfs_set_log_full_commit(trans); 7069 if (log_pinned) 7070 btrfs_end_log_trans(root); 7071 } 7072 7073