1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/blkdev.h> 9 #include <linux/list_sort.h> 10 #include <linux/iversion.h> 11 #include "misc.h" 12 #include "ctree.h" 13 #include "tree-log.h" 14 #include "disk-io.h" 15 #include "locking.h" 16 #include "print-tree.h" 17 #include "backref.h" 18 #include "compression.h" 19 #include "qgroup.h" 20 #include "block-group.h" 21 #include "space-info.h" 22 #include "zoned.h" 23 #include "inode-item.h" 24 25 /* magic values for the inode_only field in btrfs_log_inode: 26 * 27 * LOG_INODE_ALL means to log everything 28 * LOG_INODE_EXISTS means to log just enough to recreate the inode 29 * during log replay 30 */ 31 enum { 32 LOG_INODE_ALL, 33 LOG_INODE_EXISTS, 34 LOG_OTHER_INODE, 35 LOG_OTHER_INODE_ALL, 36 }; 37 38 /* 39 * directory trouble cases 40 * 41 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync 42 * log, we must force a full commit before doing an fsync of the directory 43 * where the unlink was done. 44 * ---> record transid of last unlink/rename per directory 45 * 46 * mkdir foo/some_dir 47 * normal commit 48 * rename foo/some_dir foo2/some_dir 49 * mkdir foo/some_dir 50 * fsync foo/some_dir/some_file 51 * 52 * The fsync above will unlink the original some_dir without recording 53 * it in its new location (foo2). After a crash, some_dir will be gone 54 * unless the fsync of some_file forces a full commit 55 * 56 * 2) we must log any new names for any file or dir that is in the fsync 57 * log. ---> check inode while renaming/linking. 58 * 59 * 2a) we must log any new names for any file or dir during rename 60 * when the directory they are being removed from was logged. 61 * ---> check inode and old parent dir during rename 62 * 63 * 2a is actually the more important variant. With the extra logging 64 * a crash might unlink the old name without recreating the new one 65 * 66 * 3) after a crash, we must go through any directories with a link count 67 * of zero and redo the rm -rf 68 * 69 * mkdir f1/foo 70 * normal commit 71 * rm -rf f1/foo 72 * fsync(f1) 73 * 74 * The directory f1 was fully removed from the FS, but fsync was never 75 * called on f1, only its parent dir. After a crash the rm -rf must 76 * be replayed. This must be able to recurse down the entire 77 * directory tree. The inode link count fixup code takes care of the 78 * ugly details. 79 */ 80 81 /* 82 * stages for the tree walking. The first 83 * stage (0) is to only pin down the blocks we find 84 * the second stage (1) is to make sure that all the inodes 85 * we find in the log are created in the subvolume. 86 * 87 * The last stage is to deal with directories and links and extents 88 * and all the other fun semantics 89 */ 90 enum { 91 LOG_WALK_PIN_ONLY, 92 LOG_WALK_REPLAY_INODES, 93 LOG_WALK_REPLAY_DIR_INDEX, 94 LOG_WALK_REPLAY_ALL, 95 }; 96 97 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 98 struct btrfs_inode *inode, 99 int inode_only, 100 struct btrfs_log_ctx *ctx); 101 static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102 struct btrfs_root *root, 103 struct btrfs_path *path, u64 objectid); 104 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 105 struct btrfs_root *root, 106 struct btrfs_root *log, 107 struct btrfs_path *path, 108 u64 dirid, int del_all); 109 static void wait_log_commit(struct btrfs_root *root, int transid); 110 111 /* 112 * tree logging is a special write ahead log used to make sure that 113 * fsyncs and O_SYNCs can happen without doing full tree commits. 114 * 115 * Full tree commits are expensive because they require commonly 116 * modified blocks to be recowed, creating many dirty pages in the 117 * extent tree an 4x-6x higher write load than ext3. 118 * 119 * Instead of doing a tree commit on every fsync, we use the 120 * key ranges and transaction ids to find items for a given file or directory 121 * that have changed in this transaction. Those items are copied into 122 * a special tree (one per subvolume root), that tree is written to disk 123 * and then the fsync is considered complete. 124 * 125 * After a crash, items are copied out of the log-tree back into the 126 * subvolume tree. Any file data extents found are recorded in the extent 127 * allocation tree, and the log-tree freed. 128 * 129 * The log tree is read three times, once to pin down all the extents it is 130 * using in ram and once, once to create all the inodes logged in the tree 131 * and once to do all the other items. 132 */ 133 134 /* 135 * start a sub transaction and setup the log tree 136 * this increments the log tree writer count to make the people 137 * syncing the tree wait for us to finish 138 */ 139 static int start_log_trans(struct btrfs_trans_handle *trans, 140 struct btrfs_root *root, 141 struct btrfs_log_ctx *ctx) 142 { 143 struct btrfs_fs_info *fs_info = root->fs_info; 144 struct btrfs_root *tree_root = fs_info->tree_root; 145 const bool zoned = btrfs_is_zoned(fs_info); 146 int ret = 0; 147 bool created = false; 148 149 /* 150 * First check if the log root tree was already created. If not, create 151 * it before locking the root's log_mutex, just to keep lockdep happy. 152 */ 153 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) { 154 mutex_lock(&tree_root->log_mutex); 155 if (!fs_info->log_root_tree) { 156 ret = btrfs_init_log_root_tree(trans, fs_info); 157 if (!ret) { 158 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); 159 created = true; 160 } 161 } 162 mutex_unlock(&tree_root->log_mutex); 163 if (ret) 164 return ret; 165 } 166 167 mutex_lock(&root->log_mutex); 168 169 again: 170 if (root->log_root) { 171 int index = (root->log_transid + 1) % 2; 172 173 if (btrfs_need_log_full_commit(trans)) { 174 ret = BTRFS_LOG_FORCE_COMMIT; 175 goto out; 176 } 177 178 if (zoned && atomic_read(&root->log_commit[index])) { 179 wait_log_commit(root, root->log_transid - 1); 180 goto again; 181 } 182 183 if (!root->log_start_pid) { 184 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 185 root->log_start_pid = current->pid; 186 } else if (root->log_start_pid != current->pid) { 187 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 188 } 189 } else { 190 /* 191 * This means fs_info->log_root_tree was already created 192 * for some other FS trees. Do the full commit not to mix 193 * nodes from multiple log transactions to do sequential 194 * writing. 195 */ 196 if (zoned && !created) { 197 ret = BTRFS_LOG_FORCE_COMMIT; 198 goto out; 199 } 200 201 ret = btrfs_add_log_tree(trans, root); 202 if (ret) 203 goto out; 204 205 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 206 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); 207 root->log_start_pid = current->pid; 208 } 209 210 atomic_inc(&root->log_writers); 211 if (!ctx->logging_new_name) { 212 int index = root->log_transid % 2; 213 list_add_tail(&ctx->list, &root->log_ctxs[index]); 214 ctx->log_transid = root->log_transid; 215 } 216 217 out: 218 mutex_unlock(&root->log_mutex); 219 return ret; 220 } 221 222 /* 223 * returns 0 if there was a log transaction running and we were able 224 * to join, or returns -ENOENT if there were not transactions 225 * in progress 226 */ 227 static int join_running_log_trans(struct btrfs_root *root) 228 { 229 const bool zoned = btrfs_is_zoned(root->fs_info); 230 int ret = -ENOENT; 231 232 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) 233 return ret; 234 235 mutex_lock(&root->log_mutex); 236 again: 237 if (root->log_root) { 238 int index = (root->log_transid + 1) % 2; 239 240 ret = 0; 241 if (zoned && atomic_read(&root->log_commit[index])) { 242 wait_log_commit(root, root->log_transid - 1); 243 goto again; 244 } 245 atomic_inc(&root->log_writers); 246 } 247 mutex_unlock(&root->log_mutex); 248 return ret; 249 } 250 251 /* 252 * This either makes the current running log transaction wait 253 * until you call btrfs_end_log_trans() or it makes any future 254 * log transactions wait until you call btrfs_end_log_trans() 255 */ 256 void btrfs_pin_log_trans(struct btrfs_root *root) 257 { 258 atomic_inc(&root->log_writers); 259 } 260 261 /* 262 * indicate we're done making changes to the log tree 263 * and wake up anyone waiting to do a sync 264 */ 265 void btrfs_end_log_trans(struct btrfs_root *root) 266 { 267 if (atomic_dec_and_test(&root->log_writers)) { 268 /* atomic_dec_and_test implies a barrier */ 269 cond_wake_up_nomb(&root->log_writer_wait); 270 } 271 } 272 273 static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 274 { 275 filemap_fdatawait_range(buf->pages[0]->mapping, 276 buf->start, buf->start + buf->len - 1); 277 } 278 279 /* 280 * the walk control struct is used to pass state down the chain when 281 * processing the log tree. The stage field tells us which part 282 * of the log tree processing we are currently doing. The others 283 * are state fields used for that specific part 284 */ 285 struct walk_control { 286 /* should we free the extent on disk when done? This is used 287 * at transaction commit time while freeing a log tree 288 */ 289 int free; 290 291 /* pin only walk, we record which extents on disk belong to the 292 * log trees 293 */ 294 int pin; 295 296 /* what stage of the replay code we're currently in */ 297 int stage; 298 299 /* 300 * Ignore any items from the inode currently being processed. Needs 301 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in 302 * the LOG_WALK_REPLAY_INODES stage. 303 */ 304 bool ignore_cur_inode; 305 306 /* the root we are currently replaying */ 307 struct btrfs_root *replay_dest; 308 309 /* the trans handle for the current replay */ 310 struct btrfs_trans_handle *trans; 311 312 /* the function that gets used to process blocks we find in the 313 * tree. Note the extent_buffer might not be up to date when it is 314 * passed in, and it must be checked or read if you need the data 315 * inside it 316 */ 317 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, 318 struct walk_control *wc, u64 gen, int level); 319 }; 320 321 /* 322 * process_func used to pin down extents, write them or wait on them 323 */ 324 static int process_one_buffer(struct btrfs_root *log, 325 struct extent_buffer *eb, 326 struct walk_control *wc, u64 gen, int level) 327 { 328 struct btrfs_fs_info *fs_info = log->fs_info; 329 int ret = 0; 330 331 /* 332 * If this fs is mixed then we need to be able to process the leaves to 333 * pin down any logged extents, so we have to read the block. 334 */ 335 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 336 ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 337 if (ret) 338 return ret; 339 } 340 341 if (wc->pin) { 342 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, 343 eb->len); 344 if (ret) 345 return ret; 346 347 if (btrfs_buffer_uptodate(eb, gen, 0) && 348 btrfs_header_level(eb) == 0) 349 ret = btrfs_exclude_logged_extents(eb); 350 } 351 return ret; 352 } 353 354 static int do_overwrite_item(struct btrfs_trans_handle *trans, 355 struct btrfs_root *root, 356 struct btrfs_path *path, 357 struct extent_buffer *eb, int slot, 358 struct btrfs_key *key) 359 { 360 int ret; 361 u32 item_size; 362 u64 saved_i_size = 0; 363 int save_old_i_size = 0; 364 unsigned long src_ptr; 365 unsigned long dst_ptr; 366 int overwrite_root = 0; 367 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; 368 369 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 370 overwrite_root = 1; 371 372 item_size = btrfs_item_size(eb, slot); 373 src_ptr = btrfs_item_ptr_offset(eb, slot); 374 375 /* Our caller must have done a search for the key for us. */ 376 ASSERT(path->nodes[0] != NULL); 377 378 /* 379 * And the slot must point to the exact key or the slot where the key 380 * should be at (the first item with a key greater than 'key') 381 */ 382 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 383 struct btrfs_key found_key; 384 385 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 386 ret = btrfs_comp_cpu_keys(&found_key, key); 387 ASSERT(ret >= 0); 388 } else { 389 ret = 1; 390 } 391 392 if (ret == 0) { 393 char *src_copy; 394 char *dst_copy; 395 u32 dst_size = btrfs_item_size(path->nodes[0], 396 path->slots[0]); 397 if (dst_size != item_size) 398 goto insert; 399 400 if (item_size == 0) { 401 btrfs_release_path(path); 402 return 0; 403 } 404 dst_copy = kmalloc(item_size, GFP_NOFS); 405 src_copy = kmalloc(item_size, GFP_NOFS); 406 if (!dst_copy || !src_copy) { 407 btrfs_release_path(path); 408 kfree(dst_copy); 409 kfree(src_copy); 410 return -ENOMEM; 411 } 412 413 read_extent_buffer(eb, src_copy, src_ptr, item_size); 414 415 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 416 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, 417 item_size); 418 ret = memcmp(dst_copy, src_copy, item_size); 419 420 kfree(dst_copy); 421 kfree(src_copy); 422 /* 423 * they have the same contents, just return, this saves 424 * us from cowing blocks in the destination tree and doing 425 * extra writes that may not have been done by a previous 426 * sync 427 */ 428 if (ret == 0) { 429 btrfs_release_path(path); 430 return 0; 431 } 432 433 /* 434 * We need to load the old nbytes into the inode so when we 435 * replay the extents we've logged we get the right nbytes. 436 */ 437 if (inode_item) { 438 struct btrfs_inode_item *item; 439 u64 nbytes; 440 u32 mode; 441 442 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 443 struct btrfs_inode_item); 444 nbytes = btrfs_inode_nbytes(path->nodes[0], item); 445 item = btrfs_item_ptr(eb, slot, 446 struct btrfs_inode_item); 447 btrfs_set_inode_nbytes(eb, item, nbytes); 448 449 /* 450 * If this is a directory we need to reset the i_size to 451 * 0 so that we can set it up properly when replaying 452 * the rest of the items in this log. 453 */ 454 mode = btrfs_inode_mode(eb, item); 455 if (S_ISDIR(mode)) 456 btrfs_set_inode_size(eb, item, 0); 457 } 458 } else if (inode_item) { 459 struct btrfs_inode_item *item; 460 u32 mode; 461 462 /* 463 * New inode, set nbytes to 0 so that the nbytes comes out 464 * properly when we replay the extents. 465 */ 466 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); 467 btrfs_set_inode_nbytes(eb, item, 0); 468 469 /* 470 * If this is a directory we need to reset the i_size to 0 so 471 * that we can set it up properly when replaying the rest of 472 * the items in this log. 473 */ 474 mode = btrfs_inode_mode(eb, item); 475 if (S_ISDIR(mode)) 476 btrfs_set_inode_size(eb, item, 0); 477 } 478 insert: 479 btrfs_release_path(path); 480 /* try to insert the key into the destination tree */ 481 path->skip_release_on_error = 1; 482 ret = btrfs_insert_empty_item(trans, root, path, 483 key, item_size); 484 path->skip_release_on_error = 0; 485 486 /* make sure any existing item is the correct size */ 487 if (ret == -EEXIST || ret == -EOVERFLOW) { 488 u32 found_size; 489 found_size = btrfs_item_size(path->nodes[0], 490 path->slots[0]); 491 if (found_size > item_size) 492 btrfs_truncate_item(path, item_size, 1); 493 else if (found_size < item_size) 494 btrfs_extend_item(path, item_size - found_size); 495 } else if (ret) { 496 return ret; 497 } 498 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 499 path->slots[0]); 500 501 /* don't overwrite an existing inode if the generation number 502 * was logged as zero. This is done when the tree logging code 503 * is just logging an inode to make sure it exists after recovery. 504 * 505 * Also, don't overwrite i_size on directories during replay. 506 * log replay inserts and removes directory items based on the 507 * state of the tree found in the subvolume, and i_size is modified 508 * as it goes 509 */ 510 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { 511 struct btrfs_inode_item *src_item; 512 struct btrfs_inode_item *dst_item; 513 514 src_item = (struct btrfs_inode_item *)src_ptr; 515 dst_item = (struct btrfs_inode_item *)dst_ptr; 516 517 if (btrfs_inode_generation(eb, src_item) == 0) { 518 struct extent_buffer *dst_eb = path->nodes[0]; 519 const u64 ino_size = btrfs_inode_size(eb, src_item); 520 521 /* 522 * For regular files an ino_size == 0 is used only when 523 * logging that an inode exists, as part of a directory 524 * fsync, and the inode wasn't fsynced before. In this 525 * case don't set the size of the inode in the fs/subvol 526 * tree, otherwise we would be throwing valid data away. 527 */ 528 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 529 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && 530 ino_size != 0) 531 btrfs_set_inode_size(dst_eb, dst_item, ino_size); 532 goto no_copy; 533 } 534 535 if (overwrite_root && 536 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 537 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { 538 save_old_i_size = 1; 539 saved_i_size = btrfs_inode_size(path->nodes[0], 540 dst_item); 541 } 542 } 543 544 copy_extent_buffer(path->nodes[0], eb, dst_ptr, 545 src_ptr, item_size); 546 547 if (save_old_i_size) { 548 struct btrfs_inode_item *dst_item; 549 dst_item = (struct btrfs_inode_item *)dst_ptr; 550 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); 551 } 552 553 /* make sure the generation is filled in */ 554 if (key->type == BTRFS_INODE_ITEM_KEY) { 555 struct btrfs_inode_item *dst_item; 556 dst_item = (struct btrfs_inode_item *)dst_ptr; 557 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { 558 btrfs_set_inode_generation(path->nodes[0], dst_item, 559 trans->transid); 560 } 561 } 562 no_copy: 563 btrfs_mark_buffer_dirty(path->nodes[0]); 564 btrfs_release_path(path); 565 return 0; 566 } 567 568 /* 569 * Item overwrite used by replay and tree logging. eb, slot and key all refer 570 * to the src data we are copying out. 571 * 572 * root is the tree we are copying into, and path is a scratch 573 * path for use in this function (it should be released on entry and 574 * will be released on exit). 575 * 576 * If the key is already in the destination tree the existing item is 577 * overwritten. If the existing item isn't big enough, it is extended. 578 * If it is too large, it is truncated. 579 * 580 * If the key isn't in the destination yet, a new item is inserted. 581 */ 582 static int overwrite_item(struct btrfs_trans_handle *trans, 583 struct btrfs_root *root, 584 struct btrfs_path *path, 585 struct extent_buffer *eb, int slot, 586 struct btrfs_key *key) 587 { 588 int ret; 589 590 /* Look for the key in the destination tree. */ 591 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 592 if (ret < 0) 593 return ret; 594 595 return do_overwrite_item(trans, root, path, eb, slot, key); 596 } 597 598 /* 599 * simple helper to read an inode off the disk from a given root 600 * This can only be called for subvolume roots and not for the log 601 */ 602 static noinline struct inode *read_one_inode(struct btrfs_root *root, 603 u64 objectid) 604 { 605 struct inode *inode; 606 607 inode = btrfs_iget(root->fs_info->sb, objectid, root); 608 if (IS_ERR(inode)) 609 inode = NULL; 610 return inode; 611 } 612 613 /* replays a single extent in 'eb' at 'slot' with 'key' into the 614 * subvolume 'root'. path is released on entry and should be released 615 * on exit. 616 * 617 * extents in the log tree have not been allocated out of the extent 618 * tree yet. So, this completes the allocation, taking a reference 619 * as required if the extent already exists or creating a new extent 620 * if it isn't in the extent allocation tree yet. 621 * 622 * The extent is inserted into the file, dropping any existing extents 623 * from the file that overlap the new one. 624 */ 625 static noinline int replay_one_extent(struct btrfs_trans_handle *trans, 626 struct btrfs_root *root, 627 struct btrfs_path *path, 628 struct extent_buffer *eb, int slot, 629 struct btrfs_key *key) 630 { 631 struct btrfs_drop_extents_args drop_args = { 0 }; 632 struct btrfs_fs_info *fs_info = root->fs_info; 633 int found_type; 634 u64 extent_end; 635 u64 start = key->offset; 636 u64 nbytes = 0; 637 struct btrfs_file_extent_item *item; 638 struct inode *inode = NULL; 639 unsigned long size; 640 int ret = 0; 641 642 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 643 found_type = btrfs_file_extent_type(eb, item); 644 645 if (found_type == BTRFS_FILE_EXTENT_REG || 646 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 647 nbytes = btrfs_file_extent_num_bytes(eb, item); 648 extent_end = start + nbytes; 649 650 /* 651 * We don't add to the inodes nbytes if we are prealloc or a 652 * hole. 653 */ 654 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 655 nbytes = 0; 656 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 657 size = btrfs_file_extent_ram_bytes(eb, item); 658 nbytes = btrfs_file_extent_ram_bytes(eb, item); 659 extent_end = ALIGN(start + size, 660 fs_info->sectorsize); 661 } else { 662 ret = 0; 663 goto out; 664 } 665 666 inode = read_one_inode(root, key->objectid); 667 if (!inode) { 668 ret = -EIO; 669 goto out; 670 } 671 672 /* 673 * first check to see if we already have this extent in the 674 * file. This must be done before the btrfs_drop_extents run 675 * so we don't try to drop this extent. 676 */ 677 ret = btrfs_lookup_file_extent(trans, root, path, 678 btrfs_ino(BTRFS_I(inode)), start, 0); 679 680 if (ret == 0 && 681 (found_type == BTRFS_FILE_EXTENT_REG || 682 found_type == BTRFS_FILE_EXTENT_PREALLOC)) { 683 struct btrfs_file_extent_item cmp1; 684 struct btrfs_file_extent_item cmp2; 685 struct btrfs_file_extent_item *existing; 686 struct extent_buffer *leaf; 687 688 leaf = path->nodes[0]; 689 existing = btrfs_item_ptr(leaf, path->slots[0], 690 struct btrfs_file_extent_item); 691 692 read_extent_buffer(eb, &cmp1, (unsigned long)item, 693 sizeof(cmp1)); 694 read_extent_buffer(leaf, &cmp2, (unsigned long)existing, 695 sizeof(cmp2)); 696 697 /* 698 * we already have a pointer to this exact extent, 699 * we don't have to do anything 700 */ 701 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 702 btrfs_release_path(path); 703 goto out; 704 } 705 } 706 btrfs_release_path(path); 707 708 /* drop any overlapping extents */ 709 drop_args.start = start; 710 drop_args.end = extent_end; 711 drop_args.drop_cache = true; 712 ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); 713 if (ret) 714 goto out; 715 716 if (found_type == BTRFS_FILE_EXTENT_REG || 717 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 718 u64 offset; 719 unsigned long dest_offset; 720 struct btrfs_key ins; 721 722 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 && 723 btrfs_fs_incompat(fs_info, NO_HOLES)) 724 goto update_inode; 725 726 ret = btrfs_insert_empty_item(trans, root, path, key, 727 sizeof(*item)); 728 if (ret) 729 goto out; 730 dest_offset = btrfs_item_ptr_offset(path->nodes[0], 731 path->slots[0]); 732 copy_extent_buffer(path->nodes[0], eb, dest_offset, 733 (unsigned long)item, sizeof(*item)); 734 735 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); 736 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); 737 ins.type = BTRFS_EXTENT_ITEM_KEY; 738 offset = key->offset - btrfs_file_extent_offset(eb, item); 739 740 /* 741 * Manually record dirty extent, as here we did a shallow 742 * file extent item copy and skip normal backref update, 743 * but modifying extent tree all by ourselves. 744 * So need to manually record dirty extent for qgroup, 745 * as the owner of the file extent changed from log tree 746 * (doesn't affect qgroup) to fs/file tree(affects qgroup) 747 */ 748 ret = btrfs_qgroup_trace_extent(trans, 749 btrfs_file_extent_disk_bytenr(eb, item), 750 btrfs_file_extent_disk_num_bytes(eb, item), 751 GFP_NOFS); 752 if (ret < 0) 753 goto out; 754 755 if (ins.objectid > 0) { 756 struct btrfs_ref ref = { 0 }; 757 u64 csum_start; 758 u64 csum_end; 759 LIST_HEAD(ordered_sums); 760 761 /* 762 * is this extent already allocated in the extent 763 * allocation tree? If so, just add a reference 764 */ 765 ret = btrfs_lookup_data_extent(fs_info, ins.objectid, 766 ins.offset); 767 if (ret < 0) { 768 goto out; 769 } else if (ret == 0) { 770 btrfs_init_generic_ref(&ref, 771 BTRFS_ADD_DELAYED_REF, 772 ins.objectid, ins.offset, 0); 773 btrfs_init_data_ref(&ref, 774 root->root_key.objectid, 775 key->objectid, offset, 0, false); 776 ret = btrfs_inc_extent_ref(trans, &ref); 777 if (ret) 778 goto out; 779 } else { 780 /* 781 * insert the extent pointer in the extent 782 * allocation tree 783 */ 784 ret = btrfs_alloc_logged_file_extent(trans, 785 root->root_key.objectid, 786 key->objectid, offset, &ins); 787 if (ret) 788 goto out; 789 } 790 btrfs_release_path(path); 791 792 if (btrfs_file_extent_compression(eb, item)) { 793 csum_start = ins.objectid; 794 csum_end = csum_start + ins.offset; 795 } else { 796 csum_start = ins.objectid + 797 btrfs_file_extent_offset(eb, item); 798 csum_end = csum_start + 799 btrfs_file_extent_num_bytes(eb, item); 800 } 801 802 ret = btrfs_lookup_csums_range(root->log_root, 803 csum_start, csum_end - 1, 804 &ordered_sums, 0); 805 if (ret) 806 goto out; 807 /* 808 * Now delete all existing cums in the csum root that 809 * cover our range. We do this because we can have an 810 * extent that is completely referenced by one file 811 * extent item and partially referenced by another 812 * file extent item (like after using the clone or 813 * extent_same ioctls). In this case if we end up doing 814 * the replay of the one that partially references the 815 * extent first, and we do not do the csum deletion 816 * below, we can get 2 csum items in the csum tree that 817 * overlap each other. For example, imagine our log has 818 * the two following file extent items: 819 * 820 * key (257 EXTENT_DATA 409600) 821 * extent data disk byte 12845056 nr 102400 822 * extent data offset 20480 nr 20480 ram 102400 823 * 824 * key (257 EXTENT_DATA 819200) 825 * extent data disk byte 12845056 nr 102400 826 * extent data offset 0 nr 102400 ram 102400 827 * 828 * Where the second one fully references the 100K extent 829 * that starts at disk byte 12845056, and the log tree 830 * has a single csum item that covers the entire range 831 * of the extent: 832 * 833 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 834 * 835 * After the first file extent item is replayed, the 836 * csum tree gets the following csum item: 837 * 838 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 839 * 840 * Which covers the 20K sub-range starting at offset 20K 841 * of our extent. Now when we replay the second file 842 * extent item, if we do not delete existing csum items 843 * that cover any of its blocks, we end up getting two 844 * csum items in our csum tree that overlap each other: 845 * 846 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100 847 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20 848 * 849 * Which is a problem, because after this anyone trying 850 * to lookup up for the checksum of any block of our 851 * extent starting at an offset of 40K or higher, will 852 * end up looking at the second csum item only, which 853 * does not contain the checksum for any block starting 854 * at offset 40K or higher of our extent. 855 */ 856 while (!list_empty(&ordered_sums)) { 857 struct btrfs_ordered_sum *sums; 858 struct btrfs_root *csum_root; 859 860 sums = list_entry(ordered_sums.next, 861 struct btrfs_ordered_sum, 862 list); 863 csum_root = btrfs_csum_root(fs_info, 864 sums->bytenr); 865 if (!ret) 866 ret = btrfs_del_csums(trans, csum_root, 867 sums->bytenr, 868 sums->len); 869 if (!ret) 870 ret = btrfs_csum_file_blocks(trans, 871 csum_root, 872 sums); 873 list_del(&sums->list); 874 kfree(sums); 875 } 876 if (ret) 877 goto out; 878 } else { 879 btrfs_release_path(path); 880 } 881 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 882 /* inline extents are easy, we just overwrite them */ 883 ret = overwrite_item(trans, root, path, eb, slot, key); 884 if (ret) 885 goto out; 886 } 887 888 ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, 889 extent_end - start); 890 if (ret) 891 goto out; 892 893 update_inode: 894 btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); 895 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 896 out: 897 iput(inode); 898 return ret; 899 } 900 901 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, 902 struct btrfs_inode *dir, 903 struct btrfs_inode *inode, 904 const char *name, 905 int name_len) 906 { 907 int ret; 908 909 ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); 910 if (ret) 911 return ret; 912 /* 913 * Whenever we need to check if a name exists or not, we check the 914 * fs/subvolume tree. So after an unlink we must run delayed items, so 915 * that future checks for a name during log replay see that the name 916 * does not exists anymore. 917 */ 918 return btrfs_run_delayed_items(trans); 919 } 920 921 /* 922 * when cleaning up conflicts between the directory names in the 923 * subvolume, directory names in the log and directory names in the 924 * inode back references, we may have to unlink inodes from directories. 925 * 926 * This is a helper function to do the unlink of a specific directory 927 * item 928 */ 929 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, 930 struct btrfs_path *path, 931 struct btrfs_inode *dir, 932 struct btrfs_dir_item *di) 933 { 934 struct btrfs_root *root = dir->root; 935 struct inode *inode; 936 char *name; 937 int name_len; 938 struct extent_buffer *leaf; 939 struct btrfs_key location; 940 int ret; 941 942 leaf = path->nodes[0]; 943 944 btrfs_dir_item_key_to_cpu(leaf, di, &location); 945 name_len = btrfs_dir_name_len(leaf, di); 946 name = kmalloc(name_len, GFP_NOFS); 947 if (!name) 948 return -ENOMEM; 949 950 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 951 btrfs_release_path(path); 952 953 inode = read_one_inode(root, location.objectid); 954 if (!inode) { 955 ret = -EIO; 956 goto out; 957 } 958 959 ret = link_to_fixup_dir(trans, root, path, location.objectid); 960 if (ret) 961 goto out; 962 963 ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, 964 name_len); 965 out: 966 kfree(name); 967 iput(inode); 968 return ret; 969 } 970 971 /* 972 * See if a given name and sequence number found in an inode back reference are 973 * already in a directory and correctly point to this inode. 974 * 975 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it 976 * exists. 977 */ 978 static noinline int inode_in_dir(struct btrfs_root *root, 979 struct btrfs_path *path, 980 u64 dirid, u64 objectid, u64 index, 981 const char *name, int name_len) 982 { 983 struct btrfs_dir_item *di; 984 struct btrfs_key location; 985 int ret = 0; 986 987 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, 988 index, name, name_len, 0); 989 if (IS_ERR(di)) { 990 ret = PTR_ERR(di); 991 goto out; 992 } else if (di) { 993 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 994 if (location.objectid != objectid) 995 goto out; 996 } else { 997 goto out; 998 } 999 1000 btrfs_release_path(path); 1001 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 1002 if (IS_ERR(di)) { 1003 ret = PTR_ERR(di); 1004 goto out; 1005 } else if (di) { 1006 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); 1007 if (location.objectid == objectid) 1008 ret = 1; 1009 } 1010 out: 1011 btrfs_release_path(path); 1012 return ret; 1013 } 1014 1015 /* 1016 * helper function to check a log tree for a named back reference in 1017 * an inode. This is used to decide if a back reference that is 1018 * found in the subvolume conflicts with what we find in the log. 1019 * 1020 * inode backreferences may have multiple refs in a single item, 1021 * during replay we process one reference at a time, and we don't 1022 * want to delete valid links to a file from the subvolume if that 1023 * link is also in the log. 1024 */ 1025 static noinline int backref_in_log(struct btrfs_root *log, 1026 struct btrfs_key *key, 1027 u64 ref_objectid, 1028 const char *name, int namelen) 1029 { 1030 struct btrfs_path *path; 1031 int ret; 1032 1033 path = btrfs_alloc_path(); 1034 if (!path) 1035 return -ENOMEM; 1036 1037 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 1038 if (ret < 0) { 1039 goto out; 1040 } else if (ret == 1) { 1041 ret = 0; 1042 goto out; 1043 } 1044 1045 if (key->type == BTRFS_INODE_EXTREF_KEY) 1046 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1047 path->slots[0], 1048 ref_objectid, 1049 name, namelen); 1050 else 1051 ret = !!btrfs_find_name_in_backref(path->nodes[0], 1052 path->slots[0], 1053 name, namelen); 1054 out: 1055 btrfs_free_path(path); 1056 return ret; 1057 } 1058 1059 static inline int __add_inode_ref(struct btrfs_trans_handle *trans, 1060 struct btrfs_root *root, 1061 struct btrfs_path *path, 1062 struct btrfs_root *log_root, 1063 struct btrfs_inode *dir, 1064 struct btrfs_inode *inode, 1065 u64 inode_objectid, u64 parent_objectid, 1066 u64 ref_index, char *name, int namelen, 1067 int *search_done) 1068 { 1069 int ret; 1070 char *victim_name; 1071 int victim_name_len; 1072 struct extent_buffer *leaf; 1073 struct btrfs_dir_item *di; 1074 struct btrfs_key search_key; 1075 struct btrfs_inode_extref *extref; 1076 1077 again: 1078 /* Search old style refs */ 1079 search_key.objectid = inode_objectid; 1080 search_key.type = BTRFS_INODE_REF_KEY; 1081 search_key.offset = parent_objectid; 1082 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1083 if (ret == 0) { 1084 struct btrfs_inode_ref *victim_ref; 1085 unsigned long ptr; 1086 unsigned long ptr_end; 1087 1088 leaf = path->nodes[0]; 1089 1090 /* are we trying to overwrite a back ref for the root directory 1091 * if so, just jump out, we're done 1092 */ 1093 if (search_key.objectid == search_key.offset) 1094 return 1; 1095 1096 /* check all the names in this back reference to see 1097 * if they are in the log. if so, we allow them to stay 1098 * otherwise they must be unlinked as a conflict 1099 */ 1100 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1101 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); 1102 while (ptr < ptr_end) { 1103 victim_ref = (struct btrfs_inode_ref *)ptr; 1104 victim_name_len = btrfs_inode_ref_name_len(leaf, 1105 victim_ref); 1106 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1107 if (!victim_name) 1108 return -ENOMEM; 1109 1110 read_extent_buffer(leaf, victim_name, 1111 (unsigned long)(victim_ref + 1), 1112 victim_name_len); 1113 1114 ret = backref_in_log(log_root, &search_key, 1115 parent_objectid, victim_name, 1116 victim_name_len); 1117 if (ret < 0) { 1118 kfree(victim_name); 1119 return ret; 1120 } else if (!ret) { 1121 inc_nlink(&inode->vfs_inode); 1122 btrfs_release_path(path); 1123 1124 ret = unlink_inode_for_log_replay(trans, dir, inode, 1125 victim_name, victim_name_len); 1126 kfree(victim_name); 1127 if (ret) 1128 return ret; 1129 *search_done = 1; 1130 goto again; 1131 } 1132 kfree(victim_name); 1133 1134 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 1135 } 1136 1137 /* 1138 * NOTE: we have searched root tree and checked the 1139 * corresponding ref, it does not need to check again. 1140 */ 1141 *search_done = 1; 1142 } 1143 btrfs_release_path(path); 1144 1145 /* Same search but for extended refs */ 1146 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, 1147 inode_objectid, parent_objectid, 0, 1148 0); 1149 if (!IS_ERR_OR_NULL(extref)) { 1150 u32 item_size; 1151 u32 cur_offset = 0; 1152 unsigned long base; 1153 struct inode *victim_parent; 1154 1155 leaf = path->nodes[0]; 1156 1157 item_size = btrfs_item_size(leaf, path->slots[0]); 1158 base = btrfs_item_ptr_offset(leaf, path->slots[0]); 1159 1160 while (cur_offset < item_size) { 1161 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1162 1163 victim_name_len = btrfs_inode_extref_name_len(leaf, extref); 1164 1165 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1166 goto next; 1167 1168 victim_name = kmalloc(victim_name_len, GFP_NOFS); 1169 if (!victim_name) 1170 return -ENOMEM; 1171 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, 1172 victim_name_len); 1173 1174 search_key.objectid = inode_objectid; 1175 search_key.type = BTRFS_INODE_EXTREF_KEY; 1176 search_key.offset = btrfs_extref_hash(parent_objectid, 1177 victim_name, 1178 victim_name_len); 1179 ret = backref_in_log(log_root, &search_key, 1180 parent_objectid, victim_name, 1181 victim_name_len); 1182 if (ret < 0) { 1183 kfree(victim_name); 1184 return ret; 1185 } else if (!ret) { 1186 ret = -ENOENT; 1187 victim_parent = read_one_inode(root, 1188 parent_objectid); 1189 if (victim_parent) { 1190 inc_nlink(&inode->vfs_inode); 1191 btrfs_release_path(path); 1192 1193 ret = unlink_inode_for_log_replay(trans, 1194 BTRFS_I(victim_parent), 1195 inode, 1196 victim_name, 1197 victim_name_len); 1198 } 1199 iput(victim_parent); 1200 kfree(victim_name); 1201 if (ret) 1202 return ret; 1203 *search_done = 1; 1204 goto again; 1205 } 1206 kfree(victim_name); 1207 next: 1208 cur_offset += victim_name_len + sizeof(*extref); 1209 } 1210 *search_done = 1; 1211 } 1212 btrfs_release_path(path); 1213 1214 /* look for a conflicting sequence number */ 1215 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), 1216 ref_index, name, namelen, 0); 1217 if (IS_ERR(di)) { 1218 return PTR_ERR(di); 1219 } else if (di) { 1220 ret = drop_one_dir_item(trans, path, dir, di); 1221 if (ret) 1222 return ret; 1223 } 1224 btrfs_release_path(path); 1225 1226 /* look for a conflicting name */ 1227 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), 1228 name, namelen, 0); 1229 if (IS_ERR(di)) { 1230 return PTR_ERR(di); 1231 } else if (di) { 1232 ret = drop_one_dir_item(trans, path, dir, di); 1233 if (ret) 1234 return ret; 1235 } 1236 btrfs_release_path(path); 1237 1238 return 0; 1239 } 1240 1241 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1242 u32 *namelen, char **name, u64 *index, 1243 u64 *parent_objectid) 1244 { 1245 struct btrfs_inode_extref *extref; 1246 1247 extref = (struct btrfs_inode_extref *)ref_ptr; 1248 1249 *namelen = btrfs_inode_extref_name_len(eb, extref); 1250 *name = kmalloc(*namelen, GFP_NOFS); 1251 if (*name == NULL) 1252 return -ENOMEM; 1253 1254 read_extent_buffer(eb, *name, (unsigned long)&extref->name, 1255 *namelen); 1256 1257 if (index) 1258 *index = btrfs_inode_extref_index(eb, extref); 1259 if (parent_objectid) 1260 *parent_objectid = btrfs_inode_extref_parent(eb, extref); 1261 1262 return 0; 1263 } 1264 1265 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, 1266 u32 *namelen, char **name, u64 *index) 1267 { 1268 struct btrfs_inode_ref *ref; 1269 1270 ref = (struct btrfs_inode_ref *)ref_ptr; 1271 1272 *namelen = btrfs_inode_ref_name_len(eb, ref); 1273 *name = kmalloc(*namelen, GFP_NOFS); 1274 if (*name == NULL) 1275 return -ENOMEM; 1276 1277 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); 1278 1279 if (index) 1280 *index = btrfs_inode_ref_index(eb, ref); 1281 1282 return 0; 1283 } 1284 1285 /* 1286 * Take an inode reference item from the log tree and iterate all names from the 1287 * inode reference item in the subvolume tree with the same key (if it exists). 1288 * For any name that is not in the inode reference item from the log tree, do a 1289 * proper unlink of that name (that is, remove its entry from the inode 1290 * reference item and both dir index keys). 1291 */ 1292 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, 1293 struct btrfs_root *root, 1294 struct btrfs_path *path, 1295 struct btrfs_inode *inode, 1296 struct extent_buffer *log_eb, 1297 int log_slot, 1298 struct btrfs_key *key) 1299 { 1300 int ret; 1301 unsigned long ref_ptr; 1302 unsigned long ref_end; 1303 struct extent_buffer *eb; 1304 1305 again: 1306 btrfs_release_path(path); 1307 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 1308 if (ret > 0) { 1309 ret = 0; 1310 goto out; 1311 } 1312 if (ret < 0) 1313 goto out; 1314 1315 eb = path->nodes[0]; 1316 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 1317 ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); 1318 while (ref_ptr < ref_end) { 1319 char *name = NULL; 1320 int namelen; 1321 u64 parent_id; 1322 1323 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1324 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1325 NULL, &parent_id); 1326 } else { 1327 parent_id = key->offset; 1328 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1329 NULL); 1330 } 1331 if (ret) 1332 goto out; 1333 1334 if (key->type == BTRFS_INODE_EXTREF_KEY) 1335 ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, 1336 parent_id, name, 1337 namelen); 1338 else 1339 ret = !!btrfs_find_name_in_backref(log_eb, log_slot, 1340 name, namelen); 1341 1342 if (!ret) { 1343 struct inode *dir; 1344 1345 btrfs_release_path(path); 1346 dir = read_one_inode(root, parent_id); 1347 if (!dir) { 1348 ret = -ENOENT; 1349 kfree(name); 1350 goto out; 1351 } 1352 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), 1353 inode, name, namelen); 1354 kfree(name); 1355 iput(dir); 1356 if (ret) 1357 goto out; 1358 goto again; 1359 } 1360 1361 kfree(name); 1362 ref_ptr += namelen; 1363 if (key->type == BTRFS_INODE_EXTREF_KEY) 1364 ref_ptr += sizeof(struct btrfs_inode_extref); 1365 else 1366 ref_ptr += sizeof(struct btrfs_inode_ref); 1367 } 1368 ret = 0; 1369 out: 1370 btrfs_release_path(path); 1371 return ret; 1372 } 1373 1374 static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir, 1375 const u8 ref_type, const char *name, 1376 const int namelen) 1377 { 1378 struct btrfs_key key; 1379 struct btrfs_path *path; 1380 const u64 parent_id = btrfs_ino(BTRFS_I(dir)); 1381 int ret; 1382 1383 path = btrfs_alloc_path(); 1384 if (!path) 1385 return -ENOMEM; 1386 1387 key.objectid = btrfs_ino(BTRFS_I(inode)); 1388 key.type = ref_type; 1389 if (key.type == BTRFS_INODE_REF_KEY) 1390 key.offset = parent_id; 1391 else 1392 key.offset = btrfs_extref_hash(parent_id, name, namelen); 1393 1394 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0); 1395 if (ret < 0) 1396 goto out; 1397 if (ret > 0) { 1398 ret = 0; 1399 goto out; 1400 } 1401 if (key.type == BTRFS_INODE_EXTREF_KEY) 1402 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], 1403 path->slots[0], parent_id, name, namelen); 1404 else 1405 ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], 1406 name, namelen); 1407 1408 out: 1409 btrfs_free_path(path); 1410 return ret; 1411 } 1412 1413 static int add_link(struct btrfs_trans_handle *trans, 1414 struct inode *dir, struct inode *inode, const char *name, 1415 int namelen, u64 ref_index) 1416 { 1417 struct btrfs_root *root = BTRFS_I(dir)->root; 1418 struct btrfs_dir_item *dir_item; 1419 struct btrfs_key key; 1420 struct btrfs_path *path; 1421 struct inode *other_inode = NULL; 1422 int ret; 1423 1424 path = btrfs_alloc_path(); 1425 if (!path) 1426 return -ENOMEM; 1427 1428 dir_item = btrfs_lookup_dir_item(NULL, root, path, 1429 btrfs_ino(BTRFS_I(dir)), 1430 name, namelen, 0); 1431 if (!dir_item) { 1432 btrfs_release_path(path); 1433 goto add_link; 1434 } else if (IS_ERR(dir_item)) { 1435 ret = PTR_ERR(dir_item); 1436 goto out; 1437 } 1438 1439 /* 1440 * Our inode's dentry collides with the dentry of another inode which is 1441 * in the log but not yet processed since it has a higher inode number. 1442 * So delete that other dentry. 1443 */ 1444 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key); 1445 btrfs_release_path(path); 1446 other_inode = read_one_inode(root, key.objectid); 1447 if (!other_inode) { 1448 ret = -ENOENT; 1449 goto out; 1450 } 1451 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode), 1452 name, namelen); 1453 if (ret) 1454 goto out; 1455 /* 1456 * If we dropped the link count to 0, bump it so that later the iput() 1457 * on the inode will not free it. We will fixup the link count later. 1458 */ 1459 if (other_inode->i_nlink == 0) 1460 inc_nlink(other_inode); 1461 add_link: 1462 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 1463 name, namelen, 0, ref_index); 1464 out: 1465 iput(other_inode); 1466 btrfs_free_path(path); 1467 1468 return ret; 1469 } 1470 1471 /* 1472 * replay one inode back reference item found in the log tree. 1473 * eb, slot and key refer to the buffer and key found in the log tree. 1474 * root is the destination we are replaying into, and path is for temp 1475 * use by this function. (it should be released on return). 1476 */ 1477 static noinline int add_inode_ref(struct btrfs_trans_handle *trans, 1478 struct btrfs_root *root, 1479 struct btrfs_root *log, 1480 struct btrfs_path *path, 1481 struct extent_buffer *eb, int slot, 1482 struct btrfs_key *key) 1483 { 1484 struct inode *dir = NULL; 1485 struct inode *inode = NULL; 1486 unsigned long ref_ptr; 1487 unsigned long ref_end; 1488 char *name = NULL; 1489 int namelen; 1490 int ret; 1491 int search_done = 0; 1492 int log_ref_ver = 0; 1493 u64 parent_objectid; 1494 u64 inode_objectid; 1495 u64 ref_index = 0; 1496 int ref_struct_size; 1497 1498 ref_ptr = btrfs_item_ptr_offset(eb, slot); 1499 ref_end = ref_ptr + btrfs_item_size(eb, slot); 1500 1501 if (key->type == BTRFS_INODE_EXTREF_KEY) { 1502 struct btrfs_inode_extref *r; 1503 1504 ref_struct_size = sizeof(struct btrfs_inode_extref); 1505 log_ref_ver = 1; 1506 r = (struct btrfs_inode_extref *)ref_ptr; 1507 parent_objectid = btrfs_inode_extref_parent(eb, r); 1508 } else { 1509 ref_struct_size = sizeof(struct btrfs_inode_ref); 1510 parent_objectid = key->offset; 1511 } 1512 inode_objectid = key->objectid; 1513 1514 /* 1515 * it is possible that we didn't log all the parent directories 1516 * for a given inode. If we don't find the dir, just don't 1517 * copy the back ref in. The link count fixup code will take 1518 * care of the rest 1519 */ 1520 dir = read_one_inode(root, parent_objectid); 1521 if (!dir) { 1522 ret = -ENOENT; 1523 goto out; 1524 } 1525 1526 inode = read_one_inode(root, inode_objectid); 1527 if (!inode) { 1528 ret = -EIO; 1529 goto out; 1530 } 1531 1532 while (ref_ptr < ref_end) { 1533 if (log_ref_ver) { 1534 ret = extref_get_fields(eb, ref_ptr, &namelen, &name, 1535 &ref_index, &parent_objectid); 1536 /* 1537 * parent object can change from one array 1538 * item to another. 1539 */ 1540 if (!dir) 1541 dir = read_one_inode(root, parent_objectid); 1542 if (!dir) { 1543 ret = -ENOENT; 1544 goto out; 1545 } 1546 } else { 1547 ret = ref_get_fields(eb, ref_ptr, &namelen, &name, 1548 &ref_index); 1549 } 1550 if (ret) 1551 goto out; 1552 1553 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), 1554 btrfs_ino(BTRFS_I(inode)), ref_index, 1555 name, namelen); 1556 if (ret < 0) { 1557 goto out; 1558 } else if (ret == 0) { 1559 /* 1560 * look for a conflicting back reference in the 1561 * metadata. if we find one we have to unlink that name 1562 * of the file before we add our new link. Later on, we 1563 * overwrite any existing back reference, and we don't 1564 * want to create dangling pointers in the directory. 1565 */ 1566 1567 if (!search_done) { 1568 ret = __add_inode_ref(trans, root, path, log, 1569 BTRFS_I(dir), 1570 BTRFS_I(inode), 1571 inode_objectid, 1572 parent_objectid, 1573 ref_index, name, namelen, 1574 &search_done); 1575 if (ret) { 1576 if (ret == 1) 1577 ret = 0; 1578 goto out; 1579 } 1580 } 1581 1582 /* 1583 * If a reference item already exists for this inode 1584 * with the same parent and name, but different index, 1585 * drop it and the corresponding directory index entries 1586 * from the parent before adding the new reference item 1587 * and dir index entries, otherwise we would fail with 1588 * -EEXIST returned from btrfs_add_link() below. 1589 */ 1590 ret = btrfs_inode_ref_exists(inode, dir, key->type, 1591 name, namelen); 1592 if (ret > 0) { 1593 ret = unlink_inode_for_log_replay(trans, 1594 BTRFS_I(dir), 1595 BTRFS_I(inode), 1596 name, namelen); 1597 /* 1598 * If we dropped the link count to 0, bump it so 1599 * that later the iput() on the inode will not 1600 * free it. We will fixup the link count later. 1601 */ 1602 if (!ret && inode->i_nlink == 0) 1603 inc_nlink(inode); 1604 } 1605 if (ret < 0) 1606 goto out; 1607 1608 /* insert our name */ 1609 ret = add_link(trans, dir, inode, name, namelen, 1610 ref_index); 1611 if (ret) 1612 goto out; 1613 1614 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1615 if (ret) 1616 goto out; 1617 } 1618 /* Else, ret == 1, we already have a perfect match, we're done. */ 1619 1620 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; 1621 kfree(name); 1622 name = NULL; 1623 if (log_ref_ver) { 1624 iput(dir); 1625 dir = NULL; 1626 } 1627 } 1628 1629 /* 1630 * Before we overwrite the inode reference item in the subvolume tree 1631 * with the item from the log tree, we must unlink all names from the 1632 * parent directory that are in the subvolume's tree inode reference 1633 * item, otherwise we end up with an inconsistent subvolume tree where 1634 * dir index entries exist for a name but there is no inode reference 1635 * item with the same name. 1636 */ 1637 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, 1638 key); 1639 if (ret) 1640 goto out; 1641 1642 /* finally write the back reference in the inode */ 1643 ret = overwrite_item(trans, root, path, eb, slot, key); 1644 out: 1645 btrfs_release_path(path); 1646 kfree(name); 1647 iput(dir); 1648 iput(inode); 1649 return ret; 1650 } 1651 1652 static int count_inode_extrefs(struct btrfs_root *root, 1653 struct btrfs_inode *inode, struct btrfs_path *path) 1654 { 1655 int ret = 0; 1656 int name_len; 1657 unsigned int nlink = 0; 1658 u32 item_size; 1659 u32 cur_offset = 0; 1660 u64 inode_objectid = btrfs_ino(inode); 1661 u64 offset = 0; 1662 unsigned long ptr; 1663 struct btrfs_inode_extref *extref; 1664 struct extent_buffer *leaf; 1665 1666 while (1) { 1667 ret = btrfs_find_one_extref(root, inode_objectid, offset, path, 1668 &extref, &offset); 1669 if (ret) 1670 break; 1671 1672 leaf = path->nodes[0]; 1673 item_size = btrfs_item_size(leaf, path->slots[0]); 1674 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1675 cur_offset = 0; 1676 1677 while (cur_offset < item_size) { 1678 extref = (struct btrfs_inode_extref *) (ptr + cur_offset); 1679 name_len = btrfs_inode_extref_name_len(leaf, extref); 1680 1681 nlink++; 1682 1683 cur_offset += name_len + sizeof(*extref); 1684 } 1685 1686 offset++; 1687 btrfs_release_path(path); 1688 } 1689 btrfs_release_path(path); 1690 1691 if (ret < 0 && ret != -ENOENT) 1692 return ret; 1693 return nlink; 1694 } 1695 1696 static int count_inode_refs(struct btrfs_root *root, 1697 struct btrfs_inode *inode, struct btrfs_path *path) 1698 { 1699 int ret; 1700 struct btrfs_key key; 1701 unsigned int nlink = 0; 1702 unsigned long ptr; 1703 unsigned long ptr_end; 1704 int name_len; 1705 u64 ino = btrfs_ino(inode); 1706 1707 key.objectid = ino; 1708 key.type = BTRFS_INODE_REF_KEY; 1709 key.offset = (u64)-1; 1710 1711 while (1) { 1712 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1713 if (ret < 0) 1714 break; 1715 if (ret > 0) { 1716 if (path->slots[0] == 0) 1717 break; 1718 path->slots[0]--; 1719 } 1720 process_slot: 1721 btrfs_item_key_to_cpu(path->nodes[0], &key, 1722 path->slots[0]); 1723 if (key.objectid != ino || 1724 key.type != BTRFS_INODE_REF_KEY) 1725 break; 1726 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 1727 ptr_end = ptr + btrfs_item_size(path->nodes[0], 1728 path->slots[0]); 1729 while (ptr < ptr_end) { 1730 struct btrfs_inode_ref *ref; 1731 1732 ref = (struct btrfs_inode_ref *)ptr; 1733 name_len = btrfs_inode_ref_name_len(path->nodes[0], 1734 ref); 1735 ptr = (unsigned long)(ref + 1) + name_len; 1736 nlink++; 1737 } 1738 1739 if (key.offset == 0) 1740 break; 1741 if (path->slots[0] > 0) { 1742 path->slots[0]--; 1743 goto process_slot; 1744 } 1745 key.offset--; 1746 btrfs_release_path(path); 1747 } 1748 btrfs_release_path(path); 1749 1750 return nlink; 1751 } 1752 1753 /* 1754 * There are a few corners where the link count of the file can't 1755 * be properly maintained during replay. So, instead of adding 1756 * lots of complexity to the log code, we just scan the backrefs 1757 * for any file that has been through replay. 1758 * 1759 * The scan will update the link count on the inode to reflect the 1760 * number of back refs found. If it goes down to zero, the iput 1761 * will free the inode. 1762 */ 1763 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, 1764 struct btrfs_root *root, 1765 struct inode *inode) 1766 { 1767 struct btrfs_path *path; 1768 int ret; 1769 u64 nlink = 0; 1770 u64 ino = btrfs_ino(BTRFS_I(inode)); 1771 1772 path = btrfs_alloc_path(); 1773 if (!path) 1774 return -ENOMEM; 1775 1776 ret = count_inode_refs(root, BTRFS_I(inode), path); 1777 if (ret < 0) 1778 goto out; 1779 1780 nlink = ret; 1781 1782 ret = count_inode_extrefs(root, BTRFS_I(inode), path); 1783 if (ret < 0) 1784 goto out; 1785 1786 nlink += ret; 1787 1788 ret = 0; 1789 1790 if (nlink != inode->i_nlink) { 1791 set_nlink(inode, nlink); 1792 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1793 if (ret) 1794 goto out; 1795 } 1796 BTRFS_I(inode)->index_cnt = (u64)-1; 1797 1798 if (inode->i_nlink == 0) { 1799 if (S_ISDIR(inode->i_mode)) { 1800 ret = replay_dir_deletes(trans, root, NULL, path, 1801 ino, 1); 1802 if (ret) 1803 goto out; 1804 } 1805 ret = btrfs_insert_orphan_item(trans, root, ino); 1806 if (ret == -EEXIST) 1807 ret = 0; 1808 } 1809 1810 out: 1811 btrfs_free_path(path); 1812 return ret; 1813 } 1814 1815 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, 1816 struct btrfs_root *root, 1817 struct btrfs_path *path) 1818 { 1819 int ret; 1820 struct btrfs_key key; 1821 struct inode *inode; 1822 1823 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1824 key.type = BTRFS_ORPHAN_ITEM_KEY; 1825 key.offset = (u64)-1; 1826 while (1) { 1827 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1828 if (ret < 0) 1829 break; 1830 1831 if (ret == 1) { 1832 ret = 0; 1833 if (path->slots[0] == 0) 1834 break; 1835 path->slots[0]--; 1836 } 1837 1838 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1839 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || 1840 key.type != BTRFS_ORPHAN_ITEM_KEY) 1841 break; 1842 1843 ret = btrfs_del_item(trans, root, path); 1844 if (ret) 1845 break; 1846 1847 btrfs_release_path(path); 1848 inode = read_one_inode(root, key.offset); 1849 if (!inode) { 1850 ret = -EIO; 1851 break; 1852 } 1853 1854 ret = fixup_inode_link_count(trans, root, inode); 1855 iput(inode); 1856 if (ret) 1857 break; 1858 1859 /* 1860 * fixup on a directory may create new entries, 1861 * make sure we always look for the highset possible 1862 * offset 1863 */ 1864 key.offset = (u64)-1; 1865 } 1866 btrfs_release_path(path); 1867 return ret; 1868 } 1869 1870 1871 /* 1872 * record a given inode in the fixup dir so we can check its link 1873 * count when replay is done. The link count is incremented here 1874 * so the inode won't go away until we check it 1875 */ 1876 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, 1877 struct btrfs_root *root, 1878 struct btrfs_path *path, 1879 u64 objectid) 1880 { 1881 struct btrfs_key key; 1882 int ret = 0; 1883 struct inode *inode; 1884 1885 inode = read_one_inode(root, objectid); 1886 if (!inode) 1887 return -EIO; 1888 1889 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1890 key.type = BTRFS_ORPHAN_ITEM_KEY; 1891 key.offset = objectid; 1892 1893 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1894 1895 btrfs_release_path(path); 1896 if (ret == 0) { 1897 if (!inode->i_nlink) 1898 set_nlink(inode, 1); 1899 else 1900 inc_nlink(inode); 1901 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 1902 } else if (ret == -EEXIST) { 1903 ret = 0; 1904 } 1905 iput(inode); 1906 1907 return ret; 1908 } 1909 1910 /* 1911 * when replaying the log for a directory, we only insert names 1912 * for inodes that actually exist. This means an fsync on a directory 1913 * does not implicitly fsync all the new files in it 1914 */ 1915 static noinline int insert_one_name(struct btrfs_trans_handle *trans, 1916 struct btrfs_root *root, 1917 u64 dirid, u64 index, 1918 char *name, int name_len, 1919 struct btrfs_key *location) 1920 { 1921 struct inode *inode; 1922 struct inode *dir; 1923 int ret; 1924 1925 inode = read_one_inode(root, location->objectid); 1926 if (!inode) 1927 return -ENOENT; 1928 1929 dir = read_one_inode(root, dirid); 1930 if (!dir) { 1931 iput(inode); 1932 return -EIO; 1933 } 1934 1935 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 1936 name_len, 1, index); 1937 1938 /* FIXME, put inode into FIXUP list */ 1939 1940 iput(inode); 1941 iput(dir); 1942 return ret; 1943 } 1944 1945 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, 1946 struct btrfs_inode *dir, 1947 struct btrfs_path *path, 1948 struct btrfs_dir_item *dst_di, 1949 const struct btrfs_key *log_key, 1950 u8 log_type, 1951 bool exists) 1952 { 1953 struct btrfs_key found_key; 1954 1955 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); 1956 /* The existing dentry points to the same inode, don't delete it. */ 1957 if (found_key.objectid == log_key->objectid && 1958 found_key.type == log_key->type && 1959 found_key.offset == log_key->offset && 1960 btrfs_dir_type(path->nodes[0], dst_di) == log_type) 1961 return 1; 1962 1963 /* 1964 * Don't drop the conflicting directory entry if the inode for the new 1965 * entry doesn't exist. 1966 */ 1967 if (!exists) 1968 return 0; 1969 1970 return drop_one_dir_item(trans, path, dir, dst_di); 1971 } 1972 1973 /* 1974 * take a single entry in a log directory item and replay it into 1975 * the subvolume. 1976 * 1977 * if a conflicting item exists in the subdirectory already, 1978 * the inode it points to is unlinked and put into the link count 1979 * fix up tree. 1980 * 1981 * If a name from the log points to a file or directory that does 1982 * not exist in the FS, it is skipped. fsyncs on directories 1983 * do not force down inodes inside that directory, just changes to the 1984 * names or unlinks in a directory. 1985 * 1986 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a 1987 * non-existing inode) and 1 if the name was replayed. 1988 */ 1989 static noinline int replay_one_name(struct btrfs_trans_handle *trans, 1990 struct btrfs_root *root, 1991 struct btrfs_path *path, 1992 struct extent_buffer *eb, 1993 struct btrfs_dir_item *di, 1994 struct btrfs_key *key) 1995 { 1996 char *name; 1997 int name_len; 1998 struct btrfs_dir_item *dir_dst_di; 1999 struct btrfs_dir_item *index_dst_di; 2000 bool dir_dst_matches = false; 2001 bool index_dst_matches = false; 2002 struct btrfs_key log_key; 2003 struct btrfs_key search_key; 2004 struct inode *dir; 2005 u8 log_type; 2006 bool exists; 2007 int ret; 2008 bool update_size = true; 2009 bool name_added = false; 2010 2011 dir = read_one_inode(root, key->objectid); 2012 if (!dir) 2013 return -EIO; 2014 2015 name_len = btrfs_dir_name_len(eb, di); 2016 name = kmalloc(name_len, GFP_NOFS); 2017 if (!name) { 2018 ret = -ENOMEM; 2019 goto out; 2020 } 2021 2022 log_type = btrfs_dir_type(eb, di); 2023 read_extent_buffer(eb, name, (unsigned long)(di + 1), 2024 name_len); 2025 2026 btrfs_dir_item_key_to_cpu(eb, di, &log_key); 2027 ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); 2028 btrfs_release_path(path); 2029 if (ret < 0) 2030 goto out; 2031 exists = (ret == 0); 2032 ret = 0; 2033 2034 dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 2035 name, name_len, 1); 2036 if (IS_ERR(dir_dst_di)) { 2037 ret = PTR_ERR(dir_dst_di); 2038 goto out; 2039 } else if (dir_dst_di) { 2040 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2041 dir_dst_di, &log_key, log_type, 2042 exists); 2043 if (ret < 0) 2044 goto out; 2045 dir_dst_matches = (ret == 1); 2046 } 2047 2048 btrfs_release_path(path); 2049 2050 index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, 2051 key->objectid, key->offset, 2052 name, name_len, 1); 2053 if (IS_ERR(index_dst_di)) { 2054 ret = PTR_ERR(index_dst_di); 2055 goto out; 2056 } else if (index_dst_di) { 2057 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, 2058 index_dst_di, &log_key, 2059 log_type, exists); 2060 if (ret < 0) 2061 goto out; 2062 index_dst_matches = (ret == 1); 2063 } 2064 2065 btrfs_release_path(path); 2066 2067 if (dir_dst_matches && index_dst_matches) { 2068 ret = 0; 2069 update_size = false; 2070 goto out; 2071 } 2072 2073 /* 2074 * Check if the inode reference exists in the log for the given name, 2075 * inode and parent inode 2076 */ 2077 search_key.objectid = log_key.objectid; 2078 search_key.type = BTRFS_INODE_REF_KEY; 2079 search_key.offset = key->objectid; 2080 ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); 2081 if (ret < 0) { 2082 goto out; 2083 } else if (ret) { 2084 /* The dentry will be added later. */ 2085 ret = 0; 2086 update_size = false; 2087 goto out; 2088 } 2089 2090 search_key.objectid = log_key.objectid; 2091 search_key.type = BTRFS_INODE_EXTREF_KEY; 2092 search_key.offset = key->objectid; 2093 ret = backref_in_log(root->log_root, &search_key, key->objectid, name, 2094 name_len); 2095 if (ret < 0) { 2096 goto out; 2097 } else if (ret) { 2098 /* The dentry will be added later. */ 2099 ret = 0; 2100 update_size = false; 2101 goto out; 2102 } 2103 btrfs_release_path(path); 2104 ret = insert_one_name(trans, root, key->objectid, key->offset, 2105 name, name_len, &log_key); 2106 if (ret && ret != -ENOENT && ret != -EEXIST) 2107 goto out; 2108 if (!ret) 2109 name_added = true; 2110 update_size = false; 2111 ret = 0; 2112 2113 out: 2114 if (!ret && update_size) { 2115 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); 2116 ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); 2117 } 2118 kfree(name); 2119 iput(dir); 2120 if (!ret && name_added) 2121 ret = 1; 2122 return ret; 2123 } 2124 2125 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */ 2126 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, 2127 struct btrfs_root *root, 2128 struct btrfs_path *path, 2129 struct extent_buffer *eb, int slot, 2130 struct btrfs_key *key) 2131 { 2132 int ret; 2133 struct btrfs_dir_item *di; 2134 2135 /* We only log dir index keys, which only contain a single dir item. */ 2136 ASSERT(key->type == BTRFS_DIR_INDEX_KEY); 2137 2138 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2139 ret = replay_one_name(trans, root, path, eb, di, key); 2140 if (ret < 0) 2141 return ret; 2142 2143 /* 2144 * If this entry refers to a non-directory (directories can not have a 2145 * link count > 1) and it was added in the transaction that was not 2146 * committed, make sure we fixup the link count of the inode the entry 2147 * points to. Otherwise something like the following would result in a 2148 * directory pointing to an inode with a wrong link that does not account 2149 * for this dir entry: 2150 * 2151 * mkdir testdir 2152 * touch testdir/foo 2153 * touch testdir/bar 2154 * sync 2155 * 2156 * ln testdir/bar testdir/bar_link 2157 * ln testdir/foo testdir/foo_link 2158 * xfs_io -c "fsync" testdir/bar 2159 * 2160 * <power failure> 2161 * 2162 * mount fs, log replay happens 2163 * 2164 * File foo would remain with a link count of 1 when it has two entries 2165 * pointing to it in the directory testdir. This would make it impossible 2166 * to ever delete the parent directory has it would result in stale 2167 * dentries that can never be deleted. 2168 */ 2169 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { 2170 struct btrfs_path *fixup_path; 2171 struct btrfs_key di_key; 2172 2173 fixup_path = btrfs_alloc_path(); 2174 if (!fixup_path) 2175 return -ENOMEM; 2176 2177 btrfs_dir_item_key_to_cpu(eb, di, &di_key); 2178 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid); 2179 btrfs_free_path(fixup_path); 2180 } 2181 2182 return ret; 2183 } 2184 2185 /* 2186 * directory replay has two parts. There are the standard directory 2187 * items in the log copied from the subvolume, and range items 2188 * created in the log while the subvolume was logged. 2189 * 2190 * The range items tell us which parts of the key space the log 2191 * is authoritative for. During replay, if a key in the subvolume 2192 * directory is in a logged range item, but not actually in the log 2193 * that means it was deleted from the directory before the fsync 2194 * and should be removed. 2195 */ 2196 static noinline int find_dir_range(struct btrfs_root *root, 2197 struct btrfs_path *path, 2198 u64 dirid, 2199 u64 *start_ret, u64 *end_ret) 2200 { 2201 struct btrfs_key key; 2202 u64 found_end; 2203 struct btrfs_dir_log_item *item; 2204 int ret; 2205 int nritems; 2206 2207 if (*start_ret == (u64)-1) 2208 return 1; 2209 2210 key.objectid = dirid; 2211 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2212 key.offset = *start_ret; 2213 2214 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2215 if (ret < 0) 2216 goto out; 2217 if (ret > 0) { 2218 if (path->slots[0] == 0) 2219 goto out; 2220 path->slots[0]--; 2221 } 2222 if (ret != 0) 2223 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2224 2225 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2226 ret = 1; 2227 goto next; 2228 } 2229 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2230 struct btrfs_dir_log_item); 2231 found_end = btrfs_dir_log_end(path->nodes[0], item); 2232 2233 if (*start_ret >= key.offset && *start_ret <= found_end) { 2234 ret = 0; 2235 *start_ret = key.offset; 2236 *end_ret = found_end; 2237 goto out; 2238 } 2239 ret = 1; 2240 next: 2241 /* check the next slot in the tree to see if it is a valid item */ 2242 nritems = btrfs_header_nritems(path->nodes[0]); 2243 path->slots[0]++; 2244 if (path->slots[0] >= nritems) { 2245 ret = btrfs_next_leaf(root, path); 2246 if (ret) 2247 goto out; 2248 } 2249 2250 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2251 2252 if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) { 2253 ret = 1; 2254 goto out; 2255 } 2256 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2257 struct btrfs_dir_log_item); 2258 found_end = btrfs_dir_log_end(path->nodes[0], item); 2259 *start_ret = key.offset; 2260 *end_ret = found_end; 2261 ret = 0; 2262 out: 2263 btrfs_release_path(path); 2264 return ret; 2265 } 2266 2267 /* 2268 * this looks for a given directory item in the log. If the directory 2269 * item is not in the log, the item is removed and the inode it points 2270 * to is unlinked 2271 */ 2272 static noinline int check_item_in_log(struct btrfs_trans_handle *trans, 2273 struct btrfs_root *log, 2274 struct btrfs_path *path, 2275 struct btrfs_path *log_path, 2276 struct inode *dir, 2277 struct btrfs_key *dir_key) 2278 { 2279 struct btrfs_root *root = BTRFS_I(dir)->root; 2280 int ret; 2281 struct extent_buffer *eb; 2282 int slot; 2283 struct btrfs_dir_item *di; 2284 int name_len; 2285 char *name; 2286 struct inode *inode = NULL; 2287 struct btrfs_key location; 2288 2289 /* 2290 * Currently we only log dir index keys. Even if we replay a log created 2291 * by an older kernel that logged both dir index and dir item keys, all 2292 * we need to do is process the dir index keys, we (and our caller) can 2293 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). 2294 */ 2295 ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); 2296 2297 eb = path->nodes[0]; 2298 slot = path->slots[0]; 2299 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); 2300 name_len = btrfs_dir_name_len(eb, di); 2301 name = kmalloc(name_len, GFP_NOFS); 2302 if (!name) { 2303 ret = -ENOMEM; 2304 goto out; 2305 } 2306 2307 read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); 2308 2309 if (log) { 2310 struct btrfs_dir_item *log_di; 2311 2312 log_di = btrfs_lookup_dir_index_item(trans, log, log_path, 2313 dir_key->objectid, 2314 dir_key->offset, 2315 name, name_len, 0); 2316 if (IS_ERR(log_di)) { 2317 ret = PTR_ERR(log_di); 2318 goto out; 2319 } else if (log_di) { 2320 /* The dentry exists in the log, we have nothing to do. */ 2321 ret = 0; 2322 goto out; 2323 } 2324 } 2325 2326 btrfs_dir_item_key_to_cpu(eb, di, &location); 2327 btrfs_release_path(path); 2328 btrfs_release_path(log_path); 2329 inode = read_one_inode(root, location.objectid); 2330 if (!inode) { 2331 ret = -EIO; 2332 goto out; 2333 } 2334 2335 ret = link_to_fixup_dir(trans, root, path, location.objectid); 2336 if (ret) 2337 goto out; 2338 2339 inc_nlink(inode); 2340 ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), 2341 name, name_len); 2342 /* 2343 * Unlike dir item keys, dir index keys can only have one name (entry) in 2344 * them, as there are no key collisions since each key has a unique offset 2345 * (an index number), so we're done. 2346 */ 2347 out: 2348 btrfs_release_path(path); 2349 btrfs_release_path(log_path); 2350 kfree(name); 2351 iput(inode); 2352 return ret; 2353 } 2354 2355 static int replay_xattr_deletes(struct btrfs_trans_handle *trans, 2356 struct btrfs_root *root, 2357 struct btrfs_root *log, 2358 struct btrfs_path *path, 2359 const u64 ino) 2360 { 2361 struct btrfs_key search_key; 2362 struct btrfs_path *log_path; 2363 int i; 2364 int nritems; 2365 int ret; 2366 2367 log_path = btrfs_alloc_path(); 2368 if (!log_path) 2369 return -ENOMEM; 2370 2371 search_key.objectid = ino; 2372 search_key.type = BTRFS_XATTR_ITEM_KEY; 2373 search_key.offset = 0; 2374 again: 2375 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 2376 if (ret < 0) 2377 goto out; 2378 process_leaf: 2379 nritems = btrfs_header_nritems(path->nodes[0]); 2380 for (i = path->slots[0]; i < nritems; i++) { 2381 struct btrfs_key key; 2382 struct btrfs_dir_item *di; 2383 struct btrfs_dir_item *log_di; 2384 u32 total_size; 2385 u32 cur; 2386 2387 btrfs_item_key_to_cpu(path->nodes[0], &key, i); 2388 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { 2389 ret = 0; 2390 goto out; 2391 } 2392 2393 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); 2394 total_size = btrfs_item_size(path->nodes[0], i); 2395 cur = 0; 2396 while (cur < total_size) { 2397 u16 name_len = btrfs_dir_name_len(path->nodes[0], di); 2398 u16 data_len = btrfs_dir_data_len(path->nodes[0], di); 2399 u32 this_len = sizeof(*di) + name_len + data_len; 2400 char *name; 2401 2402 name = kmalloc(name_len, GFP_NOFS); 2403 if (!name) { 2404 ret = -ENOMEM; 2405 goto out; 2406 } 2407 read_extent_buffer(path->nodes[0], name, 2408 (unsigned long)(di + 1), name_len); 2409 2410 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, 2411 name, name_len, 0); 2412 btrfs_release_path(log_path); 2413 if (!log_di) { 2414 /* Doesn't exist in log tree, so delete it. */ 2415 btrfs_release_path(path); 2416 di = btrfs_lookup_xattr(trans, root, path, ino, 2417 name, name_len, -1); 2418 kfree(name); 2419 if (IS_ERR(di)) { 2420 ret = PTR_ERR(di); 2421 goto out; 2422 } 2423 ASSERT(di); 2424 ret = btrfs_delete_one_dir_name(trans, root, 2425 path, di); 2426 if (ret) 2427 goto out; 2428 btrfs_release_path(path); 2429 search_key = key; 2430 goto again; 2431 } 2432 kfree(name); 2433 if (IS_ERR(log_di)) { 2434 ret = PTR_ERR(log_di); 2435 goto out; 2436 } 2437 cur += this_len; 2438 di = (struct btrfs_dir_item *)((char *)di + this_len); 2439 } 2440 } 2441 ret = btrfs_next_leaf(root, path); 2442 if (ret > 0) 2443 ret = 0; 2444 else if (ret == 0) 2445 goto process_leaf; 2446 out: 2447 btrfs_free_path(log_path); 2448 btrfs_release_path(path); 2449 return ret; 2450 } 2451 2452 2453 /* 2454 * deletion replay happens before we copy any new directory items 2455 * out of the log or out of backreferences from inodes. It 2456 * scans the log to find ranges of keys that log is authoritative for, 2457 * and then scans the directory to find items in those ranges that are 2458 * not present in the log. 2459 * 2460 * Anything we don't find in the log is unlinked and removed from the 2461 * directory. 2462 */ 2463 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, 2464 struct btrfs_root *root, 2465 struct btrfs_root *log, 2466 struct btrfs_path *path, 2467 u64 dirid, int del_all) 2468 { 2469 u64 range_start; 2470 u64 range_end; 2471 int ret = 0; 2472 struct btrfs_key dir_key; 2473 struct btrfs_key found_key; 2474 struct btrfs_path *log_path; 2475 struct inode *dir; 2476 2477 dir_key.objectid = dirid; 2478 dir_key.type = BTRFS_DIR_INDEX_KEY; 2479 log_path = btrfs_alloc_path(); 2480 if (!log_path) 2481 return -ENOMEM; 2482 2483 dir = read_one_inode(root, dirid); 2484 /* it isn't an error if the inode isn't there, that can happen 2485 * because we replay the deletes before we copy in the inode item 2486 * from the log 2487 */ 2488 if (!dir) { 2489 btrfs_free_path(log_path); 2490 return 0; 2491 } 2492 2493 range_start = 0; 2494 range_end = 0; 2495 while (1) { 2496 if (del_all) 2497 range_end = (u64)-1; 2498 else { 2499 ret = find_dir_range(log, path, dirid, 2500 &range_start, &range_end); 2501 if (ret < 0) 2502 goto out; 2503 else if (ret > 0) 2504 break; 2505 } 2506 2507 dir_key.offset = range_start; 2508 while (1) { 2509 int nritems; 2510 ret = btrfs_search_slot(NULL, root, &dir_key, path, 2511 0, 0); 2512 if (ret < 0) 2513 goto out; 2514 2515 nritems = btrfs_header_nritems(path->nodes[0]); 2516 if (path->slots[0] >= nritems) { 2517 ret = btrfs_next_leaf(root, path); 2518 if (ret == 1) 2519 break; 2520 else if (ret < 0) 2521 goto out; 2522 } 2523 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 2524 path->slots[0]); 2525 if (found_key.objectid != dirid || 2526 found_key.type != dir_key.type) { 2527 ret = 0; 2528 goto out; 2529 } 2530 2531 if (found_key.offset > range_end) 2532 break; 2533 2534 ret = check_item_in_log(trans, log, path, 2535 log_path, dir, 2536 &found_key); 2537 if (ret) 2538 goto out; 2539 if (found_key.offset == (u64)-1) 2540 break; 2541 dir_key.offset = found_key.offset + 1; 2542 } 2543 btrfs_release_path(path); 2544 if (range_end == (u64)-1) 2545 break; 2546 range_start = range_end + 1; 2547 } 2548 ret = 0; 2549 out: 2550 btrfs_release_path(path); 2551 btrfs_free_path(log_path); 2552 iput(dir); 2553 return ret; 2554 } 2555 2556 /* 2557 * the process_func used to replay items from the log tree. This 2558 * gets called in two different stages. The first stage just looks 2559 * for inodes and makes sure they are all copied into the subvolume. 2560 * 2561 * The second stage copies all the other item types from the log into 2562 * the subvolume. The two stage approach is slower, but gets rid of 2563 * lots of complexity around inodes referencing other inodes that exist 2564 * only in the log (references come from either directory items or inode 2565 * back refs). 2566 */ 2567 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, 2568 struct walk_control *wc, u64 gen, int level) 2569 { 2570 int nritems; 2571 struct btrfs_path *path; 2572 struct btrfs_root *root = wc->replay_dest; 2573 struct btrfs_key key; 2574 int i; 2575 int ret; 2576 2577 ret = btrfs_read_extent_buffer(eb, gen, level, NULL); 2578 if (ret) 2579 return ret; 2580 2581 level = btrfs_header_level(eb); 2582 2583 if (level != 0) 2584 return 0; 2585 2586 path = btrfs_alloc_path(); 2587 if (!path) 2588 return -ENOMEM; 2589 2590 nritems = btrfs_header_nritems(eb); 2591 for (i = 0; i < nritems; i++) { 2592 btrfs_item_key_to_cpu(eb, &key, i); 2593 2594 /* inode keys are done during the first stage */ 2595 if (key.type == BTRFS_INODE_ITEM_KEY && 2596 wc->stage == LOG_WALK_REPLAY_INODES) { 2597 struct btrfs_inode_item *inode_item; 2598 u32 mode; 2599 2600 inode_item = btrfs_item_ptr(eb, i, 2601 struct btrfs_inode_item); 2602 /* 2603 * If we have a tmpfile (O_TMPFILE) that got fsync'ed 2604 * and never got linked before the fsync, skip it, as 2605 * replaying it is pointless since it would be deleted 2606 * later. We skip logging tmpfiles, but it's always 2607 * possible we are replaying a log created with a kernel 2608 * that used to log tmpfiles. 2609 */ 2610 if (btrfs_inode_nlink(eb, inode_item) == 0) { 2611 wc->ignore_cur_inode = true; 2612 continue; 2613 } else { 2614 wc->ignore_cur_inode = false; 2615 } 2616 ret = replay_xattr_deletes(wc->trans, root, log, 2617 path, key.objectid); 2618 if (ret) 2619 break; 2620 mode = btrfs_inode_mode(eb, inode_item); 2621 if (S_ISDIR(mode)) { 2622 ret = replay_dir_deletes(wc->trans, 2623 root, log, path, key.objectid, 0); 2624 if (ret) 2625 break; 2626 } 2627 ret = overwrite_item(wc->trans, root, path, 2628 eb, i, &key); 2629 if (ret) 2630 break; 2631 2632 /* 2633 * Before replaying extents, truncate the inode to its 2634 * size. We need to do it now and not after log replay 2635 * because before an fsync we can have prealloc extents 2636 * added beyond the inode's i_size. If we did it after, 2637 * through orphan cleanup for example, we would drop 2638 * those prealloc extents just after replaying them. 2639 */ 2640 if (S_ISREG(mode)) { 2641 struct btrfs_drop_extents_args drop_args = { 0 }; 2642 struct inode *inode; 2643 u64 from; 2644 2645 inode = read_one_inode(root, key.objectid); 2646 if (!inode) { 2647 ret = -EIO; 2648 break; 2649 } 2650 from = ALIGN(i_size_read(inode), 2651 root->fs_info->sectorsize); 2652 drop_args.start = from; 2653 drop_args.end = (u64)-1; 2654 drop_args.drop_cache = true; 2655 ret = btrfs_drop_extents(wc->trans, root, 2656 BTRFS_I(inode), 2657 &drop_args); 2658 if (!ret) { 2659 inode_sub_bytes(inode, 2660 drop_args.bytes_found); 2661 /* Update the inode's nbytes. */ 2662 ret = btrfs_update_inode(wc->trans, 2663 root, BTRFS_I(inode)); 2664 } 2665 iput(inode); 2666 if (ret) 2667 break; 2668 } 2669 2670 ret = link_to_fixup_dir(wc->trans, root, 2671 path, key.objectid); 2672 if (ret) 2673 break; 2674 } 2675 2676 if (wc->ignore_cur_inode) 2677 continue; 2678 2679 if (key.type == BTRFS_DIR_INDEX_KEY && 2680 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { 2681 ret = replay_one_dir_item(wc->trans, root, path, 2682 eb, i, &key); 2683 if (ret) 2684 break; 2685 } 2686 2687 if (wc->stage < LOG_WALK_REPLAY_ALL) 2688 continue; 2689 2690 /* these keys are simply copied */ 2691 if (key.type == BTRFS_XATTR_ITEM_KEY) { 2692 ret = overwrite_item(wc->trans, root, path, 2693 eb, i, &key); 2694 if (ret) 2695 break; 2696 } else if (key.type == BTRFS_INODE_REF_KEY || 2697 key.type == BTRFS_INODE_EXTREF_KEY) { 2698 ret = add_inode_ref(wc->trans, root, log, path, 2699 eb, i, &key); 2700 if (ret && ret != -ENOENT) 2701 break; 2702 ret = 0; 2703 } else if (key.type == BTRFS_EXTENT_DATA_KEY) { 2704 ret = replay_one_extent(wc->trans, root, path, 2705 eb, i, &key); 2706 if (ret) 2707 break; 2708 } 2709 /* 2710 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the 2711 * BTRFS_DIR_INDEX_KEY items which we use to derive the 2712 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an 2713 * older kernel with such keys, ignore them. 2714 */ 2715 } 2716 btrfs_free_path(path); 2717 return ret; 2718 } 2719 2720 /* 2721 * Correctly adjust the reserved bytes occupied by a log tree extent buffer 2722 */ 2723 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) 2724 { 2725 struct btrfs_block_group *cache; 2726 2727 cache = btrfs_lookup_block_group(fs_info, start); 2728 if (!cache) { 2729 btrfs_err(fs_info, "unable to find block group for %llu", start); 2730 return; 2731 } 2732 2733 spin_lock(&cache->space_info->lock); 2734 spin_lock(&cache->lock); 2735 cache->reserved -= fs_info->nodesize; 2736 cache->space_info->bytes_reserved -= fs_info->nodesize; 2737 spin_unlock(&cache->lock); 2738 spin_unlock(&cache->space_info->lock); 2739 2740 btrfs_put_block_group(cache); 2741 } 2742 2743 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, 2744 struct btrfs_root *root, 2745 struct btrfs_path *path, int *level, 2746 struct walk_control *wc) 2747 { 2748 struct btrfs_fs_info *fs_info = root->fs_info; 2749 u64 bytenr; 2750 u64 ptr_gen; 2751 struct extent_buffer *next; 2752 struct extent_buffer *cur; 2753 u32 blocksize; 2754 int ret = 0; 2755 2756 while (*level > 0) { 2757 struct btrfs_key first_key; 2758 2759 cur = path->nodes[*level]; 2760 2761 WARN_ON(btrfs_header_level(cur) != *level); 2762 2763 if (path->slots[*level] >= 2764 btrfs_header_nritems(cur)) 2765 break; 2766 2767 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2768 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2769 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); 2770 blocksize = fs_info->nodesize; 2771 2772 next = btrfs_find_create_tree_block(fs_info, bytenr, 2773 btrfs_header_owner(cur), 2774 *level - 1); 2775 if (IS_ERR(next)) 2776 return PTR_ERR(next); 2777 2778 if (*level == 1) { 2779 ret = wc->process_func(root, next, wc, ptr_gen, 2780 *level - 1); 2781 if (ret) { 2782 free_extent_buffer(next); 2783 return ret; 2784 } 2785 2786 path->slots[*level]++; 2787 if (wc->free) { 2788 ret = btrfs_read_extent_buffer(next, ptr_gen, 2789 *level - 1, &first_key); 2790 if (ret) { 2791 free_extent_buffer(next); 2792 return ret; 2793 } 2794 2795 if (trans) { 2796 btrfs_tree_lock(next); 2797 btrfs_clean_tree_block(next); 2798 btrfs_wait_tree_block_writeback(next); 2799 btrfs_tree_unlock(next); 2800 ret = btrfs_pin_reserved_extent(trans, 2801 bytenr, blocksize); 2802 if (ret) { 2803 free_extent_buffer(next); 2804 return ret; 2805 } 2806 btrfs_redirty_list_add( 2807 trans->transaction, next); 2808 } else { 2809 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2810 clear_extent_buffer_dirty(next); 2811 unaccount_log_buffer(fs_info, bytenr); 2812 } 2813 } 2814 free_extent_buffer(next); 2815 continue; 2816 } 2817 ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); 2818 if (ret) { 2819 free_extent_buffer(next); 2820 return ret; 2821 } 2822 2823 if (path->nodes[*level-1]) 2824 free_extent_buffer(path->nodes[*level-1]); 2825 path->nodes[*level-1] = next; 2826 *level = btrfs_header_level(next); 2827 path->slots[*level] = 0; 2828 cond_resched(); 2829 } 2830 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); 2831 2832 cond_resched(); 2833 return 0; 2834 } 2835 2836 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, 2837 struct btrfs_root *root, 2838 struct btrfs_path *path, int *level, 2839 struct walk_control *wc) 2840 { 2841 struct btrfs_fs_info *fs_info = root->fs_info; 2842 int i; 2843 int slot; 2844 int ret; 2845 2846 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 2847 slot = path->slots[i]; 2848 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 2849 path->slots[i]++; 2850 *level = i; 2851 WARN_ON(*level == 0); 2852 return 0; 2853 } else { 2854 ret = wc->process_func(root, path->nodes[*level], wc, 2855 btrfs_header_generation(path->nodes[*level]), 2856 *level); 2857 if (ret) 2858 return ret; 2859 2860 if (wc->free) { 2861 struct extent_buffer *next; 2862 2863 next = path->nodes[*level]; 2864 2865 if (trans) { 2866 btrfs_tree_lock(next); 2867 btrfs_clean_tree_block(next); 2868 btrfs_wait_tree_block_writeback(next); 2869 btrfs_tree_unlock(next); 2870 ret = btrfs_pin_reserved_extent(trans, 2871 path->nodes[*level]->start, 2872 path->nodes[*level]->len); 2873 if (ret) 2874 return ret; 2875 btrfs_redirty_list_add(trans->transaction, 2876 next); 2877 } else { 2878 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2879 clear_extent_buffer_dirty(next); 2880 2881 unaccount_log_buffer(fs_info, 2882 path->nodes[*level]->start); 2883 } 2884 } 2885 free_extent_buffer(path->nodes[*level]); 2886 path->nodes[*level] = NULL; 2887 *level = i + 1; 2888 } 2889 } 2890 return 1; 2891 } 2892 2893 /* 2894 * drop the reference count on the tree rooted at 'snap'. This traverses 2895 * the tree freeing any blocks that have a ref count of zero after being 2896 * decremented. 2897 */ 2898 static int walk_log_tree(struct btrfs_trans_handle *trans, 2899 struct btrfs_root *log, struct walk_control *wc) 2900 { 2901 struct btrfs_fs_info *fs_info = log->fs_info; 2902 int ret = 0; 2903 int wret; 2904 int level; 2905 struct btrfs_path *path; 2906 int orig_level; 2907 2908 path = btrfs_alloc_path(); 2909 if (!path) 2910 return -ENOMEM; 2911 2912 level = btrfs_header_level(log->node); 2913 orig_level = level; 2914 path->nodes[level] = log->node; 2915 atomic_inc(&log->node->refs); 2916 path->slots[level] = 0; 2917 2918 while (1) { 2919 wret = walk_down_log_tree(trans, log, path, &level, wc); 2920 if (wret > 0) 2921 break; 2922 if (wret < 0) { 2923 ret = wret; 2924 goto out; 2925 } 2926 2927 wret = walk_up_log_tree(trans, log, path, &level, wc); 2928 if (wret > 0) 2929 break; 2930 if (wret < 0) { 2931 ret = wret; 2932 goto out; 2933 } 2934 } 2935 2936 /* was the root node processed? if not, catch it here */ 2937 if (path->nodes[orig_level]) { 2938 ret = wc->process_func(log, path->nodes[orig_level], wc, 2939 btrfs_header_generation(path->nodes[orig_level]), 2940 orig_level); 2941 if (ret) 2942 goto out; 2943 if (wc->free) { 2944 struct extent_buffer *next; 2945 2946 next = path->nodes[orig_level]; 2947 2948 if (trans) { 2949 btrfs_tree_lock(next); 2950 btrfs_clean_tree_block(next); 2951 btrfs_wait_tree_block_writeback(next); 2952 btrfs_tree_unlock(next); 2953 ret = btrfs_pin_reserved_extent(trans, 2954 next->start, next->len); 2955 if (ret) 2956 goto out; 2957 btrfs_redirty_list_add(trans->transaction, next); 2958 } else { 2959 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) 2960 clear_extent_buffer_dirty(next); 2961 unaccount_log_buffer(fs_info, next->start); 2962 } 2963 } 2964 } 2965 2966 out: 2967 btrfs_free_path(path); 2968 return ret; 2969 } 2970 2971 /* 2972 * helper function to update the item for a given subvolumes log root 2973 * in the tree of log roots 2974 */ 2975 static int update_log_root(struct btrfs_trans_handle *trans, 2976 struct btrfs_root *log, 2977 struct btrfs_root_item *root_item) 2978 { 2979 struct btrfs_fs_info *fs_info = log->fs_info; 2980 int ret; 2981 2982 if (log->log_transid == 1) { 2983 /* insert root item on the first sync */ 2984 ret = btrfs_insert_root(trans, fs_info->log_root_tree, 2985 &log->root_key, root_item); 2986 } else { 2987 ret = btrfs_update_root(trans, fs_info->log_root_tree, 2988 &log->root_key, root_item); 2989 } 2990 return ret; 2991 } 2992 2993 static void wait_log_commit(struct btrfs_root *root, int transid) 2994 { 2995 DEFINE_WAIT(wait); 2996 int index = transid % 2; 2997 2998 /* 2999 * we only allow two pending log transactions at a time, 3000 * so we know that if ours is more than 2 older than the 3001 * current transaction, we're done 3002 */ 3003 for (;;) { 3004 prepare_to_wait(&root->log_commit_wait[index], 3005 &wait, TASK_UNINTERRUPTIBLE); 3006 3007 if (!(root->log_transid_committed < transid && 3008 atomic_read(&root->log_commit[index]))) 3009 break; 3010 3011 mutex_unlock(&root->log_mutex); 3012 schedule(); 3013 mutex_lock(&root->log_mutex); 3014 } 3015 finish_wait(&root->log_commit_wait[index], &wait); 3016 } 3017 3018 static void wait_for_writer(struct btrfs_root *root) 3019 { 3020 DEFINE_WAIT(wait); 3021 3022 for (;;) { 3023 prepare_to_wait(&root->log_writer_wait, &wait, 3024 TASK_UNINTERRUPTIBLE); 3025 if (!atomic_read(&root->log_writers)) 3026 break; 3027 3028 mutex_unlock(&root->log_mutex); 3029 schedule(); 3030 mutex_lock(&root->log_mutex); 3031 } 3032 finish_wait(&root->log_writer_wait, &wait); 3033 } 3034 3035 static inline void btrfs_remove_log_ctx(struct btrfs_root *root, 3036 struct btrfs_log_ctx *ctx) 3037 { 3038 mutex_lock(&root->log_mutex); 3039 list_del_init(&ctx->list); 3040 mutex_unlock(&root->log_mutex); 3041 } 3042 3043 /* 3044 * Invoked in log mutex context, or be sure there is no other task which 3045 * can access the list. 3046 */ 3047 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, 3048 int index, int error) 3049 { 3050 struct btrfs_log_ctx *ctx; 3051 struct btrfs_log_ctx *safe; 3052 3053 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) { 3054 list_del_init(&ctx->list); 3055 ctx->log_ret = error; 3056 } 3057 } 3058 3059 /* 3060 * btrfs_sync_log does sends a given tree log down to the disk and 3061 * updates the super blocks to record it. When this call is done, 3062 * you know that any inodes previously logged are safely on disk only 3063 * if it returns 0. 3064 * 3065 * Any other return value means you need to call btrfs_commit_transaction. 3066 * Some of the edge cases for fsyncing directories that have had unlinks 3067 * or renames done in the past mean that sometimes the only safe 3068 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN, 3069 * that has happened. 3070 */ 3071 int btrfs_sync_log(struct btrfs_trans_handle *trans, 3072 struct btrfs_root *root, struct btrfs_log_ctx *ctx) 3073 { 3074 int index1; 3075 int index2; 3076 int mark; 3077 int ret; 3078 struct btrfs_fs_info *fs_info = root->fs_info; 3079 struct btrfs_root *log = root->log_root; 3080 struct btrfs_root *log_root_tree = fs_info->log_root_tree; 3081 struct btrfs_root_item new_root_item; 3082 int log_transid = 0; 3083 struct btrfs_log_ctx root_log_ctx; 3084 struct blk_plug plug; 3085 u64 log_root_start; 3086 u64 log_root_level; 3087 3088 mutex_lock(&root->log_mutex); 3089 log_transid = ctx->log_transid; 3090 if (root->log_transid_committed >= log_transid) { 3091 mutex_unlock(&root->log_mutex); 3092 return ctx->log_ret; 3093 } 3094 3095 index1 = log_transid % 2; 3096 if (atomic_read(&root->log_commit[index1])) { 3097 wait_log_commit(root, log_transid); 3098 mutex_unlock(&root->log_mutex); 3099 return ctx->log_ret; 3100 } 3101 ASSERT(log_transid == root->log_transid); 3102 atomic_set(&root->log_commit[index1], 1); 3103 3104 /* wait for previous tree log sync to complete */ 3105 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 3106 wait_log_commit(root, log_transid - 1); 3107 3108 while (1) { 3109 int batch = atomic_read(&root->log_batch); 3110 /* when we're on an ssd, just kick the log commit out */ 3111 if (!btrfs_test_opt(fs_info, SSD) && 3112 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { 3113 mutex_unlock(&root->log_mutex); 3114 schedule_timeout_uninterruptible(1); 3115 mutex_lock(&root->log_mutex); 3116 } 3117 wait_for_writer(root); 3118 if (batch == atomic_read(&root->log_batch)) 3119 break; 3120 } 3121 3122 /* bail out if we need to do a full commit */ 3123 if (btrfs_need_log_full_commit(trans)) { 3124 ret = BTRFS_LOG_FORCE_COMMIT; 3125 mutex_unlock(&root->log_mutex); 3126 goto out; 3127 } 3128 3129 if (log_transid % 2 == 0) 3130 mark = EXTENT_DIRTY; 3131 else 3132 mark = EXTENT_NEW; 3133 3134 /* we start IO on all the marked extents here, but we don't actually 3135 * wait for them until later. 3136 */ 3137 blk_start_plug(&plug); 3138 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); 3139 /* 3140 * -EAGAIN happens when someone, e.g., a concurrent transaction 3141 * commit, writes a dirty extent in this tree-log commit. This 3142 * concurrent write will create a hole writing out the extents, 3143 * and we cannot proceed on a zoned filesystem, requiring 3144 * sequential writing. While we can bail out to a full commit 3145 * here, but we can continue hoping the concurrent writing fills 3146 * the hole. 3147 */ 3148 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) 3149 ret = 0; 3150 if (ret) { 3151 blk_finish_plug(&plug); 3152 btrfs_abort_transaction(trans, ret); 3153 btrfs_set_log_full_commit(trans); 3154 mutex_unlock(&root->log_mutex); 3155 goto out; 3156 } 3157 3158 /* 3159 * We _must_ update under the root->log_mutex in order to make sure we 3160 * have a consistent view of the log root we are trying to commit at 3161 * this moment. 3162 * 3163 * We _must_ copy this into a local copy, because we are not holding the 3164 * log_root_tree->log_mutex yet. This is important because when we 3165 * commit the log_root_tree we must have a consistent view of the 3166 * log_root_tree when we update the super block to point at the 3167 * log_root_tree bytenr. If we update the log_root_tree here we'll race 3168 * with the commit and possibly point at the new block which we may not 3169 * have written out. 3170 */ 3171 btrfs_set_root_node(&log->root_item, log->node); 3172 memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); 3173 3174 root->log_transid++; 3175 log->log_transid = root->log_transid; 3176 root->log_start_pid = 0; 3177 /* 3178 * IO has been started, blocks of the log tree have WRITTEN flag set 3179 * in their headers. new modifications of the log will be written to 3180 * new positions. so it's safe to allow log writers to go in. 3181 */ 3182 mutex_unlock(&root->log_mutex); 3183 3184 if (btrfs_is_zoned(fs_info)) { 3185 mutex_lock(&fs_info->tree_root->log_mutex); 3186 if (!log_root_tree->node) { 3187 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); 3188 if (ret) { 3189 mutex_unlock(&fs_info->tree_root->log_mutex); 3190 blk_finish_plug(&plug); 3191 goto out; 3192 } 3193 } 3194 mutex_unlock(&fs_info->tree_root->log_mutex); 3195 } 3196 3197 btrfs_init_log_ctx(&root_log_ctx, NULL); 3198 3199 mutex_lock(&log_root_tree->log_mutex); 3200 3201 index2 = log_root_tree->log_transid % 2; 3202 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3203 root_log_ctx.log_transid = log_root_tree->log_transid; 3204 3205 /* 3206 * Now we are safe to update the log_root_tree because we're under the 3207 * log_mutex, and we're a current writer so we're holding the commit 3208 * open until we drop the log_mutex. 3209 */ 3210 ret = update_log_root(trans, log, &new_root_item); 3211 if (ret) { 3212 if (!list_empty(&root_log_ctx.list)) 3213 list_del_init(&root_log_ctx.list); 3214 3215 blk_finish_plug(&plug); 3216 btrfs_set_log_full_commit(trans); 3217 3218 if (ret != -ENOSPC) { 3219 btrfs_abort_transaction(trans, ret); 3220 mutex_unlock(&log_root_tree->log_mutex); 3221 goto out; 3222 } 3223 btrfs_wait_tree_log_extents(log, mark); 3224 mutex_unlock(&log_root_tree->log_mutex); 3225 ret = BTRFS_LOG_FORCE_COMMIT; 3226 goto out; 3227 } 3228 3229 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 3230 blk_finish_plug(&plug); 3231 list_del_init(&root_log_ctx.list); 3232 mutex_unlock(&log_root_tree->log_mutex); 3233 ret = root_log_ctx.log_ret; 3234 goto out; 3235 } 3236 3237 index2 = root_log_ctx.log_transid % 2; 3238 if (atomic_read(&log_root_tree->log_commit[index2])) { 3239 blk_finish_plug(&plug); 3240 ret = btrfs_wait_tree_log_extents(log, mark); 3241 wait_log_commit(log_root_tree, 3242 root_log_ctx.log_transid); 3243 mutex_unlock(&log_root_tree->log_mutex); 3244 if (!ret) 3245 ret = root_log_ctx.log_ret; 3246 goto out; 3247 } 3248 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); 3249 atomic_set(&log_root_tree->log_commit[index2], 1); 3250 3251 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { 3252 wait_log_commit(log_root_tree, 3253 root_log_ctx.log_transid - 1); 3254 } 3255 3256 /* 3257 * now that we've moved on to the tree of log tree roots, 3258 * check the full commit flag again 3259 */ 3260 if (btrfs_need_log_full_commit(trans)) { 3261 blk_finish_plug(&plug); 3262 btrfs_wait_tree_log_extents(log, mark); 3263 mutex_unlock(&log_root_tree->log_mutex); 3264 ret = BTRFS_LOG_FORCE_COMMIT; 3265 goto out_wake_log_root; 3266 } 3267 3268 ret = btrfs_write_marked_extents(fs_info, 3269 &log_root_tree->dirty_log_pages, 3270 EXTENT_DIRTY | EXTENT_NEW); 3271 blk_finish_plug(&plug); 3272 /* 3273 * As described above, -EAGAIN indicates a hole in the extents. We 3274 * cannot wait for these write outs since the waiting cause a 3275 * deadlock. Bail out to the full commit instead. 3276 */ 3277 if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { 3278 btrfs_set_log_full_commit(trans); 3279 btrfs_wait_tree_log_extents(log, mark); 3280 mutex_unlock(&log_root_tree->log_mutex); 3281 goto out_wake_log_root; 3282 } else if (ret) { 3283 btrfs_set_log_full_commit(trans); 3284 btrfs_abort_transaction(trans, ret); 3285 mutex_unlock(&log_root_tree->log_mutex); 3286 goto out_wake_log_root; 3287 } 3288 ret = btrfs_wait_tree_log_extents(log, mark); 3289 if (!ret) 3290 ret = btrfs_wait_tree_log_extents(log_root_tree, 3291 EXTENT_NEW | EXTENT_DIRTY); 3292 if (ret) { 3293 btrfs_set_log_full_commit(trans); 3294 mutex_unlock(&log_root_tree->log_mutex); 3295 goto out_wake_log_root; 3296 } 3297 3298 log_root_start = log_root_tree->node->start; 3299 log_root_level = btrfs_header_level(log_root_tree->node); 3300 log_root_tree->log_transid++; 3301 mutex_unlock(&log_root_tree->log_mutex); 3302 3303 /* 3304 * Here we are guaranteed that nobody is going to write the superblock 3305 * for the current transaction before us and that neither we do write 3306 * our superblock before the previous transaction finishes its commit 3307 * and writes its superblock, because: 3308 * 3309 * 1) We are holding a handle on the current transaction, so no body 3310 * can commit it until we release the handle; 3311 * 3312 * 2) Before writing our superblock we acquire the tree_log_mutex, so 3313 * if the previous transaction is still committing, and hasn't yet 3314 * written its superblock, we wait for it to do it, because a 3315 * transaction commit acquires the tree_log_mutex when the commit 3316 * begins and releases it only after writing its superblock. 3317 */ 3318 mutex_lock(&fs_info->tree_log_mutex); 3319 3320 /* 3321 * The previous transaction writeout phase could have failed, and thus 3322 * marked the fs in an error state. We must not commit here, as we 3323 * could have updated our generation in the super_for_commit and 3324 * writing the super here would result in transid mismatches. If there 3325 * is an error here just bail. 3326 */ 3327 if (BTRFS_FS_ERROR(fs_info)) { 3328 ret = -EIO; 3329 btrfs_set_log_full_commit(trans); 3330 btrfs_abort_transaction(trans, ret); 3331 mutex_unlock(&fs_info->tree_log_mutex); 3332 goto out_wake_log_root; 3333 } 3334 3335 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3336 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3337 ret = write_all_supers(fs_info, 1); 3338 mutex_unlock(&fs_info->tree_log_mutex); 3339 if (ret) { 3340 btrfs_set_log_full_commit(trans); 3341 btrfs_abort_transaction(trans, ret); 3342 goto out_wake_log_root; 3343 } 3344 3345 /* 3346 * We know there can only be one task here, since we have not yet set 3347 * root->log_commit[index1] to 0 and any task attempting to sync the 3348 * log must wait for the previous log transaction to commit if it's 3349 * still in progress or wait for the current log transaction commit if 3350 * someone else already started it. We use <= and not < because the 3351 * first log transaction has an ID of 0. 3352 */ 3353 ASSERT(root->last_log_commit <= log_transid); 3354 root->last_log_commit = log_transid; 3355 3356 out_wake_log_root: 3357 mutex_lock(&log_root_tree->log_mutex); 3358 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret); 3359 3360 log_root_tree->log_transid_committed++; 3361 atomic_set(&log_root_tree->log_commit[index2], 0); 3362 mutex_unlock(&log_root_tree->log_mutex); 3363 3364 /* 3365 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3366 * all the updates above are seen by the woken threads. It might not be 3367 * necessary, but proving that seems to be hard. 3368 */ 3369 cond_wake_up(&log_root_tree->log_commit_wait[index2]); 3370 out: 3371 mutex_lock(&root->log_mutex); 3372 btrfs_remove_all_log_ctxs(root, index1, ret); 3373 root->log_transid_committed++; 3374 atomic_set(&root->log_commit[index1], 0); 3375 mutex_unlock(&root->log_mutex); 3376 3377 /* 3378 * The barrier before waitqueue_active (in cond_wake_up) is needed so 3379 * all the updates above are seen by the woken threads. It might not be 3380 * necessary, but proving that seems to be hard. 3381 */ 3382 cond_wake_up(&root->log_commit_wait[index1]); 3383 return ret; 3384 } 3385 3386 static void free_log_tree(struct btrfs_trans_handle *trans, 3387 struct btrfs_root *log) 3388 { 3389 int ret; 3390 struct walk_control wc = { 3391 .free = 1, 3392 .process_func = process_one_buffer 3393 }; 3394 3395 if (log->node) { 3396 ret = walk_log_tree(trans, log, &wc); 3397 if (ret) { 3398 /* 3399 * We weren't able to traverse the entire log tree, the 3400 * typical scenario is getting an -EIO when reading an 3401 * extent buffer of the tree, due to a previous writeback 3402 * failure of it. 3403 */ 3404 set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, 3405 &log->fs_info->fs_state); 3406 3407 /* 3408 * Some extent buffers of the log tree may still be dirty 3409 * and not yet written back to storage, because we may 3410 * have updates to a log tree without syncing a log tree, 3411 * such as during rename and link operations. So flush 3412 * them out and wait for their writeback to complete, so 3413 * that we properly cleanup their state and pages. 3414 */ 3415 btrfs_write_marked_extents(log->fs_info, 3416 &log->dirty_log_pages, 3417 EXTENT_DIRTY | EXTENT_NEW); 3418 btrfs_wait_tree_log_extents(log, 3419 EXTENT_DIRTY | EXTENT_NEW); 3420 3421 if (trans) 3422 btrfs_abort_transaction(trans, ret); 3423 else 3424 btrfs_handle_fs_error(log->fs_info, ret, NULL); 3425 } 3426 } 3427 3428 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, 3429 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); 3430 extent_io_tree_release(&log->log_csum_range); 3431 3432 btrfs_put_root(log); 3433 } 3434 3435 /* 3436 * free all the extents used by the tree log. This should be called 3437 * at commit time of the full transaction 3438 */ 3439 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3440 { 3441 if (root->log_root) { 3442 free_log_tree(trans, root->log_root); 3443 root->log_root = NULL; 3444 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3445 } 3446 return 0; 3447 } 3448 3449 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3450 struct btrfs_fs_info *fs_info) 3451 { 3452 if (fs_info->log_root_tree) { 3453 free_log_tree(trans, fs_info->log_root_tree); 3454 fs_info->log_root_tree = NULL; 3455 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3456 } 3457 return 0; 3458 } 3459 3460 /* 3461 * Check if an inode was logged in the current transaction. This correctly deals 3462 * with the case where the inode was logged but has a logged_trans of 0, which 3463 * happens if the inode is evicted and loaded again, as logged_trans is an in 3464 * memory only field (not persisted). 3465 * 3466 * Returns 1 if the inode was logged before in the transaction, 0 if it was not, 3467 * and < 0 on error. 3468 */ 3469 static int inode_logged(struct btrfs_trans_handle *trans, 3470 struct btrfs_inode *inode, 3471 struct btrfs_path *path_in) 3472 { 3473 struct btrfs_path *path = path_in; 3474 struct btrfs_key key; 3475 int ret; 3476 3477 if (inode->logged_trans == trans->transid) 3478 return 1; 3479 3480 /* 3481 * If logged_trans is not 0, then we know the inode logged was not logged 3482 * in this transaction, so we can return false right away. 3483 */ 3484 if (inode->logged_trans > 0) 3485 return 0; 3486 3487 /* 3488 * If no log tree was created for this root in this transaction, then 3489 * the inode can not have been logged in this transaction. In that case 3490 * set logged_trans to anything greater than 0 and less than the current 3491 * transaction's ID, to avoid the search below in a future call in case 3492 * a log tree gets created after this. 3493 */ 3494 if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) { 3495 inode->logged_trans = trans->transid - 1; 3496 return 0; 3497 } 3498 3499 /* 3500 * We have a log tree and the inode's logged_trans is 0. We can't tell 3501 * for sure if the inode was logged before in this transaction by looking 3502 * only at logged_trans. We could be pessimistic and assume it was, but 3503 * that can lead to unnecessarily logging an inode during rename and link 3504 * operations, and then further updating the log in followup rename and 3505 * link operations, specially if it's a directory, which adds latency 3506 * visible to applications doing a series of rename or link operations. 3507 * 3508 * A logged_trans of 0 here can mean several things: 3509 * 3510 * 1) The inode was never logged since the filesystem was mounted, and may 3511 * or may have not been evicted and loaded again; 3512 * 3513 * 2) The inode was logged in a previous transaction, then evicted and 3514 * then loaded again; 3515 * 3516 * 3) The inode was logged in the current transaction, then evicted and 3517 * then loaded again. 3518 * 3519 * For cases 1) and 2) we don't want to return true, but we need to detect 3520 * case 3) and return true. So we do a search in the log root for the inode 3521 * item. 3522 */ 3523 key.objectid = btrfs_ino(inode); 3524 key.type = BTRFS_INODE_ITEM_KEY; 3525 key.offset = 0; 3526 3527 if (!path) { 3528 path = btrfs_alloc_path(); 3529 if (!path) 3530 return -ENOMEM; 3531 } 3532 3533 ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); 3534 3535 if (path_in) 3536 btrfs_release_path(path); 3537 else 3538 btrfs_free_path(path); 3539 3540 /* 3541 * Logging an inode always results in logging its inode item. So if we 3542 * did not find the item we know the inode was not logged for sure. 3543 */ 3544 if (ret < 0) { 3545 return ret; 3546 } else if (ret > 0) { 3547 /* 3548 * Set logged_trans to a value greater than 0 and less then the 3549 * current transaction to avoid doing the search in future calls. 3550 */ 3551 inode->logged_trans = trans->transid - 1; 3552 return 0; 3553 } 3554 3555 /* 3556 * The inode was previously logged and then evicted, set logged_trans to 3557 * the current transacion's ID, to avoid future tree searches as long as 3558 * the inode is not evicted again. 3559 */ 3560 inode->logged_trans = trans->transid; 3561 3562 /* 3563 * If it's a directory, then we must set last_dir_index_offset to the 3564 * maximum possible value, so that the next attempt to log the inode does 3565 * not skip checking if dir index keys found in modified subvolume tree 3566 * leaves have been logged before, otherwise it would result in attempts 3567 * to insert duplicate dir index keys in the log tree. This must be done 3568 * because last_dir_index_offset is an in-memory only field, not persisted 3569 * in the inode item or any other on-disk structure, so its value is lost 3570 * once the inode is evicted. 3571 */ 3572 if (S_ISDIR(inode->vfs_inode.i_mode)) 3573 inode->last_dir_index_offset = (u64)-1; 3574 3575 return 1; 3576 } 3577 3578 /* 3579 * Delete a directory entry from the log if it exists. 3580 * 3581 * Returns < 0 on error 3582 * 1 if the entry does not exists 3583 * 0 if the entry existed and was successfully deleted 3584 */ 3585 static int del_logged_dentry(struct btrfs_trans_handle *trans, 3586 struct btrfs_root *log, 3587 struct btrfs_path *path, 3588 u64 dir_ino, 3589 const char *name, int name_len, 3590 u64 index) 3591 { 3592 struct btrfs_dir_item *di; 3593 3594 /* 3595 * We only log dir index items of a directory, so we don't need to look 3596 * for dir item keys. 3597 */ 3598 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, 3599 index, name, name_len, -1); 3600 if (IS_ERR(di)) 3601 return PTR_ERR(di); 3602 else if (!di) 3603 return 1; 3604 3605 /* 3606 * We do not need to update the size field of the directory's 3607 * inode item because on log replay we update the field to reflect 3608 * all existing entries in the directory (see overwrite_item()). 3609 */ 3610 return btrfs_delete_one_dir_name(trans, log, path, di); 3611 } 3612 3613 /* 3614 * If both a file and directory are logged, and unlinks or renames are 3615 * mixed in, we have a few interesting corners: 3616 * 3617 * create file X in dir Y 3618 * link file X to X.link in dir Y 3619 * fsync file X 3620 * unlink file X but leave X.link 3621 * fsync dir Y 3622 * 3623 * After a crash we would expect only X.link to exist. But file X 3624 * didn't get fsync'd again so the log has back refs for X and X.link. 3625 * 3626 * We solve this by removing directory entries and inode backrefs from the 3627 * log when a file that was logged in the current transaction is 3628 * unlinked. Any later fsync will include the updated log entries, and 3629 * we'll be able to reconstruct the proper directory items from backrefs. 3630 * 3631 * This optimizations allows us to avoid relogging the entire inode 3632 * or the entire directory. 3633 */ 3634 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 3635 struct btrfs_root *root, 3636 const char *name, int name_len, 3637 struct btrfs_inode *dir, u64 index) 3638 { 3639 struct btrfs_path *path; 3640 int ret; 3641 3642 ret = inode_logged(trans, dir, NULL); 3643 if (ret == 0) 3644 return; 3645 else if (ret < 0) { 3646 btrfs_set_log_full_commit(trans); 3647 return; 3648 } 3649 3650 ret = join_running_log_trans(root); 3651 if (ret) 3652 return; 3653 3654 mutex_lock(&dir->log_mutex); 3655 3656 path = btrfs_alloc_path(); 3657 if (!path) { 3658 ret = -ENOMEM; 3659 goto out_unlock; 3660 } 3661 3662 ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), 3663 name, name_len, index); 3664 btrfs_free_path(path); 3665 out_unlock: 3666 mutex_unlock(&dir->log_mutex); 3667 if (ret < 0) 3668 btrfs_set_log_full_commit(trans); 3669 btrfs_end_log_trans(root); 3670 } 3671 3672 /* see comments for btrfs_del_dir_entries_in_log */ 3673 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, 3674 struct btrfs_root *root, 3675 const char *name, int name_len, 3676 struct btrfs_inode *inode, u64 dirid) 3677 { 3678 struct btrfs_root *log; 3679 u64 index; 3680 int ret; 3681 3682 ret = inode_logged(trans, inode, NULL); 3683 if (ret == 0) 3684 return; 3685 else if (ret < 0) { 3686 btrfs_set_log_full_commit(trans); 3687 return; 3688 } 3689 3690 ret = join_running_log_trans(root); 3691 if (ret) 3692 return; 3693 log = root->log_root; 3694 mutex_lock(&inode->log_mutex); 3695 3696 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), 3697 dirid, &index); 3698 mutex_unlock(&inode->log_mutex); 3699 if (ret < 0 && ret != -ENOENT) 3700 btrfs_set_log_full_commit(trans); 3701 btrfs_end_log_trans(root); 3702 } 3703 3704 /* 3705 * creates a range item in the log for 'dirid'. first_offset and 3706 * last_offset tell us which parts of the key space the log should 3707 * be considered authoritative for. 3708 */ 3709 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, 3710 struct btrfs_root *log, 3711 struct btrfs_path *path, 3712 u64 dirid, 3713 u64 first_offset, u64 last_offset) 3714 { 3715 int ret; 3716 struct btrfs_key key; 3717 struct btrfs_dir_log_item *item; 3718 3719 key.objectid = dirid; 3720 key.offset = first_offset; 3721 key.type = BTRFS_DIR_LOG_INDEX_KEY; 3722 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 3723 /* 3724 * -EEXIST is fine and can happen sporadically when we are logging a 3725 * directory and have concurrent insertions in the subvolume's tree for 3726 * items from other inodes and that result in pushing off some dir items 3727 * from one leaf to another in order to accommodate for the new items. 3728 * This results in logging the same dir index range key. 3729 */ 3730 if (ret && ret != -EEXIST) 3731 return ret; 3732 3733 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3734 struct btrfs_dir_log_item); 3735 if (ret == -EEXIST) { 3736 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item); 3737 3738 /* 3739 * btrfs_del_dir_entries_in_log() might have been called during 3740 * an unlink between the initial insertion of this key and the 3741 * current update, or we might be logging a single entry deletion 3742 * during a rename, so set the new last_offset to the max value. 3743 */ 3744 last_offset = max(last_offset, curr_end); 3745 } 3746 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 3747 btrfs_mark_buffer_dirty(path->nodes[0]); 3748 btrfs_release_path(path); 3749 return 0; 3750 } 3751 3752 static int flush_dir_items_batch(struct btrfs_trans_handle *trans, 3753 struct btrfs_root *log, 3754 struct extent_buffer *src, 3755 struct btrfs_path *dst_path, 3756 int start_slot, 3757 int count) 3758 { 3759 char *ins_data = NULL; 3760 struct btrfs_item_batch batch; 3761 struct extent_buffer *dst; 3762 unsigned long src_offset; 3763 unsigned long dst_offset; 3764 struct btrfs_key key; 3765 u32 item_size; 3766 int ret; 3767 int i; 3768 3769 ASSERT(count > 0); 3770 batch.nr = count; 3771 3772 if (count == 1) { 3773 btrfs_item_key_to_cpu(src, &key, start_slot); 3774 item_size = btrfs_item_size(src, start_slot); 3775 batch.keys = &key; 3776 batch.data_sizes = &item_size; 3777 batch.total_data_size = item_size; 3778 } else { 3779 struct btrfs_key *ins_keys; 3780 u32 *ins_sizes; 3781 3782 ins_data = kmalloc(count * sizeof(u32) + 3783 count * sizeof(struct btrfs_key), GFP_NOFS); 3784 if (!ins_data) 3785 return -ENOMEM; 3786 3787 ins_sizes = (u32 *)ins_data; 3788 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); 3789 batch.keys = ins_keys; 3790 batch.data_sizes = ins_sizes; 3791 batch.total_data_size = 0; 3792 3793 for (i = 0; i < count; i++) { 3794 const int slot = start_slot + i; 3795 3796 btrfs_item_key_to_cpu(src, &ins_keys[i], slot); 3797 ins_sizes[i] = btrfs_item_size(src, slot); 3798 batch.total_data_size += ins_sizes[i]; 3799 } 3800 } 3801 3802 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 3803 if (ret) 3804 goto out; 3805 3806 dst = dst_path->nodes[0]; 3807 /* 3808 * Copy all the items in bulk, in a single copy operation. Item data is 3809 * organized such that it's placed at the end of a leaf and from right 3810 * to left. For example, the data for the second item ends at an offset 3811 * that matches the offset where the data for the first item starts, the 3812 * data for the third item ends at an offset that matches the offset 3813 * where the data of the second items starts, and so on. 3814 * Therefore our source and destination start offsets for copy match the 3815 * offsets of the last items (highest slots). 3816 */ 3817 dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); 3818 src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); 3819 copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); 3820 btrfs_release_path(dst_path); 3821 out: 3822 kfree(ins_data); 3823 3824 return ret; 3825 } 3826 3827 static int process_dir_items_leaf(struct btrfs_trans_handle *trans, 3828 struct btrfs_inode *inode, 3829 struct btrfs_path *path, 3830 struct btrfs_path *dst_path, 3831 struct btrfs_log_ctx *ctx, 3832 u64 *last_old_dentry_offset) 3833 { 3834 struct btrfs_root *log = inode->root->log_root; 3835 struct extent_buffer *src = path->nodes[0]; 3836 const int nritems = btrfs_header_nritems(src); 3837 const u64 ino = btrfs_ino(inode); 3838 bool last_found = false; 3839 int batch_start = 0; 3840 int batch_size = 0; 3841 int i; 3842 3843 for (i = path->slots[0]; i < nritems; i++) { 3844 struct btrfs_dir_item *di; 3845 struct btrfs_key key; 3846 int ret; 3847 3848 btrfs_item_key_to_cpu(src, &key, i); 3849 3850 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) { 3851 last_found = true; 3852 break; 3853 } 3854 3855 di = btrfs_item_ptr(src, i, struct btrfs_dir_item); 3856 ctx->last_dir_item_offset = key.offset; 3857 3858 /* 3859 * Skip ranges of items that consist only of dir item keys created 3860 * in past transactions. However if we find a gap, we must log a 3861 * dir index range item for that gap, so that index keys in that 3862 * gap are deleted during log replay. 3863 */ 3864 if (btrfs_dir_transid(src, di) < trans->transid) { 3865 if (key.offset > *last_old_dentry_offset + 1) { 3866 ret = insert_dir_log_key(trans, log, dst_path, 3867 ino, *last_old_dentry_offset + 1, 3868 key.offset - 1); 3869 if (ret < 0) 3870 return ret; 3871 } 3872 3873 *last_old_dentry_offset = key.offset; 3874 continue; 3875 } 3876 /* 3877 * We must make sure that when we log a directory entry, the 3878 * corresponding inode, after log replay, has a matching link 3879 * count. For example: 3880 * 3881 * touch foo 3882 * mkdir mydir 3883 * sync 3884 * ln foo mydir/bar 3885 * xfs_io -c "fsync" mydir 3886 * <crash> 3887 * <mount fs and log replay> 3888 * 3889 * Would result in a fsync log that when replayed, our file inode 3890 * would have a link count of 1, but we get two directory entries 3891 * pointing to the same inode. After removing one of the names, 3892 * it would not be possible to remove the other name, which 3893 * resulted always in stale file handle errors, and would not be 3894 * possible to rmdir the parent directory, since its i_size could 3895 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, 3896 * resulting in -ENOTEMPTY errors. 3897 */ 3898 if (!ctx->log_new_dentries) { 3899 struct btrfs_key di_key; 3900 3901 btrfs_dir_item_key_to_cpu(src, di, &di_key); 3902 if (di_key.type != BTRFS_ROOT_ITEM_KEY) 3903 ctx->log_new_dentries = true; 3904 } 3905 3906 if (!ctx->logged_before) 3907 goto add_to_batch; 3908 3909 /* 3910 * If we were logged before and have logged dir items, we can skip 3911 * checking if any item with a key offset larger than the last one 3912 * we logged is in the log tree, saving time and avoiding adding 3913 * contention on the log tree. We can only rely on the value of 3914 * last_dir_index_offset when we know for sure that the inode was 3915 * previously logged in the current transaction. 3916 */ 3917 if (key.offset > inode->last_dir_index_offset) 3918 goto add_to_batch; 3919 /* 3920 * Check if the key was already logged before. If not we can add 3921 * it to a batch for bulk insertion. 3922 */ 3923 ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); 3924 if (ret < 0) { 3925 return ret; 3926 } else if (ret > 0) { 3927 btrfs_release_path(dst_path); 3928 goto add_to_batch; 3929 } 3930 3931 /* 3932 * Item exists in the log. Overwrite the item in the log if it 3933 * has different content or do nothing if it has exactly the same 3934 * content. And then flush the current batch if any - do it after 3935 * overwriting the current item, or we would deadlock otherwise, 3936 * since we are holding a path for the existing item. 3937 */ 3938 ret = do_overwrite_item(trans, log, dst_path, src, i, &key); 3939 if (ret < 0) 3940 return ret; 3941 3942 if (batch_size > 0) { 3943 ret = flush_dir_items_batch(trans, log, src, dst_path, 3944 batch_start, batch_size); 3945 if (ret < 0) 3946 return ret; 3947 batch_size = 0; 3948 } 3949 continue; 3950 add_to_batch: 3951 if (batch_size == 0) 3952 batch_start = i; 3953 batch_size++; 3954 } 3955 3956 if (batch_size > 0) { 3957 int ret; 3958 3959 ret = flush_dir_items_batch(trans, log, src, dst_path, 3960 batch_start, batch_size); 3961 if (ret < 0) 3962 return ret; 3963 } 3964 3965 return last_found ? 1 : 0; 3966 } 3967 3968 /* 3969 * log all the items included in the current transaction for a given 3970 * directory. This also creates the range items in the log tree required 3971 * to replay anything deleted before the fsync 3972 */ 3973 static noinline int log_dir_items(struct btrfs_trans_handle *trans, 3974 struct btrfs_inode *inode, 3975 struct btrfs_path *path, 3976 struct btrfs_path *dst_path, 3977 struct btrfs_log_ctx *ctx, 3978 u64 min_offset, u64 *last_offset_ret) 3979 { 3980 struct btrfs_key min_key; 3981 struct btrfs_root *root = inode->root; 3982 struct btrfs_root *log = root->log_root; 3983 int err = 0; 3984 int ret; 3985 u64 last_old_dentry_offset = min_offset - 1; 3986 u64 last_offset = (u64)-1; 3987 u64 ino = btrfs_ino(inode); 3988 3989 min_key.objectid = ino; 3990 min_key.type = BTRFS_DIR_INDEX_KEY; 3991 min_key.offset = min_offset; 3992 3993 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 3994 3995 /* 3996 * we didn't find anything from this transaction, see if there 3997 * is anything at all 3998 */ 3999 if (ret != 0 || min_key.objectid != ino || 4000 min_key.type != BTRFS_DIR_INDEX_KEY) { 4001 min_key.objectid = ino; 4002 min_key.type = BTRFS_DIR_INDEX_KEY; 4003 min_key.offset = (u64)-1; 4004 btrfs_release_path(path); 4005 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 4006 if (ret < 0) { 4007 btrfs_release_path(path); 4008 return ret; 4009 } 4010 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4011 4012 /* if ret == 0 there are items for this type, 4013 * create a range to tell us the last key of this type. 4014 * otherwise, there are no items in this directory after 4015 * *min_offset, and we create a range to indicate that. 4016 */ 4017 if (ret == 0) { 4018 struct btrfs_key tmp; 4019 4020 btrfs_item_key_to_cpu(path->nodes[0], &tmp, 4021 path->slots[0]); 4022 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4023 last_old_dentry_offset = tmp.offset; 4024 } 4025 goto done; 4026 } 4027 4028 /* go backward to find any previous key */ 4029 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY); 4030 if (ret == 0) { 4031 struct btrfs_key tmp; 4032 4033 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 4034 /* 4035 * The dir index key before the first one we found that needs to 4036 * be logged might be in a previous leaf, and there might be a 4037 * gap between these keys, meaning that we had deletions that 4038 * happened. So the key range item we log (key type 4039 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the 4040 * previous key's offset plus 1, so that those deletes are replayed. 4041 */ 4042 if (tmp.type == BTRFS_DIR_INDEX_KEY) 4043 last_old_dentry_offset = tmp.offset; 4044 } 4045 btrfs_release_path(path); 4046 4047 /* 4048 * Find the first key from this transaction again. See the note for 4049 * log_new_dir_dentries, if we're logging a directory recursively we 4050 * won't be holding its i_mutex, which means we can modify the directory 4051 * while we're logging it. If we remove an entry between our first 4052 * search and this search we'll not find the key again and can just 4053 * bail. 4054 */ 4055 search: 4056 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 4057 if (ret != 0) 4058 goto done; 4059 4060 /* 4061 * we have a block from this transaction, log every item in it 4062 * from our directory 4063 */ 4064 while (1) { 4065 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx, 4066 &last_old_dentry_offset); 4067 if (ret != 0) { 4068 if (ret < 0) 4069 err = ret; 4070 goto done; 4071 } 4072 path->slots[0] = btrfs_header_nritems(path->nodes[0]); 4073 4074 /* 4075 * look ahead to the next item and see if it is also 4076 * from this directory and from this transaction 4077 */ 4078 ret = btrfs_next_leaf(root, path); 4079 if (ret) { 4080 if (ret == 1) 4081 last_offset = (u64)-1; 4082 else 4083 err = ret; 4084 goto done; 4085 } 4086 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); 4087 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) { 4088 last_offset = (u64)-1; 4089 goto done; 4090 } 4091 if (btrfs_header_generation(path->nodes[0]) != trans->transid) { 4092 /* 4093 * The next leaf was not changed in the current transaction 4094 * and has at least one dir index key. 4095 * We check for the next key because there might have been 4096 * one or more deletions between the last key we logged and 4097 * that next key. So the key range item we log (key type 4098 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's 4099 * offset minus 1, so that those deletes are replayed. 4100 */ 4101 last_offset = min_key.offset - 1; 4102 goto done; 4103 } 4104 if (need_resched()) { 4105 btrfs_release_path(path); 4106 cond_resched(); 4107 goto search; 4108 } 4109 } 4110 done: 4111 btrfs_release_path(path); 4112 btrfs_release_path(dst_path); 4113 4114 if (err == 0) { 4115 *last_offset_ret = last_offset; 4116 /* 4117 * In case the leaf was changed in the current transaction but 4118 * all its dir items are from a past transaction, the last item 4119 * in the leaf is a dir item and there's no gap between that last 4120 * dir item and the first one on the next leaf (which did not 4121 * change in the current transaction), then we don't need to log 4122 * a range, last_old_dentry_offset is == to last_offset. 4123 */ 4124 ASSERT(last_old_dentry_offset <= last_offset); 4125 if (last_old_dentry_offset < last_offset) { 4126 ret = insert_dir_log_key(trans, log, path, ino, 4127 last_old_dentry_offset + 1, 4128 last_offset); 4129 if (ret) 4130 err = ret; 4131 } 4132 } 4133 return err; 4134 } 4135 4136 /* 4137 * logging directories is very similar to logging inodes, We find all the items 4138 * from the current transaction and write them to the log. 4139 * 4140 * The recovery code scans the directory in the subvolume, and if it finds a 4141 * key in the range logged that is not present in the log tree, then it means 4142 * that dir entry was unlinked during the transaction. 4143 * 4144 * In order for that scan to work, we must include one key smaller than 4145 * the smallest logged by this transaction and one key larger than the largest 4146 * key logged by this transaction. 4147 */ 4148 static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 4149 struct btrfs_inode *inode, 4150 struct btrfs_path *path, 4151 struct btrfs_path *dst_path, 4152 struct btrfs_log_ctx *ctx) 4153 { 4154 u64 min_key; 4155 u64 max_key; 4156 int ret; 4157 4158 min_key = BTRFS_DIR_START_INDEX; 4159 max_key = 0; 4160 ctx->last_dir_item_offset = inode->last_dir_index_offset; 4161 4162 while (1) { 4163 ret = log_dir_items(trans, inode, path, dst_path, 4164 ctx, min_key, &max_key); 4165 if (ret) 4166 return ret; 4167 if (max_key == (u64)-1) 4168 break; 4169 min_key = max_key + 1; 4170 } 4171 4172 inode->last_dir_index_offset = ctx->last_dir_item_offset; 4173 4174 return 0; 4175 } 4176 4177 /* 4178 * a helper function to drop items from the log before we relog an 4179 * inode. max_key_type indicates the highest item type to remove. 4180 * This cannot be run for file data extents because it does not 4181 * free the extents they point to. 4182 */ 4183 static int drop_inode_items(struct btrfs_trans_handle *trans, 4184 struct btrfs_root *log, 4185 struct btrfs_path *path, 4186 struct btrfs_inode *inode, 4187 int max_key_type) 4188 { 4189 int ret; 4190 struct btrfs_key key; 4191 struct btrfs_key found_key; 4192 int start_slot; 4193 4194 key.objectid = btrfs_ino(inode); 4195 key.type = max_key_type; 4196 key.offset = (u64)-1; 4197 4198 while (1) { 4199 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 4200 BUG_ON(ret == 0); /* Logic error */ 4201 if (ret < 0) 4202 break; 4203 4204 if (path->slots[0] == 0) 4205 break; 4206 4207 path->slots[0]--; 4208 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 4209 path->slots[0]); 4210 4211 if (found_key.objectid != key.objectid) 4212 break; 4213 4214 found_key.offset = 0; 4215 found_key.type = 0; 4216 ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); 4217 if (ret < 0) 4218 break; 4219 4220 ret = btrfs_del_items(trans, log, path, start_slot, 4221 path->slots[0] - start_slot + 1); 4222 /* 4223 * If start slot isn't 0 then we don't need to re-search, we've 4224 * found the last guy with the objectid in this tree. 4225 */ 4226 if (ret || start_slot != 0) 4227 break; 4228 btrfs_release_path(path); 4229 } 4230 btrfs_release_path(path); 4231 if (ret > 0) 4232 ret = 0; 4233 return ret; 4234 } 4235 4236 static int truncate_inode_items(struct btrfs_trans_handle *trans, 4237 struct btrfs_root *log_root, 4238 struct btrfs_inode *inode, 4239 u64 new_size, u32 min_type) 4240 { 4241 struct btrfs_truncate_control control = { 4242 .new_size = new_size, 4243 .ino = btrfs_ino(inode), 4244 .min_type = min_type, 4245 .skip_ref_updates = true, 4246 }; 4247 4248 return btrfs_truncate_inode_items(trans, log_root, &control); 4249 } 4250 4251 static void fill_inode_item(struct btrfs_trans_handle *trans, 4252 struct extent_buffer *leaf, 4253 struct btrfs_inode_item *item, 4254 struct inode *inode, int log_inode_only, 4255 u64 logged_isize) 4256 { 4257 struct btrfs_map_token token; 4258 u64 flags; 4259 4260 btrfs_init_map_token(&token, leaf); 4261 4262 if (log_inode_only) { 4263 /* set the generation to zero so the recover code 4264 * can tell the difference between an logging 4265 * just to say 'this inode exists' and a logging 4266 * to say 'update this inode with these values' 4267 */ 4268 btrfs_set_token_inode_generation(&token, item, 0); 4269 btrfs_set_token_inode_size(&token, item, logged_isize); 4270 } else { 4271 btrfs_set_token_inode_generation(&token, item, 4272 BTRFS_I(inode)->generation); 4273 btrfs_set_token_inode_size(&token, item, inode->i_size); 4274 } 4275 4276 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); 4277 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); 4278 btrfs_set_token_inode_mode(&token, item, inode->i_mode); 4279 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); 4280 4281 btrfs_set_token_timespec_sec(&token, &item->atime, 4282 inode->i_atime.tv_sec); 4283 btrfs_set_token_timespec_nsec(&token, &item->atime, 4284 inode->i_atime.tv_nsec); 4285 4286 btrfs_set_token_timespec_sec(&token, &item->mtime, 4287 inode->i_mtime.tv_sec); 4288 btrfs_set_token_timespec_nsec(&token, &item->mtime, 4289 inode->i_mtime.tv_nsec); 4290 4291 btrfs_set_token_timespec_sec(&token, &item->ctime, 4292 inode->i_ctime.tv_sec); 4293 btrfs_set_token_timespec_nsec(&token, &item->ctime, 4294 inode->i_ctime.tv_nsec); 4295 4296 /* 4297 * We do not need to set the nbytes field, in fact during a fast fsync 4298 * its value may not even be correct, since a fast fsync does not wait 4299 * for ordered extent completion, which is where we update nbytes, it 4300 * only waits for writeback to complete. During log replay as we find 4301 * file extent items and replay them, we adjust the nbytes field of the 4302 * inode item in subvolume tree as needed (see overwrite_item()). 4303 */ 4304 4305 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); 4306 btrfs_set_token_inode_transid(&token, item, trans->transid); 4307 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); 4308 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4309 BTRFS_I(inode)->ro_flags); 4310 btrfs_set_token_inode_flags(&token, item, flags); 4311 btrfs_set_token_inode_block_group(&token, item, 0); 4312 } 4313 4314 static int log_inode_item(struct btrfs_trans_handle *trans, 4315 struct btrfs_root *log, struct btrfs_path *path, 4316 struct btrfs_inode *inode, bool inode_item_dropped) 4317 { 4318 struct btrfs_inode_item *inode_item; 4319 int ret; 4320 4321 /* 4322 * If we are doing a fast fsync and the inode was logged before in the 4323 * current transaction, then we know the inode was previously logged and 4324 * it exists in the log tree. For performance reasons, in this case use 4325 * btrfs_search_slot() directly with ins_len set to 0 so that we never 4326 * attempt a write lock on the leaf's parent, which adds unnecessary lock 4327 * contention in case there are concurrent fsyncs for other inodes of the 4328 * same subvolume. Using btrfs_insert_empty_item() when the inode item 4329 * already exists can also result in unnecessarily splitting a leaf. 4330 */ 4331 if (!inode_item_dropped && inode->logged_trans == trans->transid) { 4332 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); 4333 ASSERT(ret <= 0); 4334 if (ret > 0) 4335 ret = -ENOENT; 4336 } else { 4337 /* 4338 * This means it is the first fsync in the current transaction, 4339 * so the inode item is not in the log and we need to insert it. 4340 * We can never get -EEXIST because we are only called for a fast 4341 * fsync and in case an inode eviction happens after the inode was 4342 * logged before in the current transaction, when we load again 4343 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime 4344 * flags and set ->logged_trans to 0. 4345 */ 4346 ret = btrfs_insert_empty_item(trans, log, path, &inode->location, 4347 sizeof(*inode_item)); 4348 ASSERT(ret != -EEXIST); 4349 } 4350 if (ret) 4351 return ret; 4352 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4353 struct btrfs_inode_item); 4354 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4355 0, 0); 4356 btrfs_release_path(path); 4357 return 0; 4358 } 4359 4360 static int log_csums(struct btrfs_trans_handle *trans, 4361 struct btrfs_inode *inode, 4362 struct btrfs_root *log_root, 4363 struct btrfs_ordered_sum *sums) 4364 { 4365 const u64 lock_end = sums->bytenr + sums->len - 1; 4366 struct extent_state *cached_state = NULL; 4367 int ret; 4368 4369 /* 4370 * If this inode was not used for reflink operations in the current 4371 * transaction with new extents, then do the fast path, no need to 4372 * worry about logging checksum items with overlapping ranges. 4373 */ 4374 if (inode->last_reflink_trans < trans->transid) 4375 return btrfs_csum_file_blocks(trans, log_root, sums); 4376 4377 /* 4378 * Serialize logging for checksums. This is to avoid racing with the 4379 * same checksum being logged by another task that is logging another 4380 * file which happens to refer to the same extent as well. Such races 4381 * can leave checksum items in the log with overlapping ranges. 4382 */ 4383 ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, 4384 lock_end, &cached_state); 4385 if (ret) 4386 return ret; 4387 /* 4388 * Due to extent cloning, we might have logged a csum item that covers a 4389 * subrange of a cloned extent, and later we can end up logging a csum 4390 * item for a larger subrange of the same extent or the entire range. 4391 * This would leave csum items in the log tree that cover the same range 4392 * and break the searches for checksums in the log tree, resulting in 4393 * some checksums missing in the fs/subvolume tree. So just delete (or 4394 * trim and adjust) any existing csum items in the log for this range. 4395 */ 4396 ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); 4397 if (!ret) 4398 ret = btrfs_csum_file_blocks(trans, log_root, sums); 4399 4400 unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, 4401 &cached_state); 4402 4403 return ret; 4404 } 4405 4406 static noinline int copy_items(struct btrfs_trans_handle *trans, 4407 struct btrfs_inode *inode, 4408 struct btrfs_path *dst_path, 4409 struct btrfs_path *src_path, 4410 int start_slot, int nr, int inode_only, 4411 u64 logged_isize) 4412 { 4413 struct btrfs_root *log = inode->root->log_root; 4414 struct btrfs_file_extent_item *extent; 4415 struct extent_buffer *src = src_path->nodes[0]; 4416 int ret = 0; 4417 struct btrfs_key *ins_keys; 4418 u32 *ins_sizes; 4419 struct btrfs_item_batch batch; 4420 char *ins_data; 4421 int i; 4422 int dst_index; 4423 const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); 4424 const u64 i_size = i_size_read(&inode->vfs_inode); 4425 4426 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 4427 nr * sizeof(u32), GFP_NOFS); 4428 if (!ins_data) 4429 return -ENOMEM; 4430 4431 ins_sizes = (u32 *)ins_data; 4432 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 4433 batch.keys = ins_keys; 4434 batch.data_sizes = ins_sizes; 4435 batch.total_data_size = 0; 4436 batch.nr = 0; 4437 4438 dst_index = 0; 4439 for (i = 0; i < nr; i++) { 4440 const int src_slot = start_slot + i; 4441 struct btrfs_root *csum_root; 4442 struct btrfs_ordered_sum *sums; 4443 struct btrfs_ordered_sum *sums_next; 4444 LIST_HEAD(ordered_sums); 4445 u64 disk_bytenr; 4446 u64 disk_num_bytes; 4447 u64 extent_offset; 4448 u64 extent_num_bytes; 4449 bool is_old_extent; 4450 4451 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot); 4452 4453 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY) 4454 goto add_to_batch; 4455 4456 extent = btrfs_item_ptr(src, src_slot, 4457 struct btrfs_file_extent_item); 4458 4459 is_old_extent = (btrfs_file_extent_generation(src, extent) < 4460 trans->transid); 4461 4462 /* 4463 * Don't copy extents from past generations. That would make us 4464 * log a lot more metadata for common cases like doing only a 4465 * few random writes into a file and then fsync it for the first 4466 * time or after the full sync flag is set on the inode. We can 4467 * get leaves full of extent items, most of which are from past 4468 * generations, so we can skip them - as long as the inode has 4469 * not been the target of a reflink operation in this transaction, 4470 * as in that case it might have had file extent items with old 4471 * generations copied into it. We also must always log prealloc 4472 * extents that start at or beyond eof, otherwise we would lose 4473 * them on log replay. 4474 */ 4475 if (is_old_extent && 4476 ins_keys[dst_index].offset < i_size && 4477 inode->last_reflink_trans < trans->transid) 4478 continue; 4479 4480 if (skip_csum) 4481 goto add_to_batch; 4482 4483 /* Only regular extents have checksums. */ 4484 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG) 4485 goto add_to_batch; 4486 4487 /* 4488 * If it's an extent created in a past transaction, then its 4489 * checksums are already accessible from the committed csum tree, 4490 * no need to log them. 4491 */ 4492 if (is_old_extent) 4493 goto add_to_batch; 4494 4495 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent); 4496 /* If it's an explicit hole, there are no checksums. */ 4497 if (disk_bytenr == 0) 4498 goto add_to_batch; 4499 4500 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent); 4501 4502 if (btrfs_file_extent_compression(src, extent)) { 4503 extent_offset = 0; 4504 extent_num_bytes = disk_num_bytes; 4505 } else { 4506 extent_offset = btrfs_file_extent_offset(src, extent); 4507 extent_num_bytes = btrfs_file_extent_num_bytes(src, extent); 4508 } 4509 4510 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); 4511 disk_bytenr += extent_offset; 4512 ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, 4513 disk_bytenr + extent_num_bytes - 1, 4514 &ordered_sums, 0); 4515 if (ret) 4516 goto out; 4517 4518 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { 4519 if (!ret) 4520 ret = log_csums(trans, inode, log, sums); 4521 list_del(&sums->list); 4522 kfree(sums); 4523 } 4524 if (ret) 4525 goto out; 4526 4527 add_to_batch: 4528 ins_sizes[dst_index] = btrfs_item_size(src, src_slot); 4529 batch.total_data_size += ins_sizes[dst_index]; 4530 batch.nr++; 4531 dst_index++; 4532 } 4533 4534 /* 4535 * We have a leaf full of old extent items that don't need to be logged, 4536 * so we don't need to do anything. 4537 */ 4538 if (batch.nr == 0) 4539 goto out; 4540 4541 ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); 4542 if (ret) 4543 goto out; 4544 4545 dst_index = 0; 4546 for (i = 0; i < nr; i++) { 4547 const int src_slot = start_slot + i; 4548 const int dst_slot = dst_path->slots[0] + dst_index; 4549 struct btrfs_key key; 4550 unsigned long src_offset; 4551 unsigned long dst_offset; 4552 4553 /* 4554 * We're done, all the remaining items in the source leaf 4555 * correspond to old file extent items. 4556 */ 4557 if (dst_index >= batch.nr) 4558 break; 4559 4560 btrfs_item_key_to_cpu(src, &key, src_slot); 4561 4562 if (key.type != BTRFS_EXTENT_DATA_KEY) 4563 goto copy_item; 4564 4565 extent = btrfs_item_ptr(src, src_slot, 4566 struct btrfs_file_extent_item); 4567 4568 /* See the comment in the previous loop, same logic. */ 4569 if (btrfs_file_extent_generation(src, extent) < trans->transid && 4570 key.offset < i_size && 4571 inode->last_reflink_trans < trans->transid) 4572 continue; 4573 4574 copy_item: 4575 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot); 4576 src_offset = btrfs_item_ptr_offset(src, src_slot); 4577 4578 if (key.type == BTRFS_INODE_ITEM_KEY) { 4579 struct btrfs_inode_item *inode_item; 4580 4581 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4582 struct btrfs_inode_item); 4583 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4584 &inode->vfs_inode, 4585 inode_only == LOG_INODE_EXISTS, 4586 logged_isize); 4587 } else { 4588 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 4589 src_offset, ins_sizes[dst_index]); 4590 } 4591 4592 dst_index++; 4593 } 4594 4595 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 4596 btrfs_release_path(dst_path); 4597 out: 4598 kfree(ins_data); 4599 4600 return ret; 4601 } 4602 4603 static int extent_cmp(void *priv, const struct list_head *a, 4604 const struct list_head *b) 4605 { 4606 const struct extent_map *em1, *em2; 4607 4608 em1 = list_entry(a, struct extent_map, list); 4609 em2 = list_entry(b, struct extent_map, list); 4610 4611 if (em1->start < em2->start) 4612 return -1; 4613 else if (em1->start > em2->start) 4614 return 1; 4615 return 0; 4616 } 4617 4618 static int log_extent_csums(struct btrfs_trans_handle *trans, 4619 struct btrfs_inode *inode, 4620 struct btrfs_root *log_root, 4621 const struct extent_map *em, 4622 struct btrfs_log_ctx *ctx) 4623 { 4624 struct btrfs_ordered_extent *ordered; 4625 struct btrfs_root *csum_root; 4626 u64 csum_offset; 4627 u64 csum_len; 4628 u64 mod_start = em->mod_start; 4629 u64 mod_len = em->mod_len; 4630 LIST_HEAD(ordered_sums); 4631 int ret = 0; 4632 4633 if (inode->flags & BTRFS_INODE_NODATASUM || 4634 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || 4635 em->block_start == EXTENT_MAP_HOLE) 4636 return 0; 4637 4638 list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { 4639 const u64 ordered_end = ordered->file_offset + ordered->num_bytes; 4640 const u64 mod_end = mod_start + mod_len; 4641 struct btrfs_ordered_sum *sums; 4642 4643 if (mod_len == 0) 4644 break; 4645 4646 if (ordered_end <= mod_start) 4647 continue; 4648 if (mod_end <= ordered->file_offset) 4649 break; 4650 4651 /* 4652 * We are going to copy all the csums on this ordered extent, so 4653 * go ahead and adjust mod_start and mod_len in case this ordered 4654 * extent has already been logged. 4655 */ 4656 if (ordered->file_offset > mod_start) { 4657 if (ordered_end >= mod_end) 4658 mod_len = ordered->file_offset - mod_start; 4659 /* 4660 * If we have this case 4661 * 4662 * |--------- logged extent ---------| 4663 * |----- ordered extent ----| 4664 * 4665 * Just don't mess with mod_start and mod_len, we'll 4666 * just end up logging more csums than we need and it 4667 * will be ok. 4668 */ 4669 } else { 4670 if (ordered_end < mod_end) { 4671 mod_len = mod_end - ordered_end; 4672 mod_start = ordered_end; 4673 } else { 4674 mod_len = 0; 4675 } 4676 } 4677 4678 /* 4679 * To keep us from looping for the above case of an ordered 4680 * extent that falls inside of the logged extent. 4681 */ 4682 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 4683 continue; 4684 4685 list_for_each_entry(sums, &ordered->list, list) { 4686 ret = log_csums(trans, inode, log_root, sums); 4687 if (ret) 4688 return ret; 4689 } 4690 } 4691 4692 /* We're done, found all csums in the ordered extents. */ 4693 if (mod_len == 0) 4694 return 0; 4695 4696 /* If we're compressed we have to save the entire range of csums. */ 4697 if (em->compress_type) { 4698 csum_offset = 0; 4699 csum_len = max(em->block_len, em->orig_block_len); 4700 } else { 4701 csum_offset = mod_start - em->start; 4702 csum_len = mod_len; 4703 } 4704 4705 /* block start is already adjusted for the file extent offset. */ 4706 csum_root = btrfs_csum_root(trans->fs_info, em->block_start); 4707 ret = btrfs_lookup_csums_range(csum_root, 4708 em->block_start + csum_offset, 4709 em->block_start + csum_offset + 4710 csum_len - 1, &ordered_sums, 0); 4711 if (ret) 4712 return ret; 4713 4714 while (!list_empty(&ordered_sums)) { 4715 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 4716 struct btrfs_ordered_sum, 4717 list); 4718 if (!ret) 4719 ret = log_csums(trans, inode, log_root, sums); 4720 list_del(&sums->list); 4721 kfree(sums); 4722 } 4723 4724 return ret; 4725 } 4726 4727 static int log_one_extent(struct btrfs_trans_handle *trans, 4728 struct btrfs_inode *inode, 4729 const struct extent_map *em, 4730 struct btrfs_path *path, 4731 struct btrfs_log_ctx *ctx) 4732 { 4733 struct btrfs_drop_extents_args drop_args = { 0 }; 4734 struct btrfs_root *log = inode->root->log_root; 4735 struct btrfs_file_extent_item fi = { 0 }; 4736 struct extent_buffer *leaf; 4737 struct btrfs_key key; 4738 u64 extent_offset = em->start - em->orig_start; 4739 u64 block_len; 4740 int ret; 4741 4742 btrfs_set_stack_file_extent_generation(&fi, trans->transid); 4743 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4744 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); 4745 else 4746 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); 4747 4748 block_len = max(em->block_len, em->orig_block_len); 4749 if (em->compress_type != BTRFS_COMPRESS_NONE) { 4750 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); 4751 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4752 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { 4753 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start - 4754 extent_offset); 4755 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); 4756 } 4757 4758 btrfs_set_stack_file_extent_offset(&fi, extent_offset); 4759 btrfs_set_stack_file_extent_num_bytes(&fi, em->len); 4760 btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); 4761 btrfs_set_stack_file_extent_compression(&fi, em->compress_type); 4762 4763 ret = log_extent_csums(trans, inode, log, em, ctx); 4764 if (ret) 4765 return ret; 4766 4767 /* 4768 * If this is the first time we are logging the inode in the current 4769 * transaction, we can avoid btrfs_drop_extents(), which is expensive 4770 * because it does a deletion search, which always acquires write locks 4771 * for extent buffers at levels 2, 1 and 0. This not only wastes time 4772 * but also adds significant contention in a log tree, since log trees 4773 * are small, with a root at level 2 or 3 at most, due to their short 4774 * life span. 4775 */ 4776 if (ctx->logged_before) { 4777 drop_args.path = path; 4778 drop_args.start = em->start; 4779 drop_args.end = em->start + em->len; 4780 drop_args.replace_extent = true; 4781 drop_args.extent_item_size = sizeof(fi); 4782 ret = btrfs_drop_extents(trans, log, inode, &drop_args); 4783 if (ret) 4784 return ret; 4785 } 4786 4787 if (!drop_args.extent_inserted) { 4788 key.objectid = btrfs_ino(inode); 4789 key.type = BTRFS_EXTENT_DATA_KEY; 4790 key.offset = em->start; 4791 4792 ret = btrfs_insert_empty_item(trans, log, path, &key, 4793 sizeof(fi)); 4794 if (ret) 4795 return ret; 4796 } 4797 leaf = path->nodes[0]; 4798 write_extent_buffer(leaf, &fi, 4799 btrfs_item_ptr_offset(leaf, path->slots[0]), 4800 sizeof(fi)); 4801 btrfs_mark_buffer_dirty(leaf); 4802 4803 btrfs_release_path(path); 4804 4805 return ret; 4806 } 4807 4808 /* 4809 * Log all prealloc extents beyond the inode's i_size to make sure we do not 4810 * lose them after doing a full/fast fsync and replaying the log. We scan the 4811 * subvolume's root instead of iterating the inode's extent map tree because 4812 * otherwise we can log incorrect extent items based on extent map conversion. 4813 * That can happen due to the fact that extent maps are merged when they 4814 * are not in the extent map tree's list of modified extents. 4815 */ 4816 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, 4817 struct btrfs_inode *inode, 4818 struct btrfs_path *path) 4819 { 4820 struct btrfs_root *root = inode->root; 4821 struct btrfs_key key; 4822 const u64 i_size = i_size_read(&inode->vfs_inode); 4823 const u64 ino = btrfs_ino(inode); 4824 struct btrfs_path *dst_path = NULL; 4825 bool dropped_extents = false; 4826 u64 truncate_offset = i_size; 4827 struct extent_buffer *leaf; 4828 int slot; 4829 int ins_nr = 0; 4830 int start_slot; 4831 int ret; 4832 4833 if (!(inode->flags & BTRFS_INODE_PREALLOC)) 4834 return 0; 4835 4836 key.objectid = ino; 4837 key.type = BTRFS_EXTENT_DATA_KEY; 4838 key.offset = i_size; 4839 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4840 if (ret < 0) 4841 goto out; 4842 4843 /* 4844 * We must check if there is a prealloc extent that starts before the 4845 * i_size and crosses the i_size boundary. This is to ensure later we 4846 * truncate down to the end of that extent and not to the i_size, as 4847 * otherwise we end up losing part of the prealloc extent after a log 4848 * replay and with an implicit hole if there is another prealloc extent 4849 * that starts at an offset beyond i_size. 4850 */ 4851 ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); 4852 if (ret < 0) 4853 goto out; 4854 4855 if (ret == 0) { 4856 struct btrfs_file_extent_item *ei; 4857 4858 leaf = path->nodes[0]; 4859 slot = path->slots[0]; 4860 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 4861 4862 if (btrfs_file_extent_type(leaf, ei) == 4863 BTRFS_FILE_EXTENT_PREALLOC) { 4864 u64 extent_end; 4865 4866 btrfs_item_key_to_cpu(leaf, &key, slot); 4867 extent_end = key.offset + 4868 btrfs_file_extent_num_bytes(leaf, ei); 4869 4870 if (extent_end > i_size) 4871 truncate_offset = extent_end; 4872 } 4873 } else { 4874 ret = 0; 4875 } 4876 4877 while (true) { 4878 leaf = path->nodes[0]; 4879 slot = path->slots[0]; 4880 4881 if (slot >= btrfs_header_nritems(leaf)) { 4882 if (ins_nr > 0) { 4883 ret = copy_items(trans, inode, dst_path, path, 4884 start_slot, ins_nr, 1, 0); 4885 if (ret < 0) 4886 goto out; 4887 ins_nr = 0; 4888 } 4889 ret = btrfs_next_leaf(root, path); 4890 if (ret < 0) 4891 goto out; 4892 if (ret > 0) { 4893 ret = 0; 4894 break; 4895 } 4896 continue; 4897 } 4898 4899 btrfs_item_key_to_cpu(leaf, &key, slot); 4900 if (key.objectid > ino) 4901 break; 4902 if (WARN_ON_ONCE(key.objectid < ino) || 4903 key.type < BTRFS_EXTENT_DATA_KEY || 4904 key.offset < i_size) { 4905 path->slots[0]++; 4906 continue; 4907 } 4908 if (!dropped_extents) { 4909 /* 4910 * Avoid logging extent items logged in past fsync calls 4911 * and leading to duplicate keys in the log tree. 4912 */ 4913 ret = truncate_inode_items(trans, root->log_root, inode, 4914 truncate_offset, 4915 BTRFS_EXTENT_DATA_KEY); 4916 if (ret) 4917 goto out; 4918 dropped_extents = true; 4919 } 4920 if (ins_nr == 0) 4921 start_slot = slot; 4922 ins_nr++; 4923 path->slots[0]++; 4924 if (!dst_path) { 4925 dst_path = btrfs_alloc_path(); 4926 if (!dst_path) { 4927 ret = -ENOMEM; 4928 goto out; 4929 } 4930 } 4931 } 4932 if (ins_nr > 0) 4933 ret = copy_items(trans, inode, dst_path, path, 4934 start_slot, ins_nr, 1, 0); 4935 out: 4936 btrfs_release_path(path); 4937 btrfs_free_path(dst_path); 4938 return ret; 4939 } 4940 4941 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 4942 struct btrfs_inode *inode, 4943 struct btrfs_path *path, 4944 struct btrfs_log_ctx *ctx) 4945 { 4946 struct btrfs_ordered_extent *ordered; 4947 struct btrfs_ordered_extent *tmp; 4948 struct extent_map *em, *n; 4949 struct list_head extents; 4950 struct extent_map_tree *tree = &inode->extent_tree; 4951 int ret = 0; 4952 int num = 0; 4953 4954 INIT_LIST_HEAD(&extents); 4955 4956 write_lock(&tree->lock); 4957 4958 list_for_each_entry_safe(em, n, &tree->modified_extents, list) { 4959 list_del_init(&em->list); 4960 /* 4961 * Just an arbitrary number, this can be really CPU intensive 4962 * once we start getting a lot of extents, and really once we 4963 * have a bunch of extents we just want to commit since it will 4964 * be faster. 4965 */ 4966 if (++num > 32768) { 4967 list_del_init(&tree->modified_extents); 4968 ret = -EFBIG; 4969 goto process; 4970 } 4971 4972 if (em->generation < trans->transid) 4973 continue; 4974 4975 /* We log prealloc extents beyond eof later. */ 4976 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && 4977 em->start >= i_size_read(&inode->vfs_inode)) 4978 continue; 4979 4980 /* Need a ref to keep it from getting evicted from cache */ 4981 refcount_inc(&em->refs); 4982 set_bit(EXTENT_FLAG_LOGGING, &em->flags); 4983 list_add_tail(&em->list, &extents); 4984 num++; 4985 } 4986 4987 list_sort(NULL, &extents, extent_cmp); 4988 process: 4989 while (!list_empty(&extents)) { 4990 em = list_entry(extents.next, struct extent_map, list); 4991 4992 list_del_init(&em->list); 4993 4994 /* 4995 * If we had an error we just need to delete everybody from our 4996 * private list. 4997 */ 4998 if (ret) { 4999 clear_em_logging(tree, em); 5000 free_extent_map(em); 5001 continue; 5002 } 5003 5004 write_unlock(&tree->lock); 5005 5006 ret = log_one_extent(trans, inode, em, path, ctx); 5007 write_lock(&tree->lock); 5008 clear_em_logging(tree, em); 5009 free_extent_map(em); 5010 } 5011 WARN_ON(!list_empty(&extents)); 5012 write_unlock(&tree->lock); 5013 5014 if (!ret) 5015 ret = btrfs_log_prealloc_extents(trans, inode, path); 5016 if (ret) 5017 return ret; 5018 5019 /* 5020 * We have logged all extents successfully, now make sure the commit of 5021 * the current transaction waits for the ordered extents to complete 5022 * before it commits and wipes out the log trees, otherwise we would 5023 * lose data if an ordered extents completes after the transaction 5024 * commits and a power failure happens after the transaction commit. 5025 */ 5026 list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { 5027 list_del_init(&ordered->log_list); 5028 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); 5029 5030 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5031 spin_lock_irq(&inode->ordered_tree.lock); 5032 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 5033 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 5034 atomic_inc(&trans->transaction->pending_ordered); 5035 } 5036 spin_unlock_irq(&inode->ordered_tree.lock); 5037 } 5038 btrfs_put_ordered_extent(ordered); 5039 } 5040 5041 return 0; 5042 } 5043 5044 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, 5045 struct btrfs_path *path, u64 *size_ret) 5046 { 5047 struct btrfs_key key; 5048 int ret; 5049 5050 key.objectid = btrfs_ino(inode); 5051 key.type = BTRFS_INODE_ITEM_KEY; 5052 key.offset = 0; 5053 5054 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); 5055 if (ret < 0) { 5056 return ret; 5057 } else if (ret > 0) { 5058 *size_ret = 0; 5059 } else { 5060 struct btrfs_inode_item *item; 5061 5062 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 5063 struct btrfs_inode_item); 5064 *size_ret = btrfs_inode_size(path->nodes[0], item); 5065 /* 5066 * If the in-memory inode's i_size is smaller then the inode 5067 * size stored in the btree, return the inode's i_size, so 5068 * that we get a correct inode size after replaying the log 5069 * when before a power failure we had a shrinking truncate 5070 * followed by addition of a new name (rename / new hard link). 5071 * Otherwise return the inode size from the btree, to avoid 5072 * data loss when replaying a log due to previously doing a 5073 * write that expands the inode's size and logging a new name 5074 * immediately after. 5075 */ 5076 if (*size_ret > inode->vfs_inode.i_size) 5077 *size_ret = inode->vfs_inode.i_size; 5078 } 5079 5080 btrfs_release_path(path); 5081 return 0; 5082 } 5083 5084 /* 5085 * At the moment we always log all xattrs. This is to figure out at log replay 5086 * time which xattrs must have their deletion replayed. If a xattr is missing 5087 * in the log tree and exists in the fs/subvol tree, we delete it. This is 5088 * because if a xattr is deleted, the inode is fsynced and a power failure 5089 * happens, causing the log to be replayed the next time the fs is mounted, 5090 * we want the xattr to not exist anymore (same behaviour as other filesystems 5091 * with a journal, ext3/4, xfs, f2fs, etc). 5092 */ 5093 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, 5094 struct btrfs_inode *inode, 5095 struct btrfs_path *path, 5096 struct btrfs_path *dst_path) 5097 { 5098 struct btrfs_root *root = inode->root; 5099 int ret; 5100 struct btrfs_key key; 5101 const u64 ino = btrfs_ino(inode); 5102 int ins_nr = 0; 5103 int start_slot = 0; 5104 bool found_xattrs = false; 5105 5106 if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags)) 5107 return 0; 5108 5109 key.objectid = ino; 5110 key.type = BTRFS_XATTR_ITEM_KEY; 5111 key.offset = 0; 5112 5113 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5114 if (ret < 0) 5115 return ret; 5116 5117 while (true) { 5118 int slot = path->slots[0]; 5119 struct extent_buffer *leaf = path->nodes[0]; 5120 int nritems = btrfs_header_nritems(leaf); 5121 5122 if (slot >= nritems) { 5123 if (ins_nr > 0) { 5124 ret = copy_items(trans, inode, dst_path, path, 5125 start_slot, ins_nr, 1, 0); 5126 if (ret < 0) 5127 return ret; 5128 ins_nr = 0; 5129 } 5130 ret = btrfs_next_leaf(root, path); 5131 if (ret < 0) 5132 return ret; 5133 else if (ret > 0) 5134 break; 5135 continue; 5136 } 5137 5138 btrfs_item_key_to_cpu(leaf, &key, slot); 5139 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) 5140 break; 5141 5142 if (ins_nr == 0) 5143 start_slot = slot; 5144 ins_nr++; 5145 path->slots[0]++; 5146 found_xattrs = true; 5147 cond_resched(); 5148 } 5149 if (ins_nr > 0) { 5150 ret = copy_items(trans, inode, dst_path, path, 5151 start_slot, ins_nr, 1, 0); 5152 if (ret < 0) 5153 return ret; 5154 } 5155 5156 if (!found_xattrs) 5157 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags); 5158 5159 return 0; 5160 } 5161 5162 /* 5163 * When using the NO_HOLES feature if we punched a hole that causes the 5164 * deletion of entire leafs or all the extent items of the first leaf (the one 5165 * that contains the inode item and references) we may end up not processing 5166 * any extents, because there are no leafs with a generation matching the 5167 * current transaction that have extent items for our inode. So we need to find 5168 * if any holes exist and then log them. We also need to log holes after any 5169 * truncate operation that changes the inode's size. 5170 */ 5171 static int btrfs_log_holes(struct btrfs_trans_handle *trans, 5172 struct btrfs_inode *inode, 5173 struct btrfs_path *path) 5174 { 5175 struct btrfs_root *root = inode->root; 5176 struct btrfs_fs_info *fs_info = root->fs_info; 5177 struct btrfs_key key; 5178 const u64 ino = btrfs_ino(inode); 5179 const u64 i_size = i_size_read(&inode->vfs_inode); 5180 u64 prev_extent_end = 0; 5181 int ret; 5182 5183 if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) 5184 return 0; 5185 5186 key.objectid = ino; 5187 key.type = BTRFS_EXTENT_DATA_KEY; 5188 key.offset = 0; 5189 5190 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5191 if (ret < 0) 5192 return ret; 5193 5194 while (true) { 5195 struct extent_buffer *leaf = path->nodes[0]; 5196 5197 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 5198 ret = btrfs_next_leaf(root, path); 5199 if (ret < 0) 5200 return ret; 5201 if (ret > 0) { 5202 ret = 0; 5203 break; 5204 } 5205 leaf = path->nodes[0]; 5206 } 5207 5208 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 5209 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 5210 break; 5211 5212 /* We have a hole, log it. */ 5213 if (prev_extent_end < key.offset) { 5214 const u64 hole_len = key.offset - prev_extent_end; 5215 5216 /* 5217 * Release the path to avoid deadlocks with other code 5218 * paths that search the root while holding locks on 5219 * leafs from the log root. 5220 */ 5221 btrfs_release_path(path); 5222 ret = btrfs_insert_file_extent(trans, root->log_root, 5223 ino, prev_extent_end, 0, 5224 0, hole_len, 0, hole_len, 5225 0, 0, 0); 5226 if (ret < 0) 5227 return ret; 5228 5229 /* 5230 * Search for the same key again in the root. Since it's 5231 * an extent item and we are holding the inode lock, the 5232 * key must still exist. If it doesn't just emit warning 5233 * and return an error to fall back to a transaction 5234 * commit. 5235 */ 5236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5237 if (ret < 0) 5238 return ret; 5239 if (WARN_ON(ret > 0)) 5240 return -ENOENT; 5241 leaf = path->nodes[0]; 5242 } 5243 5244 prev_extent_end = btrfs_file_extent_end(path); 5245 path->slots[0]++; 5246 cond_resched(); 5247 } 5248 5249 if (prev_extent_end < i_size) { 5250 u64 hole_len; 5251 5252 btrfs_release_path(path); 5253 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); 5254 ret = btrfs_insert_file_extent(trans, root->log_root, 5255 ino, prev_extent_end, 0, 0, 5256 hole_len, 0, hole_len, 5257 0, 0, 0); 5258 if (ret < 0) 5259 return ret; 5260 } 5261 5262 return 0; 5263 } 5264 5265 /* 5266 * When we are logging a new inode X, check if it doesn't have a reference that 5267 * matches the reference from some other inode Y created in a past transaction 5268 * and that was renamed in the current transaction. If we don't do this, then at 5269 * log replay time we can lose inode Y (and all its files if it's a directory): 5270 * 5271 * mkdir /mnt/x 5272 * echo "hello world" > /mnt/x/foobar 5273 * sync 5274 * mv /mnt/x /mnt/y 5275 * mkdir /mnt/x # or touch /mnt/x 5276 * xfs_io -c fsync /mnt/x 5277 * <power fail> 5278 * mount fs, trigger log replay 5279 * 5280 * After the log replay procedure, we would lose the first directory and all its 5281 * files (file foobar). 5282 * For the case where inode Y is not a directory we simply end up losing it: 5283 * 5284 * echo "123" > /mnt/foo 5285 * sync 5286 * mv /mnt/foo /mnt/bar 5287 * echo "abc" > /mnt/foo 5288 * xfs_io -c fsync /mnt/foo 5289 * <power fail> 5290 * 5291 * We also need this for cases where a snapshot entry is replaced by some other 5292 * entry (file or directory) otherwise we end up with an unreplayable log due to 5293 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as 5294 * if it were a regular entry: 5295 * 5296 * mkdir /mnt/x 5297 * btrfs subvolume snapshot /mnt /mnt/x/snap 5298 * btrfs subvolume delete /mnt/x/snap 5299 * rmdir /mnt/x 5300 * mkdir /mnt/x 5301 * fsync /mnt/x or fsync some new file inside it 5302 * <power fail> 5303 * 5304 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in 5305 * the same transaction. 5306 */ 5307 static int btrfs_check_ref_name_override(struct extent_buffer *eb, 5308 const int slot, 5309 const struct btrfs_key *key, 5310 struct btrfs_inode *inode, 5311 u64 *other_ino, u64 *other_parent) 5312 { 5313 int ret; 5314 struct btrfs_path *search_path; 5315 char *name = NULL; 5316 u32 name_len = 0; 5317 u32 item_size = btrfs_item_size(eb, slot); 5318 u32 cur_offset = 0; 5319 unsigned long ptr = btrfs_item_ptr_offset(eb, slot); 5320 5321 search_path = btrfs_alloc_path(); 5322 if (!search_path) 5323 return -ENOMEM; 5324 search_path->search_commit_root = 1; 5325 search_path->skip_locking = 1; 5326 5327 while (cur_offset < item_size) { 5328 u64 parent; 5329 u32 this_name_len; 5330 u32 this_len; 5331 unsigned long name_ptr; 5332 struct btrfs_dir_item *di; 5333 5334 if (key->type == BTRFS_INODE_REF_KEY) { 5335 struct btrfs_inode_ref *iref; 5336 5337 iref = (struct btrfs_inode_ref *)(ptr + cur_offset); 5338 parent = key->offset; 5339 this_name_len = btrfs_inode_ref_name_len(eb, iref); 5340 name_ptr = (unsigned long)(iref + 1); 5341 this_len = sizeof(*iref) + this_name_len; 5342 } else { 5343 struct btrfs_inode_extref *extref; 5344 5345 extref = (struct btrfs_inode_extref *)(ptr + 5346 cur_offset); 5347 parent = btrfs_inode_extref_parent(eb, extref); 5348 this_name_len = btrfs_inode_extref_name_len(eb, extref); 5349 name_ptr = (unsigned long)&extref->name; 5350 this_len = sizeof(*extref) + this_name_len; 5351 } 5352 5353 if (this_name_len > name_len) { 5354 char *new_name; 5355 5356 new_name = krealloc(name, this_name_len, GFP_NOFS); 5357 if (!new_name) { 5358 ret = -ENOMEM; 5359 goto out; 5360 } 5361 name_len = this_name_len; 5362 name = new_name; 5363 } 5364 5365 read_extent_buffer(eb, name, name_ptr, this_name_len); 5366 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5367 parent, name, this_name_len, 0); 5368 if (di && !IS_ERR(di)) { 5369 struct btrfs_key di_key; 5370 5371 btrfs_dir_item_key_to_cpu(search_path->nodes[0], 5372 di, &di_key); 5373 if (di_key.type == BTRFS_INODE_ITEM_KEY) { 5374 if (di_key.objectid != key->objectid) { 5375 ret = 1; 5376 *other_ino = di_key.objectid; 5377 *other_parent = parent; 5378 } else { 5379 ret = 0; 5380 } 5381 } else { 5382 ret = -EAGAIN; 5383 } 5384 goto out; 5385 } else if (IS_ERR(di)) { 5386 ret = PTR_ERR(di); 5387 goto out; 5388 } 5389 btrfs_release_path(search_path); 5390 5391 cur_offset += this_len; 5392 } 5393 ret = 0; 5394 out: 5395 btrfs_free_path(search_path); 5396 kfree(name); 5397 return ret; 5398 } 5399 5400 struct btrfs_ino_list { 5401 u64 ino; 5402 u64 parent; 5403 struct list_head list; 5404 }; 5405 5406 static int log_conflicting_inodes(struct btrfs_trans_handle *trans, 5407 struct btrfs_root *root, 5408 struct btrfs_path *path, 5409 struct btrfs_log_ctx *ctx, 5410 u64 ino, u64 parent) 5411 { 5412 struct btrfs_ino_list *ino_elem; 5413 LIST_HEAD(inode_list); 5414 int ret = 0; 5415 5416 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5417 if (!ino_elem) 5418 return -ENOMEM; 5419 ino_elem->ino = ino; 5420 ino_elem->parent = parent; 5421 list_add_tail(&ino_elem->list, &inode_list); 5422 5423 while (!list_empty(&inode_list)) { 5424 struct btrfs_fs_info *fs_info = root->fs_info; 5425 struct btrfs_key key; 5426 struct inode *inode; 5427 5428 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list, 5429 list); 5430 ino = ino_elem->ino; 5431 parent = ino_elem->parent; 5432 list_del(&ino_elem->list); 5433 kfree(ino_elem); 5434 if (ret) 5435 continue; 5436 5437 btrfs_release_path(path); 5438 5439 inode = btrfs_iget(fs_info->sb, ino, root); 5440 /* 5441 * If the other inode that had a conflicting dir entry was 5442 * deleted in the current transaction, we need to log its parent 5443 * directory. 5444 */ 5445 if (IS_ERR(inode)) { 5446 ret = PTR_ERR(inode); 5447 if (ret == -ENOENT) { 5448 inode = btrfs_iget(fs_info->sb, parent, root); 5449 if (IS_ERR(inode)) { 5450 ret = PTR_ERR(inode); 5451 } else { 5452 ret = btrfs_log_inode(trans, 5453 BTRFS_I(inode), 5454 LOG_OTHER_INODE_ALL, 5455 ctx); 5456 btrfs_add_delayed_iput(inode); 5457 } 5458 } 5459 continue; 5460 } 5461 /* 5462 * If the inode was already logged skip it - otherwise we can 5463 * hit an infinite loop. Example: 5464 * 5465 * From the commit root (previous transaction) we have the 5466 * following inodes: 5467 * 5468 * inode 257 a directory 5469 * inode 258 with references "zz" and "zz_link" on inode 257 5470 * inode 259 with reference "a" on inode 257 5471 * 5472 * And in the current (uncommitted) transaction we have: 5473 * 5474 * inode 257 a directory, unchanged 5475 * inode 258 with references "a" and "a2" on inode 257 5476 * inode 259 with reference "zz_link" on inode 257 5477 * inode 261 with reference "zz" on inode 257 5478 * 5479 * When logging inode 261 the following infinite loop could 5480 * happen if we don't skip already logged inodes: 5481 * 5482 * - we detect inode 258 as a conflicting inode, with inode 261 5483 * on reference "zz", and log it; 5484 * 5485 * - we detect inode 259 as a conflicting inode, with inode 258 5486 * on reference "a", and log it; 5487 * 5488 * - we detect inode 258 as a conflicting inode, with inode 259 5489 * on reference "zz_link", and log it - again! After this we 5490 * repeat the above steps forever. 5491 */ 5492 spin_lock(&BTRFS_I(inode)->lock); 5493 /* 5494 * Check the inode's logged_trans only instead of 5495 * btrfs_inode_in_log(). This is because the last_log_commit of 5496 * the inode is not updated when we only log that it exists (see 5497 * btrfs_log_inode()). 5498 */ 5499 if (BTRFS_I(inode)->logged_trans == trans->transid) { 5500 spin_unlock(&BTRFS_I(inode)->lock); 5501 btrfs_add_delayed_iput(inode); 5502 continue; 5503 } 5504 spin_unlock(&BTRFS_I(inode)->lock); 5505 /* 5506 * We are safe logging the other inode without acquiring its 5507 * lock as long as we log with the LOG_INODE_EXISTS mode. We 5508 * are safe against concurrent renames of the other inode as 5509 * well because during a rename we pin the log and update the 5510 * log with the new name before we unpin it. 5511 */ 5512 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); 5513 if (ret) { 5514 btrfs_add_delayed_iput(inode); 5515 continue; 5516 } 5517 5518 key.objectid = ino; 5519 key.type = BTRFS_INODE_REF_KEY; 5520 key.offset = 0; 5521 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5522 if (ret < 0) { 5523 btrfs_add_delayed_iput(inode); 5524 continue; 5525 } 5526 5527 while (true) { 5528 struct extent_buffer *leaf = path->nodes[0]; 5529 int slot = path->slots[0]; 5530 u64 other_ino = 0; 5531 u64 other_parent = 0; 5532 5533 if (slot >= btrfs_header_nritems(leaf)) { 5534 ret = btrfs_next_leaf(root, path); 5535 if (ret < 0) { 5536 break; 5537 } else if (ret > 0) { 5538 ret = 0; 5539 break; 5540 } 5541 continue; 5542 } 5543 5544 btrfs_item_key_to_cpu(leaf, &key, slot); 5545 if (key.objectid != ino || 5546 (key.type != BTRFS_INODE_REF_KEY && 5547 key.type != BTRFS_INODE_EXTREF_KEY)) { 5548 ret = 0; 5549 break; 5550 } 5551 5552 ret = btrfs_check_ref_name_override(leaf, slot, &key, 5553 BTRFS_I(inode), &other_ino, 5554 &other_parent); 5555 if (ret < 0) 5556 break; 5557 if (ret > 0) { 5558 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); 5559 if (!ino_elem) { 5560 ret = -ENOMEM; 5561 break; 5562 } 5563 ino_elem->ino = other_ino; 5564 ino_elem->parent = other_parent; 5565 list_add_tail(&ino_elem->list, &inode_list); 5566 ret = 0; 5567 } 5568 path->slots[0]++; 5569 } 5570 btrfs_add_delayed_iput(inode); 5571 } 5572 5573 return ret; 5574 } 5575 5576 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, 5577 struct btrfs_inode *inode, 5578 struct btrfs_key *min_key, 5579 const struct btrfs_key *max_key, 5580 struct btrfs_path *path, 5581 struct btrfs_path *dst_path, 5582 const u64 logged_isize, 5583 const bool recursive_logging, 5584 const int inode_only, 5585 struct btrfs_log_ctx *ctx, 5586 bool *need_log_inode_item) 5587 { 5588 const u64 i_size = i_size_read(&inode->vfs_inode); 5589 struct btrfs_root *root = inode->root; 5590 int ins_start_slot = 0; 5591 int ins_nr = 0; 5592 int ret; 5593 5594 while (1) { 5595 ret = btrfs_search_forward(root, min_key, path, trans->transid); 5596 if (ret < 0) 5597 return ret; 5598 if (ret > 0) { 5599 ret = 0; 5600 break; 5601 } 5602 again: 5603 /* Note, ins_nr might be > 0 here, cleanup outside the loop */ 5604 if (min_key->objectid != max_key->objectid) 5605 break; 5606 if (min_key->type > max_key->type) 5607 break; 5608 5609 if (min_key->type == BTRFS_INODE_ITEM_KEY) { 5610 *need_log_inode_item = false; 5611 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY && 5612 min_key->offset >= i_size) { 5613 /* 5614 * Extents at and beyond eof are logged with 5615 * btrfs_log_prealloc_extents(). 5616 * Only regular files have BTRFS_EXTENT_DATA_KEY keys, 5617 * and no keys greater than that, so bail out. 5618 */ 5619 break; 5620 } else if ((min_key->type == BTRFS_INODE_REF_KEY || 5621 min_key->type == BTRFS_INODE_EXTREF_KEY) && 5622 inode->generation == trans->transid && 5623 !recursive_logging) { 5624 u64 other_ino = 0; 5625 u64 other_parent = 0; 5626 5627 ret = btrfs_check_ref_name_override(path->nodes[0], 5628 path->slots[0], min_key, inode, 5629 &other_ino, &other_parent); 5630 if (ret < 0) { 5631 return ret; 5632 } else if (ret > 0 && 5633 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { 5634 if (ins_nr > 0) { 5635 ins_nr++; 5636 } else { 5637 ins_nr = 1; 5638 ins_start_slot = path->slots[0]; 5639 } 5640 ret = copy_items(trans, inode, dst_path, path, 5641 ins_start_slot, ins_nr, 5642 inode_only, logged_isize); 5643 if (ret < 0) 5644 return ret; 5645 ins_nr = 0; 5646 5647 ret = log_conflicting_inodes(trans, root, path, 5648 ctx, other_ino, other_parent); 5649 if (ret) 5650 return ret; 5651 btrfs_release_path(path); 5652 goto next_key; 5653 } 5654 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) { 5655 /* Skip xattrs, logged later with btrfs_log_all_xattrs() */ 5656 if (ins_nr == 0) 5657 goto next_slot; 5658 ret = copy_items(trans, inode, dst_path, path, 5659 ins_start_slot, 5660 ins_nr, inode_only, logged_isize); 5661 if (ret < 0) 5662 return ret; 5663 ins_nr = 0; 5664 goto next_slot; 5665 } 5666 5667 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 5668 ins_nr++; 5669 goto next_slot; 5670 } else if (!ins_nr) { 5671 ins_start_slot = path->slots[0]; 5672 ins_nr = 1; 5673 goto next_slot; 5674 } 5675 5676 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5677 ins_nr, inode_only, logged_isize); 5678 if (ret < 0) 5679 return ret; 5680 ins_nr = 1; 5681 ins_start_slot = path->slots[0]; 5682 next_slot: 5683 path->slots[0]++; 5684 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 5685 btrfs_item_key_to_cpu(path->nodes[0], min_key, 5686 path->slots[0]); 5687 goto again; 5688 } 5689 if (ins_nr) { 5690 ret = copy_items(trans, inode, dst_path, path, 5691 ins_start_slot, ins_nr, inode_only, 5692 logged_isize); 5693 if (ret < 0) 5694 return ret; 5695 ins_nr = 0; 5696 } 5697 btrfs_release_path(path); 5698 next_key: 5699 if (min_key->offset < (u64)-1) { 5700 min_key->offset++; 5701 } else if (min_key->type < max_key->type) { 5702 min_key->type++; 5703 min_key->offset = 0; 5704 } else { 5705 break; 5706 } 5707 5708 /* 5709 * We may process many leaves full of items for our inode, so 5710 * avoid monopolizing a cpu for too long by rescheduling while 5711 * not holding locks on any tree. 5712 */ 5713 cond_resched(); 5714 } 5715 if (ins_nr) { 5716 ret = copy_items(trans, inode, dst_path, path, ins_start_slot, 5717 ins_nr, inode_only, logged_isize); 5718 if (ret) 5719 return ret; 5720 } 5721 5722 if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) { 5723 /* 5724 * Release the path because otherwise we might attempt to double 5725 * lock the same leaf with btrfs_log_prealloc_extents() below. 5726 */ 5727 btrfs_release_path(path); 5728 ret = btrfs_log_prealloc_extents(trans, inode, dst_path); 5729 } 5730 5731 return ret; 5732 } 5733 5734 /* log a single inode in the tree log. 5735 * At least one parent directory for this inode must exist in the tree 5736 * or be logged already. 5737 * 5738 * Any items from this inode changed by the current transaction are copied 5739 * to the log tree. An extra reference is taken on any extents in this 5740 * file, allowing us to avoid a whole pile of corner cases around logging 5741 * blocks that have been removed from the tree. 5742 * 5743 * See LOG_INODE_ALL and related defines for a description of what inode_only 5744 * does. 5745 * 5746 * This handles both files and directories. 5747 */ 5748 static int btrfs_log_inode(struct btrfs_trans_handle *trans, 5749 struct btrfs_inode *inode, 5750 int inode_only, 5751 struct btrfs_log_ctx *ctx) 5752 { 5753 struct btrfs_path *path; 5754 struct btrfs_path *dst_path; 5755 struct btrfs_key min_key; 5756 struct btrfs_key max_key; 5757 struct btrfs_root *log = inode->root->log_root; 5758 int ret; 5759 bool fast_search = false; 5760 u64 ino = btrfs_ino(inode); 5761 struct extent_map_tree *em_tree = &inode->extent_tree; 5762 u64 logged_isize = 0; 5763 bool need_log_inode_item = true; 5764 bool xattrs_logged = false; 5765 bool recursive_logging = false; 5766 bool inode_item_dropped = true; 5767 const bool orig_logged_before = ctx->logged_before; 5768 5769 path = btrfs_alloc_path(); 5770 if (!path) 5771 return -ENOMEM; 5772 dst_path = btrfs_alloc_path(); 5773 if (!dst_path) { 5774 btrfs_free_path(path); 5775 return -ENOMEM; 5776 } 5777 5778 min_key.objectid = ino; 5779 min_key.type = BTRFS_INODE_ITEM_KEY; 5780 min_key.offset = 0; 5781 5782 max_key.objectid = ino; 5783 5784 5785 /* today the code can only do partial logging of directories */ 5786 if (S_ISDIR(inode->vfs_inode.i_mode) || 5787 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5788 &inode->runtime_flags) && 5789 inode_only >= LOG_INODE_EXISTS)) 5790 max_key.type = BTRFS_XATTR_ITEM_KEY; 5791 else 5792 max_key.type = (u8)-1; 5793 max_key.offset = (u64)-1; 5794 5795 /* 5796 * Only run delayed items if we are a directory. We want to make sure 5797 * all directory indexes hit the fs/subvolume tree so we can find them 5798 * and figure out which index ranges have to be logged. 5799 */ 5800 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5801 ret = btrfs_commit_inode_delayed_items(trans, inode); 5802 if (ret) 5803 goto out; 5804 } 5805 5806 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { 5807 recursive_logging = true; 5808 if (inode_only == LOG_OTHER_INODE) 5809 inode_only = LOG_INODE_EXISTS; 5810 else 5811 inode_only = LOG_INODE_ALL; 5812 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING); 5813 } else { 5814 mutex_lock(&inode->log_mutex); 5815 } 5816 5817 /* 5818 * For symlinks, we must always log their content, which is stored in an 5819 * inline extent, otherwise we could end up with an empty symlink after 5820 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if 5821 * one attempts to create an empty symlink). 5822 * We don't need to worry about flushing delalloc, because when we create 5823 * the inline extent when the symlink is created (we never have delalloc 5824 * for symlinks). 5825 */ 5826 if (S_ISLNK(inode->vfs_inode.i_mode)) 5827 inode_only = LOG_INODE_ALL; 5828 5829 /* 5830 * Before logging the inode item, cache the value returned by 5831 * inode_logged(), because after that we have the need to figure out if 5832 * the inode was previously logged in this transaction. 5833 */ 5834 ret = inode_logged(trans, inode, path); 5835 if (ret < 0) 5836 goto out_unlock; 5837 ctx->logged_before = (ret == 1); 5838 ret = 0; 5839 5840 /* 5841 * This is for cases where logging a directory could result in losing a 5842 * a file after replaying the log. For example, if we move a file from a 5843 * directory A to a directory B, then fsync directory A, we have no way 5844 * to known the file was moved from A to B, so logging just A would 5845 * result in losing the file after a log replay. 5846 */ 5847 if (S_ISDIR(inode->vfs_inode.i_mode) && 5848 inode_only == LOG_INODE_ALL && 5849 inode->last_unlink_trans >= trans->transid) { 5850 btrfs_set_log_full_commit(trans); 5851 ret = BTRFS_LOG_FORCE_COMMIT; 5852 goto out_unlock; 5853 } 5854 5855 /* 5856 * a brute force approach to making sure we get the most uptodate 5857 * copies of everything. 5858 */ 5859 if (S_ISDIR(inode->vfs_inode.i_mode)) { 5860 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 5861 5862 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); 5863 if (inode_only == LOG_INODE_EXISTS) 5864 max_key_type = BTRFS_XATTR_ITEM_KEY; 5865 if (ctx->logged_before) 5866 ret = drop_inode_items(trans, log, path, inode, 5867 max_key_type); 5868 } else { 5869 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { 5870 /* 5871 * Make sure the new inode item we write to the log has 5872 * the same isize as the current one (if it exists). 5873 * This is necessary to prevent data loss after log 5874 * replay, and also to prevent doing a wrong expanding 5875 * truncate - for e.g. create file, write 4K into offset 5876 * 0, fsync, write 4K into offset 4096, add hard link, 5877 * fsync some other file (to sync log), power fail - if 5878 * we use the inode's current i_size, after log replay 5879 * we get a 8Kb file, with the last 4Kb extent as a hole 5880 * (zeroes), as if an expanding truncate happened, 5881 * instead of getting a file of 4Kb only. 5882 */ 5883 ret = logged_inode_size(log, inode, path, &logged_isize); 5884 if (ret) 5885 goto out_unlock; 5886 } 5887 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5888 &inode->runtime_flags)) { 5889 if (inode_only == LOG_INODE_EXISTS) { 5890 max_key.type = BTRFS_XATTR_ITEM_KEY; 5891 if (ctx->logged_before) 5892 ret = drop_inode_items(trans, log, path, 5893 inode, max_key.type); 5894 } else { 5895 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 5896 &inode->runtime_flags); 5897 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5898 &inode->runtime_flags); 5899 if (ctx->logged_before) 5900 ret = truncate_inode_items(trans, log, 5901 inode, 0, 0); 5902 } 5903 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 5904 &inode->runtime_flags) || 5905 inode_only == LOG_INODE_EXISTS) { 5906 if (inode_only == LOG_INODE_ALL) 5907 fast_search = true; 5908 max_key.type = BTRFS_XATTR_ITEM_KEY; 5909 if (ctx->logged_before) 5910 ret = drop_inode_items(trans, log, path, inode, 5911 max_key.type); 5912 } else { 5913 if (inode_only == LOG_INODE_ALL) 5914 fast_search = true; 5915 inode_item_dropped = false; 5916 goto log_extents; 5917 } 5918 5919 } 5920 if (ret) 5921 goto out_unlock; 5922 5923 ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, 5924 path, dst_path, logged_isize, 5925 recursive_logging, inode_only, ctx, 5926 &need_log_inode_item); 5927 if (ret) 5928 goto out_unlock; 5929 5930 btrfs_release_path(path); 5931 btrfs_release_path(dst_path); 5932 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5933 if (ret) 5934 goto out_unlock; 5935 xattrs_logged = true; 5936 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { 5937 btrfs_release_path(path); 5938 btrfs_release_path(dst_path); 5939 ret = btrfs_log_holes(trans, inode, path); 5940 if (ret) 5941 goto out_unlock; 5942 } 5943 log_extents: 5944 btrfs_release_path(path); 5945 btrfs_release_path(dst_path); 5946 if (need_log_inode_item) { 5947 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); 5948 if (ret) 5949 goto out_unlock; 5950 /* 5951 * If we are doing a fast fsync and the inode was logged before 5952 * in this transaction, we don't need to log the xattrs because 5953 * they were logged before. If xattrs were added, changed or 5954 * deleted since the last time we logged the inode, then we have 5955 * already logged them because the inode had the runtime flag 5956 * BTRFS_INODE_COPY_EVERYTHING set. 5957 */ 5958 if (!xattrs_logged && inode->logged_trans < trans->transid) { 5959 ret = btrfs_log_all_xattrs(trans, inode, path, dst_path); 5960 if (ret) 5961 goto out_unlock; 5962 btrfs_release_path(path); 5963 } 5964 } 5965 if (fast_search) { 5966 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); 5967 if (ret) 5968 goto out_unlock; 5969 } else if (inode_only == LOG_INODE_ALL) { 5970 struct extent_map *em, *n; 5971 5972 write_lock(&em_tree->lock); 5973 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) 5974 list_del_init(&em->list); 5975 write_unlock(&em_tree->lock); 5976 } 5977 5978 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { 5979 ret = log_directory_changes(trans, inode, path, dst_path, ctx); 5980 if (ret) 5981 goto out_unlock; 5982 } 5983 5984 spin_lock(&inode->lock); 5985 inode->logged_trans = trans->transid; 5986 /* 5987 * Don't update last_log_commit if we logged that an inode exists. 5988 * We do this for three reasons: 5989 * 5990 * 1) We might have had buffered writes to this inode that were 5991 * flushed and had their ordered extents completed in this 5992 * transaction, but we did not previously log the inode with 5993 * LOG_INODE_ALL. Later the inode was evicted and after that 5994 * it was loaded again and this LOG_INODE_EXISTS log operation 5995 * happened. We must make sure that if an explicit fsync against 5996 * the inode is performed later, it logs the new extents, an 5997 * updated inode item, etc, and syncs the log. The same logic 5998 * applies to direct IO writes instead of buffered writes. 5999 * 6000 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item 6001 * is logged with an i_size of 0 or whatever value was logged 6002 * before. If later the i_size of the inode is increased by a 6003 * truncate operation, the log is synced through an fsync of 6004 * some other inode and then finally an explicit fsync against 6005 * this inode is made, we must make sure this fsync logs the 6006 * inode with the new i_size, the hole between old i_size and 6007 * the new i_size, and syncs the log. 6008 * 6009 * 3) If we are logging that an ancestor inode exists as part of 6010 * logging a new name from a link or rename operation, don't update 6011 * its last_log_commit - otherwise if an explicit fsync is made 6012 * against an ancestor, the fsync considers the inode in the log 6013 * and doesn't sync the log, resulting in the ancestor missing after 6014 * a power failure unless the log was synced as part of an fsync 6015 * against any other unrelated inode. 6016 */ 6017 if (inode_only != LOG_INODE_EXISTS) 6018 inode->last_log_commit = inode->last_sub_trans; 6019 spin_unlock(&inode->lock); 6020 6021 /* 6022 * Reset the last_reflink_trans so that the next fsync does not need to 6023 * go through the slower path when logging extents and their checksums. 6024 */ 6025 if (inode_only == LOG_INODE_ALL) 6026 inode->last_reflink_trans = 0; 6027 6028 out_unlock: 6029 mutex_unlock(&inode->log_mutex); 6030 out: 6031 btrfs_free_path(path); 6032 btrfs_free_path(dst_path); 6033 6034 if (recursive_logging) 6035 ctx->logged_before = orig_logged_before; 6036 6037 return ret; 6038 } 6039 6040 /* 6041 * Check if we need to log an inode. This is used in contexts where while 6042 * logging an inode we need to log another inode (either that it exists or in 6043 * full mode). This is used instead of btrfs_inode_in_log() because the later 6044 * requires the inode to be in the log and have the log transaction committed, 6045 * while here we do not care if the log transaction was already committed - our 6046 * caller will commit the log later - and we want to avoid logging an inode 6047 * multiple times when multiple tasks have joined the same log transaction. 6048 */ 6049 static bool need_log_inode(struct btrfs_trans_handle *trans, 6050 struct btrfs_inode *inode) 6051 { 6052 /* 6053 * If a directory was not modified, no dentries added or removed, we can 6054 * and should avoid logging it. 6055 */ 6056 if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) 6057 return false; 6058 6059 /* 6060 * If this inode does not have new/updated/deleted xattrs since the last 6061 * time it was logged and is flagged as logged in the current transaction, 6062 * we can skip logging it. As for new/deleted names, those are updated in 6063 * the log by link/unlink/rename operations. 6064 * In case the inode was logged and then evicted and reloaded, its 6065 * logged_trans will be 0, in which case we have to fully log it since 6066 * logged_trans is a transient field, not persisted. 6067 */ 6068 if (inode->logged_trans == trans->transid && 6069 !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) 6070 return false; 6071 6072 return true; 6073 } 6074 6075 struct btrfs_dir_list { 6076 u64 ino; 6077 struct list_head list; 6078 }; 6079 6080 /* 6081 * Log the inodes of the new dentries of a directory. See log_dir_items() for 6082 * details about the why it is needed. 6083 * This is a recursive operation - if an existing dentry corresponds to a 6084 * directory, that directory's new entries are logged too (same behaviour as 6085 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes 6086 * the dentries point to we do not lock their i_mutex, otherwise lockdep 6087 * complains about the following circular lock dependency / possible deadlock: 6088 * 6089 * CPU0 CPU1 6090 * ---- ---- 6091 * lock(&type->i_mutex_dir_key#3/2); 6092 * lock(sb_internal#2); 6093 * lock(&type->i_mutex_dir_key#3/2); 6094 * lock(&sb->s_type->i_mutex_key#14); 6095 * 6096 * Where sb_internal is the lock (a counter that works as a lock) acquired by 6097 * sb_start_intwrite() in btrfs_start_transaction(). 6098 * Not locking i_mutex of the inodes is still safe because: 6099 * 6100 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible 6101 * that while logging the inode new references (names) are added or removed 6102 * from the inode, leaving the logged inode item with a link count that does 6103 * not match the number of logged inode reference items. This is fine because 6104 * at log replay time we compute the real number of links and correct the 6105 * link count in the inode item (see replay_one_buffer() and 6106 * link_to_fixup_dir()); 6107 * 6108 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that 6109 * while logging the inode's items new index items (key type 6110 * BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item 6111 * has a size that doesn't match the sum of the lengths of all the logged 6112 * names - this is ok, not a problem, because at log replay time we set the 6113 * directory's i_size to the correct value (see replay_one_name() and 6114 * do_overwrite_item()). 6115 */ 6116 static int log_new_dir_dentries(struct btrfs_trans_handle *trans, 6117 struct btrfs_root *root, 6118 struct btrfs_inode *start_inode, 6119 struct btrfs_log_ctx *ctx) 6120 { 6121 struct btrfs_fs_info *fs_info = root->fs_info; 6122 struct btrfs_path *path; 6123 LIST_HEAD(dir_list); 6124 struct btrfs_dir_list *dir_elem; 6125 int ret = 0; 6126 6127 /* 6128 * If we are logging a new name, as part of a link or rename operation, 6129 * don't bother logging new dentries, as we just want to log the names 6130 * of an inode and that any new parents exist. 6131 */ 6132 if (ctx->logging_new_name) 6133 return 0; 6134 6135 path = btrfs_alloc_path(); 6136 if (!path) 6137 return -ENOMEM; 6138 6139 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); 6140 if (!dir_elem) { 6141 btrfs_free_path(path); 6142 return -ENOMEM; 6143 } 6144 dir_elem->ino = btrfs_ino(start_inode); 6145 list_add_tail(&dir_elem->list, &dir_list); 6146 6147 while (!list_empty(&dir_list)) { 6148 struct extent_buffer *leaf; 6149 struct btrfs_key min_key; 6150 int nritems; 6151 int i; 6152 6153 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, 6154 list); 6155 if (ret) 6156 goto next_dir_inode; 6157 6158 min_key.objectid = dir_elem->ino; 6159 min_key.type = BTRFS_DIR_INDEX_KEY; 6160 min_key.offset = 0; 6161 again: 6162 btrfs_release_path(path); 6163 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 6164 if (ret < 0) { 6165 goto next_dir_inode; 6166 } else if (ret > 0) { 6167 ret = 0; 6168 goto next_dir_inode; 6169 } 6170 6171 leaf = path->nodes[0]; 6172 nritems = btrfs_header_nritems(leaf); 6173 for (i = path->slots[0]; i < nritems; i++) { 6174 struct btrfs_dir_item *di; 6175 struct btrfs_key di_key; 6176 struct inode *di_inode; 6177 struct btrfs_dir_list *new_dir_elem; 6178 int log_mode = LOG_INODE_EXISTS; 6179 int type; 6180 6181 btrfs_item_key_to_cpu(leaf, &min_key, i); 6182 if (min_key.objectid != dir_elem->ino || 6183 min_key.type != BTRFS_DIR_INDEX_KEY) 6184 goto next_dir_inode; 6185 6186 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); 6187 type = btrfs_dir_type(leaf, di); 6188 if (btrfs_dir_transid(leaf, di) < trans->transid) 6189 continue; 6190 btrfs_dir_item_key_to_cpu(leaf, di, &di_key); 6191 if (di_key.type == BTRFS_ROOT_ITEM_KEY) 6192 continue; 6193 6194 btrfs_release_path(path); 6195 di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); 6196 if (IS_ERR(di_inode)) { 6197 ret = PTR_ERR(di_inode); 6198 goto next_dir_inode; 6199 } 6200 6201 if (!need_log_inode(trans, BTRFS_I(di_inode))) { 6202 btrfs_add_delayed_iput(di_inode); 6203 break; 6204 } 6205 6206 ctx->log_new_dentries = false; 6207 if (type == BTRFS_FT_DIR) 6208 log_mode = LOG_INODE_ALL; 6209 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), 6210 log_mode, ctx); 6211 btrfs_add_delayed_iput(di_inode); 6212 if (ret) 6213 goto next_dir_inode; 6214 if (ctx->log_new_dentries) { 6215 new_dir_elem = kmalloc(sizeof(*new_dir_elem), 6216 GFP_NOFS); 6217 if (!new_dir_elem) { 6218 ret = -ENOMEM; 6219 goto next_dir_inode; 6220 } 6221 new_dir_elem->ino = di_key.objectid; 6222 list_add_tail(&new_dir_elem->list, &dir_list); 6223 } 6224 break; 6225 } 6226 if (min_key.offset < (u64)-1) { 6227 min_key.offset++; 6228 goto again; 6229 } 6230 next_dir_inode: 6231 list_del(&dir_elem->list); 6232 kfree(dir_elem); 6233 } 6234 6235 btrfs_free_path(path); 6236 return ret; 6237 } 6238 6239 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, 6240 struct btrfs_inode *inode, 6241 struct btrfs_log_ctx *ctx) 6242 { 6243 struct btrfs_fs_info *fs_info = trans->fs_info; 6244 int ret; 6245 struct btrfs_path *path; 6246 struct btrfs_key key; 6247 struct btrfs_root *root = inode->root; 6248 const u64 ino = btrfs_ino(inode); 6249 6250 path = btrfs_alloc_path(); 6251 if (!path) 6252 return -ENOMEM; 6253 path->skip_locking = 1; 6254 path->search_commit_root = 1; 6255 6256 key.objectid = ino; 6257 key.type = BTRFS_INODE_REF_KEY; 6258 key.offset = 0; 6259 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 6260 if (ret < 0) 6261 goto out; 6262 6263 while (true) { 6264 struct extent_buffer *leaf = path->nodes[0]; 6265 int slot = path->slots[0]; 6266 u32 cur_offset = 0; 6267 u32 item_size; 6268 unsigned long ptr; 6269 6270 if (slot >= btrfs_header_nritems(leaf)) { 6271 ret = btrfs_next_leaf(root, path); 6272 if (ret < 0) 6273 goto out; 6274 else if (ret > 0) 6275 break; 6276 continue; 6277 } 6278 6279 btrfs_item_key_to_cpu(leaf, &key, slot); 6280 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */ 6281 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY) 6282 break; 6283 6284 item_size = btrfs_item_size(leaf, slot); 6285 ptr = btrfs_item_ptr_offset(leaf, slot); 6286 while (cur_offset < item_size) { 6287 struct btrfs_key inode_key; 6288 struct inode *dir_inode; 6289 6290 inode_key.type = BTRFS_INODE_ITEM_KEY; 6291 inode_key.offset = 0; 6292 6293 if (key.type == BTRFS_INODE_EXTREF_KEY) { 6294 struct btrfs_inode_extref *extref; 6295 6296 extref = (struct btrfs_inode_extref *) 6297 (ptr + cur_offset); 6298 inode_key.objectid = btrfs_inode_extref_parent( 6299 leaf, extref); 6300 cur_offset += sizeof(*extref); 6301 cur_offset += btrfs_inode_extref_name_len(leaf, 6302 extref); 6303 } else { 6304 inode_key.objectid = key.offset; 6305 cur_offset = item_size; 6306 } 6307 6308 dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, 6309 root); 6310 /* 6311 * If the parent inode was deleted, return an error to 6312 * fallback to a transaction commit. This is to prevent 6313 * getting an inode that was moved from one parent A to 6314 * a parent B, got its former parent A deleted and then 6315 * it got fsync'ed, from existing at both parents after 6316 * a log replay (and the old parent still existing). 6317 * Example: 6318 * 6319 * mkdir /mnt/A 6320 * mkdir /mnt/B 6321 * touch /mnt/B/bar 6322 * sync 6323 * mv /mnt/B/bar /mnt/A/bar 6324 * mv -T /mnt/A /mnt/B 6325 * fsync /mnt/B/bar 6326 * <power fail> 6327 * 6328 * If we ignore the old parent B which got deleted, 6329 * after a log replay we would have file bar linked 6330 * at both parents and the old parent B would still 6331 * exist. 6332 */ 6333 if (IS_ERR(dir_inode)) { 6334 ret = PTR_ERR(dir_inode); 6335 goto out; 6336 } 6337 6338 if (!need_log_inode(trans, BTRFS_I(dir_inode))) { 6339 btrfs_add_delayed_iput(dir_inode); 6340 continue; 6341 } 6342 6343 ctx->log_new_dentries = false; 6344 ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), 6345 LOG_INODE_ALL, ctx); 6346 if (!ret && ctx->log_new_dentries) 6347 ret = log_new_dir_dentries(trans, root, 6348 BTRFS_I(dir_inode), ctx); 6349 btrfs_add_delayed_iput(dir_inode); 6350 if (ret) 6351 goto out; 6352 } 6353 path->slots[0]++; 6354 } 6355 ret = 0; 6356 out: 6357 btrfs_free_path(path); 6358 return ret; 6359 } 6360 6361 static int log_new_ancestors(struct btrfs_trans_handle *trans, 6362 struct btrfs_root *root, 6363 struct btrfs_path *path, 6364 struct btrfs_log_ctx *ctx) 6365 { 6366 struct btrfs_key found_key; 6367 6368 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 6369 6370 while (true) { 6371 struct btrfs_fs_info *fs_info = root->fs_info; 6372 struct extent_buffer *leaf = path->nodes[0]; 6373 int slot = path->slots[0]; 6374 struct btrfs_key search_key; 6375 struct inode *inode; 6376 u64 ino; 6377 int ret = 0; 6378 6379 btrfs_release_path(path); 6380 6381 ino = found_key.offset; 6382 6383 search_key.objectid = found_key.offset; 6384 search_key.type = BTRFS_INODE_ITEM_KEY; 6385 search_key.offset = 0; 6386 inode = btrfs_iget(fs_info->sb, ino, root); 6387 if (IS_ERR(inode)) 6388 return PTR_ERR(inode); 6389 6390 if (BTRFS_I(inode)->generation >= trans->transid && 6391 need_log_inode(trans, BTRFS_I(inode))) 6392 ret = btrfs_log_inode(trans, BTRFS_I(inode), 6393 LOG_INODE_EXISTS, ctx); 6394 btrfs_add_delayed_iput(inode); 6395 if (ret) 6396 return ret; 6397 6398 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID) 6399 break; 6400 6401 search_key.type = BTRFS_INODE_REF_KEY; 6402 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6403 if (ret < 0) 6404 return ret; 6405 6406 leaf = path->nodes[0]; 6407 slot = path->slots[0]; 6408 if (slot >= btrfs_header_nritems(leaf)) { 6409 ret = btrfs_next_leaf(root, path); 6410 if (ret < 0) 6411 return ret; 6412 else if (ret > 0) 6413 return -ENOENT; 6414 leaf = path->nodes[0]; 6415 slot = path->slots[0]; 6416 } 6417 6418 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6419 if (found_key.objectid != search_key.objectid || 6420 found_key.type != BTRFS_INODE_REF_KEY) 6421 return -ENOENT; 6422 } 6423 return 0; 6424 } 6425 6426 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, 6427 struct btrfs_inode *inode, 6428 struct dentry *parent, 6429 struct btrfs_log_ctx *ctx) 6430 { 6431 struct btrfs_root *root = inode->root; 6432 struct dentry *old_parent = NULL; 6433 struct super_block *sb = inode->vfs_inode.i_sb; 6434 int ret = 0; 6435 6436 while (true) { 6437 if (!parent || d_really_is_negative(parent) || 6438 sb != parent->d_sb) 6439 break; 6440 6441 inode = BTRFS_I(d_inode(parent)); 6442 if (root != inode->root) 6443 break; 6444 6445 if (inode->generation >= trans->transid && 6446 need_log_inode(trans, inode)) { 6447 ret = btrfs_log_inode(trans, inode, 6448 LOG_INODE_EXISTS, ctx); 6449 if (ret) 6450 break; 6451 } 6452 if (IS_ROOT(parent)) 6453 break; 6454 6455 parent = dget_parent(parent); 6456 dput(old_parent); 6457 old_parent = parent; 6458 } 6459 dput(old_parent); 6460 6461 return ret; 6462 } 6463 6464 static int log_all_new_ancestors(struct btrfs_trans_handle *trans, 6465 struct btrfs_inode *inode, 6466 struct dentry *parent, 6467 struct btrfs_log_ctx *ctx) 6468 { 6469 struct btrfs_root *root = inode->root; 6470 const u64 ino = btrfs_ino(inode); 6471 struct btrfs_path *path; 6472 struct btrfs_key search_key; 6473 int ret; 6474 6475 /* 6476 * For a single hard link case, go through a fast path that does not 6477 * need to iterate the fs/subvolume tree. 6478 */ 6479 if (inode->vfs_inode.i_nlink < 2) 6480 return log_new_ancestors_fast(trans, inode, parent, ctx); 6481 6482 path = btrfs_alloc_path(); 6483 if (!path) 6484 return -ENOMEM; 6485 6486 search_key.objectid = ino; 6487 search_key.type = BTRFS_INODE_REF_KEY; 6488 search_key.offset = 0; 6489 again: 6490 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 6491 if (ret < 0) 6492 goto out; 6493 if (ret == 0) 6494 path->slots[0]++; 6495 6496 while (true) { 6497 struct extent_buffer *leaf = path->nodes[0]; 6498 int slot = path->slots[0]; 6499 struct btrfs_key found_key; 6500 6501 if (slot >= btrfs_header_nritems(leaf)) { 6502 ret = btrfs_next_leaf(root, path); 6503 if (ret < 0) 6504 goto out; 6505 else if (ret > 0) 6506 break; 6507 continue; 6508 } 6509 6510 btrfs_item_key_to_cpu(leaf, &found_key, slot); 6511 if (found_key.objectid != ino || 6512 found_key.type > BTRFS_INODE_EXTREF_KEY) 6513 break; 6514 6515 /* 6516 * Don't deal with extended references because they are rare 6517 * cases and too complex to deal with (we would need to keep 6518 * track of which subitem we are processing for each item in 6519 * this loop, etc). So just return some error to fallback to 6520 * a transaction commit. 6521 */ 6522 if (found_key.type == BTRFS_INODE_EXTREF_KEY) { 6523 ret = -EMLINK; 6524 goto out; 6525 } 6526 6527 /* 6528 * Logging ancestors needs to do more searches on the fs/subvol 6529 * tree, so it releases the path as needed to avoid deadlocks. 6530 * Keep track of the last inode ref key and resume from that key 6531 * after logging all new ancestors for the current hard link. 6532 */ 6533 memcpy(&search_key, &found_key, sizeof(search_key)); 6534 6535 ret = log_new_ancestors(trans, root, path, ctx); 6536 if (ret) 6537 goto out; 6538 btrfs_release_path(path); 6539 goto again; 6540 } 6541 ret = 0; 6542 out: 6543 btrfs_free_path(path); 6544 return ret; 6545 } 6546 6547 /* 6548 * helper function around btrfs_log_inode to make sure newly created 6549 * parent directories also end up in the log. A minimal inode and backref 6550 * only logging is done of any parent directories that are older than 6551 * the last committed transaction 6552 */ 6553 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 6554 struct btrfs_inode *inode, 6555 struct dentry *parent, 6556 int inode_only, 6557 struct btrfs_log_ctx *ctx) 6558 { 6559 struct btrfs_root *root = inode->root; 6560 struct btrfs_fs_info *fs_info = root->fs_info; 6561 int ret = 0; 6562 bool log_dentries = false; 6563 6564 if (btrfs_test_opt(fs_info, NOTREELOG)) { 6565 ret = BTRFS_LOG_FORCE_COMMIT; 6566 goto end_no_trans; 6567 } 6568 6569 if (btrfs_root_refs(&root->root_item) == 0) { 6570 ret = BTRFS_LOG_FORCE_COMMIT; 6571 goto end_no_trans; 6572 } 6573 6574 /* 6575 * Skip already logged inodes or inodes corresponding to tmpfiles 6576 * (since logging them is pointless, a link count of 0 means they 6577 * will never be accessible). 6578 */ 6579 if ((btrfs_inode_in_log(inode, trans->transid) && 6580 list_empty(&ctx->ordered_extents)) || 6581 inode->vfs_inode.i_nlink == 0) { 6582 ret = BTRFS_NO_LOG_SYNC; 6583 goto end_no_trans; 6584 } 6585 6586 ret = start_log_trans(trans, root, ctx); 6587 if (ret) 6588 goto end_no_trans; 6589 6590 ret = btrfs_log_inode(trans, inode, inode_only, ctx); 6591 if (ret) 6592 goto end_trans; 6593 6594 /* 6595 * for regular files, if its inode is already on disk, we don't 6596 * have to worry about the parents at all. This is because 6597 * we can use the last_unlink_trans field to record renames 6598 * and other fun in this file. 6599 */ 6600 if (S_ISREG(inode->vfs_inode.i_mode) && 6601 inode->generation < trans->transid && 6602 inode->last_unlink_trans < trans->transid) { 6603 ret = 0; 6604 goto end_trans; 6605 } 6606 6607 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) 6608 log_dentries = true; 6609 6610 /* 6611 * On unlink we must make sure all our current and old parent directory 6612 * inodes are fully logged. This is to prevent leaving dangling 6613 * directory index entries in directories that were our parents but are 6614 * not anymore. Not doing this results in old parent directory being 6615 * impossible to delete after log replay (rmdir will always fail with 6616 * error -ENOTEMPTY). 6617 * 6618 * Example 1: 6619 * 6620 * mkdir testdir 6621 * touch testdir/foo 6622 * ln testdir/foo testdir/bar 6623 * sync 6624 * unlink testdir/bar 6625 * xfs_io -c fsync testdir/foo 6626 * <power failure> 6627 * mount fs, triggers log replay 6628 * 6629 * If we don't log the parent directory (testdir), after log replay the 6630 * directory still has an entry pointing to the file inode using the bar 6631 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and 6632 * the file inode has a link count of 1. 6633 * 6634 * Example 2: 6635 * 6636 * mkdir testdir 6637 * touch foo 6638 * ln foo testdir/foo2 6639 * ln foo testdir/foo3 6640 * sync 6641 * unlink testdir/foo3 6642 * xfs_io -c fsync foo 6643 * <power failure> 6644 * mount fs, triggers log replay 6645 * 6646 * Similar as the first example, after log replay the parent directory 6647 * testdir still has an entry pointing to the inode file with name foo3 6648 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item 6649 * and has a link count of 2. 6650 */ 6651 if (inode->last_unlink_trans >= trans->transid) { 6652 ret = btrfs_log_all_parents(trans, inode, ctx); 6653 if (ret) 6654 goto end_trans; 6655 } 6656 6657 ret = log_all_new_ancestors(trans, inode, parent, ctx); 6658 if (ret) 6659 goto end_trans; 6660 6661 if (log_dentries) 6662 ret = log_new_dir_dentries(trans, root, inode, ctx); 6663 else 6664 ret = 0; 6665 end_trans: 6666 if (ret < 0) { 6667 btrfs_set_log_full_commit(trans); 6668 ret = BTRFS_LOG_FORCE_COMMIT; 6669 } 6670 6671 if (ret) 6672 btrfs_remove_log_ctx(root, ctx); 6673 btrfs_end_log_trans(root); 6674 end_no_trans: 6675 return ret; 6676 } 6677 6678 /* 6679 * it is not safe to log dentry if the chunk root has added new 6680 * chunks. This returns 0 if the dentry was logged, and 1 otherwise. 6681 * If this returns 1, you must commit the transaction to safely get your 6682 * data on disk. 6683 */ 6684 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 6685 struct dentry *dentry, 6686 struct btrfs_log_ctx *ctx) 6687 { 6688 struct dentry *parent = dget_parent(dentry); 6689 int ret; 6690 6691 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, 6692 LOG_INODE_ALL, ctx); 6693 dput(parent); 6694 6695 return ret; 6696 } 6697 6698 /* 6699 * should be called during mount to recover any replay any log trees 6700 * from the FS 6701 */ 6702 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) 6703 { 6704 int ret; 6705 struct btrfs_path *path; 6706 struct btrfs_trans_handle *trans; 6707 struct btrfs_key key; 6708 struct btrfs_key found_key; 6709 struct btrfs_root *log; 6710 struct btrfs_fs_info *fs_info = log_root_tree->fs_info; 6711 struct walk_control wc = { 6712 .process_func = process_one_buffer, 6713 .stage = LOG_WALK_PIN_ONLY, 6714 }; 6715 6716 path = btrfs_alloc_path(); 6717 if (!path) 6718 return -ENOMEM; 6719 6720 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6721 6722 trans = btrfs_start_transaction(fs_info->tree_root, 0); 6723 if (IS_ERR(trans)) { 6724 ret = PTR_ERR(trans); 6725 goto error; 6726 } 6727 6728 wc.trans = trans; 6729 wc.pin = 1; 6730 6731 ret = walk_log_tree(trans, log_root_tree, &wc); 6732 if (ret) { 6733 btrfs_abort_transaction(trans, ret); 6734 goto error; 6735 } 6736 6737 again: 6738 key.objectid = BTRFS_TREE_LOG_OBJECTID; 6739 key.offset = (u64)-1; 6740 key.type = BTRFS_ROOT_ITEM_KEY; 6741 6742 while (1) { 6743 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 6744 6745 if (ret < 0) { 6746 btrfs_abort_transaction(trans, ret); 6747 goto error; 6748 } 6749 if (ret > 0) { 6750 if (path->slots[0] == 0) 6751 break; 6752 path->slots[0]--; 6753 } 6754 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 6755 path->slots[0]); 6756 btrfs_release_path(path); 6757 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 6758 break; 6759 6760 log = btrfs_read_tree_root(log_root_tree, &found_key); 6761 if (IS_ERR(log)) { 6762 ret = PTR_ERR(log); 6763 btrfs_abort_transaction(trans, ret); 6764 goto error; 6765 } 6766 6767 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, 6768 true); 6769 if (IS_ERR(wc.replay_dest)) { 6770 ret = PTR_ERR(wc.replay_dest); 6771 6772 /* 6773 * We didn't find the subvol, likely because it was 6774 * deleted. This is ok, simply skip this log and go to 6775 * the next one. 6776 * 6777 * We need to exclude the root because we can't have 6778 * other log replays overwriting this log as we'll read 6779 * it back in a few more times. This will keep our 6780 * block from being modified, and we'll just bail for 6781 * each subsequent pass. 6782 */ 6783 if (ret == -ENOENT) 6784 ret = btrfs_pin_extent_for_log_replay(trans, 6785 log->node->start, 6786 log->node->len); 6787 btrfs_put_root(log); 6788 6789 if (!ret) 6790 goto next; 6791 btrfs_abort_transaction(trans, ret); 6792 goto error; 6793 } 6794 6795 wc.replay_dest->log_root = log; 6796 ret = btrfs_record_root_in_trans(trans, wc.replay_dest); 6797 if (ret) 6798 /* The loop needs to continue due to the root refs */ 6799 btrfs_abort_transaction(trans, ret); 6800 else 6801 ret = walk_log_tree(trans, log, &wc); 6802 6803 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6804 ret = fixup_inode_link_counts(trans, wc.replay_dest, 6805 path); 6806 if (ret) 6807 btrfs_abort_transaction(trans, ret); 6808 } 6809 6810 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { 6811 struct btrfs_root *root = wc.replay_dest; 6812 6813 btrfs_release_path(path); 6814 6815 /* 6816 * We have just replayed everything, and the highest 6817 * objectid of fs roots probably has changed in case 6818 * some inode_item's got replayed. 6819 * 6820 * root->objectid_mutex is not acquired as log replay 6821 * could only happen during mount. 6822 */ 6823 ret = btrfs_init_root_free_objectid(root); 6824 if (ret) 6825 btrfs_abort_transaction(trans, ret); 6826 } 6827 6828 wc.replay_dest->log_root = NULL; 6829 btrfs_put_root(wc.replay_dest); 6830 btrfs_put_root(log); 6831 6832 if (ret) 6833 goto error; 6834 next: 6835 if (found_key.offset == 0) 6836 break; 6837 key.offset = found_key.offset - 1; 6838 } 6839 btrfs_release_path(path); 6840 6841 /* step one is to pin it all, step two is to replay just inodes */ 6842 if (wc.pin) { 6843 wc.pin = 0; 6844 wc.process_func = replay_one_buffer; 6845 wc.stage = LOG_WALK_REPLAY_INODES; 6846 goto again; 6847 } 6848 /* step three is to replay everything */ 6849 if (wc.stage < LOG_WALK_REPLAY_ALL) { 6850 wc.stage++; 6851 goto again; 6852 } 6853 6854 btrfs_free_path(path); 6855 6856 /* step 4: commit the transaction, which also unpins the blocks */ 6857 ret = btrfs_commit_transaction(trans); 6858 if (ret) 6859 return ret; 6860 6861 log_root_tree->log_root = NULL; 6862 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6863 btrfs_put_root(log_root_tree); 6864 6865 return 0; 6866 error: 6867 if (wc.trans) 6868 btrfs_end_transaction(wc.trans); 6869 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6870 btrfs_free_path(path); 6871 return ret; 6872 } 6873 6874 /* 6875 * there are some corner cases where we want to force a full 6876 * commit instead of allowing a directory to be logged. 6877 * 6878 * They revolve around files there were unlinked from the directory, and 6879 * this function updates the parent directory so that a full commit is 6880 * properly done if it is fsync'd later after the unlinks are done. 6881 * 6882 * Must be called before the unlink operations (updates to the subvolume tree, 6883 * inodes, etc) are done. 6884 */ 6885 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, 6886 struct btrfs_inode *dir, struct btrfs_inode *inode, 6887 int for_rename) 6888 { 6889 /* 6890 * when we're logging a file, if it hasn't been renamed 6891 * or unlinked, and its inode is fully committed on disk, 6892 * we don't have to worry about walking up the directory chain 6893 * to log its parents. 6894 * 6895 * So, we use the last_unlink_trans field to put this transid 6896 * into the file. When the file is logged we check it and 6897 * don't log the parents if the file is fully on disk. 6898 */ 6899 mutex_lock(&inode->log_mutex); 6900 inode->last_unlink_trans = trans->transid; 6901 mutex_unlock(&inode->log_mutex); 6902 6903 /* 6904 * if this directory was already logged any new 6905 * names for this file/dir will get recorded 6906 */ 6907 if (dir->logged_trans == trans->transid) 6908 return; 6909 6910 /* 6911 * if the inode we're about to unlink was logged, 6912 * the log will be properly updated for any new names 6913 */ 6914 if (inode->logged_trans == trans->transid) 6915 return; 6916 6917 /* 6918 * when renaming files across directories, if the directory 6919 * there we're unlinking from gets fsync'd later on, there's 6920 * no way to find the destination directory later and fsync it 6921 * properly. So, we have to be conservative and force commits 6922 * so the new name gets discovered. 6923 */ 6924 if (for_rename) 6925 goto record; 6926 6927 /* we can safely do the unlink without any special recording */ 6928 return; 6929 6930 record: 6931 mutex_lock(&dir->log_mutex); 6932 dir->last_unlink_trans = trans->transid; 6933 mutex_unlock(&dir->log_mutex); 6934 } 6935 6936 /* 6937 * Make sure that if someone attempts to fsync the parent directory of a deleted 6938 * snapshot, it ends up triggering a transaction commit. This is to guarantee 6939 * that after replaying the log tree of the parent directory's root we will not 6940 * see the snapshot anymore and at log replay time we will not see any log tree 6941 * corresponding to the deleted snapshot's root, which could lead to replaying 6942 * it after replaying the log tree of the parent directory (which would replay 6943 * the snapshot delete operation). 6944 * 6945 * Must be called before the actual snapshot destroy operation (updates to the 6946 * parent root and tree of tree roots trees, etc) are done. 6947 */ 6948 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, 6949 struct btrfs_inode *dir) 6950 { 6951 mutex_lock(&dir->log_mutex); 6952 dir->last_unlink_trans = trans->transid; 6953 mutex_unlock(&dir->log_mutex); 6954 } 6955 6956 /** 6957 * Update the log after adding a new name for an inode. 6958 * 6959 * @trans: Transaction handle. 6960 * @old_dentry: The dentry associated with the old name and the old 6961 * parent directory. 6962 * @old_dir: The inode of the previous parent directory for the case 6963 * of a rename. For a link operation, it must be NULL. 6964 * @old_dir_index: The index number associated with the old name, meaningful 6965 * only for rename operations (when @old_dir is not NULL). 6966 * Ignored for link operations. 6967 * @parent: The dentry associated with the directory under which the 6968 * new name is located. 6969 * 6970 * Call this after adding a new name for an inode, as a result of a link or 6971 * rename operation, and it will properly update the log to reflect the new name. 6972 */ 6973 void btrfs_log_new_name(struct btrfs_trans_handle *trans, 6974 struct dentry *old_dentry, struct btrfs_inode *old_dir, 6975 u64 old_dir_index, struct dentry *parent) 6976 { 6977 struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); 6978 struct btrfs_root *root = inode->root; 6979 struct btrfs_log_ctx ctx; 6980 bool log_pinned = false; 6981 int ret; 6982 6983 /* 6984 * this will force the logging code to walk the dentry chain 6985 * up for the file 6986 */ 6987 if (!S_ISDIR(inode->vfs_inode.i_mode)) 6988 inode->last_unlink_trans = trans->transid; 6989 6990 /* 6991 * if this inode hasn't been logged and directory we're renaming it 6992 * from hasn't been logged, we don't need to log it 6993 */ 6994 ret = inode_logged(trans, inode, NULL); 6995 if (ret < 0) { 6996 goto out; 6997 } else if (ret == 0) { 6998 if (!old_dir) 6999 return; 7000 /* 7001 * If the inode was not logged and we are doing a rename (old_dir is not 7002 * NULL), check if old_dir was logged - if it was not we can return and 7003 * do nothing. 7004 */ 7005 ret = inode_logged(trans, old_dir, NULL); 7006 if (ret < 0) 7007 goto out; 7008 else if (ret == 0) 7009 return; 7010 } 7011 ret = 0; 7012 7013 /* 7014 * If we are doing a rename (old_dir is not NULL) from a directory that 7015 * was previously logged, make sure that on log replay we get the old 7016 * dir entry deleted. This is needed because we will also log the new 7017 * name of the renamed inode, so we need to make sure that after log 7018 * replay we don't end up with both the new and old dir entries existing. 7019 */ 7020 if (old_dir && old_dir->logged_trans == trans->transid) { 7021 struct btrfs_root *log = old_dir->root->log_root; 7022 struct btrfs_path *path; 7023 7024 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); 7025 7026 /* 7027 * We have two inodes to update in the log, the old directory and 7028 * the inode that got renamed, so we must pin the log to prevent 7029 * anyone from syncing the log until we have updated both inodes 7030 * in the log. 7031 */ 7032 ret = join_running_log_trans(root); 7033 /* 7034 * At least one of the inodes was logged before, so this should 7035 * not fail, but if it does, it's not serious, just bail out and 7036 * mark the log for a full commit. 7037 */ 7038 if (WARN_ON_ONCE(ret < 0)) 7039 goto out; 7040 log_pinned = true; 7041 7042 path = btrfs_alloc_path(); 7043 if (!path) { 7044 ret = -ENOMEM; 7045 goto out; 7046 } 7047 7048 /* 7049 * Other concurrent task might be logging the old directory, 7050 * as it can be triggered when logging other inode that had or 7051 * still has a dentry in the old directory. We lock the old 7052 * directory's log_mutex to ensure the deletion of the old 7053 * name is persisted, because during directory logging we 7054 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of 7055 * the old name's dir index item is in the delayed items, so 7056 * it could be missed by an in progress directory logging. 7057 */ 7058 mutex_lock(&old_dir->log_mutex); 7059 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), 7060 old_dentry->d_name.name, 7061 old_dentry->d_name.len, old_dir_index); 7062 if (ret > 0) { 7063 /* 7064 * The dentry does not exist in the log, so record its 7065 * deletion. 7066 */ 7067 btrfs_release_path(path); 7068 ret = insert_dir_log_key(trans, log, path, 7069 btrfs_ino(old_dir), 7070 old_dir_index, old_dir_index); 7071 } 7072 mutex_unlock(&old_dir->log_mutex); 7073 7074 btrfs_free_path(path); 7075 if (ret < 0) 7076 goto out; 7077 } 7078 7079 btrfs_init_log_ctx(&ctx, &inode->vfs_inode); 7080 ctx.logging_new_name = true; 7081 /* 7082 * We don't care about the return value. If we fail to log the new name 7083 * then we know the next attempt to sync the log will fallback to a full 7084 * transaction commit (due to a call to btrfs_set_log_full_commit()), so 7085 * we don't need to worry about getting a log committed that has an 7086 * inconsistent state after a rename operation. 7087 */ 7088 btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); 7089 out: 7090 /* 7091 * If an error happened mark the log for a full commit because it's not 7092 * consistent and up to date or we couldn't find out if one of the 7093 * inodes was logged before in this transaction. Do it before unpinning 7094 * the log, to avoid any races with someone else trying to commit it. 7095 */ 7096 if (ret < 0) 7097 btrfs_set_log_full_commit(trans); 7098 if (log_pinned) 7099 btrfs_end_log_trans(root); 7100 } 7101 7102